From d215c872bbdf8ba733f9a9fbd586374b841fdfb5 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Sat, 23 Mar 2019 17:55:57 +0100 Subject: Add a new separate module for grid transfers/interpolation. --- meson.build | 2 + transfer.c | 346 +++++++++++++++++++++++++++++++++++++++++++++ transfer.h | 58 ++++++++ transfer_interp.asm | 74 ++++++++++ transfer_interp_template.c | 64 +++++++++ 5 files changed, 544 insertions(+) create mode 100644 transfer.c create mode 100644 transfer.h create mode 100644 transfer_interp.asm create mode 100644 transfer_interp_template.c diff --git a/meson.build b/meson.build index 3d5b035..ce5e57b 100644 --- a/meson.build +++ b/meson.build @@ -12,6 +12,7 @@ lib_src = [ 'mg2d.c', 'ndarray.c', 'residual_calc.c', + 'transfer.c', ] lib_obj = [] @@ -82,6 +83,7 @@ if nasm.found() nasm_sources = files( 'cpuid.asm', 'residual_calc.asm', + 'transfer_interp.asm', ) lib_obj += nasm_gen.process(nasm_sources) diff --git a/transfer.c b/transfer.c new file mode 100644 index 0000000..6c8f614 --- /dev/null +++ b/transfer.c @@ -0,0 +1,346 @@ +/* + * Grid transfer operators + * Copyright 2019 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "cpu.h" +#include "ndarray.h" +#include "transfer.h" + +typedef struct GridTransfer { + size_t priv_data_size; + int (*init)(GridTransferContext *ctx); + void (*free)(GridTransferContext *ctx); + int (*transfer)(GridTransferContext *ctx, NDArray *dst, const NDArray *src); +} GridTransfer; + +typedef struct GridTransferLagrange { + unsigned int stencil; + + void (*transfer_cont) (double *dst, ptrdiff_t dst_len, + const double *src, ptrdiff_t src_stride, + const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y); + void (*transfer_generic)(double *dst, ptrdiff_t dst_len, + const double *src, ptrdiff_t src_stride, + const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y, + ptrdiff_t dst_stride0, ptrdiff_t src_stride0); + + const NDArray *src; + NDArray *dst; + + ptrdiff_t *idx[2]; + double *fact[2]; +} GridTransferLagrange; + +#if HAVE_EXTERNAL_ASM +void mg2di_transfer_interp_line_cont_4_fma3(double *dst, ptrdiff_t dst_len, + const double *src, ptrdiff_t src_stride, + const ptrdiff_t *idx_x, + const double *fact_x, const double *fact_y); +#endif + +#define STENCIL 2 +#define CONTIGUOUS 0 +#include "transfer_interp_template.c" +#undef CONTIGUOUS + +#define CONTIGUOUS 1 +#include "transfer_interp_template.c" +#undef CONTIGUOUS +#undef STENCIL + +#define STENCIL 4 +#define CONTIGUOUS 0 +#include "transfer_interp_template.c" +#undef CONTIGUOUS + +#define CONTIGUOUS 1 +#include "transfer_interp_template.c" +#undef CONTIGUOUS +#undef STENCIL + +// generate the interpolation source indices and weights +static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim) +{ + GridTransferLagrange *priv = ctx->priv; + const double step_dst = ctx->dst.step[dim]; + const double step_src = ctx->src.step[dim]; + const double scale = step_dst / step_src; + + double *coord_src; + + coord_src = calloc(priv->stencil, sizeof(*coord_src)); + if (!coord_src) + return -ENOMEM; + + for (ptrdiff_t idx_dst = 0; idx_dst < ctx->dst.size[dim]; idx_dst++) { + const ptrdiff_t idx_src = scale * idx_dst - (priv->stencil / 2 - 1); + const double coord_dst = idx_dst * step_dst; + + double *fact = priv->fact[dim] + priv->stencil * idx_dst; + + for (int i = 0; i < priv->stencil; i++) + coord_src[i] = (idx_src + i) * step_src; + + for (int i = 0; i < priv->stencil; i++) { + fact[i] = 1.0; + for (int j = 0; j < priv->stencil; j++) { + if (i == j) + continue; + + fact[i] *= (coord_dst - coord_src[j]) / (coord_src[i] - coord_src[j]); + } + } + + priv->idx[dim][idx_dst] = idx_src; + } + + free(coord_src); + + return 0; +} + +static int transfer_lagrange_init(GridTransferContext *ctx) +{ + GridTransferLagrange *priv = ctx->priv; + int ret; + + switch (ctx->op) { + case GRID_TRANSFER_LAGRANGE_1: + priv->stencil = 2; + priv->transfer_cont = interp_transfer_line_cont_2; + priv->transfer_generic = interp_transfer_line_generic_2; + break; + case GRID_TRANSFER_LAGRANGE_3: + priv->transfer_cont = interp_transfer_line_cont_4; + priv->transfer_generic = interp_transfer_line_generic_4; + priv->stencil = 4; + +#if HAVE_EXTERNAL_ASM + if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { + priv->transfer_cont = mg2di_transfer_interp_line_cont_4_fma3; + } +#endif + break; + default: + return -EINVAL; + } + + for (int dim = 0; dim < 2; dim++) { + priv->idx[dim] = calloc(ctx->dst.size[dim], sizeof(*priv->idx[dim])); + if (!priv->idx[dim]) + return -ENOMEM; + + priv->fact[dim] = calloc(ctx->dst.size[dim], sizeof(*priv->fact[dim]) * priv->stencil); + if (!priv->fact[dim]) + return -ENOMEM; + + ret = transfer_factors_lagrange(ctx, dim); + if (ret < 0) + return ret; + } + + return 0; +} + +static void transfer_lagrange_free(GridTransferContext *ctx) +{ + GridTransferLagrange *priv = ctx->priv; + + for (int dim = 0; dim < 2; dim++) { + free(priv->idx[dim]); + free(priv->fact[dim]); + } +} + +static int transfer_interp_task(void *arg, unsigned int job_idx, unsigned int thread_idx) +{ + GridTransferLagrange *priv = arg; + const NDArray *src = priv->src; + NDArray *dst = priv->dst; + + const ptrdiff_t idx_y = priv->idx[0][job_idx]; + const double *fact_y = priv->fact[0] + priv->stencil * job_idx; + + if (dst->stride[1] == 1 && src->stride[1] == 1) { + priv->transfer_cont(dst->data + job_idx * dst->stride[0], dst->shape[1], + src->data + idx_y * src->stride[0], src->stride[0], + priv->idx[1], priv->fact[1], fact_y); + } else { + priv->transfer_generic(dst->data + job_idx * dst->stride[0], dst->shape[1], + src->data + idx_y * src->stride[0], src->stride[0], + priv->idx[1], priv->fact[1], fact_y, dst->stride[1], src->stride[1]); + } + + return 0; +} + +static int transfer_lagrange_transfer(GridTransferContext *ctx, + NDArray *dst, const NDArray *src) +{ + GridTransferLagrange *priv = ctx->priv; + + priv->src = src; + priv->dst = dst; + + tp_execute(ctx->tp, ctx->dst.size[0], transfer_interp_task, priv); + + priv->src = NULL; + priv->dst = NULL; + + return 0; +} + +static const GridTransfer transfer_lagrange = { + .priv_data_size = sizeof(GridTransferLagrange), + .init = transfer_lagrange_init, + .free = transfer_lagrange_free, + .transfer = transfer_lagrange_transfer, +}; + +typedef struct FWThreadData { + NDArray *dst; + const NDArray *src; +} FWThreadData; + +static int fw_1_transfer_task(void *arg, unsigned int job_idx, unsigned int thread_idx) +{ + FWThreadData *td = arg; + + const double *sd = td->src->data + job_idx * 2 * td->src->stride[0]; + const ptrdiff_t ss0 = td->src->stride[0]; + const ptrdiff_t ss1 = td->src->stride[1]; + + double *dd = td->dst->data + job_idx * td->dst->stride[0]; + const ptrdiff_t ds1 = td->dst->stride[1]; + + for (size_t x = 0; x < td->dst->shape[1]; x++) { + *dd = 0.25 * sd[0] + 0.125 * (sd[ss1] + sd[-ss1] + sd[ss0] + sd[-ss0]) + + 0.0625 * (sd[ss0 + ss1] + sd[ss0 - ss1] + sd[-ss0 + ss1] + sd[-ss0 - ss1]); + dd += ds1; + sd += ss1 * 2; + } + + return 0; +} +static int transfer_fw_1_transfer(GridTransferContext *ctx, + NDArray *dst, const NDArray *src) +{ + FWThreadData td = { dst, src }; + + tp_execute(ctx->tp, ctx->dst.size[0], fw_1_transfer_task, &td); + + return 0; +} + +static const GridTransfer transfer_fw_1 = { + .priv_data_size = sizeof(GridTransferLagrange), + .transfer = transfer_fw_1_transfer, +}; + +static const GridTransfer *transfer_ops[] = { + [GRID_TRANSFER_LAGRANGE_1] = &transfer_lagrange, + [GRID_TRANSFER_LAGRANGE_3] = &transfer_lagrange, + [GRID_TRANSFER_FW_1] = &transfer_fw_1, +}; + +int mg2di_gt_transfer(GridTransferContext *ctx, + NDArray *dst, const NDArray *src) +{ + const GridTransfer *t = transfer_ops[ctx->op]; + + if (dst->dims != 2 || src->dims != 2 || + dst->shape[0] != ctx->dst.size[0] || dst->shape[1] != ctx->dst.size[1] || + src->shape[0] != ctx->src.size[0] || src->shape[1] != ctx->src.size[1]) + return -EINVAL; + + return t->transfer(ctx, dst, src); +} + +GridTransferContext *mg2di_gt_alloc(enum GridTransferOperator op) +{ + GridTransferContext *ctx; + const GridTransfer *t; + + if (op < 0 || op > ARRAY_ELEMS(transfer_ops)) + return NULL; + t = transfer_ops[op]; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return NULL; + + if (t->priv_data_size) { + ctx->priv = calloc(1, t->priv_data_size); + if (!ctx->priv) { + free(ctx); + return NULL; + } + } + ctx->op = op; + + return ctx; +} + +int mg2di_gt_init(GridTransferContext *ctx) +{ + const GridTransfer *t = transfer_ops[ctx->op]; + int ret; + + if (!ctx->tp) + return -EINVAL; + + /* TODO check that the destination grid is not outside the source one */ + + if (t->init) { + ret = t->init(ctx); + if (ret < 0) + return ret; + } + + return 0; +} + +void mg2di_gt_free(GridTransferContext **pctx) +{ + GridTransferContext *ctx = *pctx; + + if (!ctx) + return; + + if (ctx->priv) { + const GridTransfer *t = transfer_ops[ctx->op]; + + if (t->free) + t->free(ctx); + } + + free(ctx->priv); + + free(ctx); + *pctx = NULL; +} diff --git a/transfer.h b/transfer.h new file mode 100644 index 0000000..f9d0870 --- /dev/null +++ b/transfer.h @@ -0,0 +1,58 @@ +/* + * Grid transfer operators + * Copyright 2018 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef MG2D_TRANSFER_H +#define MG2D_TRANSFER_H + +#include +#include + +#include "ndarray.h" + +enum GridTransferOperator { + GRID_TRANSFER_LAGRANGE_1, + GRID_TRANSFER_LAGRANGE_3, + GRID_TRANSFER_FW_1, +}; + +typedef struct RegularGrid2D { + size_t size[2]; + double step[2]; +} RegularGrid2D; + +typedef struct GridTransferContext { + void *priv; + + TPContext *tp; + int cpuflags; + + enum GridTransferOperator op; + + RegularGrid2D src; + RegularGrid2D dst; +} GridTransferContext; + +GridTransferContext *mg2di_gt_alloc(enum GridTransferOperator op); +int mg2di_gt_init(GridTransferContext *ctx); + +void mg2di_gt_free(GridTransferContext **ctx); + +int mg2di_gt_transfer(GridTransferContext *ctx, + NDArray *dst, const NDArray *src); + +#endif // MG2D_TRANSFER_H diff --git a/transfer_interp.asm b/transfer_interp.asm new file mode 100644 index 0000000..a6ae60f --- /dev/null +++ b/transfer_interp.asm @@ -0,0 +1,74 @@ +; +; SIMD for interpolation +; Copyright 2019 Anton Khirnov +; +; This program is free software: you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation, either version 3 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program. If not, see . +;/ + +%include "config.asm" +%include "x86inc.asm" + +SECTION .text + +INIT_YMM fma3 +cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\ + idx_x_val + shl src_strideq, 3 + shl dst_lenq, 3 + + add dstq, dst_lenq + add idx_xq, dst_lenq + lea fact_xq, [fact_xq + 4 * dst_lenq] + neg dst_lenq + ; from now on, the register that held the line size is used as the offset into data arrays + %define offsetq dst_lenq + + movu m0, [fact_yq] + vpermq m1, m0, 01010101b ; fact y + 1 -> m1 + vpermq m2, m0, 10101010b ; fact y + 2 -> m2 + vpermq m3, m0, 11111111b ; fact y + 3 -> m3 + vpermq m0, m0, 00000000b ; fact y + 0 -> m0 + +.loop: + mov idx_x_valq, [idx_xq + offsetq] + shl idx_x_valq, 3 + + xorpd m4, m4 + + movu m5, [fact_xq + 4 * offsetq] + + mulpd m6, m5, [srcq + idx_x_valq] + mulpd m6, m0 + + add idx_x_valq, src_strideq + mulpd m7, m5, [srcq + idx_x_valq] + vfmadd231pd m6, m7, m1 + + add idx_x_valq, src_strideq + mulpd m7, m5, [srcq + idx_x_valq] + vfmadd231pd m6, m7, m2 + + add idx_x_valq, src_strideq + mulpd m7, m5, [srcq + idx_x_valq] + vfmadd231pd m6, m7, m3 + + haddpd m6, m6 + vpermq m6, m6, 00001000b + haddpd m6, m6 + + movq [dstq + offsetq], xm6 + add offsetq, 8 + js .loop + + RET diff --git a/transfer_interp_template.c b/transfer_interp_template.c new file mode 100644 index 0000000..65ae98b --- /dev/null +++ b/transfer_interp_template.c @@ -0,0 +1,64 @@ +/* + * Polynomial interpolation template + * Copyright 2019 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#if CONTIGUOUS +# define CONT cont +#else +# define CONT generic +#endif + +#define JOIN3(a, b, c) a ## _ ## b ## _ ## c +#define FUNC2(name, cont, stencil) JOIN3(name, cont, stencil) +#define FUNC(name) FUNC2(name, CONT, STENCIL) + +static void +FUNC(interp_transfer_line)(double *dst, ptrdiff_t dst_len, + const double *src, ptrdiff_t src_stride, + const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y +#if !CONTIGUOUS + , ptrdiff_t dst_stride0, ptrdiff_t src_stride0 +# define SSTRIDE1 src_stride0 +# define DSTRIDE1 dst_stride0 +#else +# define SSTRIDE1 1 +# define DSTRIDE1 1 +#endif + ) +{ + for (size_t x = 0; x < dst_len; x++) { + const double *src_data = src + SSTRIDE1 * idx_x[x]; + + double val = 0.0; + + for (ptrdiff_t idx0 = 0; idx0 < STENCIL; idx0++) { + double tmp = 0.0; + for (ptrdiff_t idx1 = 0; idx1 < STENCIL; idx1++) + tmp += src_data[idx0 * src_stride + idx1 ] * fact_x[STENCIL * x + idx1]; + val += tmp * fact_y[idx0]; + } + + dst[x * DSTRIDE1] = val; + } +} + +#undef SSTRIDE1 +#undef DSTRIDE1 +#undef CONT +#undef JOIN3 +#undef FUNC2 +#undef FUNC -- cgit v1.2.3