Add a new separate module for grid transfers/interpolation.

author: Anton Khirnov <anton@khirnov.net> 2019-03-23 17:55:57 +0100
committer: Anton Khirnov <anton@khirnov.net> 2019-03-23 17:55:57 +0100
commit: d215c872bbdf8ba733f9a9fbd586374b841fdfb5 (patch)
tree: fb0072a8dc35179ef35d86c6fbb057b6c504445c
parent: bd178d67da6a8c30b3ccbd020be1b00f42eceb53 (diff)
5 files changed, 544 insertions, 0 deletions
diff --git a/meson.build b/meson.build
index 3d5b035..ce5e57b 100644
--- a/meson.build
+++ b/meson.build
@@ -12,6 +12,7 @@ lib_src = [
     'mg2d.c',
     'ndarray.c',
     'residual_calc.c',
+    'transfer.c',
 ]
 lib_obj = []
 
@@ -82,6 +83,7 @@ if nasm.found()
     nasm_sources = files(
         'cpuid.asm',
         'residual_calc.asm',
+        'transfer_interp.asm',
     )
 
     lib_obj += nasm_gen.process(nasm_sources)
diff --git a/transfer.c b/transfer.c
new file mode 100644
index 0000000..6c8f614
--- /dev/null
+++ b/transfer.c
@@ -0,0 +1,346 @@
+/*
+ * Grid transfer operators
+ * Copyright 2019 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <threadpool.h>
+
+#include "common.h"
+#include "cpu.h"
+#include "ndarray.h"
+#include "transfer.h"
+
+typedef struct GridTransfer {
+    size_t priv_data_size;
+    int    (*init)(GridTransferContext *ctx);
+    void   (*free)(GridTransferContext *ctx);
+    int    (*transfer)(GridTransferContext *ctx, NDArray *dst, const NDArray *src);
+} GridTransfer;
+
+typedef struct GridTransferLagrange {
+    unsigned int stencil;
+
+    void (*transfer_cont)   (double *dst, ptrdiff_t dst_len,
+                             const double *src, ptrdiff_t src_stride,
+                             const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y);
+    void (*transfer_generic)(double *dst, ptrdiff_t dst_len,
+                             const double *src, ptrdiff_t src_stride,
+                             const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y,
+                             ptrdiff_t dst_stride0, ptrdiff_t src_stride0);
+
+    const NDArray *src;
+    NDArray       *dst;
+
+    ptrdiff_t *idx[2];
+    double    *fact[2];
+} GridTransferLagrange;
+
+#if HAVE_EXTERNAL_ASM
+void mg2di_transfer_interp_line_cont_4_fma3(double *dst, ptrdiff_t dst_len,
+                                            const double *src, ptrdiff_t src_stride,
+                                            const ptrdiff_t *idx_x,
+                                            const double *fact_x, const double *fact_y);
+#endif
+
+#define STENCIL 2
+#define CONTIGUOUS 0
+#include "transfer_interp_template.c"
+#undef CONTIGUOUS
+
+#define CONTIGUOUS 1
+#include "transfer_interp_template.c"
+#undef CONTIGUOUS
+#undef STENCIL
+
+#define STENCIL 4
+#define CONTIGUOUS 0
+#include "transfer_interp_template.c"
+#undef CONTIGUOUS
+
+#define CONTIGUOUS 1
+#include "transfer_interp_template.c"
+#undef CONTIGUOUS
+#undef STENCIL
+
+// generate the interpolation source indices and weights
+static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim)
+{
+    GridTransferLagrange *priv = ctx->priv;
+    const double step_dst = ctx->dst.step[dim];
+    const double step_src = ctx->src.step[dim];
+    const double scale    = step_dst / step_src;
+
+    double *coord_src;
+
+    coord_src = calloc(priv->stencil, sizeof(*coord_src));
+    if (!coord_src)
+        return -ENOMEM;
+
+    for (ptrdiff_t idx_dst = 0; idx_dst < ctx->dst.size[dim]; idx_dst++) {
+        const ptrdiff_t idx_src = scale * idx_dst - (priv->stencil / 2 - 1);
+        const double coord_dst  = idx_dst * step_dst;
+
+        double *fact = priv->fact[dim] + priv->stencil * idx_dst;
+
+        for (int i = 0; i < priv->stencil; i++)
+            coord_src[i] = (idx_src + i) * step_src;
+
+        for (int i = 0; i < priv->stencil; i++) {
+            fact[i] = 1.0;
+            for (int j = 0; j < priv->stencil; j++) {
+                if (i == j)
+                    continue;
+
+                fact[i] *= (coord_dst - coord_src[j]) / (coord_src[i] - coord_src[j]);
+            }
+        }
+
+        priv->idx[dim][idx_dst] = idx_src;
+    }
+
+    free(coord_src);
+
+    return 0;
+}
+
+static int transfer_lagrange_init(GridTransferContext *ctx)
+{
+    GridTransferLagrange *priv = ctx->priv;
+    int ret;
+
+    switch (ctx->op) {
+    case GRID_TRANSFER_LAGRANGE_1:
+        priv->stencil = 2;
+        priv->transfer_cont    = interp_transfer_line_cont_2;
+        priv->transfer_generic = interp_transfer_line_generic_2;
+        break;
+    case GRID_TRANSFER_LAGRANGE_3:
+        priv->transfer_cont    = interp_transfer_line_cont_4;
+        priv->transfer_generic = interp_transfer_line_generic_4;
+        priv->stencil = 4;
+
+#if HAVE_EXTERNAL_ASM
+        if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
+            priv->transfer_cont    = mg2di_transfer_interp_line_cont_4_fma3;
+        }
+#endif
+        break;
+    default:
+        return -EINVAL;
+    }
+
+    for (int dim = 0; dim < 2; dim++) {
+        priv->idx[dim] = calloc(ctx->dst.size[dim], sizeof(*priv->idx[dim]));
+        if (!priv->idx[dim])
+            return -ENOMEM;
+
+        priv->fact[dim] = calloc(ctx->dst.size[dim], sizeof(*priv->fact[dim]) * priv->stencil);
+        if (!priv->fact[dim])
+            return -ENOMEM;
+
+        ret = transfer_factors_lagrange(ctx, dim);
+        if (ret < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+static void transfer_lagrange_free(GridTransferContext *ctx)
+{
+    GridTransferLagrange *priv = ctx->priv;
+
+    for (int dim = 0; dim < 2; dim++) {
+        free(priv->idx[dim]);
+        free(priv->fact[dim]);
+    }
+}
+
+static int transfer_interp_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
+{
+    GridTransferLagrange *priv = arg;
+    const NDArray         *src = priv->src;
+    NDArray               *dst = priv->dst;
+
+    const ptrdiff_t     idx_y  = priv->idx[0][job_idx];
+    const double       *fact_y = priv->fact[0] + priv->stencil * job_idx;
+
+    if (dst->stride[1] == 1 && src->stride[1] == 1) {
+        priv->transfer_cont(dst->data + job_idx * dst->stride[0], dst->shape[1],
+                            src->data + idx_y * src->stride[0], src->stride[0],
+                            priv->idx[1], priv->fact[1], fact_y);
+    } else {
+        priv->transfer_generic(dst->data + job_idx * dst->stride[0], dst->shape[1],
+                               src->data + idx_y * src->stride[0], src->stride[0],
+                               priv->idx[1], priv->fact[1], fact_y, dst->stride[1], src->stride[1]);
+    }
+
+    return 0;
+}
+
+static int transfer_lagrange_transfer(GridTransferContext *ctx,
+                                      NDArray *dst, const NDArray *src)
+{
+    GridTransferLagrange *priv = ctx->priv;
+
+    priv->src = src;
+    priv->dst = dst;
+
+    tp_execute(ctx->tp, ctx->dst.size[0], transfer_interp_task, priv);
+
+    priv->src = NULL;
+    priv->dst = NULL;
+
+    return 0;
+}
+
+static const GridTransfer transfer_lagrange = {
+    .priv_data_size = sizeof(GridTransferLagrange),
+    .init           = transfer_lagrange_init,
+    .free           = transfer_lagrange_free,
+    .transfer       = transfer_lagrange_transfer,
+};
+
+typedef struct FWThreadData {
+    NDArray *dst;
+    const NDArray *src;
+} FWThreadData;
+
+static int fw_1_transfer_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
+{
+    FWThreadData *td = arg;
+
+    const double    *sd = td->src->data + job_idx * 2 * td->src->stride[0];
+    const ptrdiff_t ss0 = td->src->stride[0];
+    const ptrdiff_t ss1 = td->src->stride[1];
+
+    double          *dd = td->dst->data + job_idx * td->dst->stride[0];
+    const ptrdiff_t ds1 = td->dst->stride[1];
+
+    for (size_t x = 0; x < td->dst->shape[1]; x++) {
+        *dd = 0.25 * sd[0] + 0.125 * (sd[ss1] + sd[-ss1] + sd[ss0] + sd[-ss0])
+              + 0.0625 * (sd[ss0 + ss1] + sd[ss0 - ss1] + sd[-ss0 + ss1] + sd[-ss0 - ss1]);
+        dd += ds1;
+        sd += ss1 * 2;
+    }
+
+    return 0;
+}
+static int transfer_fw_1_transfer(GridTransferContext *ctx,
+                                  NDArray *dst, const NDArray *src)
+{
+    FWThreadData td = { dst, src };
+
+    tp_execute(ctx->tp, ctx->dst.size[0], fw_1_transfer_task, &td);
+
+    return 0;
+}
+
+static const GridTransfer transfer_fw_1 = {
+    .priv_data_size = sizeof(GridTransferLagrange),
+    .transfer       = transfer_fw_1_transfer,
+};
+
+static const GridTransfer *transfer_ops[] = {
+    [GRID_TRANSFER_LAGRANGE_1] = &transfer_lagrange,
+    [GRID_TRANSFER_LAGRANGE_3] = &transfer_lagrange,
+    [GRID_TRANSFER_FW_1]       = &transfer_fw_1,
+};
+
+int mg2di_gt_transfer(GridTransferContext *ctx,
+                      NDArray *dst, const NDArray *src)
+{
+    const GridTransfer *t = transfer_ops[ctx->op];
+
+    if (dst->dims != 2 || src->dims != 2 ||
+        dst->shape[0] != ctx->dst.size[0] || dst->shape[1] != ctx->dst.size[1] ||
+        src->shape[0] != ctx->src.size[0] || src->shape[1] != ctx->src.size[1])
+        return -EINVAL;
+
+    return t->transfer(ctx, dst, src);
+}
+
+GridTransferContext *mg2di_gt_alloc(enum GridTransferOperator op)
+{
+    GridTransferContext *ctx;
+    const GridTransfer *t;
+
+    if (op < 0 || op > ARRAY_ELEMS(transfer_ops))
+        return NULL;
+    t = transfer_ops[op];
+
+    ctx = calloc(1, sizeof(*ctx));
+    if (!ctx)
+        return NULL;
+
+    if (t->priv_data_size) {
+        ctx->priv = calloc(1, t->priv_data_size);
+        if (!ctx->priv) {
+            free(ctx);
+            return NULL;
+        }
+    }
+    ctx->op = op;
+
+    return ctx;
+}
+
+int mg2di_gt_init(GridTransferContext *ctx)
+{
+    const GridTransfer *t = transfer_ops[ctx->op];
+    int ret;
+
+    if (!ctx->tp)
+        return -EINVAL;
+
+    /* TODO check that the destination grid is not outside the source one */
+
+    if (t->init) {
+        ret = t->init(ctx);
+        if (ret < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+void mg2di_gt_free(GridTransferContext **pctx)
+{
+    GridTransferContext *ctx = *pctx;
+
+    if (!ctx)
+        return;
+
+    if (ctx->priv) {
+        const GridTransfer *t = transfer_ops[ctx->op];
+
+        if (t->free)
+            t->free(ctx);
+    }
+
+    free(ctx->priv);
+
+    free(ctx);
+    *pctx = NULL;
+}
diff --git a/transfer.h b/transfer.h
new file mode 100644
index 0000000..f9d0870
--- /dev/null
+++ b/transfer.h
@@ -0,0 +1,58 @@
+/*
+ * Grid transfer operators
+ * Copyright 2018 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MG2D_TRANSFER_H
+#define MG2D_TRANSFER_H
+
+#include <stddef.h>
+#include <threadpool.h>
+
+#include "ndarray.h"
+
+enum GridTransferOperator {
+    GRID_TRANSFER_LAGRANGE_1,
+    GRID_TRANSFER_LAGRANGE_3,
+    GRID_TRANSFER_FW_1,
+};
+
+typedef struct RegularGrid2D {
+    size_t size[2];
+    double step[2];
+} RegularGrid2D;
+
+typedef struct GridTransferContext {
+    void *priv;
+
+    TPContext *tp;
+    int cpuflags;
+
+    enum GridTransferOperator op;
+
+    RegularGrid2D src;
+    RegularGrid2D dst;
+} GridTransferContext;
+
+GridTransferContext *mg2di_gt_alloc(enum GridTransferOperator op);
+int mg2di_gt_init(GridTransferContext *ctx);
+
+void mg2di_gt_free(GridTransferContext **ctx);
+
+int mg2di_gt_transfer(GridTransferContext *ctx,
+                      NDArray *dst, const NDArray *src);
+
+#endif // MG2D_TRANSFER_H
diff --git a/transfer_interp.asm b/transfer_interp.asm
new file mode 100644
index 0000000..a6ae60f
--- /dev/null
+++ b/transfer_interp.asm
@@ -0,0 +1,74 @@
+;
+; SIMD for interpolation
+; Copyright 2019 Anton Khirnov <anton@khirnov.net>
+;
+; This program is free software: you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation, either version 3 of the License, or
+; (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program.  If not, see <http://www.gnu.org/licenses/>.
+;/
+
+%include "config.asm"
+%include "x86inc.asm"
+
+SECTION .text
+
+INIT_YMM fma3
+cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\
+                                              idx_x_val
+    shl src_strideq, 3
+    shl dst_lenq,    3
+
+    add dstq,    dst_lenq
+    add idx_xq,  dst_lenq
+    lea fact_xq, [fact_xq + 4 * dst_lenq]
+    neg dst_lenq
+    ; from now on, the register that held the line size is used as the offset into data arrays
+    %define offsetq dst_lenq
+
+    movu m0, [fact_yq]
+    vpermq  m1, m0, 01010101b                           ; fact y + 1 -> m1
+    vpermq  m2, m0, 10101010b                           ; fact y + 2 -> m2
+    vpermq  m3, m0, 11111111b                           ; fact y + 3 -> m3
+    vpermq  m0, m0, 00000000b                           ; fact y + 0 -> m0
+
+.loop:
+    mov idx_x_valq, [idx_xq + offsetq]
+    shl idx_x_valq, 3
+
+    xorpd m4, m4
+
+    movu  m5, [fact_xq + 4 * offsetq]
+
+    mulpd m6, m5, [srcq + idx_x_valq]
+    mulpd m6, m0
+
+    add idx_x_valq, src_strideq
+    mulpd m7, m5, [srcq + idx_x_valq]
+    vfmadd231pd m6, m7, m1
+
+    add idx_x_valq, src_strideq
+    mulpd m7, m5, [srcq + idx_x_valq]
+    vfmadd231pd m6, m7, m2
+
+    add idx_x_valq, src_strideq
+    mulpd m7, m5, [srcq + idx_x_valq]
+    vfmadd231pd m6, m7, m3
+
+    haddpd m6, m6
+    vpermq m6, m6, 00001000b
+    haddpd m6, m6
+
+    movq [dstq + offsetq], xm6
+    add offsetq, 8
+    js .loop
+
+    RET
diff --git a/transfer_interp_template.c b/transfer_interp_template.c
new file mode 100644
index 0000000..65ae98b
--- /dev/null
+++ b/transfer_interp_template.c
@@ -0,0 +1,64 @@
+/*
+ * Polynomial interpolation template
+ * Copyright 2019 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if CONTIGUOUS
+# define CONT cont
+#else
+# define CONT generic
+#endif
+
+#define JOIN3(a, b, c) a ## _ ## b ## _ ## c
+#define FUNC2(name, cont, stencil) JOIN3(name, cont, stencil)
+#define FUNC(name) FUNC2(name, CONT, STENCIL)
+
+static void
+FUNC(interp_transfer_line)(double *dst, ptrdiff_t dst_len,
+                           const double *src, ptrdiff_t src_stride,
+                           const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y
+#if !CONTIGUOUS
+                           , ptrdiff_t dst_stride0, ptrdiff_t src_stride0
+# define SSTRIDE1 src_stride0
+# define DSTRIDE1 dst_stride0
+#else
+# define SSTRIDE1 1
+# define DSTRIDE1 1
+#endif
+                           )
+{
+    for (size_t x = 0; x < dst_len; x++) {
+        const double *src_data = src + SSTRIDE1 * idx_x[x];
+
+        double val = 0.0;
+
+        for (ptrdiff_t idx0 = 0; idx0 < STENCIL; idx0++) {
+            double tmp = 0.0;
+            for (ptrdiff_t idx1 = 0; idx1 < STENCIL; idx1++)
+                tmp += src_data[idx0 * src_stride + idx1 ] * fact_x[STENCIL * x + idx1];
+            val += tmp * fact_y[idx0];
+        }
+
+        dst[x * DSTRIDE1] = val;
+    }
+}
+
+#undef SSTRIDE1
+#undef DSTRIDE1
+#undef CONT
+#undef JOIN3
+#undef FUNC2
+#undef FUNC
author	Anton Khirnov <anton@khirnov.net>	2019-03-23 17:55:57 +0100
committer	Anton Khirnov <anton@khirnov.net>	2019-03-23 17:55:57 +0100
commit	d215c872bbdf8ba733f9a9fbd586374b841fdfb5 (patch)
tree	fb0072a8dc35179ef35d86c6fbb057b6c504445c
parent	bd178d67da6a8c30b3ccbd020be1b00f42eceb53 (diff)