From e1d1bdc0d20cc379f2cf31fca84f6179eba0cb36 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Sat, 23 Mar 2019 19:34:48 +0100
Subject: mg2d: use appropriate full-weighted restriction for 3rd order FDs

---
 transfer.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 3 deletions(-)

(limited to 'transfer.c')

diff --git a/transfer.c b/transfer.c
index 046d96d..232d6e1 100644
--- a/transfer.c
+++ b/transfer.c
@@ -229,18 +229,26 @@ typedef struct FWThreadData {
 
 static int fw_1_transfer_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
 {
+    static const double factors[] = { 0.5, 1.0, 0.5 };
+    static const size_t order = ARRAY_ELEMS(factors) / 2;
+
     FWThreadData *td = arg;
 
-    const double    *sd = td->src->data + job_idx * 2 * td->src->stride[0];
     const ptrdiff_t ss0 = td->src->stride[0];
     const ptrdiff_t ss1 = td->src->stride[1];
+    const double    *sd = td->src->data + (job_idx * 2 - order) * ss0 - order * ss1;
 
     double          *dd = td->dst->data + job_idx * td->dst->stride[0];
     const ptrdiff_t ds1 = td->dst->stride[1];
 
     for (size_t x = 0; x < td->dst->shape[1]; x++) {
-        *dd = 0.25 * sd[0] + 0.125 * (sd[ss1] + sd[-ss1] + sd[ss0] + sd[-ss0])
-              + 0.0625 * (sd[ss0 + ss1] + sd[ss0 - ss1] + sd[-ss0 + ss1] + sd[-ss0 - ss1]);
+        double val = 0.0;
+
+        for (int i = 0; i < ARRAY_ELEMS(factors); i++)
+            for (int j = 0; j < ARRAY_ELEMS(factors); j++)
+                val += sd[i * ss0 + j * ss1] * factors[i] * factors[j];
+
+        *dd = 0.25 * val;
         dd += ds1;
         sd += ss1 * 2;
     }
@@ -262,10 +270,53 @@ static const GridTransfer transfer_fw_1 = {
     .transfer       = transfer_fw_1_transfer,
 };
 
+static int fw_3_transfer_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
+{
+    static const double factors[] = { -1.0 / 16, 9.0 / 16, 1.0, 9.0 / 16, -1.0 / 16 };
+    static const size_t order = ARRAY_ELEMS(factors) / 2;
+    FWThreadData *td = arg;
+
+    const ptrdiff_t ss0 = td->src->stride[0];
+    const ptrdiff_t ss1 = td->src->stride[1];
+    const double    *sd = td->src->data + (job_idx * 2 - order) * ss0 - order * ss1;
+
+    double          *dd = td->dst->data + job_idx * td->dst->stride[0];
+    const ptrdiff_t ds1 = td->dst->stride[1];
+
+    for (size_t x = 0; x < td->dst->shape[1]; x++) {
+        double val = 0.0;
+
+        for (int i = 0; i < ARRAY_ELEMS(factors); i++)
+            for (int j = 0; j < ARRAY_ELEMS(factors); j++)
+                val += sd[i * ss0 + j * ss1] * factors[i] * factors[j];
+
+        *dd = 0.25 * val;
+        dd += ds1;
+        sd += ss1 * 2;
+    }
+
+    return 0;
+}
+static int transfer_fw_3_transfer(GridTransferContext *ctx,
+                                  NDArray *dst, const NDArray *src)
+{
+    FWThreadData td = { dst, src };
+
+    tp_execute(ctx->tp, ctx->dst.size[0], fw_3_transfer_task, &td);
+
+    return 0;
+}
+
+static const GridTransfer transfer_fw_3 = {
+    .priv_data_size = sizeof(GridTransferLagrange),
+    .transfer       = transfer_fw_3_transfer,
+};
+
 static const GridTransfer *transfer_ops[] = {
     [GRID_TRANSFER_LAGRANGE_1] = &transfer_lagrange,
     [GRID_TRANSFER_LAGRANGE_3] = &transfer_lagrange,
     [GRID_TRANSFER_FW_1]       = &transfer_fw_1,
+    [GRID_TRANSFER_FW_3]       = &transfer_fw_3,
 };
 
 int mg2di_gt_transfer(GridTransferContext *ctx,
-- 
cgit v1.2.3