egs: add higher-order finite difference operatorsfd8

author: Anton Khirnov <anton@khirnov.net> 2019-04-09 09:45:16 +0200
committer: Anton Khirnov <anton@khirnov.net> 2020-01-26 20:21:39 +0100
commit: c2740d169b81ae4f113bad26a5462c0d3947cc48 (patch)
tree: 801929156a0a1472780071b514f76dfe6c32fa28
parent: 62c92eea4ec4859fff5002931e2d7d562b3deb5d (diff)
6 files changed, 754 insertions, 6 deletions
diff --git a/ell_grid_solve.c b/ell_grid_solve.c
index fd6e455..3f0cb46 100644
--- a/ell_grid_solve.c
+++ b/ell_grid_solve.c
@@ -44,6 +44,8 @@
 static const double relax_factors[FD_STENCIL_MAX] = {
     [0] = 1.0 / 5,
     [1] = 1.0 / 7,
+    [2] = 1.0 / 7,
+    [3] = 1.0 / 7,
 };
 
 typedef struct EGSRelaxInternal {
@@ -140,6 +142,22 @@ static const double fd_denoms[][MG2D_DIFF_COEFF_NB] = {
         [MG2D_DIFF_COEFF_02] = 12.0,
         [MG2D_DIFF_COEFF_11] = 144.0,
     },
+    {
+        [MG2D_DIFF_COEFF_00] = 1.0,
+        [MG2D_DIFF_COEFF_10] = 60.0,
+        [MG2D_DIFF_COEFF_01] = 60.0,
+        [MG2D_DIFF_COEFF_20] = 90.0,
+        [MG2D_DIFF_COEFF_02] = 90.0,
+        [MG2D_DIFF_COEFF_11] = 3600.0,
+    },
+    {
+        [MG2D_DIFF_COEFF_00] = 1.0,
+        [MG2D_DIFF_COEFF_10] = 840.0,
+        [MG2D_DIFF_COEFF_01] = 840.0,
+        [MG2D_DIFF_COEFF_20] = 5040.0,
+        [MG2D_DIFF_COEFF_02] = 5040.0,
+        [MG2D_DIFF_COEFF_11] = SQR(840.0),
+    },
 };
 
 static void boundaries_sync(EGSContext *ctx, NDArray *a_dst)
@@ -227,7 +245,9 @@ static void boundaries_apply_reflect(double *dst, const ptrdiff_t dst_stride[2],
 /* FIXME we currently use 1st-order forward differences in all cases,
  * since centered and/or higher-order FDs seem to be unstable
  * this may be suboptimal and should be investigated closer */
-static const double falloff_bnd_fd_factors[][FD_STENCIL_MAX * 2 + 1] = {
+static const double falloff_bnd_fd_factors[][2] = {
+    {  1.0 / 2.0, -1.0 / 2.0, },
+    {  1.0 / 2.0, -1.0 / 2.0, },
     {  1.0 / 2.0, -1.0 / 2.0, },
     {  1.0 / 2.0, -1.0 / 2.0, },
     //{ -1.0, 8.0, 0.0, -8.0, 1.0 },
@@ -248,7 +268,7 @@ static void boundaries_apply_falloff(double *dst, const ptrdiff_t dst_stride[2],
 
             double val = row[-(ptrdiff_t)ctx->fd_stencil * dst_stride[1]] / r * (x / r);
 
-            for (int i = 1; i < ctx->fd_stencil * 2 + 1; i++)
+            for (int i = 1; i < ARRAY_ELEMS(falloff_bnd_fd_factors[0]); i++)
                 val += dst[(bnd_layer - i) * dst_stride[1] + bnd_idx * dst_stride[0]] * falloff_bnd_fd_factors[ctx->fd_stencil - 1][i] / ctx->step[0];
 
             *row = -val * (ctx->step[0] / falloff_bnd_fd_factors[ctx->fd_stencil - 1][0]);
@@ -574,6 +594,201 @@ static void fill_mat_s2(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_str
     mat_row[(-2 - 2 * row_stride) * mat_stride] +=   1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
 }
 
+static void fill_mat_s3(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride,
+                        NDArray **diff_coeffs, ptrdiff_t idx_src)
+{
+    mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src];
+
+    mat_row[ 3 * mat_stride] +=   1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+    mat_row[ 2 * mat_stride] +=  -9.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+    mat_row[ 1 * mat_stride] +=  45.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+    mat_row[-1 * mat_stride] += -45.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+    mat_row[-2 * mat_stride] +=   9.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+    mat_row[-3 * mat_stride] +=  -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+
+    mat_row[ 3 * row_stride * mat_stride] +=   1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+    mat_row[ 2 * row_stride * mat_stride] +=  -9.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+    mat_row[ 1 * row_stride * mat_stride] +=  45.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+    mat_row[-1 * row_stride * mat_stride] += -45.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+    mat_row[-2 * row_stride * mat_stride] +=   9.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+    mat_row[-3 * row_stride * mat_stride] +=  -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+
+    mat_row[ 3 * mat_stride] +=    1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+    mat_row[ 2 * mat_stride] +=  -13.5 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+    mat_row[ 1 * mat_stride] +=  135.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+    mat_row[ 0 * mat_stride] += -245.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+    mat_row[-1 * mat_stride] +=  135.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+    mat_row[-2 * mat_stride] +=  -13.5 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+    mat_row[-3 * mat_stride] +=    1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+
+    mat_row[ 3 * row_stride * mat_stride] +=    1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+    mat_row[ 2 * row_stride * mat_stride] +=  -13.5 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+    mat_row[ 1 * row_stride * mat_stride] +=  135.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+    mat_row[ 0 * row_stride * mat_stride] += -245.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+    mat_row[-1 * row_stride * mat_stride] +=  135.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+    mat_row[-2 * row_stride * mat_stride] +=  -13.5 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+    mat_row[-3 * row_stride * mat_stride] +=    1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+
+    mat_row[( 3 + 3 * row_stride) * mat_stride] += (1.0) * (  1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 3 + 2 * row_stride) * mat_stride] += (1.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 3 + 1 * row_stride) * mat_stride] += (1.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 3 - 1 * row_stride) * mat_stride] += (1.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 3 - 2 * row_stride) * mat_stride] += (1.0) * (  9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 3 - 3 * row_stride) * mat_stride] += (1.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+    mat_row[( 2 + 3 * row_stride) * mat_stride] += (-9.0) * (  1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 2 + 2 * row_stride) * mat_stride] += (-9.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 2 + 1 * row_stride) * mat_stride] += (-9.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 2 - 1 * row_stride) * mat_stride] += (-9.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 2 - 2 * row_stride) * mat_stride] += (-9.0) * (  9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 2 - 3 * row_stride) * mat_stride] += (-9.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+    mat_row[( 1 + 3 * row_stride) * mat_stride] += (45.0) * (  1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 1 + 2 * row_stride) * mat_stride] += (45.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 1 + 1 * row_stride) * mat_stride] += (45.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 1 - 1 * row_stride) * mat_stride] += (45.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 1 - 2 * row_stride) * mat_stride] += (45.0) * (  9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 1 - 3 * row_stride) * mat_stride] += (45.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+    mat_row[(-1 + 3 * row_stride) * mat_stride] += (-45.0) * (  1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-1 + 2 * row_stride) * mat_stride] += (-45.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-1 + 1 * row_stride) * mat_stride] += (-45.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-1 - 1 * row_stride) * mat_stride] += (-45.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-1 - 2 * row_stride) * mat_stride] += (-45.0) * (  9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-1 - 3 * row_stride) * mat_stride] += (-45.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+    mat_row[(-2 + 3 * row_stride) * mat_stride] += (9.0) * (  1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-2 + 2 * row_stride) * mat_stride] += (9.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-2 + 1 * row_stride) * mat_stride] += (9.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-2 - 1 * row_stride) * mat_stride] += (9.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-2 - 2 * row_stride) * mat_stride] += (9.0) * (  9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-2 - 3 * row_stride) * mat_stride] += (9.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+    mat_row[(-3 + 3 * row_stride) * mat_stride] += (-1.0) * (  1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-3 + 2 * row_stride) * mat_stride] += (-1.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-3 + 1 * row_stride) * mat_stride] += (-1.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-3 - 1 * row_stride) * mat_stride] += (-1.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-3 - 2 * row_stride) * mat_stride] += (-1.0) * (  9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-3 - 3 * row_stride) * mat_stride] += (-1.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+}
+
+static void fill_mat_s4(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride,
+                        NDArray **diff_coeffs, ptrdiff_t idx_src)
+{
+    mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src];
+
+    mat_row[ 4 * mat_stride] +=   -3.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+    mat_row[ 3 * mat_stride] +=   32.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+    mat_row[ 2 * mat_stride] += -168.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+    mat_row[ 1 * mat_stride] +=  672.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+    mat_row[-1 * mat_stride] += -672.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+    mat_row[-2 * mat_stride] +=  168.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+    mat_row[-3 * mat_stride] +=  -32.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+    mat_row[-4 * mat_stride] +=    3.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+
+    mat_row[ 4 * row_stride * mat_stride] +=   -3.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+    mat_row[ 3 * row_stride * mat_stride] +=   32.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+    mat_row[ 2 * row_stride * mat_stride] += -168.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+    mat_row[ 1 * row_stride * mat_stride] +=  672.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+    mat_row[-1 * row_stride * mat_stride] += -672.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+    mat_row[-2 * row_stride * mat_stride] +=  168.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+    mat_row[-3 * row_stride * mat_stride] +=  -32.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+    mat_row[-4 * row_stride * mat_stride] +=    3.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+
+    mat_row[ 4 * mat_stride] +=     -9.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+    mat_row[ 3 * mat_stride] +=    128.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+    mat_row[ 2 * mat_stride] +=  -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+    mat_row[ 1 * mat_stride] +=   8064.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+    mat_row[ 0 * mat_stride] += -14350.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+    mat_row[-1 * mat_stride] +=   8064.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+    mat_row[-2 * mat_stride] +=  -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+    mat_row[-3 * mat_stride] +=    128.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+    mat_row[-4 * mat_stride] +=     -9.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+
+    mat_row[ 4 * row_stride * mat_stride] +=     -9.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+    mat_row[ 3 * row_stride * mat_stride] +=    128.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+    mat_row[ 2 * row_stride * mat_stride] +=  -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+    mat_row[ 1 * row_stride * mat_stride] +=   8064.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+    mat_row[ 0 * row_stride * mat_stride] += -14350.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+    mat_row[-1 * row_stride * mat_stride] +=   8064.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+    mat_row[-2 * row_stride * mat_stride] +=  -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+    mat_row[-3 * row_stride * mat_stride] +=    128.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+    mat_row[-4 * row_stride * mat_stride] +=     -9.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+
+    mat_row[( 4 + 4 * row_stride) * mat_stride] += (-3.0) * (  -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 4 + 3 * row_stride) * mat_stride] += (-3.0) * (  32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 4 + 2 * row_stride) * mat_stride] += (-3.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 4 + 1 * row_stride) * mat_stride] += (-3.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 4 - 1 * row_stride) * mat_stride] += (-3.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 4 - 2 * row_stride) * mat_stride] += (-3.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 4 - 3 * row_stride) * mat_stride] += (-3.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 4 - 4 * row_stride) * mat_stride] += (-3.0) * (   3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+    mat_row[( 3 + 4 * row_stride) * mat_stride] += (32.0) * (  -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 3 + 3 * row_stride) * mat_stride] += (32.0) * (  32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 3 + 2 * row_stride) * mat_stride] += (32.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 3 + 1 * row_stride) * mat_stride] += (32.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 3 - 1 * row_stride) * mat_stride] += (32.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 3 - 2 * row_stride) * mat_stride] += (32.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 3 - 3 * row_stride) * mat_stride] += (32.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 3 - 4 * row_stride) * mat_stride] += (32.0) * (   3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+    mat_row[( 2 + 4 * row_stride) * mat_stride] += (-168.0) * (  -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 2 + 3 * row_stride) * mat_stride] += (-168.0) * (  32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 2 + 2 * row_stride) * mat_stride] += (-168.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 2 + 1 * row_stride) * mat_stride] += (-168.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 2 - 1 * row_stride) * mat_stride] += (-168.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 2 - 2 * row_stride) * mat_stride] += (-168.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 2 - 3 * row_stride) * mat_stride] += (-168.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 2 - 4 * row_stride) * mat_stride] += (-168.0) * (   3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+    mat_row[( 1 + 4 * row_stride) * mat_stride] += (672.0) * (  -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 1 + 3 * row_stride) * mat_stride] += (672.0) * (  32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 1 + 2 * row_stride) * mat_stride] += (672.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 1 + 1 * row_stride) * mat_stride] += (672.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 1 - 1 * row_stride) * mat_stride] += (672.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 1 - 2 * row_stride) * mat_stride] += (672.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 1 - 3 * row_stride) * mat_stride] += (672.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[( 1 - 4 * row_stride) * mat_stride] += (672.0) * (   3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+    mat_row[(-1 + 4 * row_stride) * mat_stride] += (-672.0) * (  -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-1 + 3 * row_stride) * mat_stride] += (-672.0) * (  32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-1 + 2 * row_stride) * mat_stride] += (-672.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-1 + 1 * row_stride) * mat_stride] += (-672.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-1 - 1 * row_stride) * mat_stride] += (-672.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-1 - 2 * row_stride) * mat_stride] += (-672.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-1 - 3 * row_stride) * mat_stride] += (-672.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-1 - 4 * row_stride) * mat_stride] += (-672.0) * (   3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+    mat_row[(-2 + 4 * row_stride) * mat_stride] += (168.0) * (  -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-2 + 3 * row_stride) * mat_stride] += (168.0) * (  32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-2 + 2 * row_stride) * mat_stride] += (168.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-2 + 1 * row_stride) * mat_stride] += (168.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-2 - 1 * row_stride) * mat_stride] += (168.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-2 - 2 * row_stride) * mat_stride] += (168.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-2 - 3 * row_stride) * mat_stride] += (168.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-2 - 4 * row_stride) * mat_stride] += (168.0) * (   3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+    mat_row[(-3 + 4 * row_stride) * mat_stride] += (-32.0) * (  -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-3 + 3 * row_stride) * mat_stride] += (-32.0) * (  32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-3 + 2 * row_stride) * mat_stride] += (-32.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-3 + 1 * row_stride) * mat_stride] += (-32.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-3 - 1 * row_stride) * mat_stride] += (-32.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-3 - 2 * row_stride) * mat_stride] += (-32.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-3 - 3 * row_stride) * mat_stride] += (-32.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-3 - 4 * row_stride) * mat_stride] += (-32.0) * (   3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+    mat_row[(-4 + 4 * row_stride) * mat_stride] += (3.0) * (  -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-4 + 3 * row_stride) * mat_stride] += (3.0) * (  32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-4 + 2 * row_stride) * mat_stride] += (3.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-4 + 1 * row_stride) * mat_stride] += (3.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-4 - 1 * row_stride) * mat_stride] += (3.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-4 - 2 * row_stride) * mat_stride] += (3.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-4 - 3 * row_stride) * mat_stride] += (3.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+    mat_row[(-4 - 4 * row_stride) * mat_stride] += (3.0) * (   3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+}
+
 static void mat_fill_row(EGSContext *ctx, double *scratch_line,
                          ptrdiff_t idx1, ptrdiff_t idx0)
 {
@@ -707,7 +922,7 @@ static void mat_fill_row(EGSContext *ctx, double *scratch_line,
                         const double r = sqrt(SQR(x) + SQR(y));
                         double val = dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]];
 
-                        for (int i = 1; i < ctx->fd_stencil * 2 + 1; i++) {
+                        for (int i = 1; i < ARRAY_ELEMS(falloff_bnd_fd_factors[0]); i++) {
                             dst[dir * (bnd_layer - i) * bnd_stride[1] + bnd_idx * bnd_stride[0]] -=
                                 val * falloff_bnd_fd_factors[ctx->fd_stencil - 1][i] / falloff_bnd_fd_factors[ctx->fd_stencil - 1][0];
                         }
@@ -1021,6 +1236,8 @@ int mg2di_egs_init(EGSContext *ctx, int flags)
     switch (ctx->fd_stencil) {
     case 1: priv->e.fill_mat = fill_mat_s1; break;
     case 2: priv->e.fill_mat = fill_mat_s2; break;
+    case 3: priv->e.fill_mat = fill_mat_s3; break;
+    case 4: priv->e.fill_mat = fill_mat_s4; break;
     default:
         mg2di_log(&ctx->logger, 0, "Invalid finite difference stencil: %zd\n",
                   ctx->fd_stencil);
diff --git a/mg2d.c b/mg2d.c
index 3255ce7..8905206 100644
--- a/mg2d.c
+++ b/mg2d.c
@@ -860,6 +860,8 @@ static int mg_levels_init(MG2DContext *ctx)
             switch (ctx->fd_stencil) {
             case 1: op_interp = GRID_TRANSFER_LAGRANGE_3; break;
             case 2: op_interp = GRID_TRANSFER_LAGRANGE_5; break;
+            case 3: op_interp = GRID_TRANSFER_LAGRANGE_7; break;
+            case 4: op_interp = GRID_TRANSFER_LAGRANGE_7; break;
             default: return -ENOSYS;
             }
 
diff --git a/residual_calc.asm b/residual_calc.asm
index 5eea31c..471f864 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -32,8 +32,14 @@
 
 SECTION .rodata
 
-const8:  times 8 dq  8.0
-const30: times 8 dq 30.0
+const5:    times 8 dq  5.0
+const8:    times 8 dq  8.0
+const9:    times 8 dq  9.0
+const10:   times 8 dq 10.0
+const13_5: times 8 dq 13.5
+const30:   times 8 dq 30.0
+const45:   times 8 dq 45.0
+const245:  times 8 dq 245.0
 
 SECTION .text
 
@@ -282,3 +288,224 @@ RESIDUAL_CALC 2, 0
 cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
                                          diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
 RESIDUAL_CALC 2, 1
+
+%macro DX3 3
+    movu     %1,  [%3 + ELEM_SIZE * 1]   ; out =  u[x+1]
+    subpd    %1,  [%3 - ELEM_SIZE * 1]   ;       -u[x-1]
+
+    movu     %2,  [%3 + ELEM_SIZE * 2]   ; tmp =  u[x+2]
+    subpd    %2,  [%3 - ELEM_SIZE * 2]   ;       -u[x-2]
+
+    vfmsub132pd %1, %2, [const5]         ; out = 5 * out - tmp
+
+    movu     %2,  [%3 + ELEM_SIZE * 3]   ; tmp =  u[x+3]
+    subpd    %2,  [%3 - ELEM_SIZE * 3]   ;       -u[x-3]
+
+    vfmadd132pd %1, %2, [const9]         ; out = 9 * out + tmp
+%endmacro
+
+%macro RES_ADD_DIFF_X_3 0
+    ; first derivative
+    DX3 m7, m8, uq + offsetq
+    vfmadd231pd m0, m7, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10
+
+    ; second derivative
+    movu     m7,  [uq + offsetq + ELEM_SIZE * 1]   ; m7 =  u[x+1]
+    addpd    m7,  [uq + offsetq - ELEM_SIZE * 1]   ;      +u[x-1]
+
+    movu     m8,  [uq + offsetq + ELEM_SIZE * 2]   ; m8 =  u[x+2]
+    addpd    m8,  [uq + offsetq - ELEM_SIZE * 2]   ;      +u[x-2]
+
+    vfmsub132pd m7, m8, [const10]                  ; m7 = 10 * m7 - m8
+
+    movu     m8,  [uq + offsetq + ELEM_SIZE * 3]   ; m8 =  u[x+3]
+    addpd    m8,  [uq + offsetq - ELEM_SIZE * 3]   ;      -u[x-3]
+
+    vfmadd132pd  m7, m8, [const13_5]               ; m7 = 13.5 * m7 + m8
+    vfnmadd231pd m7, m6, [const245]                ; m7 = m7 - 245 * u[0]
+    vfmadd231pd  m0, m7, [diff_coeffs20q + offsetq]; res += d_x u * diff_coeffs20
+%endmacro
+
+%macro RES_ADD_DIFF_Y_3 0
+    ; first derivative
+    lea u_upq,   [uq + strideq]
+    mov u_downq, uq
+    sub u_downq, strideq
+    movu     m7,  [u_upq   + offsetq]              ; m7 =  u[y+1]
+    subpd    m7,  [u_downq + offsetq]              ;      -u[y-1]
+
+    add u_upq,   strideq
+    sub u_downq, strideq
+    movu     m8,  [u_upq   + offsetq]              ; m8 =  u[y+2]
+    subpd    m8,  [u_downq + offsetq]              ;      -u[y-2]
+
+    vfmsub132pd m7, m8, [const5]                   ; m7 = 5 * m7 - m8
+
+    add u_upq,   strideq
+    sub u_downq, strideq
+    movu     m8,  [u_upq   + offsetq]              ; m8 =  u[y+3]
+    subpd    m8,  [u_downq + offsetq]              ;      -u[y-3]
+
+    vfmadd132pd m7, m8, [const9]                   ; m7 = 9 * m7 + m8
+    vfmadd231pd m0, m7, [diff_coeffs01q + offsetq] ; res += d_x u * diff_coeffs01
+
+    ; second derivative
+    lea u_upq,   [uq + strideq]
+    mov u_downq, uq
+    sub u_downq, strideq
+    movu     m7,  [u_upq   + offsetq]              ; m7 =  u[y+1]
+    addpd    m7,  [u_downq + offsetq]              ;      +u[y-1]
+
+    add u_upq,   strideq
+    sub u_downq, strideq
+    movu     m8,  [u_upq   + offsetq]              ; m8 =  u[y+2]
+    addpd    m8,  [u_downq + offsetq]              ;      +u[y-2]
+
+    vfmsub132pd m7, m8, [const10]                  ; m7 = 10 * m7 - m8
+
+    add u_upq,   strideq
+    sub u_downq, strideq
+    movu     m8,  [u_upq   + offsetq]              ; m8 =  u[x+3]
+    addpd    m8,  [u_downq + offsetq]              ;      -u[x-3]
+
+    vfmadd132pd  m7, m8, [const13_5]               ; m7 = 13.5 * m7 + m8
+    vfnmadd231pd m7, m6, [const245]                ; m7 = m7 - 245 * u[0]
+    vfmadd231pd  m0, m7, [diff_coeffs02q + offsetq]; res += d_y u * diff_coeffs02
+%endmacro
+
+%macro RES_ADD_DIFF_MIXED_3 0
+    lea u_upq, [uq + 2 * strideq]
+    add u_upq, strideq
+    DX3 m6, m7, u_upq + offsetq                 ; m6 = dx u[y + 3]
+
+    sub u_upq, strideq
+    DX3 m7, m8, u_upq + offsetq                 ; m7 = dx u[y + 2]
+    vfnmadd231pd m6, m7, [const9]               ; m6 = m6 - 9 * m7
+
+    sub u_upq, strideq
+    DX3 m7, m8, u_upq + offsetq                 ; m7 = dx u[y + 1]
+    vfmadd231pd m6, m7, [const45]               ; m6 = m6 + 45 * m7
+
+    sub u_upq, strideq
+    sub u_upq, strideq
+    DX3 m7, m8, u_upq + offsetq                 ; m7 = dx u[y - 1]
+    vfnmadd231pd m6, m7, [const45]              ; m6 = m6 - 45 * m7
+
+    sub u_upq, strideq
+    DX3 m7, m8, u_upq + offsetq                 ; m7 = dx u[y - 2]
+    vfmadd231pd m6, m7, [const9]               ; m6 = m6 + 9 * m7
+
+    sub u_upq, strideq
+    DX3 m7, m8, u_upq + offsetq                 ; m7 = dx u[y - 3]
+    subpd m6, m7
+    vfmadd231pd m0, m6, [diff_coeffs11q + offsetq]
+%endmacro
+
+%macro RESIDUAL_CALC_3 2
+%if %2
+    vpermq m2, m1, 0
+%endif
+    vpermq m1, m0, 0
+
+    ; compute the mask for absolute value
+    pcmpeqq m13, m13
+    psrlq   m13, 1
+    movu    m12, [res_maxq]
+
+    ; load pointers to the equation coefficients
+    %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
+    mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
+    mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01]
+    mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10]
+    mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11]
+    mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02]
+    mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20]
+
+    ; setup the data pointers and the loop counter
+    shl strideq, 3
+    shl linesizeq, 3
+    add dstq,           linesizeq
+    add uq,             linesizeq
+    add rhsq,           linesizeq
+    add diff_coeffs00q, linesizeq
+    add diff_coeffs01q, linesizeq
+    add diff_coeffs10q, linesizeq
+    add diff_coeffs11q, linesizeq
+    add diff_coeffs02q, linesizeq
+    add diff_coeffs20q, linesizeq
+    neg linesizeq
+    ; from now on, the register that held linesize is used as the offset into data arrays
+    %define offsetq linesizeq
+.loop:
+    xorpd m0, m0
+    subpd m0, [rhsq + offsetq]                          ; res  = -rhs
+
+    ; plain value
+    movu        m6, [uq + offsetq]                      ; m6 = u[x]
+    vfmadd231pd m0, m6, [diff_coeffs00q + offsetq]      ; res += u * diff_coeffs00
+%if %2
+    mulpd       m3, m6, m2
+%endif
+
+    RES_ADD_DIFF_X_3
+    RES_ADD_DIFF_Y_3
+    RES_ADD_DIFF_MIXED_3
+
+    andpd m6, m0, m13                                   ; m6 = abs(res)
+    mulpd m0, m1
+%if %2
+    addpd m0, m3
+%endif
+
+    ; store the result
+    add offsetq, mmsize
+    jg .store_partial
+
+    ; store full block
+    movu [dstq + offsetq - mmsize], m0
+    maxpd m12, m6
+    js .loop
+    jmp .finish
+
+.store_partial:
+    sub offsetq, ELEM_SIZE
+    jz .store3
+    sub offsetq, ELEM_SIZE
+    jz .store2
+
+.store1:
+    ; offsetq is now mmsize-2 after the write position
+    movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
+
+    vpermq m6, m6, 0
+    maxpd m12, m6
+
+    jmp .finish
+.store2:
+    ; offsetq is now mmsize-2 after the write position
+    movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
+    mova  xm6, xm6
+    maxpd m12, m6
+    jmp .finish
+.store3:
+    ; offsetq is now mmsize-1 after the write position
+    movu [dstq + offsetq - mmsize + 1 * ELEM_SIZE], xm0
+    vextractf128 xm0, m0, 1
+    movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0
+
+    vpermq m6, m6, 10100100b
+    maxpd m12, m6
+
+.finish:
+    movu [res_maxq], m12
+    RET
+%endmacro
+
+INIT_YMM fma3
+cglobal residual_calc_line_s3, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
+                                          diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_down
+RESIDUAL_CALC_3 3, 0
+
+cglobal residual_add_line_s3, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
+                                          diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_down
+RESIDUAL_CALC_3 3, 1
diff --git a/residual_calc.c b/residual_calc.c
index 6d58c10..d92986b 100644
--- a/residual_calc.c
+++ b/residual_calc.c
@@ -87,8 +87,62 @@ void mg2di_residual_add_line_s2_fma3(size_t linesize, double *dst, double *dst_m
                                      ptrdiff_t stride, const double *u, const double *rhs,
                                      const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
                                      double res_mult, double u_mult);
+void mg2di_residual_calc_line_s3_fma3(size_t linesize, double *dst, double *dst_max,
+                                      ptrdiff_t stride, const double *u, const double *rhs,
+                                      const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+                                      double res_mult);
+void mg2di_residual_add_line_s3_fma3(size_t linesize, double *dst, double *dst_max,
+                                     ptrdiff_t stride, const double *u, const double *rhs,
+                                     const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+                                     double res_mult, double u_mult);
 #endif
 
+static const double fd_coeffs[][2][FD_STENCIL_MAX * 2 + 1] = {
+    {
+        { -1.0,  0.0, 1.0 },
+        { 1.0,  -2.0, 1.0 },
+    },
+    {
+        {  1.0, -8.0,   0.0,  8.0, -1.0 },
+        { -1.0, 16.0, -30.0, 16.0, -1.0 },
+    },
+    {
+        { -1.0,   9.0, -45.0,    0.0,  45.0,  -9.0, 1.0 },
+        {  1.0, -13.5, 135.0, -245.0, 135.0, -13.5, 1.0 },
+    },
+    {
+        {  3.0, -32.0,   168.0, -672.0,      0.0,  672.0,  -168.0,  32.0, -3.0 },
+        { -9.0, 128.0, -1008.0, 8064.0, -14350.0, 8064.0, -1008.0, 128.0, -9.0 },
+    },
+};
+
+#define STENCIL 1
+#define ADD 0
+#include "residual_calc_template.c"
+#undef ADD
+#undef STENCIL
+
+#define STENCIL 2
+#define ADD 0
+#include "residual_calc_template.c"
+#undef ADD
+#undef STENCIL
+
+#define STENCIL 3
+#define ADD 0
+#include "residual_calc_template.c"
+#undef ADD
+#undef STENCIL
+
+#define STENCIL 4
+#define ADD 0
+#include "residual_calc_template.c"
+#undef ADD
+#define ADD 1
+#include "residual_calc_template.c"
+#undef ADD
+#undef STENCIL
+
 static void
 derivatives_calc_s1(double *dst, const double *u, ptrdiff_t stride)
 {
@@ -149,6 +203,81 @@ derivatives_calc_s2(double *dst, const double *u, ptrdiff_t stride)
                                -1.0 * valxm2yp2 +  8.0 * valxm2yp1 -  8.0 * valxm2ym1 + 1.0 * valxm2ym2);
 }
 
+static void
+derivatives_calc_s3(double *dst, const double *u, ptrdiff_t stride)
+{
+    const double val    = u[0];
+
+    const double valxp1 = u[ 1];
+    const double valxp2 = u[ 2];
+    const double valxp3 = u[ 3];
+    const double valxm1 = u[-1];
+    const double valxm2 = u[-2];
+    const double valxm3 = u[-3];
+    const double valyp1 = u[ 1 * stride];
+    const double valyp2 = u[ 2 * stride];
+    const double valyp3 = u[ 3 * stride];
+    const double valym1 = u[-1 * stride];
+    const double valym2 = u[-2 * stride];
+    const double valym3 = u[-3 * stride];
+
+    const double valxp3yp1 = u[ 3 + 1 * stride];
+    const double valxp3yp2 = u[ 3 + 2 * stride];
+    const double valxp3yp3 = u[ 3 + 3 * stride];
+    const double valxp3ym1 = u[ 3 - 1 * stride];
+    const double valxp3ym2 = u[ 3 - 2 * stride];
+    const double valxp3ym3 = u[ 3 - 3 * stride];
+
+    const double valxp2yp1 = u[ 2 + 1 * stride];
+    const double valxp2yp2 = u[ 2 + 2 * stride];
+    const double valxp2yp3 = u[ 2 + 3 * stride];
+    const double valxp2ym1 = u[ 2 - 1 * stride];
+    const double valxp2ym2 = u[ 2 - 2 * stride];
+    const double valxp2ym3 = u[ 2 - 3 * stride];
+
+    const double valxp1yp1 = u[ 1 + 1 * stride];
+    const double valxp1yp2 = u[ 1 + 2 * stride];
+    const double valxp1yp3 = u[ 1 + 3 * stride];
+    const double valxp1ym1 = u[ 1 - 1 * stride];
+    const double valxp1ym2 = u[ 1 - 2 * stride];
+    const double valxp1ym3 = u[ 1 - 3 * stride];
+
+    const double valxm1yp1 = u[-1 + 1 * stride];
+    const double valxm1yp2 = u[-1 + 2 * stride];
+    const double valxm1yp3 = u[-1 + 3 * stride];
+    const double valxm1ym1 = u[-1 - 1 * stride];
+    const double valxm1ym2 = u[-1 - 2 * stride];
+    const double valxm1ym3 = u[-1 - 3 * stride];
+
+    const double valxm2yp1 = u[-2 + 1 * stride];
+    const double valxm2yp2 = u[-2 + 2 * stride];
+    const double valxm2yp3 = u[-2 + 3 * stride];
+    const double valxm2ym1 = u[-2 - 1 * stride];
+    const double valxm2ym2 = u[-2 - 2 * stride];
+    const double valxm2ym3 = u[-2 - 3 * stride];
+
+    const double valxm3yp1 = u[-3 + 1 * stride];
+    const double valxm3yp2 = u[-3 + 2 * stride];
+    const double valxm3yp3 = u[-3 + 3 * stride];
+    const double valxm3ym1 = u[-3 - 1 * stride];
+    const double valxm3ym2 = u[-3 - 2 * stride];
+    const double valxm3ym3 = u[-3 - 3 * stride];
+
+    dst[MG2D_DIFF_COEFF_00] = val;
+    dst[MG2D_DIFF_COEFF_10] = (1.0 * valxp3 - 9.0 * valxp2 + 45.0 * valxp1 - 45.0 * valxm1 + 9.0 * valxm2 - 1.0 * valxm3);
+    dst[MG2D_DIFF_COEFF_01] = (1.0 * valyp3 - 9.0 * valyp2 + 45.0 * valyp1 - 45.0 * valym1 + 9.0 * valym2 - 1.0 * valym3);
+
+    dst[MG2D_DIFF_COEFF_20] = (1.0 * valxp3 - 13.5 * valxp2 + 135.0 * valxp1 - 245.0 * val + 135.0 * valxm1 - 13.5 * valxm2 + 1.0 * valxm3);
+    dst[MG2D_DIFF_COEFF_02] = (1.0 * valyp3 - 13.5 * valyp2 + 135.0 * valyp1 - 245.0 * val + 135.0 * valym1 - 13.5 * valym2 + 1.0 * valym3);
+
+    dst[MG2D_DIFF_COEFF_11] = (   1.0 * (1.0 * valxp3yp3 - 9.0 * valxp3yp2 + 45.0 * valxp3yp1 - 45.0 * valxp3ym1 + 9.0 * valxp3ym2 - 1.0 * valxp3ym3)
+                               -  9.0 * (1.0 * valxp2yp3 - 9.0 * valxp2yp2 + 45.0 * valxp2yp1 - 45.0 * valxp2ym1 + 9.0 * valxp2ym2 - 1.0 * valxp2ym3)
+                               + 45.0 * (1.0 * valxp1yp3 - 9.0 * valxp1yp2 + 45.0 * valxp1yp1 - 45.0 * valxp1ym1 + 9.0 * valxp1ym2 - 1.0 * valxp1ym3)
+                               - 45.0 * (1.0 * valxm1yp3 - 9.0 * valxm1yp2 + 45.0 * valxm1yp1 - 45.0 * valxm1ym1 + 9.0 * valxm1ym2 - 1.0 * valxm1ym3)
+                               +  9.0 * (1.0 * valxm2yp3 - 9.0 * valxm2yp2 + 45.0 * valxm2yp1 - 45.0 * valxm2ym1 + 9.0 * valxm2ym2 - 1.0 * valxm2ym3)
+                               -  1.0 * (1.0 * valxm3yp3 - 9.0 * valxm3yp2 + 45.0 * valxm3yp1 - 45.0 * valxm3ym1 + 9.0 * valxm3ym2 - 1.0 * valxm3ym3));
+}
+
 static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max,
                                     ptrdiff_t stride, const double *u, const double *rhs,
                                     const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
@@ -245,6 +374,54 @@ static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max
     *dst_max = MAX(*dst_max, res_max);
 }
 
+static void residual_calc_line_s3_c(size_t linesize, double *dst, double *dst_max,
+                                    ptrdiff_t stride, const double *u, const double *rhs,
+                                    const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+                                    double res_mult)
+{
+    double res_max = 0.0, res_abs;
+    for (size_t i = 0; i < linesize; i++) {
+        double u_vals[MG2D_DIFF_COEFF_NB];
+        double res;
+
+        derivatives_calc_s3(u_vals, u + i, stride);
+
+        res = -rhs[i];
+        for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
+            res += u_vals[j] * diff_coeffs[j][i];
+        dst[i] = res_mult * res;
+
+        res_abs = fabs(res);
+        res_max = MAX(res_max, res_abs);
+    }
+
+    *dst_max = MAX(*dst_max, res_max);
+}
+
+static void residual_add_line_s3_c(size_t linesize, double *dst, double *dst_max,
+                                    ptrdiff_t stride, const double *u, const double *rhs,
+                                    const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+                                    double res_mult, double u_mult)
+{
+    double res_max = 0.0, res_abs;
+    for (size_t i = 0; i < linesize; i++) {
+        double u_vals[MG2D_DIFF_COEFF_NB];
+        double res;
+
+        derivatives_calc_s3(u_vals, u + i, stride);
+
+        res = -rhs[i];
+        for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
+            res += u_vals[j] * diff_coeffs[j][i];
+        dst[i] = u_mult * u[i] + res_mult * res;
+
+        res_abs = fabs(res);
+        res_max = MAX(res_max, res_abs);
+    }
+
+    *dst_max = MAX(*dst_max, res_max);
+}
+
 static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
 {
     ResidualCalcInternal *priv = arg;
@@ -340,6 +517,7 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
     case 1:
         priv->residual_calc_line = residual_calc_line_s1_c;
         priv->residual_add_line  = residual_add_line_s1_c;
+        //priv->residual_calc_line = residual_calc_line_1;
 #if HAVE_EXTERNAL_ASM
         if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
             priv->residual_calc_line = mg2di_residual_calc_line_s1_fma3;
@@ -351,6 +529,7 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
     case 2:
         priv->residual_calc_line = residual_calc_line_s2_c;
         priv->residual_add_line  = residual_add_line_s2_c;
+        //priv->residual_calc_line = residual_calc_line_2;
 #if HAVE_EXTERNAL_ASM
         if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
             priv->residual_calc_line = mg2di_residual_calc_line_s2_fma3;
@@ -359,6 +538,23 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
         }
 #endif
         break;
+    case 3:
+        priv->residual_calc_line = residual_calc_line_s3_c;
+        priv->residual_add_line  = residual_add_line_s3_c;
+#if HAVE_EXTERNAL_ASM
+        if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
+            priv->residual_calc_line = mg2di_residual_calc_line_s3_fma3;
+            priv->residual_add_line  = mg2di_residual_add_line_s3_fma3;
+            priv->calc_blocksize     = 4;
+        }
+#endif
+        break;
+    case 4:
+        priv->residual_calc_line = residual_line_calc_4;
+        priv->residual_add_line  = residual_line_add_4;
+        break;
+    default:
+        return -ENOSYS;
     }
 
     priv->residual_max_size = tp_get_nb_threads(ctx->tp) * priv->calc_blocksize;
diff --git a/residual_calc_template.c b/residual_calc_template.c
new file mode 100644
index 0000000..779ac24
--- /dev/null
+++ b/residual_calc_template.c
@@ -0,0 +1,105 @@
+/*
+ * Residual calculation template
+ * Copyright 2019 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if ADD
+# define ACT add
+#else
+#define  ACT calc
+#endif
+
+#define JOIN3(a, b, c) a ## _ ## b ## _ ## c
+#define FUNC3(name, act, stencil) JOIN3(name, act, stencil)
+#define FUNC(name) FUNC3(name, ACT, STENCIL)
+
+static void
+FUNC(derivatives_calc)(double *dst, const double *u, ptrdiff_t stride)
+{
+    dst[MG2D_DIFF_COEFF_00] = u[0];
+
+    {
+        double val = 0.0;
+        for (int i = -STENCIL; i <= STENCIL; i++)
+            val += fd_coeffs[STENCIL - 1][0][i + STENCIL] * u[i];
+        dst[MG2D_DIFF_COEFF_10] = val;
+    }
+    {
+        double val = 0.0;
+        for (int i = -STENCIL; i <= STENCIL; i++)
+            val += fd_coeffs[STENCIL - 1][0][i + STENCIL] * u[i * stride];
+        dst[MG2D_DIFF_COEFF_01] = val;
+    }
+    {
+        double val = 0.0;
+        for (int i = -STENCIL; i <= STENCIL; i++)
+            val += fd_coeffs[STENCIL - 1][1][i + STENCIL] * u[i];
+        dst[MG2D_DIFF_COEFF_20] = val;
+    }
+    {
+        double val = 0.0;
+        for (int i = -STENCIL; i <= STENCIL; i++)
+            val += fd_coeffs[STENCIL - 1][1][i + STENCIL] * u[i * stride];
+        dst[MG2D_DIFF_COEFF_02] = val;
+    }
+    {
+        double val = 0.0;
+        for (int j = -STENCIL; j <= STENCIL; j++) {
+            double tmp = 0.0;
+            for (int i = -STENCIL; i <= STENCIL; i++)
+                tmp += fd_coeffs[STENCIL - 1][0][i + STENCIL] * u[j * stride + i];
+            val += fd_coeffs[STENCIL - 1][0][j + STENCIL] * tmp;
+        }
+        dst[MG2D_DIFF_COEFF_11] = val;
+    }
+}
+static void
+FUNC(residual_line)(size_t linesize, double *dst, double *dst_max,
+                    ptrdiff_t stride, const double *u, const double *rhs,
+                    const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+                    double res_mult
+#if ADD
+                    , double u_mult
+#endif
+                        )
+{
+    double res_max = 0.0, res_abs;
+    for (size_t i = 0; i < linesize; i++) {
+        double u_vals[MG2D_DIFF_COEFF_NB];
+        double res;
+
+        FUNC(derivatives_calc)(u_vals, u + i, stride);
+
+        res = -rhs[i];
+        for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
+            res += u_vals[j] * diff_coeffs[j][i];
+        dst[i] = res_mult * res
+#if ADD
+            + u_mult * u[i]
+#endif
+            ;
+
+        res_abs = fabs(res);
+        res_max = MAX(res_max, res_abs);
+    }
+
+    *dst_max = MAX(*dst_max, res_max);
+}
+
+#undef FUNC
+#undef FUNC2
+#undef JOIN2
+#undef ACT
diff --git a/transfer.c b/transfer.c
index a791467..01b9efc 100644
--- a/transfer.c
+++ b/transfer.c
@@ -129,9 +129,10 @@ static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim)
     for (ptrdiff_t idx_dst = 0; idx_dst < ctx->dst.size[dim]; idx_dst++) {
         const ptrdiff_t idx_src = (ptrdiff_t)(scale * (idx_dst + ctx->dst.start[dim])) - ctx->src.start[dim] - (priv->stencil / 2 - 1);
         const double coord_dst  = (idx_dst + ctx->dst.start[dim]) * step_dst;
-
         double *fact = priv->fact[dim] + priv->stencil * idx_dst;
 
+        //idx_src = MAX(MIN(idx_src, ctx->src.size[dim] - priv->stencil - 1), 0);
+
         for (int i = 0; i < priv->stencil; i++)
             coord_src[i] = (idx_src + ctx->src.start[dim] + i) * step_src;
author	Anton Khirnov <anton@khirnov.net>	2019-04-09 09:45:16 +0200
committer	Anton Khirnov <anton@khirnov.net>	2020-01-26 20:21:39 +0100
commit	c2740d169b81ae4f113bad26a5462c0d3947cc48 (patch)
tree	801929156a0a1472780071b514f76dfe6c32fa28
parent	62c92eea4ec4859fff5002931e2d7d562b3deb5d (diff)