summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-04-09 09:45:16 +0200
committerAnton Khirnov <anton@khirnov.net>2019-06-14 11:05:39 +0200
commit373dbd15f216c2ce97396af37ea3f7cc5d39455c (patch)
tree9fec3609181410c7584bced57b118c23499a90eb
parent33d18edb8db8cad947b7a06206d4000ad0bfa619 (diff)
egs: add higher-order finite difference operatorstmp
-rw-r--r--ell_grid_solve.c225
-rw-r--r--mg2d.c4
-rw-r--r--residual_calc.c145
-rw-r--r--residual_calc_template.c90
-rw-r--r--transfer.c8
5 files changed, 464 insertions, 8 deletions
diff --git a/ell_grid_solve.c b/ell_grid_solve.c
index dfc48c6..e334f58 100644
--- a/ell_grid_solve.c
+++ b/ell_grid_solve.c
@@ -111,6 +111,22 @@ static const double fd_denoms[][MG2D_DIFF_COEFF_NB] = {
[MG2D_DIFF_COEFF_02] = 12.0,
[MG2D_DIFF_COEFF_11] = 144.0,
},
+ {
+ [MG2D_DIFF_COEFF_00] = 1.0,
+ [MG2D_DIFF_COEFF_10] = 60.0,
+ [MG2D_DIFF_COEFF_01] = 60.0,
+ [MG2D_DIFF_COEFF_20] = 90.0,
+ [MG2D_DIFF_COEFF_02] = 90.0,
+ [MG2D_DIFF_COEFF_11] = 3600.0,
+ },
+ {
+ [MG2D_DIFF_COEFF_00] = 1.0,
+ [MG2D_DIFF_COEFF_10] = 840.0,
+ [MG2D_DIFF_COEFF_01] = 840.0,
+ [MG2D_DIFF_COEFF_20] = 5040.0,
+ [MG2D_DIFF_COEFF_02] = 5040.0,
+ [MG2D_DIFF_COEFF_11] = SQR(840.0),
+ },
};
static void residual_calc(EGSContext *ctx)
@@ -165,7 +181,9 @@ static void boundaries_apply_reflect(double *dst, const ptrdiff_t dst_stride[2],
/* FIXME we currently use 1st-order forward differences in all cases,
* since centered and/or higher-order FDs seem to be unstable
* this may be suboptimal and should be investigated closer */
-static const double falloff_bnd_fd_factors[][FD_STENCIL_MAX * 2 + 1] = {
+static const double falloff_bnd_fd_factors[][2] = {
+ { 1.0 / 2.0, -1.0 / 2.0, },
+ { 1.0 / 2.0, -1.0 / 2.0, },
{ 1.0 / 2.0, -1.0 / 2.0, },
{ 1.0 / 2.0, -1.0 / 2.0, },
//{ -1.0, 8.0, 0.0, -8.0, 1.0 },
@@ -185,7 +203,7 @@ static void boundaries_apply_falloff(double *dst, const ptrdiff_t dst_stride[2],
double val = row[-(ptrdiff_t)ctx->fd_stencil * dst_stride[1]] / r * (x / r);
- for (int i = 1; i < ctx->fd_stencil * 2 + 1; i++)
+ for (int i = 1; i < ARRAY_ELEMS(falloff_bnd_fd_factors); i++)
val += dst[(bnd_layer - i) * dst_stride[1] + bnd_idx * dst_stride[0]] * falloff_bnd_fd_factors[ctx->fd_stencil - 1][i] / ctx->step[0];
*row = -val * (ctx->step[0] / falloff_bnd_fd_factors[ctx->fd_stencil - 1][0]);
@@ -405,6 +423,203 @@ static void fill_mat_s2(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_str
mat_row[(-2 - 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
}
+static void fill_mat_s3(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride,
+ NDArray **diff_coeffs, double *fd_factors,
+ ptrdiff_t idx_src)
+{
+ mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src];
+
+ mat_row[ 3 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[ 2 * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[ 1 * mat_stride] += 45.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[-1 * mat_stride] += -45.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[-2 * mat_stride] += 9.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[-3 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+
+ mat_row[ 3 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[ 2 * row_stride * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[ 1 * row_stride * mat_stride] += 45.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[-1 * row_stride * mat_stride] += -45.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[-2 * row_stride * mat_stride] += 9.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[-3 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+
+ mat_row[ 3 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[ 2 * mat_stride] += -13.5 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[ 1 * mat_stride] += 135.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[ 0 * mat_stride] += -245.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[-1 * mat_stride] += 135.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[-2 * mat_stride] += -13.5 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[-3 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+
+ mat_row[ 3 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[ 2 * row_stride * mat_stride] += -13.5 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[ 1 * row_stride * mat_stride] += 135.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[ 0 * row_stride * mat_stride] += -245.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[-1 * row_stride * mat_stride] += 135.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[-2 * row_stride * mat_stride] += -13.5 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[-3 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+
+ mat_row[( 3 + 3 * row_stride) * mat_stride] += (1.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 3 + 2 * row_stride) * mat_stride] += (1.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 3 + 1 * row_stride) * mat_stride] += (1.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 3 - 1 * row_stride) * mat_stride] += (1.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 3 - 2 * row_stride) * mat_stride] += (1.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 3 - 3 * row_stride) * mat_stride] += (1.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[( 2 + 3 * row_stride) * mat_stride] += (-9.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 2 + 2 * row_stride) * mat_stride] += (-9.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 2 + 1 * row_stride) * mat_stride] += (-9.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 2 - 1 * row_stride) * mat_stride] += (-9.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 2 - 2 * row_stride) * mat_stride] += (-9.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 2 - 3 * row_stride) * mat_stride] += (-9.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[( 1 + 3 * row_stride) * mat_stride] += (45.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 1 + 2 * row_stride) * mat_stride] += (45.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 1 + 1 * row_stride) * mat_stride] += (45.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 1 - 1 * row_stride) * mat_stride] += (45.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 1 - 2 * row_stride) * mat_stride] += (45.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 1 - 3 * row_stride) * mat_stride] += (45.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[(-1 + 3 * row_stride) * mat_stride] += (-45.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-1 + 2 * row_stride) * mat_stride] += (-45.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-1 + 1 * row_stride) * mat_stride] += (-45.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-1 - 1 * row_stride) * mat_stride] += (-45.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-1 - 2 * row_stride) * mat_stride] += (-45.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-1 - 3 * row_stride) * mat_stride] += (-45.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[(-2 + 3 * row_stride) * mat_stride] += (9.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-2 + 2 * row_stride) * mat_stride] += (9.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-2 + 1 * row_stride) * mat_stride] += (9.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-2 - 1 * row_stride) * mat_stride] += (9.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-2 - 2 * row_stride) * mat_stride] += (9.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-2 - 3 * row_stride) * mat_stride] += (9.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[(-3 + 3 * row_stride) * mat_stride] += (-1.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-3 + 2 * row_stride) * mat_stride] += (-1.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-3 + 1 * row_stride) * mat_stride] += (-1.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-3 - 1 * row_stride) * mat_stride] += (-1.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-3 - 2 * row_stride) * mat_stride] += (-1.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-3 - 3 * row_stride) * mat_stride] += (-1.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+}
+
+static void fill_mat_s4(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride,
+ NDArray **diff_coeffs, double *fd_factors,
+ ptrdiff_t idx_src)
+{
+ mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src];
+
+ mat_row[ 4 * mat_stride] += -3.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[ 3 * mat_stride] += 32.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[ 2 * mat_stride] += -168.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[ 1 * mat_stride] += 672.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[-1 * mat_stride] += -672.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[-2 * mat_stride] += 168.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[-3 * mat_stride] += -32.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[-4 * mat_stride] += 3.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+
+ mat_row[ 4 * row_stride * mat_stride] += -3.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[ 3 * row_stride * mat_stride] += 32.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[ 2 * row_stride * mat_stride] += -168.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[ 1 * row_stride * mat_stride] += 672.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[-1 * row_stride * mat_stride] += -672.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[-2 * row_stride * mat_stride] += 168.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[-3 * row_stride * mat_stride] += -32.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[-4 * row_stride * mat_stride] += 3.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+
+ mat_row[ 4 * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[ 3 * mat_stride] += 128.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[ 2 * mat_stride] += -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[ 1 * mat_stride] += 8064.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[ 0 * mat_stride] += -14350.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[-1 * mat_stride] += 8064.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[-2 * mat_stride] += -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[-3 * mat_stride] += 128.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[-4 * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+
+ mat_row[ 4 * row_stride * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[ 3 * row_stride * mat_stride] += 128.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[ 2 * row_stride * mat_stride] += -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[ 1 * row_stride * mat_stride] += 8064.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[ 0 * row_stride * mat_stride] += -14350.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[-1 * row_stride * mat_stride] += 8064.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[-2 * row_stride * mat_stride] += -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[-3 * row_stride * mat_stride] += 128.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[-4 * row_stride * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+
+ mat_row[( 4 + 4 * row_stride) * mat_stride] += (-3.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 4 + 3 * row_stride) * mat_stride] += (-3.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 4 + 2 * row_stride) * mat_stride] += (-3.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 4 + 1 * row_stride) * mat_stride] += (-3.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 4 - 1 * row_stride) * mat_stride] += (-3.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 4 - 2 * row_stride) * mat_stride] += (-3.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 4 - 3 * row_stride) * mat_stride] += (-3.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 4 - 4 * row_stride) * mat_stride] += (-3.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[( 3 + 4 * row_stride) * mat_stride] += (32.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 3 + 3 * row_stride) * mat_stride] += (32.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 3 + 2 * row_stride) * mat_stride] += (32.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 3 + 1 * row_stride) * mat_stride] += (32.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 3 - 1 * row_stride) * mat_stride] += (32.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 3 - 2 * row_stride) * mat_stride] += (32.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 3 - 3 * row_stride) * mat_stride] += (32.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 3 - 4 * row_stride) * mat_stride] += (32.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[( 2 + 4 * row_stride) * mat_stride] += (-168.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 2 + 3 * row_stride) * mat_stride] += (-168.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 2 + 2 * row_stride) * mat_stride] += (-168.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 2 + 1 * row_stride) * mat_stride] += (-168.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 2 - 1 * row_stride) * mat_stride] += (-168.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 2 - 2 * row_stride) * mat_stride] += (-168.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 2 - 3 * row_stride) * mat_stride] += (-168.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 2 - 4 * row_stride) * mat_stride] += (-168.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[( 1 + 4 * row_stride) * mat_stride] += (672.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 1 + 3 * row_stride) * mat_stride] += (672.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 1 + 2 * row_stride) * mat_stride] += (672.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 1 + 1 * row_stride) * mat_stride] += (672.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 1 - 1 * row_stride) * mat_stride] += (672.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 1 - 2 * row_stride) * mat_stride] += (672.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 1 - 3 * row_stride) * mat_stride] += (672.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 1 - 4 * row_stride) * mat_stride] += (672.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[(-1 + 4 * row_stride) * mat_stride] += (-672.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-1 + 3 * row_stride) * mat_stride] += (-672.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-1 + 2 * row_stride) * mat_stride] += (-672.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-1 + 1 * row_stride) * mat_stride] += (-672.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-1 - 1 * row_stride) * mat_stride] += (-672.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-1 - 2 * row_stride) * mat_stride] += (-672.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-1 - 3 * row_stride) * mat_stride] += (-672.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-1 - 4 * row_stride) * mat_stride] += (-672.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[(-2 + 4 * row_stride) * mat_stride] += (168.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-2 + 3 * row_stride) * mat_stride] += (168.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-2 + 2 * row_stride) * mat_stride] += (168.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-2 + 1 * row_stride) * mat_stride] += (168.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-2 - 1 * row_stride) * mat_stride] += (168.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-2 - 2 * row_stride) * mat_stride] += (168.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-2 - 3 * row_stride) * mat_stride] += (168.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-2 - 4 * row_stride) * mat_stride] += (168.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[(-3 + 4 * row_stride) * mat_stride] += (-32.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-3 + 3 * row_stride) * mat_stride] += (-32.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-3 + 2 * row_stride) * mat_stride] += (-32.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-3 + 1 * row_stride) * mat_stride] += (-32.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-3 - 1 * row_stride) * mat_stride] += (-32.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-3 - 2 * row_stride) * mat_stride] += (-32.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-3 - 3 * row_stride) * mat_stride] += (-32.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-3 - 4 * row_stride) * mat_stride] += (-32.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[(-4 + 4 * row_stride) * mat_stride] += (3.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-4 + 3 * row_stride) * mat_stride] += (3.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-4 + 2 * row_stride) * mat_stride] += (3.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-4 + 1 * row_stride) * mat_stride] += (3.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-4 - 1 * row_stride) * mat_stride] += (3.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-4 - 2 * row_stride) * mat_stride] += (3.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-4 - 3 * row_stride) * mat_stride] += (3.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[(-4 - 4 * row_stride) * mat_stride] += (3.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+}
+
static void mat_fill_row(EGSContext *ctx, double *scratch_line,
ptrdiff_t idx1, ptrdiff_t idx0)
{
@@ -538,7 +753,7 @@ static void mat_fill_row(EGSContext *ctx, double *scratch_line,
const double r = sqrt(SQR(x) + SQR(y));
double val = dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]];
- for (int i = 1; i < ctx->fd_stencil * 2 + 1; i++) {
+ for (int i = 1; i < ARRAY_ELEMS(falloff_bnd_fd_factors); i++) {
dst[dir * (bnd_layer - i) * bnd_stride[1] + bnd_idx * bnd_stride[0]] -=
val * falloff_bnd_fd_factors[ctx->fd_stencil - 1][i] / falloff_bnd_fd_factors[ctx->fd_stencil - 1][0];
}
@@ -732,6 +947,8 @@ int mg2di_egs_init(EGSContext *ctx, int flags)
switch (ctx->fd_stencil) {
case 1: e->fill_mat = fill_mat_s1; break;
case 2: e->fill_mat = fill_mat_s2; break;
+ case 3: e->fill_mat = fill_mat_s3; break;
+ case 4: e->fill_mat = fill_mat_s4; break;
default:
mg2di_log(&ctx->logger, 0, "Invalid finite difference stencil: %zd\n",
ctx->fd_stencil);
@@ -873,7 +1090,7 @@ static int arrays_alloc(EGSContext *ctx, const size_t domain_size[2])
if (!e->mat || !e->mat_f || !e->rhs || !e->rhs_mat || !e->rhs_factor || !e->x || !e->ipiv)
return -ENOMEM;
- ret = mg2di_bicgstab_context_alloc(&e->bicgstab, e->N, 64);
+ ret = mg2di_bicgstab_context_alloc(&e->bicgstab, e->N, 24);
if (ret < 0)
return ret;
diff --git a/mg2d.c b/mg2d.c
index 11a8158..b4a9124 100644
--- a/mg2d.c
+++ b/mg2d.c
@@ -224,7 +224,7 @@ static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level)
finish:
res_new = level->solver->residual_max;
if (!isfinite(res_new) ||
- (res_new > 1e2 * DBL_EPSILON && res_old / res_new <= 1e-1)) {
+ (res_new > 1e2 * DBL_EPSILON && res_old / res_new <= 1e-2)) {
mg2di_log(&ctx->priv->logger, MG2D_LOG_ERROR,
"The relaxation step at level %d has diverged: %g -> %g\n",
level->depth, res_old, res_new);
@@ -399,6 +399,8 @@ static int mg_levels_init(MG2DContext *ctx)
switch (ctx->fd_stencil) {
case 1: op_interp = GRID_TRANSFER_LAGRANGE_3; break;
case 2: op_interp = GRID_TRANSFER_LAGRANGE_5; break;
+ case 3: op_interp = GRID_TRANSFER_LAGRANGE_7; break;
+ case 4: op_interp = GRID_TRANSFER_LAGRANGE_7; break;
default: return -ENOSYS;
}
diff --git a/residual_calc.c b/residual_calc.c
index 24ff088..3f1a91b 100644
--- a/residual_calc.c
+++ b/residual_calc.c
@@ -65,6 +65,41 @@ void mg2di_residual_calc_line_s2_avx2(size_t linesize, double *dst, double *dst_
const double *fd_factors);
#endif
+static const double fd_coeffs[][2][FD_STENCIL_MAX * 2 + 1] = {
+ {
+ { -1.0, 0.0, 1.0 },
+ { 1.0, -2.0, 1.0 },
+ },
+ {
+ { 1.0, -8.0, 0.0, 8.0, -1.0 },
+ { -1.0, 16.0, -30.0, 16.0, -1.0 },
+ },
+ {
+ { -1.0, 9.0, -45.0, 0.0, 45.0, -9.0, 1.0 },
+ { 1.0, -13.5, 135.0, -245.0, 135.0, -13.5, 1.0 },
+ },
+ {
+ { 3.0, -32.0, 168.0, -672.0, 0.0, 672.0, -168.0, 32.0, -3.0 },
+ { -9.0, 128.0, -1008.0, 8064.0, -14350.0, 8064.0, -1008.0, 128.0, -9.0 },
+ },
+};
+
+#define STENCIL 1
+#include "residual_calc_template.c"
+#undef STENCIL
+
+#define STENCIL 2
+#include "residual_calc_template.c"
+#undef STENCIL
+
+#define STENCIL 3
+#include "residual_calc_template.c"
+#undef STENCIL
+
+#define STENCIL 4
+#include "residual_calc_template.c"
+#undef STENCIL
+
static void
derivatives_calc_s1(double *dst, const double *u, const double *fd_factors, ptrdiff_t stride)
{
@@ -125,6 +160,82 @@ derivatives_calc_s2(double *dst, const double *u, const double *fd_factors, ptrd
-1.0 * valxm2yp2 + 8.0 * valxm2yp1 - 8.0 * valxm2ym1 + 1.0 * valxm2ym2) * fd_factors[MG2D_DIFF_COEFF_11];
}
+static void
+derivatives_calc_s3(double *dst, const double *u, const double *fd_factors, ptrdiff_t stride)
+{
+ const double val = u[0];
+
+ const double valxp1 = u[ 1];
+ const double valxp2 = u[ 2];
+ const double valxp3 = u[ 3];
+ const double valxm1 = u[-1];
+ const double valxm2 = u[-2];
+ const double valxm3 = u[-3];
+ const double valyp1 = u[ 1 * stride];
+ const double valyp2 = u[ 2 * stride];
+ const double valyp3 = u[ 3 * stride];
+ const double valym1 = u[-1 * stride];
+ const double valym2 = u[-2 * stride];
+ const double valym3 = u[-3 * stride];
+
+ const double valxp3yp1 = u[ 3 + 1 * stride];
+ const double valxp3yp2 = u[ 3 + 2 * stride];
+ const double valxp3yp3 = u[ 3 + 3 * stride];
+ const double valxp3ym1 = u[ 3 - 1 * stride];
+ const double valxp3ym2 = u[ 3 - 2 * stride];
+ const double valxp3ym3 = u[ 3 - 3 * stride];
+
+ const double valxp2yp1 = u[ 2 + 1 * stride];
+ const double valxp2yp2 = u[ 2 + 2 * stride];
+ const double valxp2yp3 = u[ 2 + 3 * stride];
+ const double valxp2ym1 = u[ 2 - 1 * stride];
+ const double valxp2ym2 = u[ 2 - 2 * stride];
+ const double valxp2ym3 = u[ 2 - 3 * stride];
+
+ const double valxp1yp1 = u[ 1 + 1 * stride];
+ const double valxp1yp2 = u[ 1 + 2 * stride];
+ const double valxp1yp3 = u[ 1 + 3 * stride];
+ const double valxp1ym1 = u[ 1 - 1 * stride];
+ const double valxp1ym2 = u[ 1 - 2 * stride];
+ const double valxp1ym3 = u[ 1 - 3 * stride];
+
+ const double valxm1yp1 = u[-1 + 1 * stride];
+ const double valxm1yp2 = u[-1 + 2 * stride];
+ const double valxm1yp3 = u[-1 + 3 * stride];
+ const double valxm1ym1 = u[-1 - 1 * stride];
+ const double valxm1ym2 = u[-1 - 2 * stride];
+ const double valxm1ym3 = u[-1 - 3 * stride];
+
+ const double valxm2yp1 = u[-2 + 1 * stride];
+ const double valxm2yp2 = u[-2 + 2 * stride];
+ const double valxm2yp3 = u[-2 + 3 * stride];
+ const double valxm2ym1 = u[-2 - 1 * stride];
+ const double valxm2ym2 = u[-2 - 2 * stride];
+ const double valxm2ym3 = u[-2 - 3 * stride];
+
+ const double valxm3yp1 = u[-3 + 1 * stride];
+ const double valxm3yp2 = u[-3 + 2 * stride];
+ const double valxm3yp3 = u[-3 + 3 * stride];
+ const double valxm3ym1 = u[-3 - 1 * stride];
+ const double valxm3ym2 = u[-3 - 2 * stride];
+ const double valxm3ym3 = u[-3 - 3 * stride];
+
+ dst[MG2D_DIFF_COEFF_00] = val;
+ dst[MG2D_DIFF_COEFF_10] = (1.0 * valxp3 - 9.0 * valxp2 + 45.0 * valxp1 - 45.0 * valxm1 + 9.0 * valxm2 - 1.0 * valxm3) * fd_factors[MG2D_DIFF_COEFF_10];
+ dst[MG2D_DIFF_COEFF_01] = (1.0 * valyp3 - 9.0 * valyp2 + 45.0 * valyp1 - 45.0 * valym1 + 9.0 * valym2 - 1.0 * valym3) * fd_factors[MG2D_DIFF_COEFF_01];
+
+ dst[MG2D_DIFF_COEFF_20] = (1.0 * valxp3 - 13.5 * valxp2 + 135.0 * valxp1 - 245.0 * val + 135.0 * valxm1 - 13.5 * valxm2 + 1.0 * valxm3) * fd_factors[MG2D_DIFF_COEFF_20];
+ dst[MG2D_DIFF_COEFF_02] = (1.0 * valyp3 - 13.5 * valyp2 + 135.0 * valyp1 - 245.0 * val + 135.0 * valym1 - 13.5 * valym2 + 1.0 * valym3) * fd_factors[MG2D_DIFF_COEFF_02];
+
+ dst[MG2D_DIFF_COEFF_11] = fd_factors[MG2D_DIFF_COEFF_11] * (
+ 1.0 * (1.0 * valxp3yp3 - 9.0 * valxp3yp2 + 45.0 * valxp3yp1 - 45.0 * valxp3ym1 + 9.0 * valxp3ym2 - 1.0 * valxp3ym3)
+ - 9.0 * (1.0 * valxp2yp3 - 9.0 * valxp2yp2 + 45.0 * valxp2yp1 - 45.0 * valxp2ym1 + 9.0 * valxp2ym2 - 1.0 * valxp2ym3)
+ + 45.0 * (1.0 * valxp1yp3 - 9.0 * valxp1yp2 + 45.0 * valxp1yp1 - 45.0 * valxp1ym1 + 9.0 * valxp1ym2 - 1.0 * valxp1ym3)
+ - 45.0 * (1.0 * valxm1yp3 - 9.0 * valxm1yp2 + 45.0 * valxm1yp1 - 45.0 * valxm1ym1 + 9.0 * valxm1ym2 - 1.0 * valxm1ym3)
+ + 9.0 * (1.0 * valxm2yp3 - 9.0 * valxm2yp2 + 45.0 * valxm2yp1 - 45.0 * valxm2ym1 + 9.0 * valxm2ym2 - 1.0 * valxm2ym3)
+ - 1.0 * (1.0 * valxm3yp3 - 9.0 * valxm3yp2 + 45.0 * valxm3yp1 - 45.0 * valxm3ym1 + 9.0 * valxm3ym2 - 1.0 * valxm3ym3));
+}
+
static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
@@ -173,6 +284,30 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma
*dst_max = MAX(*dst_max, res_max);
}
+static void residual_calc_line_s3_c(size_t linesize, double *dst, double *dst_max,
+ ptrdiff_t stride, const double *u, const double *rhs,
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *fd_factors)
+{
+ double res_max = 0.0, res_abs;
+ for (size_t i = 0; i < linesize; i++) {
+ double u_vals[MG2D_DIFF_COEFF_NB];
+ double res;
+
+ derivatives_calc_s3(u_vals, u + i, fd_factors, stride);
+
+ res = -rhs[i];
+ for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
+ res += u_vals[j] * diff_coeffs[j][i];
+ dst[i] = res;
+
+ res_abs = fabs(res);
+ res_max = MAX(res_max, res_abs);
+ }
+
+ *dst_max = MAX(*dst_max, res_max);
+}
+
static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
{
ResidualCalcInternal *priv = arg;
@@ -230,6 +365,7 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
switch (ctx->fd_stencil) {
case 1:
priv->residual_calc_line = residual_calc_line_s1_c;
+ //priv->residual_calc_line = residual_calc_line_1;
#if HAVE_EXTERNAL_ASM
if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) {
priv->residual_calc_line = mg2di_residual_calc_line_s1_avx2;
@@ -239,6 +375,7 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
break;
case 2:
priv->residual_calc_line = residual_calc_line_s2_c;
+ priv->residual_calc_line = residual_calc_line_2;
#if HAVE_EXTERNAL_ASM
if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) {
priv->residual_calc_line = mg2di_residual_calc_line_s2_avx2;
@@ -246,6 +383,14 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
}
#endif
break;
+ case 3:
+ priv->residual_calc_line = residual_calc_line_s3_c;
+ break;
+ case 4:
+ priv->residual_calc_line = residual_calc_line_4;
+ break;
+ default:
+ return -ENOSYS;
}
priv->residual_max_size = tp_get_nb_threads(ctx->tp) * priv->calc_blocksize;
diff --git a/residual_calc_template.c b/residual_calc_template.c
new file mode 100644
index 0000000..dcf7eb4
--- /dev/null
+++ b/residual_calc_template.c
@@ -0,0 +1,90 @@
+/*
+ * Residual calculation template
+ * Copyright 2019 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define JOIN2(a, b) a ## _ ## b
+#define FUNC2(name, stencil) JOIN2(name, stencil)
+#define FUNC(name) FUNC2(name, STENCIL)
+
+static void
+FUNC(derivatives_calc)(double *dst, const double *u, const double *fd_factors, ptrdiff_t stride)
+{
+ dst[MG2D_DIFF_COEFF_00] = u[0];
+
+ {
+ double val = 0.0;
+ for (int i = -STENCIL; i <= STENCIL; i++)
+ val += fd_coeffs[STENCIL - 1][0][i + STENCIL] * u[i];
+ dst[MG2D_DIFF_COEFF_10] = val * fd_factors[MG2D_DIFF_COEFF_10];
+ }
+ {
+ double val = 0.0;
+ for (int i = -STENCIL; i <= STENCIL; i++)
+ val += fd_coeffs[STENCIL - 1][0][i + STENCIL] * u[i * stride];
+ dst[MG2D_DIFF_COEFF_01] = val * fd_factors[MG2D_DIFF_COEFF_01];
+ }
+ {
+ double val = 0.0;
+ for (int i = -STENCIL; i <= STENCIL; i++)
+ val += fd_coeffs[STENCIL - 1][1][i + STENCIL] * u[i];
+ dst[MG2D_DIFF_COEFF_20] = val * fd_factors[MG2D_DIFF_COEFF_20];
+ }
+ {
+ double val = 0.0;
+ for (int i = -STENCIL; i <= STENCIL; i++)
+ val += fd_coeffs[STENCIL - 1][1][i + STENCIL] * u[i * stride];
+ dst[MG2D_DIFF_COEFF_02] = val * fd_factors[MG2D_DIFF_COEFF_02];
+ }
+ {
+ double val = 0.0;
+ for (int j = -STENCIL; j <= STENCIL; j++) {
+ double tmp = 0.0;
+ for (int i = -STENCIL; i <= STENCIL; i++)
+ tmp += fd_coeffs[STENCIL - 1][0][i + STENCIL] * u[j * stride + i];
+ val += fd_coeffs[STENCIL - 1][0][j + STENCIL] * tmp;
+ }
+ dst[MG2D_DIFF_COEFF_11] = val * fd_factors[MG2D_DIFF_COEFF_11];
+ }
+}
+static void
+FUNC(residual_calc_line)(size_t linesize, double *dst, double *dst_max,
+ ptrdiff_t stride, const double *u, const double *rhs,
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *fd_factors)
+{
+ double res_max = 0.0, res_abs;
+ for (size_t i = 0; i < linesize; i++) {
+ double u_vals[MG2D_DIFF_COEFF_NB];
+ double res;
+
+ FUNC(derivatives_calc)(u_vals, u + i, fd_factors, stride);
+
+ res = -rhs[i];
+ for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
+ res += u_vals[j] * diff_coeffs[j][i];
+ dst[i] = res;
+
+ res_abs = fabs(res);
+ res_max = MAX(res_max, res_abs);
+ }
+
+ *dst_max = MAX(*dst_max, res_max);
+}
+
+#undef FUNC
+#undef FUNC2
+#undef JOIN2
diff --git a/transfer.c b/transfer.c
index 311c6a9..d2809fa 100644
--- a/transfer.c
+++ b/transfer.c
@@ -118,10 +118,12 @@ static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim)
return -ENOMEM;
for (ptrdiff_t idx_dst = 0; idx_dst < ctx->dst.size[dim]; idx_dst++) {
- const ptrdiff_t idx_src = (ptrdiff_t)(scale * idx_dst) - (priv->stencil / 2 - 1);
- const double coord_dst = idx_dst * step_dst;
+ const double coord_dst = idx_dst * step_dst;
+ double *fact = priv->fact[dim] + priv->stencil * idx_dst;
- double *fact = priv->fact[dim] + priv->stencil * idx_dst;
+ ptrdiff_t idx_src = (ptrdiff_t)(scale * idx_dst) - (priv->stencil / 2 - 1);
+
+ //idx_src = MAX(MIN(idx_src, ctx->src.size[dim] - priv->stencil - 1), 0);
for (int i = 0; i < priv->stencil; i++)
coord_src[i] = (idx_src + i) * step_src;