From f52486a2382b5e1036b608d663ccece5d2de220d Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Tue, 9 Apr 2019 09:45:16 +0200 Subject: egs: add higher-order finite difference operators --- ell_grid_solve.c | 223 ++++++++++++++++++++++++++++++++++++++++++++- mg2d.c | 2 + residual_calc.asm | 231 ++++++++++++++++++++++++++++++++++++++++++++++- residual_calc.c | 196 ++++++++++++++++++++++++++++++++++++++++ residual_calc_template.c | 105 +++++++++++++++++++++ transfer.c | 3 +- 6 files changed, 754 insertions(+), 6 deletions(-) create mode 100644 residual_calc_template.c diff --git a/ell_grid_solve.c b/ell_grid_solve.c index dad8861..d796b26 100644 --- a/ell_grid_solve.c +++ b/ell_grid_solve.c @@ -44,6 +44,8 @@ static const double relax_factors[FD_STENCIL_MAX] = { [0] = 1.0 / 5, [1] = 1.0 / 7, + [2] = 1.0 / 7, + [3] = 1.0 / 7, }; typedef struct EGSRelaxInternal { @@ -140,6 +142,22 @@ static const double fd_denoms[][MG2D_DIFF_COEFF_NB] = { [MG2D_DIFF_COEFF_02] = 12.0, [MG2D_DIFF_COEFF_11] = 144.0, }, + { + [MG2D_DIFF_COEFF_00] = 1.0, + [MG2D_DIFF_COEFF_10] = 60.0, + [MG2D_DIFF_COEFF_01] = 60.0, + [MG2D_DIFF_COEFF_20] = 90.0, + [MG2D_DIFF_COEFF_02] = 90.0, + [MG2D_DIFF_COEFF_11] = 3600.0, + }, + { + [MG2D_DIFF_COEFF_00] = 1.0, + [MG2D_DIFF_COEFF_10] = 840.0, + [MG2D_DIFF_COEFF_01] = 840.0, + [MG2D_DIFF_COEFF_20] = 5040.0, + [MG2D_DIFF_COEFF_02] = 5040.0, + [MG2D_DIFF_COEFF_11] = SQR(840.0), + }, }; static void boundaries_sync(EGSContext *ctx, NDArray *a_dst) @@ -227,7 +245,9 @@ static void boundaries_apply_reflect(double *dst, const ptrdiff_t dst_stride[2], /* FIXME we currently use 1st-order forward differences in all cases, * since centered and/or higher-order FDs seem to be unstable * this may be suboptimal and should be investigated closer */ -static const double falloff_bnd_fd_factors[][FD_STENCIL_MAX * 2 + 1] = { +static const double falloff_bnd_fd_factors[][2] = { + { 1.0 / 2.0, -1.0 / 2.0, }, + { 1.0 / 2.0, -1.0 / 2.0, }, { 1.0 / 2.0, -1.0 / 2.0, }, { 1.0 / 2.0, -1.0 / 2.0, }, //{ -1.0, 8.0, 0.0, -8.0, 1.0 }, @@ -248,7 +268,7 @@ static void boundaries_apply_falloff(double *dst, const ptrdiff_t dst_stride[2], double val = row[-(ptrdiff_t)ctx->fd_stencil * dst_stride[1]] / r * (x / r); - for (int i = 1; i < ctx->fd_stencil * 2 + 1; i++) + for (int i = 1; i < ARRAY_ELEMS(falloff_bnd_fd_factors[0]); i++) val += dst[(bnd_layer - i) * dst_stride[1] + bnd_idx * dst_stride[0]] * falloff_bnd_fd_factors[ctx->fd_stencil - 1][i] / ctx->step[0]; *row = -val * (ctx->step[0] / falloff_bnd_fd_factors[ctx->fd_stencil - 1][0]); @@ -574,6 +594,201 @@ static void fill_mat_s2(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_str mat_row[(-2 - 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; } +static void fill_mat_s3(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride, + NDArray **diff_coeffs, ptrdiff_t idx_src) +{ + mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src]; + + mat_row[ 3 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[ 2 * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[ 1 * mat_stride] += 45.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[-1 * mat_stride] += -45.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[-2 * mat_stride] += 9.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[-3 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + + mat_row[ 3 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[ 2 * row_stride * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[ 1 * row_stride * mat_stride] += 45.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[-1 * row_stride * mat_stride] += -45.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[-2 * row_stride * mat_stride] += 9.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[-3 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + + mat_row[ 3 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[ 2 * mat_stride] += -13.5 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[ 1 * mat_stride] += 135.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[ 0 * mat_stride] += -245.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[-1 * mat_stride] += 135.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[-2 * mat_stride] += -13.5 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[-3 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + + mat_row[ 3 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[ 2 * row_stride * mat_stride] += -13.5 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[ 1 * row_stride * mat_stride] += 135.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[ 0 * row_stride * mat_stride] += -245.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[-1 * row_stride * mat_stride] += 135.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[-2 * row_stride * mat_stride] += -13.5 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[-3 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + + mat_row[( 3 + 3 * row_stride) * mat_stride] += (1.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 3 + 2 * row_stride) * mat_stride] += (1.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 3 + 1 * row_stride) * mat_stride] += (1.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 3 - 1 * row_stride) * mat_stride] += (1.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 3 - 2 * row_stride) * mat_stride] += (1.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 3 - 3 * row_stride) * mat_stride] += (1.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[( 2 + 3 * row_stride) * mat_stride] += (-9.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 + 2 * row_stride) * mat_stride] += (-9.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 + 1 * row_stride) * mat_stride] += (-9.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 - 1 * row_stride) * mat_stride] += (-9.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 - 2 * row_stride) * mat_stride] += (-9.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 - 3 * row_stride) * mat_stride] += (-9.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[( 1 + 3 * row_stride) * mat_stride] += (45.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 + 2 * row_stride) * mat_stride] += (45.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 + 1 * row_stride) * mat_stride] += (45.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 - 1 * row_stride) * mat_stride] += (45.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 - 2 * row_stride) * mat_stride] += (45.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 - 3 * row_stride) * mat_stride] += (45.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[(-1 + 3 * row_stride) * mat_stride] += (-45.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 + 2 * row_stride) * mat_stride] += (-45.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 + 1 * row_stride) * mat_stride] += (-45.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 - 1 * row_stride) * mat_stride] += (-45.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 - 2 * row_stride) * mat_stride] += (-45.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 - 3 * row_stride) * mat_stride] += (-45.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[(-2 + 3 * row_stride) * mat_stride] += (9.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 + 2 * row_stride) * mat_stride] += (9.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 + 1 * row_stride) * mat_stride] += (9.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 - 1 * row_stride) * mat_stride] += (9.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 - 2 * row_stride) * mat_stride] += (9.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 - 3 * row_stride) * mat_stride] += (9.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[(-3 + 3 * row_stride) * mat_stride] += (-1.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-3 + 2 * row_stride) * mat_stride] += (-1.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-3 + 1 * row_stride) * mat_stride] += (-1.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-3 - 1 * row_stride) * mat_stride] += (-1.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-3 - 2 * row_stride) * mat_stride] += (-1.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-3 - 3 * row_stride) * mat_stride] += (-1.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; +} + +static void fill_mat_s4(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride, + NDArray **diff_coeffs, ptrdiff_t idx_src) +{ + mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src]; + + mat_row[ 4 * mat_stride] += -3.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[ 3 * mat_stride] += 32.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[ 2 * mat_stride] += -168.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[ 1 * mat_stride] += 672.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[-1 * mat_stride] += -672.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[-2 * mat_stride] += 168.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[-3 * mat_stride] += -32.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[-4 * mat_stride] += 3.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + + mat_row[ 4 * row_stride * mat_stride] += -3.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[ 3 * row_stride * mat_stride] += 32.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[ 2 * row_stride * mat_stride] += -168.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[ 1 * row_stride * mat_stride] += 672.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[-1 * row_stride * mat_stride] += -672.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[-2 * row_stride * mat_stride] += 168.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[-3 * row_stride * mat_stride] += -32.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[-4 * row_stride * mat_stride] += 3.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + + mat_row[ 4 * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[ 3 * mat_stride] += 128.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[ 2 * mat_stride] += -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[ 1 * mat_stride] += 8064.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[ 0 * mat_stride] += -14350.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[-1 * mat_stride] += 8064.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[-2 * mat_stride] += -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[-3 * mat_stride] += 128.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[-4 * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + + mat_row[ 4 * row_stride * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[ 3 * row_stride * mat_stride] += 128.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[ 2 * row_stride * mat_stride] += -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[ 1 * row_stride * mat_stride] += 8064.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[ 0 * row_stride * mat_stride] += -14350.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[-1 * row_stride * mat_stride] += 8064.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[-2 * row_stride * mat_stride] += -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[-3 * row_stride * mat_stride] += 128.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[-4 * row_stride * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + + mat_row[( 4 + 4 * row_stride) * mat_stride] += (-3.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 4 + 3 * row_stride) * mat_stride] += (-3.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 4 + 2 * row_stride) * mat_stride] += (-3.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 4 + 1 * row_stride) * mat_stride] += (-3.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 4 - 1 * row_stride) * mat_stride] += (-3.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 4 - 2 * row_stride) * mat_stride] += (-3.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 4 - 3 * row_stride) * mat_stride] += (-3.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 4 - 4 * row_stride) * mat_stride] += (-3.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[( 3 + 4 * row_stride) * mat_stride] += (32.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 3 + 3 * row_stride) * mat_stride] += (32.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 3 + 2 * row_stride) * mat_stride] += (32.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 3 + 1 * row_stride) * mat_stride] += (32.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 3 - 1 * row_stride) * mat_stride] += (32.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 3 - 2 * row_stride) * mat_stride] += (32.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 3 - 3 * row_stride) * mat_stride] += (32.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 3 - 4 * row_stride) * mat_stride] += (32.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[( 2 + 4 * row_stride) * mat_stride] += (-168.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 + 3 * row_stride) * mat_stride] += (-168.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 + 2 * row_stride) * mat_stride] += (-168.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 + 1 * row_stride) * mat_stride] += (-168.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 - 1 * row_stride) * mat_stride] += (-168.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 - 2 * row_stride) * mat_stride] += (-168.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 - 3 * row_stride) * mat_stride] += (-168.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 - 4 * row_stride) * mat_stride] += (-168.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[( 1 + 4 * row_stride) * mat_stride] += (672.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 + 3 * row_stride) * mat_stride] += (672.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 + 2 * row_stride) * mat_stride] += (672.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 + 1 * row_stride) * mat_stride] += (672.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 - 1 * row_stride) * mat_stride] += (672.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 - 2 * row_stride) * mat_stride] += (672.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 - 3 * row_stride) * mat_stride] += (672.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 - 4 * row_stride) * mat_stride] += (672.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[(-1 + 4 * row_stride) * mat_stride] += (-672.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 + 3 * row_stride) * mat_stride] += (-672.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 + 2 * row_stride) * mat_stride] += (-672.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 + 1 * row_stride) * mat_stride] += (-672.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 - 1 * row_stride) * mat_stride] += (-672.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 - 2 * row_stride) * mat_stride] += (-672.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 - 3 * row_stride) * mat_stride] += (-672.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 - 4 * row_stride) * mat_stride] += (-672.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[(-2 + 4 * row_stride) * mat_stride] += (168.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 + 3 * row_stride) * mat_stride] += (168.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 + 2 * row_stride) * mat_stride] += (168.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 + 1 * row_stride) * mat_stride] += (168.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 - 1 * row_stride) * mat_stride] += (168.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 - 2 * row_stride) * mat_stride] += (168.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 - 3 * row_stride) * mat_stride] += (168.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 - 4 * row_stride) * mat_stride] += (168.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[(-3 + 4 * row_stride) * mat_stride] += (-32.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-3 + 3 * row_stride) * mat_stride] += (-32.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-3 + 2 * row_stride) * mat_stride] += (-32.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-3 + 1 * row_stride) * mat_stride] += (-32.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-3 - 1 * row_stride) * mat_stride] += (-32.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-3 - 2 * row_stride) * mat_stride] += (-32.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-3 - 3 * row_stride) * mat_stride] += (-32.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-3 - 4 * row_stride) * mat_stride] += (-32.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[(-4 + 4 * row_stride) * mat_stride] += (3.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-4 + 3 * row_stride) * mat_stride] += (3.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-4 + 2 * row_stride) * mat_stride] += (3.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-4 + 1 * row_stride) * mat_stride] += (3.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-4 - 1 * row_stride) * mat_stride] += (3.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-4 - 2 * row_stride) * mat_stride] += (3.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-4 - 3 * row_stride) * mat_stride] += (3.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-4 - 4 * row_stride) * mat_stride] += (3.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + +} + static void mat_fill_row(EGSContext *ctx, double *scratch_line, ptrdiff_t idx1, ptrdiff_t idx0) { @@ -707,7 +922,7 @@ static void mat_fill_row(EGSContext *ctx, double *scratch_line, const double r = sqrt(SQR(x) + SQR(y)); double val = dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]]; - for (int i = 1; i < ctx->fd_stencil * 2 + 1; i++) { + for (int i = 1; i < ARRAY_ELEMS(falloff_bnd_fd_factors[0]); i++) { dst[dir * (bnd_layer - i) * bnd_stride[1] + bnd_idx * bnd_stride[0]] -= val * falloff_bnd_fd_factors[ctx->fd_stencil - 1][i] / falloff_bnd_fd_factors[ctx->fd_stencil - 1][0]; } @@ -1021,6 +1236,8 @@ int mg2di_egs_init(EGSContext *ctx, int flags) switch (ctx->fd_stencil) { case 1: priv->e.fill_mat = fill_mat_s1; break; case 2: priv->e.fill_mat = fill_mat_s2; break; + case 3: priv->e.fill_mat = fill_mat_s3; break; + case 4: priv->e.fill_mat = fill_mat_s4; break; default: mg2di_log(&ctx->logger, 0, "Invalid finite difference stencil: %zd\n", ctx->fd_stencil); diff --git a/mg2d.c b/mg2d.c index 103f514..fe98c73 100644 --- a/mg2d.c +++ b/mg2d.c @@ -860,6 +860,8 @@ static int mg_levels_init(MG2DContext *ctx) switch (ctx->fd_stencil) { case 1: op_interp = GRID_TRANSFER_LAGRANGE_3; break; case 2: op_interp = GRID_TRANSFER_LAGRANGE_5; break; + case 3: op_interp = GRID_TRANSFER_LAGRANGE_7; break; + case 4: op_interp = GRID_TRANSFER_LAGRANGE_7; break; default: return -ENOSYS; } diff --git a/residual_calc.asm b/residual_calc.asm index 5eea31c..471f864 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -32,8 +32,14 @@ SECTION .rodata -const8: times 8 dq 8.0 -const30: times 8 dq 30.0 +const5: times 8 dq 5.0 +const8: times 8 dq 8.0 +const9: times 8 dq 9.0 +const10: times 8 dq 10.0 +const13_5: times 8 dq 13.5 +const30: times 8 dq 30.0 +const45: times 8 dq 45.0 +const245: times 8 dq 245.0 SECTION .text @@ -282,3 +288,224 @@ RESIDUAL_CALC 2, 0 cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 RESIDUAL_CALC 2, 1 + +%macro DX3 3 + movu %1, [%3 + ELEM_SIZE * 1] ; out = u[x+1] + subpd %1, [%3 - ELEM_SIZE * 1] ; -u[x-1] + + movu %2, [%3 + ELEM_SIZE * 2] ; tmp = u[x+2] + subpd %2, [%3 - ELEM_SIZE * 2] ; -u[x-2] + + vfmsub132pd %1, %2, [const5] ; out = 5 * out - tmp + + movu %2, [%3 + ELEM_SIZE * 3] ; tmp = u[x+3] + subpd %2, [%3 - ELEM_SIZE * 3] ; -u[x-3] + + vfmadd132pd %1, %2, [const9] ; out = 9 * out + tmp +%endmacro + +%macro RES_ADD_DIFF_X_3 0 + ; first derivative + DX3 m7, m8, uq + offsetq + vfmadd231pd m0, m7, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10 + + ; second derivative + movu m7, [uq + offsetq + ELEM_SIZE * 1] ; m7 = u[x+1] + addpd m7, [uq + offsetq - ELEM_SIZE * 1] ; +u[x-1] + + movu m8, [uq + offsetq + ELEM_SIZE * 2] ; m8 = u[x+2] + addpd m8, [uq + offsetq - ELEM_SIZE * 2] ; +u[x-2] + + vfmsub132pd m7, m8, [const10] ; m7 = 10 * m7 - m8 + + movu m8, [uq + offsetq + ELEM_SIZE * 3] ; m8 = u[x+3] + addpd m8, [uq + offsetq - ELEM_SIZE * 3] ; -u[x-3] + + vfmadd132pd m7, m8, [const13_5] ; m7 = 13.5 * m7 + m8 + vfnmadd231pd m7, m6, [const245] ; m7 = m7 - 245 * u[0] + vfmadd231pd m0, m7, [diff_coeffs20q + offsetq]; res += d_x u * diff_coeffs20 +%endmacro + +%macro RES_ADD_DIFF_Y_3 0 + ; first derivative + lea u_upq, [uq + strideq] + mov u_downq, uq + sub u_downq, strideq + movu m7, [u_upq + offsetq] ; m7 = u[y+1] + subpd m7, [u_downq + offsetq] ; -u[y-1] + + add u_upq, strideq + sub u_downq, strideq + movu m8, [u_upq + offsetq] ; m8 = u[y+2] + subpd m8, [u_downq + offsetq] ; -u[y-2] + + vfmsub132pd m7, m8, [const5] ; m7 = 5 * m7 - m8 + + add u_upq, strideq + sub u_downq, strideq + movu m8, [u_upq + offsetq] ; m8 = u[y+3] + subpd m8, [u_downq + offsetq] ; -u[y-3] + + vfmadd132pd m7, m8, [const9] ; m7 = 9 * m7 + m8 + vfmadd231pd m0, m7, [diff_coeffs01q + offsetq] ; res += d_x u * diff_coeffs01 + + ; second derivative + lea u_upq, [uq + strideq] + mov u_downq, uq + sub u_downq, strideq + movu m7, [u_upq + offsetq] ; m7 = u[y+1] + addpd m7, [u_downq + offsetq] ; +u[y-1] + + add u_upq, strideq + sub u_downq, strideq + movu m8, [u_upq + offsetq] ; m8 = u[y+2] + addpd m8, [u_downq + offsetq] ; +u[y-2] + + vfmsub132pd m7, m8, [const10] ; m7 = 10 * m7 - m8 + + add u_upq, strideq + sub u_downq, strideq + movu m8, [u_upq + offsetq] ; m8 = u[x+3] + addpd m8, [u_downq + offsetq] ; -u[x-3] + + vfmadd132pd m7, m8, [const13_5] ; m7 = 13.5 * m7 + m8 + vfnmadd231pd m7, m6, [const245] ; m7 = m7 - 245 * u[0] + vfmadd231pd m0, m7, [diff_coeffs02q + offsetq]; res += d_y u * diff_coeffs02 +%endmacro + +%macro RES_ADD_DIFF_MIXED_3 0 + lea u_upq, [uq + 2 * strideq] + add u_upq, strideq + DX3 m6, m7, u_upq + offsetq ; m6 = dx u[y + 3] + + sub u_upq, strideq + DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y + 2] + vfnmadd231pd m6, m7, [const9] ; m6 = m6 - 9 * m7 + + sub u_upq, strideq + DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y + 1] + vfmadd231pd m6, m7, [const45] ; m6 = m6 + 45 * m7 + + sub u_upq, strideq + sub u_upq, strideq + DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y - 1] + vfnmadd231pd m6, m7, [const45] ; m6 = m6 - 45 * m7 + + sub u_upq, strideq + DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y - 2] + vfmadd231pd m6, m7, [const9] ; m6 = m6 + 9 * m7 + + sub u_upq, strideq + DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y - 3] + subpd m6, m7 + vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] +%endmacro + +%macro RESIDUAL_CALC_3 2 +%if %2 + vpermq m2, m1, 0 +%endif + vpermq m1, m0, 0 + + ; compute the mask for absolute value + pcmpeqq m13, m13 + psrlq m13, 1 + movu m12, [res_maxq] + + ; load pointers to the equation coefficients + %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer + mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] + mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01] + mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10] + mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11] + mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02] + mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20] + + ; setup the data pointers and the loop counter + shl strideq, 3 + shl linesizeq, 3 + add dstq, linesizeq + add uq, linesizeq + add rhsq, linesizeq + add diff_coeffs00q, linesizeq + add diff_coeffs01q, linesizeq + add diff_coeffs10q, linesizeq + add diff_coeffs11q, linesizeq + add diff_coeffs02q, linesizeq + add diff_coeffs20q, linesizeq + neg linesizeq + ; from now on, the register that held linesize is used as the offset into data arrays + %define offsetq linesizeq +.loop: + xorpd m0, m0 + subpd m0, [rhsq + offsetq] ; res = -rhs + + ; plain value + movu m6, [uq + offsetq] ; m6 = u[x] + vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 +%if %2 + mulpd m3, m6, m2 +%endif + + RES_ADD_DIFF_X_3 + RES_ADD_DIFF_Y_3 + RES_ADD_DIFF_MIXED_3 + + andpd m6, m0, m13 ; m6 = abs(res) + mulpd m0, m1 +%if %2 + addpd m0, m3 +%endif + + ; store the result + add offsetq, mmsize + jg .store_partial + + ; store full block + movu [dstq + offsetq - mmsize], m0 + maxpd m12, m6 + js .loop + jmp .finish + +.store_partial: + sub offsetq, ELEM_SIZE + jz .store3 + sub offsetq, ELEM_SIZE + jz .store2 + +.store1: + ; offsetq is now mmsize-2 after the write position + movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 + + vpermq m6, m6, 0 + maxpd m12, m6 + + jmp .finish +.store2: + ; offsetq is now mmsize-2 after the write position + movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 + mova xm6, xm6 + maxpd m12, m6 + jmp .finish +.store3: + ; offsetq is now mmsize-1 after the write position + movu [dstq + offsetq - mmsize + 1 * ELEM_SIZE], xm0 + vextractf128 xm0, m0, 1 + movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0 + + vpermq m6, m6, 10100100b + maxpd m12, m6 + +.finish: + movu [res_maxq], m12 + RET +%endmacro + +INIT_YMM fma3 +cglobal residual_calc_line_s3, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ + diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_down +RESIDUAL_CALC_3 3, 0 + +cglobal residual_add_line_s3, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ + diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_down +RESIDUAL_CALC_3 3, 1 diff --git a/residual_calc.c b/residual_calc.c index 6d58c10..d92986b 100644 --- a/residual_calc.c +++ b/residual_calc.c @@ -87,8 +87,62 @@ void mg2di_residual_add_line_s2_fma3(size_t linesize, double *dst, double *dst_m ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], double res_mult, double u_mult); +void mg2di_residual_calc_line_s3_fma3(size_t linesize, double *dst, double *dst_max, + ptrdiff_t stride, const double *u, const double *rhs, + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult); +void mg2di_residual_add_line_s3_fma3(size_t linesize, double *dst, double *dst_max, + ptrdiff_t stride, const double *u, const double *rhs, + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult, double u_mult); #endif +static const double fd_coeffs[][2][FD_STENCIL_MAX * 2 + 1] = { + { + { -1.0, 0.0, 1.0 }, + { 1.0, -2.0, 1.0 }, + }, + { + { 1.0, -8.0, 0.0, 8.0, -1.0 }, + { -1.0, 16.0, -30.0, 16.0, -1.0 }, + }, + { + { -1.0, 9.0, -45.0, 0.0, 45.0, -9.0, 1.0 }, + { 1.0, -13.5, 135.0, -245.0, 135.0, -13.5, 1.0 }, + }, + { + { 3.0, -32.0, 168.0, -672.0, 0.0, 672.0, -168.0, 32.0, -3.0 }, + { -9.0, 128.0, -1008.0, 8064.0, -14350.0, 8064.0, -1008.0, 128.0, -9.0 }, + }, +}; + +#define STENCIL 1 +#define ADD 0 +#include "residual_calc_template.c" +#undef ADD +#undef STENCIL + +#define STENCIL 2 +#define ADD 0 +#include "residual_calc_template.c" +#undef ADD +#undef STENCIL + +#define STENCIL 3 +#define ADD 0 +#include "residual_calc_template.c" +#undef ADD +#undef STENCIL + +#define STENCIL 4 +#define ADD 0 +#include "residual_calc_template.c" +#undef ADD +#define ADD 1 +#include "residual_calc_template.c" +#undef ADD +#undef STENCIL + static void derivatives_calc_s1(double *dst, const double *u, ptrdiff_t stride) { @@ -149,6 +203,81 @@ derivatives_calc_s2(double *dst, const double *u, ptrdiff_t stride) -1.0 * valxm2yp2 + 8.0 * valxm2yp1 - 8.0 * valxm2ym1 + 1.0 * valxm2ym2); } +static void +derivatives_calc_s3(double *dst, const double *u, ptrdiff_t stride) +{ + const double val = u[0]; + + const double valxp1 = u[ 1]; + const double valxp2 = u[ 2]; + const double valxp3 = u[ 3]; + const double valxm1 = u[-1]; + const double valxm2 = u[-2]; + const double valxm3 = u[-3]; + const double valyp1 = u[ 1 * stride]; + const double valyp2 = u[ 2 * stride]; + const double valyp3 = u[ 3 * stride]; + const double valym1 = u[-1 * stride]; + const double valym2 = u[-2 * stride]; + const double valym3 = u[-3 * stride]; + + const double valxp3yp1 = u[ 3 + 1 * stride]; + const double valxp3yp2 = u[ 3 + 2 * stride]; + const double valxp3yp3 = u[ 3 + 3 * stride]; + const double valxp3ym1 = u[ 3 - 1 * stride]; + const double valxp3ym2 = u[ 3 - 2 * stride]; + const double valxp3ym3 = u[ 3 - 3 * stride]; + + const double valxp2yp1 = u[ 2 + 1 * stride]; + const double valxp2yp2 = u[ 2 + 2 * stride]; + const double valxp2yp3 = u[ 2 + 3 * stride]; + const double valxp2ym1 = u[ 2 - 1 * stride]; + const double valxp2ym2 = u[ 2 - 2 * stride]; + const double valxp2ym3 = u[ 2 - 3 * stride]; + + const double valxp1yp1 = u[ 1 + 1 * stride]; + const double valxp1yp2 = u[ 1 + 2 * stride]; + const double valxp1yp3 = u[ 1 + 3 * stride]; + const double valxp1ym1 = u[ 1 - 1 * stride]; + const double valxp1ym2 = u[ 1 - 2 * stride]; + const double valxp1ym3 = u[ 1 - 3 * stride]; + + const double valxm1yp1 = u[-1 + 1 * stride]; + const double valxm1yp2 = u[-1 + 2 * stride]; + const double valxm1yp3 = u[-1 + 3 * stride]; + const double valxm1ym1 = u[-1 - 1 * stride]; + const double valxm1ym2 = u[-1 - 2 * stride]; + const double valxm1ym3 = u[-1 - 3 * stride]; + + const double valxm2yp1 = u[-2 + 1 * stride]; + const double valxm2yp2 = u[-2 + 2 * stride]; + const double valxm2yp3 = u[-2 + 3 * stride]; + const double valxm2ym1 = u[-2 - 1 * stride]; + const double valxm2ym2 = u[-2 - 2 * stride]; + const double valxm2ym3 = u[-2 - 3 * stride]; + + const double valxm3yp1 = u[-3 + 1 * stride]; + const double valxm3yp2 = u[-3 + 2 * stride]; + const double valxm3yp3 = u[-3 + 3 * stride]; + const double valxm3ym1 = u[-3 - 1 * stride]; + const double valxm3ym2 = u[-3 - 2 * stride]; + const double valxm3ym3 = u[-3 - 3 * stride]; + + dst[MG2D_DIFF_COEFF_00] = val; + dst[MG2D_DIFF_COEFF_10] = (1.0 * valxp3 - 9.0 * valxp2 + 45.0 * valxp1 - 45.0 * valxm1 + 9.0 * valxm2 - 1.0 * valxm3); + dst[MG2D_DIFF_COEFF_01] = (1.0 * valyp3 - 9.0 * valyp2 + 45.0 * valyp1 - 45.0 * valym1 + 9.0 * valym2 - 1.0 * valym3); + + dst[MG2D_DIFF_COEFF_20] = (1.0 * valxp3 - 13.5 * valxp2 + 135.0 * valxp1 - 245.0 * val + 135.0 * valxm1 - 13.5 * valxm2 + 1.0 * valxm3); + dst[MG2D_DIFF_COEFF_02] = (1.0 * valyp3 - 13.5 * valyp2 + 135.0 * valyp1 - 245.0 * val + 135.0 * valym1 - 13.5 * valym2 + 1.0 * valym3); + + dst[MG2D_DIFF_COEFF_11] = ( 1.0 * (1.0 * valxp3yp3 - 9.0 * valxp3yp2 + 45.0 * valxp3yp1 - 45.0 * valxp3ym1 + 9.0 * valxp3ym2 - 1.0 * valxp3ym3) + - 9.0 * (1.0 * valxp2yp3 - 9.0 * valxp2yp2 + 45.0 * valxp2yp1 - 45.0 * valxp2ym1 + 9.0 * valxp2ym2 - 1.0 * valxp2ym3) + + 45.0 * (1.0 * valxp1yp3 - 9.0 * valxp1yp2 + 45.0 * valxp1yp1 - 45.0 * valxp1ym1 + 9.0 * valxp1ym2 - 1.0 * valxp1ym3) + - 45.0 * (1.0 * valxm1yp3 - 9.0 * valxm1yp2 + 45.0 * valxm1yp1 - 45.0 * valxm1ym1 + 9.0 * valxm1ym2 - 1.0 * valxm1ym3) + + 9.0 * (1.0 * valxm2yp3 - 9.0 * valxm2yp2 + 45.0 * valxm2yp1 - 45.0 * valxm2ym1 + 9.0 * valxm2ym2 - 1.0 * valxm2ym3) + - 1.0 * (1.0 * valxm3yp3 - 9.0 * valxm3yp2 + 45.0 * valxm3yp1 - 45.0 * valxm3ym1 + 9.0 * valxm3ym2 - 1.0 * valxm3ym3)); +} + static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], @@ -245,6 +374,54 @@ static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max *dst_max = MAX(*dst_max, res_max); } +static void residual_calc_line_s3_c(size_t linesize, double *dst, double *dst_max, + ptrdiff_t stride, const double *u, const double *rhs, + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult) +{ + double res_max = 0.0, res_abs; + for (size_t i = 0; i < linesize; i++) { + double u_vals[MG2D_DIFF_COEFF_NB]; + double res; + + derivatives_calc_s3(u_vals, u + i, stride); + + res = -rhs[i]; + for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) + res += u_vals[j] * diff_coeffs[j][i]; + dst[i] = res_mult * res; + + res_abs = fabs(res); + res_max = MAX(res_max, res_abs); + } + + *dst_max = MAX(*dst_max, res_max); +} + +static void residual_add_line_s3_c(size_t linesize, double *dst, double *dst_max, + ptrdiff_t stride, const double *u, const double *rhs, + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult, double u_mult) +{ + double res_max = 0.0, res_abs; + for (size_t i = 0; i < linesize; i++) { + double u_vals[MG2D_DIFF_COEFF_NB]; + double res; + + derivatives_calc_s3(u_vals, u + i, stride); + + res = -rhs[i]; + for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) + res += u_vals[j] * diff_coeffs[j][i]; + dst[i] = u_mult * u[i] + res_mult * res; + + res_abs = fabs(res); + res_max = MAX(res_max, res_abs); + } + + *dst_max = MAX(*dst_max, res_max); +} + static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thread_idx) { ResidualCalcInternal *priv = arg; @@ -340,6 +517,7 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) case 1: priv->residual_calc_line = residual_calc_line_s1_c; priv->residual_add_line = residual_add_line_s1_c; + //priv->residual_calc_line = residual_calc_line_1; #if HAVE_EXTERNAL_ASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { priv->residual_calc_line = mg2di_residual_calc_line_s1_fma3; @@ -351,6 +529,7 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) case 2: priv->residual_calc_line = residual_calc_line_s2_c; priv->residual_add_line = residual_add_line_s2_c; + //priv->residual_calc_line = residual_calc_line_2; #if HAVE_EXTERNAL_ASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { priv->residual_calc_line = mg2di_residual_calc_line_s2_fma3; @@ -359,6 +538,23 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) } #endif break; + case 3: + priv->residual_calc_line = residual_calc_line_s3_c; + priv->residual_add_line = residual_add_line_s3_c; +#if HAVE_EXTERNAL_ASM + if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { + priv->residual_calc_line = mg2di_residual_calc_line_s3_fma3; + priv->residual_add_line = mg2di_residual_add_line_s3_fma3; + priv->calc_blocksize = 4; + } +#endif + break; + case 4: + priv->residual_calc_line = residual_line_calc_4; + priv->residual_add_line = residual_line_add_4; + break; + default: + return -ENOSYS; } priv->residual_max_size = tp_get_nb_threads(ctx->tp) * priv->calc_blocksize; diff --git a/residual_calc_template.c b/residual_calc_template.c new file mode 100644 index 0000000..779ac24 --- /dev/null +++ b/residual_calc_template.c @@ -0,0 +1,105 @@ +/* + * Residual calculation template + * Copyright 2019 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#if ADD +# define ACT add +#else +#define ACT calc +#endif + +#define JOIN3(a, b, c) a ## _ ## b ## _ ## c +#define FUNC3(name, act, stencil) JOIN3(name, act, stencil) +#define FUNC(name) FUNC3(name, ACT, STENCIL) + +static void +FUNC(derivatives_calc)(double *dst, const double *u, ptrdiff_t stride) +{ + dst[MG2D_DIFF_COEFF_00] = u[0]; + + { + double val = 0.0; + for (int i = -STENCIL; i <= STENCIL; i++) + val += fd_coeffs[STENCIL - 1][0][i + STENCIL] * u[i]; + dst[MG2D_DIFF_COEFF_10] = val; + } + { + double val = 0.0; + for (int i = -STENCIL; i <= STENCIL; i++) + val += fd_coeffs[STENCIL - 1][0][i + STENCIL] * u[i * stride]; + dst[MG2D_DIFF_COEFF_01] = val; + } + { + double val = 0.0; + for (int i = -STENCIL; i <= STENCIL; i++) + val += fd_coeffs[STENCIL - 1][1][i + STENCIL] * u[i]; + dst[MG2D_DIFF_COEFF_20] = val; + } + { + double val = 0.0; + for (int i = -STENCIL; i <= STENCIL; i++) + val += fd_coeffs[STENCIL - 1][1][i + STENCIL] * u[i * stride]; + dst[MG2D_DIFF_COEFF_02] = val; + } + { + double val = 0.0; + for (int j = -STENCIL; j <= STENCIL; j++) { + double tmp = 0.0; + for (int i = -STENCIL; i <= STENCIL; i++) + tmp += fd_coeffs[STENCIL - 1][0][i + STENCIL] * u[j * stride + i]; + val += fd_coeffs[STENCIL - 1][0][j + STENCIL] * tmp; + } + dst[MG2D_DIFF_COEFF_11] = val; + } +} +static void +FUNC(residual_line)(size_t linesize, double *dst, double *dst_max, + ptrdiff_t stride, const double *u, const double *rhs, + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult +#if ADD + , double u_mult +#endif + ) +{ + double res_max = 0.0, res_abs; + for (size_t i = 0; i < linesize; i++) { + double u_vals[MG2D_DIFF_COEFF_NB]; + double res; + + FUNC(derivatives_calc)(u_vals, u + i, stride); + + res = -rhs[i]; + for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) + res += u_vals[j] * diff_coeffs[j][i]; + dst[i] = res_mult * res +#if ADD + + u_mult * u[i] +#endif + ; + + res_abs = fabs(res); + res_max = MAX(res_max, res_abs); + } + + *dst_max = MAX(*dst_max, res_max); +} + +#undef FUNC +#undef FUNC2 +#undef JOIN2 +#undef ACT diff --git a/transfer.c b/transfer.c index 7fa41ee..2dd4cf1 100644 --- a/transfer.c +++ b/transfer.c @@ -129,9 +129,10 @@ static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim) for (ptrdiff_t idx_dst = 0; idx_dst < ctx->dst.size[dim]; idx_dst++) { const ptrdiff_t idx_src = (ptrdiff_t)(scale * (idx_dst + ctx->dst.start[dim])) - ctx->src.start[dim] - (priv->stencil / 2 - 1); const double coord_dst = (idx_dst + ctx->dst.start[dim]) * step_dst; - double *fact = priv->fact[dim] + priv->stencil * idx_dst; + //idx_src = MAX(MIN(idx_src, ctx->src.size[dim] - priv->stencil - 1), 0); + for (int i = 0; i < priv->stencil; i++) coord_src[i] = (idx_src + ctx->src.start[dim] + i) * step_src; -- cgit v1.2.3