From b2d93a84e3a9c84e8c591c9f55355b5834c03d4b Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Wed, 17 Apr 2019 18:28:15 +0200 Subject: egs: premultiply diff_coeffs with the denominator in init Do not do it at every residual calc, which also allows us to get rid of an extra parameter (and reduce the number of registers used in x86 SIMD). --- ell_grid_solve.c | 175 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 101 insertions(+), 74 deletions(-) (limited to 'ell_grid_solve.c') diff --git a/ell_grid_solve.c b/ell_grid_solve.c index a01f124..6646992 100644 --- a/ell_grid_solve.c +++ b/ell_grid_solve.c @@ -51,8 +51,7 @@ typedef struct EGSRelaxInternal { typedef struct EGSExactInternal { void (*fill_mat)(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride, - NDArray **diff_coeffs, double *fd_factors, - ptrdiff_t idx_src); + NDArray **diff_coeffs, ptrdiff_t idx_src); size_t N; size_t N_ghosts; @@ -78,11 +77,13 @@ struct EGSInternal { NDArray *residual_base; /* all the diff coeffs concatenated */ + NDArray *diff_coeffs_public_base; + NDArray *diff_coeffs_base; + NDArray *diff_coeffs[MG2D_DIFF_COEFF_NB]; ptrdiff_t residual_calc_offset[2]; size_t residual_calc_size[2]; - double fd_factors[MG2D_DIFF_COEFF_NB]; int bnd_zero[4]; @@ -123,7 +124,7 @@ static void residual_calc(EGSContext *ctx) mg2di_timer_start(&ctx->timer_res_calc); for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++) - diff_coeffs[i] = NDPTR2D(ctx->diff_coeffs[i], priv->residual_calc_offset[1], priv->residual_calc_offset[0]); + diff_coeffs[i] = NDPTR2D(priv->diff_coeffs[i], priv->residual_calc_offset[1], priv->residual_calc_offset[0]); mg2di_residual_calc(priv->rescalc, priv->residual_calc_size, &ctx->residual_max, NDPTR2D(ctx->residual, priv->residual_calc_offset[1], priv->residual_calc_offset[0]), @@ -132,7 +133,7 @@ static void residual_calc(EGSContext *ctx) ctx->u->stride[0], NDPTR2D(ctx->rhs, priv->residual_calc_offset[1], priv->residual_calc_offset[0]), ctx->rhs->stride[0], - diff_coeffs, ctx->diff_coeffs[0]->stride[0], priv->fd_factors); + diff_coeffs, priv->diff_coeffs[0]->stride[0]); mg2di_timer_stop(&ctx->timer_res_calc); } @@ -322,78 +323,77 @@ static int solve_relax_step(EGSContext *ctx) } static void fill_mat_s1(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride, - NDArray **diff_coeffs, double *fd_factors, ptrdiff_t idx_src) + NDArray **diff_coeffs, ptrdiff_t idx_src) { mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src]; - mat_row[ 1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; - mat_row[-1 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; + mat_row[ 1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[-1 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; - mat_row[ 1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; - mat_row[-1 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; + mat_row[ 1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[-1 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; - mat_row[ 1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; - mat_row[ 0 * mat_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; - mat_row[-1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; + mat_row[ 1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[ 0 * mat_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[-1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; - mat_row[ 1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; - mat_row[ 0 * row_stride * mat_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; - mat_row[-1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; + mat_row[ 1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[ 0 * row_stride * mat_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[-1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; - mat_row[( 1 + 1 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[( 1 - 1 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[(-1 + 1 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[(-1 - 1 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[( 1 + 1 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 - 1 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 + 1 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 - 1 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; } static void fill_mat_s2(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride, - NDArray **diff_coeffs, double *fd_factors, - ptrdiff_t idx_src) + NDArray **diff_coeffs, ptrdiff_t idx_src) { mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src]; - mat_row[ 2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; - mat_row[ 1 * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; - mat_row[-1 * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; - mat_row[-2 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; - - mat_row[ 2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; - mat_row[ 1 * row_stride * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; - mat_row[-1 * row_stride * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; - mat_row[-2 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; - - mat_row[ 2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; - mat_row[ 1 * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; - mat_row[ 0 * mat_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; - mat_row[-1 * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; - mat_row[-2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; - - mat_row[ 2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; - mat_row[ 1 * row_stride * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; - mat_row[ 0 * row_stride * mat_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; - mat_row[-1 * row_stride * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; - mat_row[-2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; - - mat_row[( 2 + 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[( 2 + 1 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[( 2 - 1 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[( 2 - 2 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - - mat_row[( 1 + 2 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[( 1 + 1 * row_stride) * mat_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[( 1 - 1 * row_stride) * mat_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[( 1 - 2 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - - mat_row[(-1 + 2 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[(-1 + 1 * row_stride) * mat_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[(-1 - 1 * row_stride) * mat_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[(-1 - 2 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - - mat_row[(-2 + 2 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[(-2 + 1 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[(-2 - 1 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; - mat_row[(-2 - 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[ 2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[ 1 * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[-1 * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + mat_row[-2 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src]; + + mat_row[ 2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[ 1 * row_stride * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[-1 * row_stride * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + mat_row[-2 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src]; + + mat_row[ 2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[ 1 * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[ 0 * mat_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[-1 * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + mat_row[-2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src]; + + mat_row[ 2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[ 1 * row_stride * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[ 0 * row_stride * mat_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[-1 * row_stride * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + mat_row[-2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src]; + + mat_row[( 2 + 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 + 1 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 - 1 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 2 - 2 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[( 1 + 2 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 + 1 * row_stride) * mat_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 - 1 * row_stride) * mat_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[( 1 - 2 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[(-1 + 2 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 + 1 * row_stride) * mat_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 - 1 * row_stride) * mat_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-1 - 2 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + + mat_row[(-2 + 2 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 + 1 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 - 1 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; + mat_row[(-2 - 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src]; } static void mat_fill_row(EGSContext *ctx, double *scratch_line, @@ -402,7 +402,7 @@ static void mat_fill_row(EGSContext *ctx, double *scratch_line, EGSInternal *priv = ctx->priv; EGSExactInternal *e = &priv->e; - const ptrdiff_t idx_src_dc = NDIDX2D(ctx->diff_coeffs[0], idx0, idx1); + const ptrdiff_t idx_src_dc = NDIDX2D(priv->diff_coeffs[0], idx0, idx1); const ptrdiff_t mat_row_idx = idx1 * ctx->domain_size[0] + idx0; int is_bnd[4], boundary; @@ -431,7 +431,7 @@ static void mat_fill_row(EGSContext *ctx, double *scratch_line, row_offset = idx1 * row_stride + idx0; - e->fill_mat(mat_row + row_offset * mat_stride, mat_stride, row_stride, ctx->diff_coeffs, ctx->priv->fd_factors, idx_src_dc); + e->fill_mat(mat_row + row_offset * mat_stride, mat_stride, row_stride, priv->diff_coeffs, idx_src_dc); e->rhs_factor[mat_row_idx] = 1.0; @@ -657,6 +657,13 @@ int mg2di_egs_solve(EGSContext *ctx) return ret; } +static void init_diff_coeffs(NDArray *dst, const NDArray *src, double factor) +{ + for (ptrdiff_t idx0 = 0; idx0 < dst->shape[0]; idx0++) + for (ptrdiff_t idx1 = 0; idx1 < dst->shape[1]; idx1++) + *NDPTR2D(dst, idx1, idx0) = *NDPTR2D(src, idx1, idx0) * factor; +} + int mg2di_egs_init(EGSContext *ctx, int flags) { EGSInternal *priv = ctx->priv; @@ -689,13 +696,6 @@ int mg2di_egs_init(EGSContext *ctx, int flags) #endif } - priv->fd_factors[MG2D_DIFF_COEFF_00] = 1.0 / fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_00]; - priv->fd_factors[MG2D_DIFF_COEFF_10] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_10] * ctx->step[0]); - priv->fd_factors[MG2D_DIFF_COEFF_01] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_01] * ctx->step[1]); - priv->fd_factors[MG2D_DIFF_COEFF_20] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_20] * SQR(ctx->step[0])); - priv->fd_factors[MG2D_DIFF_COEFF_02] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_02] * SQR(ctx->step[1])); - priv->fd_factors[MG2D_DIFF_COEFF_11] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_11] * ctx->step[0] * ctx->step[1]); - if (!ctx->tp) { ret = tp_init(&priv->tp_internal, 1); if (ret < 0) @@ -703,6 +703,15 @@ int mg2di_egs_init(EGSContext *ctx, int flags) ctx->tp = priv->tp_internal; } + if (!(flags & EGS_INIT_FLAG_SAME_DIFF_COEFFS)) { + init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_00], ctx->diff_coeffs[MG2D_DIFF_COEFF_00], 1.0 / fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_00]); + init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_10], ctx->diff_coeffs[MG2D_DIFF_COEFF_10], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_10] * ctx->step[0])); + init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_01], ctx->diff_coeffs[MG2D_DIFF_COEFF_01], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_01] * ctx->step[1])); + init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_20], ctx->diff_coeffs[MG2D_DIFF_COEFF_20], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_20] * SQR(ctx->step[0]))); + init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_02], ctx->diff_coeffs[MG2D_DIFF_COEFF_02], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_02] * SQR(ctx->step[1]))); + init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_11], ctx->diff_coeffs[MG2D_DIFF_COEFF_11], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_11] * ctx->step[0] * ctx->step[1])); + } + if (ctx->solver_type == EGS_SOLVER_EXACT) { EGSExactInternal *e = &priv->e; unsigned int nb_threads = tp_get_nb_threads(ctx->tp); @@ -823,7 +832,7 @@ static int arrays_alloc(EGSContext *ctx, const size_t domain_size[2]) if (ret < 0) return ret; - ret = mg2di_ndarray_alloc(&priv->diff_coeffs_base, 2, + ret = mg2di_ndarray_alloc(&priv->diff_coeffs_public_base, 2, size_dc, NDARRAY_ALLOC_ZERO); if (ret < 0) return ret; @@ -831,7 +840,20 @@ static int arrays_alloc(EGSContext *ctx, const size_t domain_size[2]) for (int i = 0; i < ARRAY_ELEMS(ctx->diff_coeffs); i++) { const Slice slice_dc[2] = { SLICE(i * size_padded[0] + ghosts, (i + 1) * size_padded[0] -ghosts, 1), SLICE(ghosts, -ghosts, 1) }; - ret = mg2di_ndarray_slice(&ctx->diff_coeffs[i], priv->diff_coeffs_base, slice_dc); + ret = mg2di_ndarray_slice(&ctx->diff_coeffs[i], priv->diff_coeffs_public_base, slice_dc); + if (ret < 0) + return ret; + } + + ret = mg2di_ndarray_alloc(&priv->diff_coeffs_base, 2, + size_dc, NDARRAY_ALLOC_ZERO); + if (ret < 0) + return ret; + + for (int i = 0; i < ARRAY_ELEMS(priv->diff_coeffs); i++) { + const Slice slice_dc[2] = { SLICE(i * size_padded[0] + ghosts, (i + 1) * size_padded[0] -ghosts, 1), + SLICE(ghosts, -ghosts, 1) }; + ret = mg2di_ndarray_slice(&priv->diff_coeffs[i], priv->diff_coeffs_base, slice_dc); if (ret < 0) return ret; } @@ -982,6 +1004,11 @@ void mg2di_egs_free(EGSContext **pctx) for (int i = 0; i < ARRAY_ELEMS(ctx->diff_coeffs); i++) { mg2di_ndarray_free(&ctx->diff_coeffs[i]); } + mg2di_ndarray_free(&ctx->priv->diff_coeffs_public_base); + + for (int i = 0; i < ARRAY_ELEMS(ctx->priv->diff_coeffs); i++) { + mg2di_ndarray_free(&ctx->priv->diff_coeffs[i]); + } mg2di_ndarray_free(&ctx->priv->diff_coeffs_base); for (int i = 0; i < ARRAY_ELEMS(ctx->boundaries); i++) -- cgit v1.2.3