summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-04-17 18:28:15 +0200
committerAnton Khirnov <anton@khirnov.net>2019-04-19 17:11:40 +0200
commitb2d93a84e3a9c84e8c591c9f55355b5834c03d4b (patch)
tree189213565869426d3d161e334908e63168c0bd99
parent5b94910fc4c6a47856290e9c23f9a905cf63c1eb (diff)
egs: premultiply diff_coeffs with the denominator in init
Do not do it at every residual calc, which also allows us to get rid of an extra parameter (and reduce the number of registers used in x86 SIMD).
-rw-r--r--ell_grid_solve.c175
-rw-r--r--residual_calc.asm21
-rw-r--r--residual_calc.c51
-rw-r--r--residual_calc.h3
4 files changed, 127 insertions, 123 deletions
diff --git a/ell_grid_solve.c b/ell_grid_solve.c
index a01f124..6646992 100644
--- a/ell_grid_solve.c
+++ b/ell_grid_solve.c
@@ -51,8 +51,7 @@ typedef struct EGSRelaxInternal {
typedef struct EGSExactInternal {
void (*fill_mat)(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride,
- NDArray **diff_coeffs, double *fd_factors,
- ptrdiff_t idx_src);
+ NDArray **diff_coeffs, ptrdiff_t idx_src);
size_t N;
size_t N_ghosts;
@@ -78,11 +77,13 @@ struct EGSInternal {
NDArray *residual_base;
/* all the diff coeffs concatenated */
+ NDArray *diff_coeffs_public_base;
+
NDArray *diff_coeffs_base;
+ NDArray *diff_coeffs[MG2D_DIFF_COEFF_NB];
ptrdiff_t residual_calc_offset[2];
size_t residual_calc_size[2];
- double fd_factors[MG2D_DIFF_COEFF_NB];
int bnd_zero[4];
@@ -123,7 +124,7 @@ static void residual_calc(EGSContext *ctx)
mg2di_timer_start(&ctx->timer_res_calc);
for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++)
- diff_coeffs[i] = NDPTR2D(ctx->diff_coeffs[i], priv->residual_calc_offset[1], priv->residual_calc_offset[0]);
+ diff_coeffs[i] = NDPTR2D(priv->diff_coeffs[i], priv->residual_calc_offset[1], priv->residual_calc_offset[0]);
mg2di_residual_calc(priv->rescalc, priv->residual_calc_size, &ctx->residual_max,
NDPTR2D(ctx->residual, priv->residual_calc_offset[1], priv->residual_calc_offset[0]),
@@ -132,7 +133,7 @@ static void residual_calc(EGSContext *ctx)
ctx->u->stride[0],
NDPTR2D(ctx->rhs, priv->residual_calc_offset[1], priv->residual_calc_offset[0]),
ctx->rhs->stride[0],
- diff_coeffs, ctx->diff_coeffs[0]->stride[0], priv->fd_factors);
+ diff_coeffs, priv->diff_coeffs[0]->stride[0]);
mg2di_timer_stop(&ctx->timer_res_calc);
}
@@ -322,78 +323,77 @@ static int solve_relax_step(EGSContext *ctx)
}
static void fill_mat_s1(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride,
- NDArray **diff_coeffs, double *fd_factors, ptrdiff_t idx_src)
+ NDArray **diff_coeffs, ptrdiff_t idx_src)
{
mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src];
- mat_row[ 1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
- mat_row[-1 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[ 1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+ mat_row[-1 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
- mat_row[ 1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
- mat_row[-1 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[ 1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+ mat_row[-1 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
- mat_row[ 1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
- mat_row[ 0 * mat_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
- mat_row[-1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[ 1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+ mat_row[ 0 * mat_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+ mat_row[-1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
- mat_row[ 1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
- mat_row[ 0 * row_stride * mat_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
- mat_row[-1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[ 1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+ mat_row[ 0 * row_stride * mat_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+ mat_row[-1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
- mat_row[( 1 + 1 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[( 1 - 1 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-1 + 1 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-1 - 1 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 1 + 1 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[( 1 - 1 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-1 + 1 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-1 - 1 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
}
static void fill_mat_s2(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride,
- NDArray **diff_coeffs, double *fd_factors,
- ptrdiff_t idx_src)
+ NDArray **diff_coeffs, ptrdiff_t idx_src)
{
mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src];
- mat_row[ 2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
- mat_row[ 1 * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
- mat_row[-1 * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
- mat_row[-2 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
-
- mat_row[ 2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
- mat_row[ 1 * row_stride * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
- mat_row[-1 * row_stride * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
- mat_row[-2 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
-
- mat_row[ 2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
- mat_row[ 1 * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
- mat_row[ 0 * mat_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
- mat_row[-1 * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
- mat_row[-2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
-
- mat_row[ 2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
- mat_row[ 1 * row_stride * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
- mat_row[ 0 * row_stride * mat_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
- mat_row[-1 * row_stride * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
- mat_row[-2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
-
- mat_row[( 2 + 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[( 2 + 1 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[( 2 - 1 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[( 2 - 2 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
-
- mat_row[( 1 + 2 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[( 1 + 1 * row_stride) * mat_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[( 1 - 1 * row_stride) * mat_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[( 1 - 2 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
-
- mat_row[(-1 + 2 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-1 + 1 * row_stride) * mat_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-1 - 1 * row_stride) * mat_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-1 - 2 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
-
- mat_row[(-2 + 2 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-2 + 1 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-2 - 1 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-2 - 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[ 2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+ mat_row[ 1 * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+ mat_row[-1 * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+ mat_row[-2 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+
+ mat_row[ 2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+ mat_row[ 1 * row_stride * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+ mat_row[-1 * row_stride * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+ mat_row[-2 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+
+ mat_row[ 2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+ mat_row[ 1 * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+ mat_row[ 0 * mat_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+ mat_row[-1 * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+ mat_row[-2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+
+ mat_row[ 2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+ mat_row[ 1 * row_stride * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+ mat_row[ 0 * row_stride * mat_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+ mat_row[-1 * row_stride * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+ mat_row[-2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+
+ mat_row[( 2 + 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[( 2 + 1 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[( 2 - 1 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[( 2 - 2 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+ mat_row[( 1 + 2 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[( 1 + 1 * row_stride) * mat_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[( 1 - 1 * row_stride) * mat_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[( 1 - 2 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+ mat_row[(-1 + 2 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-1 + 1 * row_stride) * mat_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-1 - 1 * row_stride) * mat_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-1 - 2 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+ mat_row[(-2 + 2 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-2 + 1 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-2 - 1 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-2 - 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
}
static void mat_fill_row(EGSContext *ctx, double *scratch_line,
@@ -402,7 +402,7 @@ static void mat_fill_row(EGSContext *ctx, double *scratch_line,
EGSInternal *priv = ctx->priv;
EGSExactInternal *e = &priv->e;
- const ptrdiff_t idx_src_dc = NDIDX2D(ctx->diff_coeffs[0], idx0, idx1);
+ const ptrdiff_t idx_src_dc = NDIDX2D(priv->diff_coeffs[0], idx0, idx1);
const ptrdiff_t mat_row_idx = idx1 * ctx->domain_size[0] + idx0;
int is_bnd[4], boundary;
@@ -431,7 +431,7 @@ static void mat_fill_row(EGSContext *ctx, double *scratch_line,
row_offset = idx1 * row_stride + idx0;
- e->fill_mat(mat_row + row_offset * mat_stride, mat_stride, row_stride, ctx->diff_coeffs, ctx->priv->fd_factors, idx_src_dc);
+ e->fill_mat(mat_row + row_offset * mat_stride, mat_stride, row_stride, priv->diff_coeffs, idx_src_dc);
e->rhs_factor[mat_row_idx] = 1.0;
@@ -657,6 +657,13 @@ int mg2di_egs_solve(EGSContext *ctx)
return ret;
}
+static void init_diff_coeffs(NDArray *dst, const NDArray *src, double factor)
+{
+ for (ptrdiff_t idx0 = 0; idx0 < dst->shape[0]; idx0++)
+ for (ptrdiff_t idx1 = 0; idx1 < dst->shape[1]; idx1++)
+ *NDPTR2D(dst, idx1, idx0) = *NDPTR2D(src, idx1, idx0) * factor;
+}
+
int mg2di_egs_init(EGSContext *ctx, int flags)
{
EGSInternal *priv = ctx->priv;
@@ -689,13 +696,6 @@ int mg2di_egs_init(EGSContext *ctx, int flags)
#endif
}
- priv->fd_factors[MG2D_DIFF_COEFF_00] = 1.0 / fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_00];
- priv->fd_factors[MG2D_DIFF_COEFF_10] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_10] * ctx->step[0]);
- priv->fd_factors[MG2D_DIFF_COEFF_01] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_01] * ctx->step[1]);
- priv->fd_factors[MG2D_DIFF_COEFF_20] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_20] * SQR(ctx->step[0]));
- priv->fd_factors[MG2D_DIFF_COEFF_02] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_02] * SQR(ctx->step[1]));
- priv->fd_factors[MG2D_DIFF_COEFF_11] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_11] * ctx->step[0] * ctx->step[1]);
-
if (!ctx->tp) {
ret = tp_init(&priv->tp_internal, 1);
if (ret < 0)
@@ -703,6 +703,15 @@ int mg2di_egs_init(EGSContext *ctx, int flags)
ctx->tp = priv->tp_internal;
}
+ if (!(flags & EGS_INIT_FLAG_SAME_DIFF_COEFFS)) {
+ init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_00], ctx->diff_coeffs[MG2D_DIFF_COEFF_00], 1.0 / fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_00]);
+ init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_10], ctx->diff_coeffs[MG2D_DIFF_COEFF_10], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_10] * ctx->step[0]));
+ init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_01], ctx->diff_coeffs[MG2D_DIFF_COEFF_01], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_01] * ctx->step[1]));
+ init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_20], ctx->diff_coeffs[MG2D_DIFF_COEFF_20], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_20] * SQR(ctx->step[0])));
+ init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_02], ctx->diff_coeffs[MG2D_DIFF_COEFF_02], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_02] * SQR(ctx->step[1])));
+ init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_11], ctx->diff_coeffs[MG2D_DIFF_COEFF_11], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_11] * ctx->step[0] * ctx->step[1]));
+ }
+
if (ctx->solver_type == EGS_SOLVER_EXACT) {
EGSExactInternal *e = &priv->e;
unsigned int nb_threads = tp_get_nb_threads(ctx->tp);
@@ -823,7 +832,7 @@ static int arrays_alloc(EGSContext *ctx, const size_t domain_size[2])
if (ret < 0)
return ret;
- ret = mg2di_ndarray_alloc(&priv->diff_coeffs_base, 2,
+ ret = mg2di_ndarray_alloc(&priv->diff_coeffs_public_base, 2,
size_dc, NDARRAY_ALLOC_ZERO);
if (ret < 0)
return ret;
@@ -831,7 +840,20 @@ static int arrays_alloc(EGSContext *ctx, const size_t domain_size[2])
for (int i = 0; i < ARRAY_ELEMS(ctx->diff_coeffs); i++) {
const Slice slice_dc[2] = { SLICE(i * size_padded[0] + ghosts, (i + 1) * size_padded[0] -ghosts, 1),
SLICE(ghosts, -ghosts, 1) };
- ret = mg2di_ndarray_slice(&ctx->diff_coeffs[i], priv->diff_coeffs_base, slice_dc);
+ ret = mg2di_ndarray_slice(&ctx->diff_coeffs[i], priv->diff_coeffs_public_base, slice_dc);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = mg2di_ndarray_alloc(&priv->diff_coeffs_base, 2,
+ size_dc, NDARRAY_ALLOC_ZERO);
+ if (ret < 0)
+ return ret;
+
+ for (int i = 0; i < ARRAY_ELEMS(priv->diff_coeffs); i++) {
+ const Slice slice_dc[2] = { SLICE(i * size_padded[0] + ghosts, (i + 1) * size_padded[0] -ghosts, 1),
+ SLICE(ghosts, -ghosts, 1) };
+ ret = mg2di_ndarray_slice(&priv->diff_coeffs[i], priv->diff_coeffs_base, slice_dc);
if (ret < 0)
return ret;
}
@@ -982,6 +1004,11 @@ void mg2di_egs_free(EGSContext **pctx)
for (int i = 0; i < ARRAY_ELEMS(ctx->diff_coeffs); i++) {
mg2di_ndarray_free(&ctx->diff_coeffs[i]);
}
+ mg2di_ndarray_free(&ctx->priv->diff_coeffs_public_base);
+
+ for (int i = 0; i < ARRAY_ELEMS(ctx->priv->diff_coeffs); i++) {
+ mg2di_ndarray_free(&ctx->priv->diff_coeffs[i]);
+ }
mg2di_ndarray_free(&ctx->priv->diff_coeffs_base);
for (int i = 0; i < ARRAY_ELEMS(ctx->boundaries); i++)
diff --git a/residual_calc.asm b/residual_calc.asm
index c51ba5e..3a5b800 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -39,7 +39,6 @@ SECTION .text
; mm register allocation (both s1 and s2)
; m0: accumulator for the residual
-; m1-m5: splatted constant finite difference coefficients
; m6-m11: working registers
; m12: max(fabs(residual))
; m13: mask for computing absolute values
@@ -90,7 +89,6 @@ SECTION .text
subpd m11, m8 ; m11 -= u[x+2]
addpd m11, m10 ; m11 += u[x-2]
%endif
- mulpd m11, m2
vfmadd231pd m0, m11, [coeffs1q + offsetq] ; res += d_x u * diff_coeffs10
; second derivative
@@ -102,7 +100,6 @@ SECTION .text
subpd m11, m10 ; m11 -= u[x-2]
%endif
subpd m11, m6 ; m11 -= fd0 u[x]
- mulpd m11, m5
vfmadd231pd m0, m11, [coeffs2q + offsetq] ; res += d_xx u * diff_coeffs20
%endmacro
@@ -139,7 +136,6 @@ SECTION .text
vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7
%endif
- mulpd m6, m3
vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11
%endmacro
@@ -147,15 +143,6 @@ SECTION .text
%macro RESIDUAL_CALC 1
%define stencil %1
- ; load and splat the finite difference factors
- movu m0, [fd_factorsq + OFF_DIFF_COEFF_01]
- vpermq m1, m0, 00000000b ; diff factor 01 -> m1
- vpermq m2, m0, 01010101b ; diff factor 10 -> m2
- vpermq m3, m0, 10101010b ; diff factor 11 -> m3
- vpermq m4, m0, 11111111b ; diff factor 02 -> m4
- movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20]
- vpermq m5, m0, 00000000b ; diff factor 20 -> m5
- %define u_downq fd_factorsq ; reuse the fd_factors register after it is no longer needed
; compute the mask for absolute value
pcmpeqq m13, m13
@@ -266,11 +253,11 @@ SECTION .text
%endmacro
INIT_YMM fma3
-cglobal residual_calc_line_s1, 8, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up
+cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
RESIDUAL_CALC 1
INIT_YMM fma3
-cglobal residual_calc_line_s2, 8, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2
+cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2
diff --git a/residual_calc.c b/residual_calc.c
index 8f67823..bfbb9bf 100644
--- a/residual_calc.c
+++ b/residual_calc.c
@@ -44,8 +44,6 @@ typedef struct ResidualCalcTask {
const double * const *diff_coeffs;
ptrdiff_t diff_coeffs_stride;
-
- const double *fd_factors;
} ResidualCalcTask;
struct ResidualCalcInternal {
@@ -54,8 +52,7 @@ struct ResidualCalcInternal {
void (*residual_calc_line)(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- const double *fd_factors);
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB]);
size_t calc_blocksize;
ResidualCalcTask task;
@@ -64,29 +61,27 @@ struct ResidualCalcInternal {
#if HAVE_EXTERNAL_ASM
void mg2di_residual_calc_line_s1_fma3(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- const double *fd_factors);
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB]);
void mg2di_residual_calc_line_s2_fma3(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- const double *fd_factors);
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB]);
#endif
static void
-derivatives_calc_s1(double *dst, const double *u, const double *fd_factors, ptrdiff_t stride)
+derivatives_calc_s1(double *dst, const double *u, ptrdiff_t stride)
{
dst[MG2D_DIFF_COEFF_00] = u[0];
- dst[MG2D_DIFF_COEFF_10] = (u[1] - u[-1]) * fd_factors[MG2D_DIFF_COEFF_10];
- dst[MG2D_DIFF_COEFF_01] = (u[stride] - u[-stride]) * fd_factors[MG2D_DIFF_COEFF_01];
+ dst[MG2D_DIFF_COEFF_10] = (u[1] - u[-1]);
+ dst[MG2D_DIFF_COEFF_01] = (u[stride] - u[-stride]);
- dst[MG2D_DIFF_COEFF_20] = (u[1] - 2.0 * u[0] + u[-1]) * fd_factors[MG2D_DIFF_COEFF_20];
- dst[MG2D_DIFF_COEFF_02] = (u[stride] - 2.0 * u[0] + u[-stride]) * fd_factors[MG2D_DIFF_COEFF_02];
+ dst[MG2D_DIFF_COEFF_20] = (u[1] - 2.0 * u[0] + u[-1]);
+ dst[MG2D_DIFF_COEFF_02] = (u[stride] - 2.0 * u[0] + u[-stride]);
- dst[MG2D_DIFF_COEFF_11] = (u[1 + stride] - u[stride - 1] - u[-stride + 1] + u[-stride - 1]) * fd_factors[MG2D_DIFF_COEFF_11];
+ dst[MG2D_DIFF_COEFF_11] = (u[1 + stride] - u[stride - 1] - u[-stride + 1] + u[-stride - 1]);
}
static void
-derivatives_calc_s2(double *dst, const double *u, const double *fd_factors, ptrdiff_t stride)
+derivatives_calc_s2(double *dst, const double *u, ptrdiff_t stride)
{
const double val = u[0];
@@ -120,29 +115,28 @@ derivatives_calc_s2(double *dst, const double *u, const double *fd_factors, ptrd
const double valxm2ym2 = u[-2 - 2 * stride];
dst[MG2D_DIFF_COEFF_00] = val;
- dst[MG2D_DIFF_COEFF_10] = (-1.0 * valxp2 + 8.0 * valxp1 - 8.0 * valxm1 + 1.0 * valxm2) * fd_factors[MG2D_DIFF_COEFF_10];
- dst[MG2D_DIFF_COEFF_01] = (-1.0 * valyp2 + 8.0 * valyp1 - 8.0 * valym1 + 1.0 * valym2) * fd_factors[MG2D_DIFF_COEFF_01];
+ dst[MG2D_DIFF_COEFF_10] = (-1.0 * valxp2 + 8.0 * valxp1 - 8.0 * valxm1 + 1.0 * valxm2);
+ dst[MG2D_DIFF_COEFF_01] = (-1.0 * valyp2 + 8.0 * valyp1 - 8.0 * valym1 + 1.0 * valym2);
- dst[MG2D_DIFF_COEFF_20] = (-1.0 * valxp2 + 16.0 * valxp1 - 30.0 * val + 16.0 * valxm1 - 1.0 * valxm2) * fd_factors[MG2D_DIFF_COEFF_20];
- dst[MG2D_DIFF_COEFF_02] = (-1.0 * valyp2 + 16.0 * valyp1 - 30.0 * val + 16.0 * valym1 - 1.0 * valym2) * fd_factors[MG2D_DIFF_COEFF_02];
+ dst[MG2D_DIFF_COEFF_20] = (-1.0 * valxp2 + 16.0 * valxp1 - 30.0 * val + 16.0 * valxm1 - 1.0 * valxm2);
+ dst[MG2D_DIFF_COEFF_02] = (-1.0 * valyp2 + 16.0 * valyp1 - 30.0 * val + 16.0 * valym1 - 1.0 * valym2);
dst[MG2D_DIFF_COEFF_11] = ( 1.0 * valxp2yp2 - 8.0 * valxp2yp1 + 8.0 * valxp2ym1 - 1.0 * valxp2ym2
-8.0 * valxp1yp2 + 64.0 * valxp1yp1 - 64.0 * valxp1ym1 + 8.0 * valxp1ym2
+8.0 * valxm1yp2 - 64.0 * valxm1yp1 + 64.0 * valxm1ym1 - 8.0 * valxm1ym2
- -1.0 * valxm2yp2 + 8.0 * valxm2yp1 - 8.0 * valxm2ym1 + 1.0 * valxm2ym2) * fd_factors[MG2D_DIFF_COEFF_11];
+ -1.0 * valxm2yp2 + 8.0 * valxm2yp1 - 8.0 * valxm2ym1 + 1.0 * valxm2ym2);
}
static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- const double *fd_factors)
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB])
{
double res_max = 0.0, res_abs;
for (size_t i = 0; i < linesize; i++) {
double u_vals[MG2D_DIFF_COEFF_NB];
double res;
- derivatives_calc_s1(u_vals, u + i, fd_factors, stride);
+ derivatives_calc_s1(u_vals, u + i, stride);
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
@@ -158,15 +152,14 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma
static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- const double *fd_factors)
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB])
{
double res_max = 0.0, res_abs;
for (size_t i = 0; i < linesize; i++) {
double u_vals[MG2D_DIFF_COEFF_NB];
double res;
- derivatives_calc_s2(u_vals, u + i, fd_factors, stride);
+ derivatives_calc_s2(u_vals, u + i, stride);
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
@@ -194,7 +187,7 @@ static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thre
priv->residual_max + thread_idx * priv->calc_blocksize,
task->u_stride, task->u + job_idx * task->u_stride,
task->rhs + job_idx * task->rhs_stride,
- diff_coeffs, task->fd_factors);
+ diff_coeffs);
return 0;
}
@@ -205,8 +198,7 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2],
const double *u, ptrdiff_t u_stride,
const double *rhs, ptrdiff_t rhs_stride,
const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- ptrdiff_t diff_coeffs_stride,
- const double *fd_factors)
+ ptrdiff_t diff_coeffs_stride)
{
ResidualCalcInternal *priv = ctx->priv;
ResidualCalcTask *task = &priv->task;
@@ -223,7 +215,6 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2],
task->rhs_stride = rhs_stride;
task->diff_coeffs = diff_coeffs;
task->diff_coeffs_stride = diff_coeffs_stride;
- task->fd_factors = fd_factors;
tp_execute(ctx->tp, size[1], residual_calc_task, priv);
diff --git a/residual_calc.h b/residual_calc.h
index c466214..31c1909 100644
--- a/residual_calc.h
+++ b/residual_calc.h
@@ -74,7 +74,6 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2],
const double *u, ptrdiff_t u_stride,
const double *rhs, ptrdiff_t rhs_stride,
const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- ptrdiff_t diff_coeffs_stride,
- const double *fd_factors);
+ ptrdiff_t diff_coeffs_stride);
#endif // MG2D_RESIDUAL_CALC_H