summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ell_grid_solve.c175
-rw-r--r--residual_calc.asm21
-rw-r--r--residual_calc.c51
-rw-r--r--residual_calc.h3
4 files changed, 127 insertions, 123 deletions
diff --git a/ell_grid_solve.c b/ell_grid_solve.c
index a01f124..6646992 100644
--- a/ell_grid_solve.c
+++ b/ell_grid_solve.c
@@ -51,8 +51,7 @@ typedef struct EGSRelaxInternal {
typedef struct EGSExactInternal {
void (*fill_mat)(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride,
- NDArray **diff_coeffs, double *fd_factors,
- ptrdiff_t idx_src);
+ NDArray **diff_coeffs, ptrdiff_t idx_src);
size_t N;
size_t N_ghosts;
@@ -78,11 +77,13 @@ struct EGSInternal {
NDArray *residual_base;
/* all the diff coeffs concatenated */
+ NDArray *diff_coeffs_public_base;
+
NDArray *diff_coeffs_base;
+ NDArray *diff_coeffs[MG2D_DIFF_COEFF_NB];
ptrdiff_t residual_calc_offset[2];
size_t residual_calc_size[2];
- double fd_factors[MG2D_DIFF_COEFF_NB];
int bnd_zero[4];
@@ -123,7 +124,7 @@ static void residual_calc(EGSContext *ctx)
mg2di_timer_start(&ctx->timer_res_calc);
for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++)
- diff_coeffs[i] = NDPTR2D(ctx->diff_coeffs[i], priv->residual_calc_offset[1], priv->residual_calc_offset[0]);
+ diff_coeffs[i] = NDPTR2D(priv->diff_coeffs[i], priv->residual_calc_offset[1], priv->residual_calc_offset[0]);
mg2di_residual_calc(priv->rescalc, priv->residual_calc_size, &ctx->residual_max,
NDPTR2D(ctx->residual, priv->residual_calc_offset[1], priv->residual_calc_offset[0]),
@@ -132,7 +133,7 @@ static void residual_calc(EGSContext *ctx)
ctx->u->stride[0],
NDPTR2D(ctx->rhs, priv->residual_calc_offset[1], priv->residual_calc_offset[0]),
ctx->rhs->stride[0],
- diff_coeffs, ctx->diff_coeffs[0]->stride[0], priv->fd_factors);
+ diff_coeffs, priv->diff_coeffs[0]->stride[0]);
mg2di_timer_stop(&ctx->timer_res_calc);
}
@@ -322,78 +323,77 @@ static int solve_relax_step(EGSContext *ctx)
}
static void fill_mat_s1(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride,
- NDArray **diff_coeffs, double *fd_factors, ptrdiff_t idx_src)
+ NDArray **diff_coeffs, ptrdiff_t idx_src)
{
mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src];
- mat_row[ 1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
- mat_row[-1 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[ 1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+ mat_row[-1 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
- mat_row[ 1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
- mat_row[-1 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[ 1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+ mat_row[-1 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
- mat_row[ 1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
- mat_row[ 0 * mat_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
- mat_row[-1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[ 1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+ mat_row[ 0 * mat_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+ mat_row[-1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
- mat_row[ 1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
- mat_row[ 0 * row_stride * mat_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
- mat_row[-1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[ 1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+ mat_row[ 0 * row_stride * mat_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+ mat_row[-1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
- mat_row[( 1 + 1 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[( 1 - 1 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-1 + 1 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-1 - 1 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[( 1 + 1 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[( 1 - 1 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-1 + 1 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-1 - 1 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
}
static void fill_mat_s2(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride,
- NDArray **diff_coeffs, double *fd_factors,
- ptrdiff_t idx_src)
+ NDArray **diff_coeffs, ptrdiff_t idx_src)
{
mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src];
- mat_row[ 2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
- mat_row[ 1 * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
- mat_row[-1 * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
- mat_row[-2 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
-
- mat_row[ 2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
- mat_row[ 1 * row_stride * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
- mat_row[-1 * row_stride * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
- mat_row[-2 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
-
- mat_row[ 2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
- mat_row[ 1 * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
- mat_row[ 0 * mat_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
- mat_row[-1 * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
- mat_row[-2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
-
- mat_row[ 2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
- mat_row[ 1 * row_stride * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
- mat_row[ 0 * row_stride * mat_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
- mat_row[-1 * row_stride * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
- mat_row[-2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
-
- mat_row[( 2 + 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[( 2 + 1 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[( 2 - 1 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[( 2 - 2 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
-
- mat_row[( 1 + 2 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[( 1 + 1 * row_stride) * mat_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[( 1 - 1 * row_stride) * mat_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[( 1 - 2 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
-
- mat_row[(-1 + 2 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-1 + 1 * row_stride) * mat_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-1 - 1 * row_stride) * mat_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-1 - 2 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
-
- mat_row[(-2 + 2 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-2 + 1 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-2 - 1 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
- mat_row[(-2 - 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[ 2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+ mat_row[ 1 * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+ mat_row[-1 * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+ mat_row[-2 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src];
+
+ mat_row[ 2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+ mat_row[ 1 * row_stride * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+ mat_row[-1 * row_stride * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+ mat_row[-2 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src];
+
+ mat_row[ 2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+ mat_row[ 1 * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+ mat_row[ 0 * mat_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+ mat_row[-1 * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+ mat_row[-2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src];
+
+ mat_row[ 2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+ mat_row[ 1 * row_stride * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+ mat_row[ 0 * row_stride * mat_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+ mat_row[-1 * row_stride * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+ mat_row[-2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src];
+
+ mat_row[( 2 + 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[( 2 + 1 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[( 2 - 1 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[( 2 - 2 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+ mat_row[( 1 + 2 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[( 1 + 1 * row_stride) * mat_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[( 1 - 1 * row_stride) * mat_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[( 1 - 2 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+ mat_row[(-1 + 2 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-1 + 1 * row_stride) * mat_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-1 - 1 * row_stride) * mat_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-1 - 2 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+
+ mat_row[(-2 + 2 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-2 + 1 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-2 - 1 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
+ mat_row[(-2 - 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src];
}
static void mat_fill_row(EGSContext *ctx, double *scratch_line,
@@ -402,7 +402,7 @@ static void mat_fill_row(EGSContext *ctx, double *scratch_line,
EGSInternal *priv = ctx->priv;
EGSExactInternal *e = &priv->e;
- const ptrdiff_t idx_src_dc = NDIDX2D(ctx->diff_coeffs[0], idx0, idx1);
+ const ptrdiff_t idx_src_dc = NDIDX2D(priv->diff_coeffs[0], idx0, idx1);
const ptrdiff_t mat_row_idx = idx1 * ctx->domain_size[0] + idx0;
int is_bnd[4], boundary;
@@ -431,7 +431,7 @@ static void mat_fill_row(EGSContext *ctx, double *scratch_line,
row_offset = idx1 * row_stride + idx0;
- e->fill_mat(mat_row + row_offset * mat_stride, mat_stride, row_stride, ctx->diff_coeffs, ctx->priv->fd_factors, idx_src_dc);
+ e->fill_mat(mat_row + row_offset * mat_stride, mat_stride, row_stride, priv->diff_coeffs, idx_src_dc);
e->rhs_factor[mat_row_idx] = 1.0;
@@ -657,6 +657,13 @@ int mg2di_egs_solve(EGSContext *ctx)
return ret;
}
+static void init_diff_coeffs(NDArray *dst, const NDArray *src, double factor)
+{
+ for (ptrdiff_t idx0 = 0; idx0 < dst->shape[0]; idx0++)
+ for (ptrdiff_t idx1 = 0; idx1 < dst->shape[1]; idx1++)
+ *NDPTR2D(dst, idx1, idx0) = *NDPTR2D(src, idx1, idx0) * factor;
+}
+
int mg2di_egs_init(EGSContext *ctx, int flags)
{
EGSInternal *priv = ctx->priv;
@@ -689,13 +696,6 @@ int mg2di_egs_init(EGSContext *ctx, int flags)
#endif
}
- priv->fd_factors[MG2D_DIFF_COEFF_00] = 1.0 / fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_00];
- priv->fd_factors[MG2D_DIFF_COEFF_10] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_10] * ctx->step[0]);
- priv->fd_factors[MG2D_DIFF_COEFF_01] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_01] * ctx->step[1]);
- priv->fd_factors[MG2D_DIFF_COEFF_20] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_20] * SQR(ctx->step[0]));
- priv->fd_factors[MG2D_DIFF_COEFF_02] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_02] * SQR(ctx->step[1]));
- priv->fd_factors[MG2D_DIFF_COEFF_11] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_11] * ctx->step[0] * ctx->step[1]);
-
if (!ctx->tp) {
ret = tp_init(&priv->tp_internal, 1);
if (ret < 0)
@@ -703,6 +703,15 @@ int mg2di_egs_init(EGSContext *ctx, int flags)
ctx->tp = priv->tp_internal;
}
+ if (!(flags & EGS_INIT_FLAG_SAME_DIFF_COEFFS)) {
+ init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_00], ctx->diff_coeffs[MG2D_DIFF_COEFF_00], 1.0 / fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_00]);
+ init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_10], ctx->diff_coeffs[MG2D_DIFF_COEFF_10], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_10] * ctx->step[0]));
+ init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_01], ctx->diff_coeffs[MG2D_DIFF_COEFF_01], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_01] * ctx->step[1]));
+ init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_20], ctx->diff_coeffs[MG2D_DIFF_COEFF_20], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_20] * SQR(ctx->step[0])));
+ init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_02], ctx->diff_coeffs[MG2D_DIFF_COEFF_02], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_02] * SQR(ctx->step[1])));
+ init_diff_coeffs(priv->diff_coeffs[MG2D_DIFF_COEFF_11], ctx->diff_coeffs[MG2D_DIFF_COEFF_11], 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_11] * ctx->step[0] * ctx->step[1]));
+ }
+
if (ctx->solver_type == EGS_SOLVER_EXACT) {
EGSExactInternal *e = &priv->e;
unsigned int nb_threads = tp_get_nb_threads(ctx->tp);
@@ -823,7 +832,7 @@ static int arrays_alloc(EGSContext *ctx, const size_t domain_size[2])
if (ret < 0)
return ret;
- ret = mg2di_ndarray_alloc(&priv->diff_coeffs_base, 2,
+ ret = mg2di_ndarray_alloc(&priv->diff_coeffs_public_base, 2,
size_dc, NDARRAY_ALLOC_ZERO);
if (ret < 0)
return ret;
@@ -831,7 +840,20 @@ static int arrays_alloc(EGSContext *ctx, const size_t domain_size[2])
for (int i = 0; i < ARRAY_ELEMS(ctx->diff_coeffs); i++) {
const Slice slice_dc[2] = { SLICE(i * size_padded[0] + ghosts, (i + 1) * size_padded[0] -ghosts, 1),
SLICE(ghosts, -ghosts, 1) };
- ret = mg2di_ndarray_slice(&ctx->diff_coeffs[i], priv->diff_coeffs_base, slice_dc);
+ ret = mg2di_ndarray_slice(&ctx->diff_coeffs[i], priv->diff_coeffs_public_base, slice_dc);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = mg2di_ndarray_alloc(&priv->diff_coeffs_base, 2,
+ size_dc, NDARRAY_ALLOC_ZERO);
+ if (ret < 0)
+ return ret;
+
+ for (int i = 0; i < ARRAY_ELEMS(priv->diff_coeffs); i++) {
+ const Slice slice_dc[2] = { SLICE(i * size_padded[0] + ghosts, (i + 1) * size_padded[0] -ghosts, 1),
+ SLICE(ghosts, -ghosts, 1) };
+ ret = mg2di_ndarray_slice(&priv->diff_coeffs[i], priv->diff_coeffs_base, slice_dc);
if (ret < 0)
return ret;
}
@@ -982,6 +1004,11 @@ void mg2di_egs_free(EGSContext **pctx)
for (int i = 0; i < ARRAY_ELEMS(ctx->diff_coeffs); i++) {
mg2di_ndarray_free(&ctx->diff_coeffs[i]);
}
+ mg2di_ndarray_free(&ctx->priv->diff_coeffs_public_base);
+
+ for (int i = 0; i < ARRAY_ELEMS(ctx->priv->diff_coeffs); i++) {
+ mg2di_ndarray_free(&ctx->priv->diff_coeffs[i]);
+ }
mg2di_ndarray_free(&ctx->priv->diff_coeffs_base);
for (int i = 0; i < ARRAY_ELEMS(ctx->boundaries); i++)
diff --git a/residual_calc.asm b/residual_calc.asm
index c51ba5e..3a5b800 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -39,7 +39,6 @@ SECTION .text
; mm register allocation (both s1 and s2)
; m0: accumulator for the residual
-; m1-m5: splatted constant finite difference coefficients
; m6-m11: working registers
; m12: max(fabs(residual))
; m13: mask for computing absolute values
@@ -90,7 +89,6 @@ SECTION .text
subpd m11, m8 ; m11 -= u[x+2]
addpd m11, m10 ; m11 += u[x-2]
%endif
- mulpd m11, m2
vfmadd231pd m0, m11, [coeffs1q + offsetq] ; res += d_x u * diff_coeffs10
; second derivative
@@ -102,7 +100,6 @@ SECTION .text
subpd m11, m10 ; m11 -= u[x-2]
%endif
subpd m11, m6 ; m11 -= fd0 u[x]
- mulpd m11, m5
vfmadd231pd m0, m11, [coeffs2q + offsetq] ; res += d_xx u * diff_coeffs20
%endmacro
@@ -139,7 +136,6 @@ SECTION .text
vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7
%endif
- mulpd m6, m3
vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11
%endmacro
@@ -147,15 +143,6 @@ SECTION .text
%macro RESIDUAL_CALC 1
%define stencil %1
- ; load and splat the finite difference factors
- movu m0, [fd_factorsq + OFF_DIFF_COEFF_01]
- vpermq m1, m0, 00000000b ; diff factor 01 -> m1
- vpermq m2, m0, 01010101b ; diff factor 10 -> m2
- vpermq m3, m0, 10101010b ; diff factor 11 -> m3
- vpermq m4, m0, 11111111b ; diff factor 02 -> m4
- movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20]
- vpermq m5, m0, 00000000b ; diff factor 20 -> m5
- %define u_downq fd_factorsq ; reuse the fd_factors register after it is no longer needed
; compute the mask for absolute value
pcmpeqq m13, m13
@@ -266,11 +253,11 @@ SECTION .text
%endmacro
INIT_YMM fma3
-cglobal residual_calc_line_s1, 8, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up
+cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
RESIDUAL_CALC 1
INIT_YMM fma3
-cglobal residual_calc_line_s2, 8, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2
+cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2
diff --git a/residual_calc.c b/residual_calc.c
index 8f67823..bfbb9bf 100644
--- a/residual_calc.c
+++ b/residual_calc.c
@@ -44,8 +44,6 @@ typedef struct ResidualCalcTask {
const double * const *diff_coeffs;
ptrdiff_t diff_coeffs_stride;
-
- const double *fd_factors;
} ResidualCalcTask;
struct ResidualCalcInternal {
@@ -54,8 +52,7 @@ struct ResidualCalcInternal {
void (*residual_calc_line)(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- const double *fd_factors);
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB]);
size_t calc_blocksize;
ResidualCalcTask task;
@@ -64,29 +61,27 @@ struct ResidualCalcInternal {
#if HAVE_EXTERNAL_ASM
void mg2di_residual_calc_line_s1_fma3(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- const double *fd_factors);
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB]);
void mg2di_residual_calc_line_s2_fma3(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- const double *fd_factors);
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB]);
#endif
static void
-derivatives_calc_s1(double *dst, const double *u, const double *fd_factors, ptrdiff_t stride)
+derivatives_calc_s1(double *dst, const double *u, ptrdiff_t stride)
{
dst[MG2D_DIFF_COEFF_00] = u[0];
- dst[MG2D_DIFF_COEFF_10] = (u[1] - u[-1]) * fd_factors[MG2D_DIFF_COEFF_10];
- dst[MG2D_DIFF_COEFF_01] = (u[stride] - u[-stride]) * fd_factors[MG2D_DIFF_COEFF_01];
+ dst[MG2D_DIFF_COEFF_10] = (u[1] - u[-1]);
+ dst[MG2D_DIFF_COEFF_01] = (u[stride] - u[-stride]);
- dst[MG2D_DIFF_COEFF_20] = (u[1] - 2.0 * u[0] + u[-1]) * fd_factors[MG2D_DIFF_COEFF_20];
- dst[MG2D_DIFF_COEFF_02] = (u[stride] - 2.0 * u[0] + u[-stride]) * fd_factors[MG2D_DIFF_COEFF_02];
+ dst[MG2D_DIFF_COEFF_20] = (u[1] - 2.0 * u[0] + u[-1]);
+ dst[MG2D_DIFF_COEFF_02] = (u[stride] - 2.0 * u[0] + u[-stride]);
- dst[MG2D_DIFF_COEFF_11] = (u[1 + stride] - u[stride - 1] - u[-stride + 1] + u[-stride - 1]) * fd_factors[MG2D_DIFF_COEFF_11];
+ dst[MG2D_DIFF_COEFF_11] = (u[1 + stride] - u[stride - 1] - u[-stride + 1] + u[-stride - 1]);
}
static void
-derivatives_calc_s2(double *dst, const double *u, const double *fd_factors, ptrdiff_t stride)
+derivatives_calc_s2(double *dst, const double *u, ptrdiff_t stride)
{
const double val = u[0];
@@ -120,29 +115,28 @@ derivatives_calc_s2(double *dst, const double *u, const double *fd_factors, ptrd
const double valxm2ym2 = u[-2 - 2 * stride];
dst[MG2D_DIFF_COEFF_00] = val;
- dst[MG2D_DIFF_COEFF_10] = (-1.0 * valxp2 + 8.0 * valxp1 - 8.0 * valxm1 + 1.0 * valxm2) * fd_factors[MG2D_DIFF_COEFF_10];
- dst[MG2D_DIFF_COEFF_01] = (-1.0 * valyp2 + 8.0 * valyp1 - 8.0 * valym1 + 1.0 * valym2) * fd_factors[MG2D_DIFF_COEFF_01];
+ dst[MG2D_DIFF_COEFF_10] = (-1.0 * valxp2 + 8.0 * valxp1 - 8.0 * valxm1 + 1.0 * valxm2);
+ dst[MG2D_DIFF_COEFF_01] = (-1.0 * valyp2 + 8.0 * valyp1 - 8.0 * valym1 + 1.0 * valym2);
- dst[MG2D_DIFF_COEFF_20] = (-1.0 * valxp2 + 16.0 * valxp1 - 30.0 * val + 16.0 * valxm1 - 1.0 * valxm2) * fd_factors[MG2D_DIFF_COEFF_20];
- dst[MG2D_DIFF_COEFF_02] = (-1.0 * valyp2 + 16.0 * valyp1 - 30.0 * val + 16.0 * valym1 - 1.0 * valym2) * fd_factors[MG2D_DIFF_COEFF_02];
+ dst[MG2D_DIFF_COEFF_20] = (-1.0 * valxp2 + 16.0 * valxp1 - 30.0 * val + 16.0 * valxm1 - 1.0 * valxm2);
+ dst[MG2D_DIFF_COEFF_02] = (-1.0 * valyp2 + 16.0 * valyp1 - 30.0 * val + 16.0 * valym1 - 1.0 * valym2);
dst[MG2D_DIFF_COEFF_11] = ( 1.0 * valxp2yp2 - 8.0 * valxp2yp1 + 8.0 * valxp2ym1 - 1.0 * valxp2ym2
-8.0 * valxp1yp2 + 64.0 * valxp1yp1 - 64.0 * valxp1ym1 + 8.0 * valxp1ym2
+8.0 * valxm1yp2 - 64.0 * valxm1yp1 + 64.0 * valxm1ym1 - 8.0 * valxm1ym2
- -1.0 * valxm2yp2 + 8.0 * valxm2yp1 - 8.0 * valxm2ym1 + 1.0 * valxm2ym2) * fd_factors[MG2D_DIFF_COEFF_11];
+ -1.0 * valxm2yp2 + 8.0 * valxm2yp1 - 8.0 * valxm2ym1 + 1.0 * valxm2ym2);
}
static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- const double *fd_factors)
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB])
{
double res_max = 0.0, res_abs;
for (size_t i = 0; i < linesize; i++) {
double u_vals[MG2D_DIFF_COEFF_NB];
double res;
- derivatives_calc_s1(u_vals, u + i, fd_factors, stride);
+ derivatives_calc_s1(u_vals, u + i, stride);
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
@@ -158,15 +152,14 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma
static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- const double *fd_factors)
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB])
{
double res_max = 0.0, res_abs;
for (size_t i = 0; i < linesize; i++) {
double u_vals[MG2D_DIFF_COEFF_NB];
double res;
- derivatives_calc_s2(u_vals, u + i, fd_factors, stride);
+ derivatives_calc_s2(u_vals, u + i, stride);
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
@@ -194,7 +187,7 @@ static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thre
priv->residual_max + thread_idx * priv->calc_blocksize,
task->u_stride, task->u + job_idx * task->u_stride,
task->rhs + job_idx * task->rhs_stride,
- diff_coeffs, task->fd_factors);
+ diff_coeffs);
return 0;
}
@@ -205,8 +198,7 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2],
const double *u, ptrdiff_t u_stride,
const double *rhs, ptrdiff_t rhs_stride,
const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- ptrdiff_t diff_coeffs_stride,
- const double *fd_factors)
+ ptrdiff_t diff_coeffs_stride)
{
ResidualCalcInternal *priv = ctx->priv;
ResidualCalcTask *task = &priv->task;
@@ -223,7 +215,6 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2],
task->rhs_stride = rhs_stride;
task->diff_coeffs = diff_coeffs;
task->diff_coeffs_stride = diff_coeffs_stride;
- task->fd_factors = fd_factors;
tp_execute(ctx->tp, size[1], residual_calc_task, priv);
diff --git a/residual_calc.h b/residual_calc.h
index c466214..31c1909 100644
--- a/residual_calc.h
+++ b/residual_calc.h
@@ -74,7 +74,6 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2],
const double *u, ptrdiff_t u_stride,
const double *rhs, ptrdiff_t rhs_stride,
const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- ptrdiff_t diff_coeffs_stride,
- const double *fd_factors);
+ ptrdiff_t diff_coeffs_stride);
#endif // MG2D_RESIDUAL_CALC_H