diff options
Diffstat (limited to 'ell_relax.c')
-rw-r--r-- | ell_relax.c | 54 |
1 files changed, 47 insertions, 7 deletions
diff --git a/ell_relax.c b/ell_relax.c index a616e2e..57c4112 100644 --- a/ell_relax.c +++ b/ell_relax.c @@ -43,10 +43,14 @@ struct EllRelaxInternal { double *boundary_val_base[4]; - void (*residual_calc_line)(size_t linesize, double *dst, + double *residual_max; + size_t residual_max_size; + + void (*residual_calc_line)(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB], const double *fd_factors); + size_t calc_blocksize; ptrdiff_t residual_calc_offset; size_t residual_calc_size[2]; double fd_factors[ELL_RELAX_DIFF_COEFF_NB]; @@ -97,11 +101,11 @@ static const double fd_denoms[][ELL_RELAX_DIFF_COEFF_NB] = { }; #if HAVE_EXTERNAL_ASM -void mg2di_residual_calc_line_s1_fma3(size_t linesize, double *dst, +void mg2di_residual_calc_line_s1_fma3(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB], const double *fd_factors); -void mg2di_residual_calc_line_s2_fma3(size_t linesize, double *dst, +void mg2di_residual_calc_line_s2_fma3(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB], const double *fd_factors); @@ -167,11 +171,12 @@ derivatives_calc_s2(double *dst, const double *u, const double *fd_factors, ptrd -1.0 * valxm2yp2 + 8.0 * valxm2yp1 - 8.0 * valxm2ym1 + 1.0 * valxm2ym2) * fd_factors[ELL_RELAX_DIFF_COEFF_11]; } -static void residual_calc_line_s1_c(size_t linesize, double *dst, +static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB], const double *fd_factors) { + double res_max = 0.0, res_abs; for (size_t i = 0; i < linesize; i++) { double u_vals[ELL_RELAX_DIFF_COEFF_NB]; double res; @@ -182,14 +187,20 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) res += u_vals[j] * diff_coeffs[j][i]; dst[i] = res; + + res_abs = fabs(res); + res_max = MAX(res_max, res_abs); } + + *dst_max = MAX(*dst_max, res_max); } -static void residual_calc_line_s2_c(size_t linesize, double *dst, +static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB], const double *fd_factors) { + double res_max = 0.0, res_abs; for (size_t i = 0; i < linesize; i++) { double u_vals[ELL_RELAX_DIFF_COEFF_NB]; double res; @@ -200,7 +211,12 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) res += u_vals[j] * diff_coeffs[j][i]; dst[i] = res; + + res_abs = fabs(res); + res_max = MAX(res_max, res_abs); } + + *dst_max = MAX(*dst_max, res_max); } static void residual_calc_task(void *arg, unsigned int job_idx, unsigned int thread_idx) @@ -214,6 +230,7 @@ static void residual_calc_task(void *arg, unsigned int job_idx, unsigned int thr diff_coeffs[i] = ctx->diff_coeffs[i] + offset; priv->residual_calc_line(priv->residual_calc_size[0], ctx->residual + offset, + priv->residual_max + thread_idx * priv->calc_blocksize, priv->stride, ctx->u + offset, ctx->rhs + offset, diff_coeffs, priv->fd_factors); } @@ -221,12 +238,19 @@ static void residual_calc_task(void *arg, unsigned int job_idx, unsigned int thr static void residual_calc(EllRelaxContext *ctx) { EllRelaxInternal *priv = ctx->priv; + double res_max = 0.0; int64_t start; + memset(priv->residual_max, 0, sizeof(*priv->residual_max) * priv->residual_max_size); + start = gettime(); tp_execute(ctx->tp, priv->residual_calc_size[1], residual_calc_task, ctx); + for (size_t i = 0; i < priv->residual_max_size; i++) + res_max = MAX(res_max, priv->residual_max[i]); + ctx->residual_max = res_max; + ctx->time_res_calc += gettime() - start; ctx->count_res++; } @@ -357,21 +381,27 @@ int mg2di_ell_relax_step(EllRelaxContext *ctx) int mg2di_ell_relax_init(EllRelaxContext *ctx) { EllRelaxInternal *priv = ctx->priv; + double *tmp; int ret; + priv->calc_blocksize = 1; switch (ctx->fd_stencil) { case 1: priv->residual_calc_line = residual_calc_line_s1_c; #if HAVE_EXTERNAL_ASM - if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) + if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { priv->residual_calc_line = mg2di_residual_calc_line_s1_fma3; + priv->calc_blocksize = 4; + } #endif break; case 2: priv->residual_calc_line = residual_calc_line_s2_c; #if HAVE_EXTERNAL_ASM - if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) + if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { priv->residual_calc_line = mg2di_residual_calc_line_s2_fma3; + priv->calc_blocksize = 4; + } #endif break; default: @@ -426,6 +456,15 @@ int mg2di_ell_relax_init(EllRelaxContext *ctx) } } + priv->residual_max_size = tp_get_nb_threads(ctx->tp) * priv->calc_blocksize; + tmp = realloc(priv->residual_max, + sizeof(*priv->residual_max) * priv->residual_max_size); + if (!tmp) { + priv->residual_max_size = 0; + return -ENOMEM; + } + priv->residual_max = tmp; + boundaries_apply(ctx); residual_calc(ctx); @@ -534,6 +573,7 @@ void mg2di_ell_relax_free(EllRelaxContext **pctx) free(ctx->priv->u_base); free(ctx->priv->rhs_base); free(ctx->priv->residual_base); + free(ctx->priv->residual_max); for (int i = 0; i < ARRAY_ELEMS(ctx->priv->diff_coeffs_base); i++) free(ctx->priv->diff_coeffs_base[i]); for (int i = 0; i < ARRAY_ELEMS(ctx->priv->boundary_val_base); i++) |