From 86ad823b9ade211bfa9361b61571933aff1c9d24 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Tue, 23 Apr 2019 15:36:34 +0200 Subject: egs: merge residual calc and correct when possible Also, merge the reflect boundary condition into residual calc+add. Improves performance due to better locality. --- residual_calc.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 111 insertions(+), 13 deletions(-) (limited to 'residual_calc.c') diff --git a/residual_calc.c b/residual_calc.c index bfbb9bf..3b83c63 100644 --- a/residual_calc.c +++ b/residual_calc.c @@ -44,6 +44,11 @@ typedef struct ResidualCalcTask { const double * const *diff_coeffs; ptrdiff_t diff_coeffs_stride; + + double u_mult; + double res_mult; + + int mirror_line; } ResidualCalcTask; struct ResidualCalcInternal { @@ -52,7 +57,12 @@ struct ResidualCalcInternal { void (*residual_calc_line)(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB]); + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult); + void (*residual_add_line)(size_t linesize, double *dst, double *dst_max, + ptrdiff_t stride, const double *u, const double *rhs, + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult, double u_mult); size_t calc_blocksize; ResidualCalcTask task; @@ -61,10 +71,20 @@ struct ResidualCalcInternal { #if HAVE_EXTERNAL_ASM void mg2di_residual_calc_line_s1_fma3(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB]); + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult); +void mg2di_residual_add_line_s1_fma3(size_t linesize, double *dst, double *dst_max, + ptrdiff_t stride, const double *u, const double *rhs, + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult, double u_mult); void mg2di_residual_calc_line_s2_fma3(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB]); + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult); +void mg2di_residual_add_line_s2_fma3(size_t linesize, double *dst, double *dst_max, + ptrdiff_t stride, const double *u, const double *rhs, + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult, double u_mult); #endif static void @@ -129,7 +149,8 @@ derivatives_calc_s2(double *dst, const double *u, ptrdiff_t stride) static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB]) + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult) { double res_max = 0.0, res_abs; for (size_t i = 0; i < linesize; i++) { @@ -141,7 +162,31 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) res += u_vals[j] * diff_coeffs[j][i]; - dst[i] = res; + dst[i] = res_mult * res; + + res_abs = fabs(res); + res_max = MAX(res_max, res_abs); + } + + *dst_max = MAX(*dst_max, res_max); +} + +static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max, + ptrdiff_t stride, const double *u, const double *rhs, + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult, double u_mult) +{ + double res_max = 0.0, res_abs; + for (size_t i = 0; i < linesize; i++) { + double u_vals[MG2D_DIFF_COEFF_NB]; + double res; + + derivatives_calc_s1(u_vals, u + i, stride); + + res = -rhs[i]; + for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) + res += u_vals[j] * diff_coeffs[j][i]; + dst[i] = u_mult * u[i] + res_mult * res; res_abs = fabs(res); res_max = MAX(res_max, res_abs); @@ -152,7 +197,8 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB]) + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult) { double res_max = 0.0, res_abs; for (size_t i = 0; i < linesize; i++) { @@ -164,7 +210,31 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) res += u_vals[j] * diff_coeffs[j][i]; - dst[i] = res; + dst[i] = res_mult * res; + + res_abs = fabs(res); + res_max = MAX(res_max, res_abs); + } + + *dst_max = MAX(*dst_max, res_max); +} + +static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max, + ptrdiff_t stride, const double *u, const double *rhs, + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult, double u_mult) +{ + double res_max = 0.0, res_abs; + for (size_t i = 0; i < linesize; i++) { + double u_vals[MG2D_DIFF_COEFF_NB]; + double res; + + derivatives_calc_s2(u_vals, u + i, stride); + + res = -rhs[i]; + for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) + res += u_vals[j] * diff_coeffs[j][i]; + dst[i] = u_mult * u[i] + res_mult * res; res_abs = fabs(res); res_max = MAX(res_max, res_abs); @@ -179,15 +249,35 @@ static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thre ResidualCalcTask *task = &priv->task; const double *diff_coeffs[MG2D_DIFF_COEFF_NB]; + double *dst = task->dst + job_idx * task->dst_stride; for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++) diff_coeffs[i] = task->diff_coeffs[i] + job_idx * task->diff_coeffs_stride; - priv->residual_calc_line(task->line_size, task->dst + job_idx * task->dst_stride, - priv->residual_max + thread_idx * priv->calc_blocksize, - task->u_stride, task->u + job_idx * task->u_stride, - task->rhs + job_idx * task->rhs_stride, - diff_coeffs); + if (task->u_mult == 0.0) { + priv->residual_calc_line(task->line_size, dst, + priv->residual_max + thread_idx * priv->calc_blocksize, + task->u_stride, task->u + job_idx * task->u_stride, + task->rhs + job_idx * task->rhs_stride, + diff_coeffs, task->res_mult); + } else { + priv->residual_add_line(task->line_size, dst, + priv->residual_max + thread_idx * priv->calc_blocksize, + task->u_stride, task->u + job_idx * task->u_stride, + task->rhs + job_idx * task->rhs_stride, + diff_coeffs, task->res_mult, task->u_mult); + } + if (task->mirror_line & (1 << 0)) { + for (int i = 1; i <= FD_STENCIL_MAX; i++) + dst[-i] = dst[i]; + } + if (task->mirror_line & (1 << 1)) { + for (int i = 1; i <= FD_STENCIL_MAX; i++) + dst[task->line_size - 1 + i] = dst[task->line_size - 1 - i]; + } + if ((task->mirror_line & (1 << 2)) && job_idx > 0 && job_idx <= FD_STENCIL_MAX) { + memcpy(task->dst - job_idx * task->dst_stride, dst, sizeof(*dst) * task->line_size); + } return 0; } @@ -198,7 +288,8 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2], const double *u, ptrdiff_t u_stride, const double *rhs, ptrdiff_t rhs_stride, const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], - ptrdiff_t diff_coeffs_stride) + ptrdiff_t diff_coeffs_stride, + double u_mult, double res_mult, int mirror_line) { ResidualCalcInternal *priv = ctx->priv; ResidualCalcTask *task = &priv->task; @@ -215,6 +306,9 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2], task->rhs_stride = rhs_stride; task->diff_coeffs = diff_coeffs; task->diff_coeffs_stride = diff_coeffs_stride; + task->u_mult = u_mult; + task->res_mult = res_mult; + task->mirror_line = mirror_line; tp_execute(ctx->tp, size[1], residual_calc_task, priv); @@ -234,18 +328,22 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) switch (ctx->fd_stencil) { case 1: priv->residual_calc_line = residual_calc_line_s1_c; + priv->residual_add_line = residual_add_line_s1_c; #if HAVE_EXTERNAL_ASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { priv->residual_calc_line = mg2di_residual_calc_line_s1_fma3; + priv->residual_add_line = mg2di_residual_add_line_s1_fma3; priv->calc_blocksize = 4; } #endif break; case 2: priv->residual_calc_line = residual_calc_line_s2_c; + priv->residual_add_line = residual_add_line_s2_c; #if HAVE_EXTERNAL_ASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { priv->residual_calc_line = mg2di_residual_calc_line_s2_fma3; + priv->residual_add_line = mg2di_residual_add_line_s2_fma3; priv->calc_blocksize = 4; } #endif -- cgit v1.2.3