From e30cfde7614be7062249954eab6c3f56eeabbb51 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Mon, 15 Apr 2024 21:44:14 +0200 Subject: residual_calc: accept all diff coefficients in a single array Plus an offset parameter that signals the distance between different coefficients. This allows to avoid passing so many pointers around, which reduces register pressure and simplifies writing SIMD. Seems also to be a little faster. --- ell_grid_solve.c | 8 ++---- residual_calc.asm | 84 +++++++++++++++++++++++++++---------------------------- residual_calc.c | 53 +++++++++++++++++------------------ residual_calc.h | 12 ++++++-- 4 files changed, 80 insertions(+), 77 deletions(-) diff --git a/ell_grid_solve.c b/ell_grid_solve.c index ecd9d8f..1cf2798 100644 --- a/ell_grid_solve.c +++ b/ell_grid_solve.c @@ -160,17 +160,15 @@ static void boundaries_sync(EGSContext *ctx, NDArray *a_dst) static void residual_calc(EGSContext *ctx, int export_res) { EGSInternal *priv = ctx->priv; - const double *diff_coeffs[MG2D_DIFF_COEFF_NB]; ptrdiff_t *offset = priv->residual_calc_offset[priv->steps_since_sync]; size_t *size = priv->residual_calc_size[priv->steps_since_sync]; + const double *diff_coeffs0 = NDA_PTR2D(priv->diff_coeffs[0], offset[0], offset[1]); + const double *diff_coeffs1 = NDA_PTR2D(priv->diff_coeffs[1], offset[0], offset[1]); NDArray *dst = export_res ? ctx->residual : priv->u_next; int reflect_flags = 0; mg2di_timer_start(&ctx->timer_res_calc); - for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++) - diff_coeffs[i] = NDA_PTR2D(priv->diff_coeffs[i], offset[0], offset[1]); - for (int bnd_idx = 0; bnd_idx < 4; bnd_idx++) if (ctx->boundaries[bnd_idx]->type == MG2D_BC_TYPE_REFLECT && priv->dg->components[priv->local_component].bnd_is_outer[bnd_idx]) @@ -180,7 +178,7 @@ static void residual_calc(EGSContext *ctx, int export_res) NDA_PTR2D(dst, offset[0], offset[1]), dst->stride[0], NDA_PTR2D(ctx->u, offset[0], offset[1]), ctx->u->stride[0], NDA_PTR2D(ctx->rhs, offset[0], offset[1]), ctx->rhs->stride[0], - diff_coeffs, priv->diff_coeffs[0]->stride[0], + diff_coeffs0, priv->diff_coeffs[0]->stride[0], diff_coeffs1 - diff_coeffs0, export_res ? 0.0 : 1.0, export_res ? 1.0 : priv->r.relax_factor, reflect_flags, FD_STENCIL_MAX); diff --git a/residual_calc.asm b/residual_calc.asm index 9cd530d..42eb50b 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -22,14 +22,6 @@ ; double precision %define ELEM_SIZE 8 -; offsets to FD coefficients for given derivative -%define OFF_DIFF_COEFF_00 0 * gprsize -%define OFF_DIFF_COEFF_01 1 * gprsize -%define OFF_DIFF_COEFF_10 2 * gprsize -%define OFF_DIFF_COEFF_11 3 * gprsize -%define OFF_DIFF_COEFF_02 4 * gprsize -%define OFF_DIFF_COEFF_20 5 * gprsize - SECTION .rodata const8: times 8 dq 8.0 @@ -65,15 +57,15 @@ SECTION .text %define up2q uq + ELEM_SIZE * 2 %define um1q uq - ELEM_SIZE %define um2q uq - ELEM_SIZE * 2 - %define coeffs1q diff_coeffs10q - %define coeffs2q diff_coeffs20q + %define coeffs1q diff_coeffsq + diff_coeff_offset_10 + %define coeffs2q diff_coeffsq + diff_coeff_offset_20 %else %define up1q u_upq %define up2q u_up2q %define um1q u_downq %define um2q u_down2q - %define coeffs1q diff_coeffs01q - %define coeffs2q diff_coeffs02q + %define coeffs1q diff_coeffsq + diff_coeff_offset_01 + %define coeffs2q diff_coeffsq + diff_coeff_offset_02 %endif ; load the function values @@ -91,7 +83,7 @@ SECTION .text subpd m11, m8 ; m11 -= u[x+2] addpd m11, m10 ; m11 += u[x-2] %endif - vfmadd231pd m0, m11, [coeffs1q + offsetq] ; res += d_x u * diff_coeffs10 + vfmadd231pd m0, m11, [coeffs1q] ; res += d_x u * diff_coeffs10 ; second derivative addpd m11, m7, m9 ; m11 = u[x+1] + u[x-1] @@ -102,7 +94,7 @@ SECTION .text subpd m11, m10 ; m11 -= u[x-2] %endif subpd m11, m6 ; m11 -= fd0 u[x] - vfmadd231pd m0, m11, [coeffs2q + offsetq] ; res += d_xx u * diff_coeffs20 + vfmadd231pd m0, m11, [coeffs2q] ; res += d_xx u * diff_coeffs20 %endmacro ; calculate and add residual contributions from the second mixed derivative @@ -138,13 +130,30 @@ SECTION .text vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7 %endif - vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 + vfmadd231pd m0, m6, [diff_coeffsq + diff_coeff_offset_11] ; res += d_xy u * diff_coeffs11 %endmacro ; %1: stencil ; %2: 0 - calc; 1 - add %macro RESIDUAL_CALC 2 - %define stencil %1 + +%define stencil %1 + +%if %2 +%define opname add +%else +%define opname calc +%endif + +; typedef void ResidualLineCalc/Add( +; size_t linesize, double *dst, double *dst_max, +; ptrdiff_t u_stride, const double *u, const double *rhs, +; const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, +; double res_mult, [double u_mult (add only)]) +cglobal residual_line_ %+ opname %+ _s %+ stencil, \ + 8, 13, 14 + stencil * 2, \ + linesize, dst, res_max, u_stride, u, rhs, diff_coeffs, diff_coeffs_offset, \ + u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5 %if %2 vpermq m2, m1, 0 @@ -156,27 +165,13 @@ SECTION .text psrlq m13, 1 movu m12, [res_maxq] - ; load pointers to the equation coefficients - %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer - mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] - mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01] - mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10] - mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11] - mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02] - mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20] - ; setup the data pointers and the loop counter shl u_strideq, 3 + shl diff_coeffs_offsetq, 3 shl linesizeq, 3 add dstq, linesizeq add uq, linesizeq add rhsq, linesizeq - add diff_coeffs00q, linesizeq - add diff_coeffs01q, linesizeq - add diff_coeffs10q, linesizeq - add diff_coeffs11q, linesizeq - add diff_coeffs02q, linesizeq - add diff_coeffs20q, linesizeq neg linesizeq ; from now on, the register that held linesize is used as the offset into data arrays %define offsetq linesizeq @@ -195,13 +190,26 @@ SECTION .text movu m14, [const8] %endif + ; offsets to FD coefficients for given derivative + %define diff_coeff_offset_01 1 * diff_coeffs_offsetq + %define diff_coeff_offset_10 2 * diff_coeffs_offsetq + + lea diff_coeffs_off3q, [diff_coeffs_offsetq * 2 + diff_coeffs_offsetq] + %define diff_coeff_offset_11 diff_coeffs_off3q + + %define diff_coeff_offset_02 4 * diff_coeffs_offsetq + + lea diff_coeffs_off5q, [diff_coeffs_offsetq * 4 + diff_coeffs_offsetq] + %define diff_coeff_offset_20 diff_coeffs_off5q + + .loop: xorpd m0, m0 subpd m0, [rhsq + offsetq] ; res = -rhs ; plain value movu m6, [uq + offsetq] ; m6 = u[x] - vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 + vfmadd231pd m0, m6, [diff_coeffsq] ; res += u * diff_coeffs00 %if %2 mulpd m3, m6, m2 %endif @@ -223,6 +231,7 @@ SECTION .text %endif ; store the result + add diff_coeffsq, mmsize add offsetq, mmsize jg .store_partial @@ -267,18 +276,7 @@ SECTION .text %endmacro INIT_YMM fma3 -cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up RESIDUAL_CALC 1, 0 -cglobal residual_add_line_s1, 7, 14, 14, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up RESIDUAL_CALC 1, 1 - -INIT_YMM fma3 -cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 RESIDUAL_CALC 2, 0 - -cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 RESIDUAL_CALC 2, 1 diff --git a/residual_calc.c b/residual_calc.c index c06c966..948655e 100644 --- a/residual_calc.c +++ b/residual_calc.c @@ -33,11 +33,11 @@ typedef void ResidualLineCalc(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult); typedef void ResidualLineAdd (size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult, double u_mult); typedef struct ResidualCalcTask { @@ -52,8 +52,9 @@ typedef struct ResidualCalcTask { const double *rhs; ptrdiff_t rhs_stride; - const double * const *diff_coeffs; + const double *diff_coeffs; ptrdiff_t diff_coeffs_stride; + ptrdiff_t diff_coeffs_offset; double u_mult; double res_mult; @@ -74,10 +75,10 @@ struct ResidualCalcInternal { }; #if HAVE_NASM -ResidualLineCalc mg2di_residual_calc_line_s1_fma3; -ResidualLineCalc mg2di_residual_calc_line_s2_fma3; -ResidualLineAdd mg2di_residual_add_line_s1_fma3; -ResidualLineAdd mg2di_residual_add_line_s2_fma3; +ResidualLineCalc mg2di_residual_line_calc_s1_fma3; +ResidualLineCalc mg2di_residual_line_calc_s2_fma3; +ResidualLineAdd mg2di_residual_line_add_s1_fma3; +ResidualLineAdd mg2di_residual_line_add_s2_fma3; #endif static void @@ -142,7 +143,7 @@ derivatives_calc_s2(double *dst, const double *u, ptrdiff_t stride) static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult) { double res_max = 0.0, res_abs; @@ -154,7 +155,7 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) - res += u_vals[j] * diff_coeffs[j][i]; + res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i]; dst[i] = res_mult * res; res_abs = fabs(res); @@ -166,7 +167,7 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult, double u_mult) { double res_max = 0.0, res_abs; @@ -178,7 +179,7 @@ static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) - res += u_vals[j] * diff_coeffs[j][i]; + res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i]; dst[i] = u_mult * u[i] + res_mult * res; res_abs = fabs(res); @@ -190,7 +191,7 @@ static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult) { double res_max = 0.0, res_abs; @@ -202,7 +203,7 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) - res += u_vals[j] * diff_coeffs[j][i]; + res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i]; dst[i] = res_mult * res; res_abs = fabs(res); @@ -214,7 +215,7 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult, double u_mult) { double res_max = 0.0, res_abs; @@ -226,7 +227,7 @@ static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) - res += u_vals[j] * diff_coeffs[j][i]; + res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i]; dst[i] = u_mult * u[i] + res_mult * res; res_abs = fabs(res); @@ -241,24 +242,21 @@ static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thre ResidualCalcInternal *priv = arg; ResidualCalcTask *task = &priv->task; - const double *diff_coeffs[MG2D_DIFF_COEFF_NB]; + const double *diff_coeffs = task->diff_coeffs + job_idx * task->diff_coeffs_stride; double *dst = task->dst + job_idx * task->dst_stride; - for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++) - diff_coeffs[i] = task->diff_coeffs[i] + job_idx * task->diff_coeffs_stride; - if (task->u_mult == 0.0) { priv->residual_line_calc(task->size[0], dst, priv->residual_max + thread_idx * priv->calc_blocksize, task->u_stride, task->u + job_idx * task->u_stride, task->rhs + job_idx * task->rhs_stride, - diff_coeffs, task->res_mult); + diff_coeffs, task->diff_coeffs_offset, task->res_mult); } else { priv->residual_line_add(task->size[0], dst, priv->residual_max + thread_idx * priv->calc_blocksize, task->u_stride, task->u + job_idx * task->u_stride, task->rhs + job_idx * task->rhs_stride, - diff_coeffs, task->res_mult, task->u_mult); + diff_coeffs, task->diff_coeffs_offset, task->res_mult, task->u_mult); } if (task->reflect & (1 << MG2D_BOUNDARY_0L)) { @@ -286,8 +284,8 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2], double *dst, ptrdiff_t dst_stride, const double *u, ptrdiff_t u_stride, const double *rhs, ptrdiff_t rhs_stride, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], - ptrdiff_t diff_coeffs_stride, + const double *diff_coeffs, ptrdiff_t diff_coeffs_stride, + ptrdiff_t diff_coeffs_offset, double u_mult, double res_mult, int reflect, size_t reflect_dist) { @@ -307,6 +305,7 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2], task->rhs_stride = rhs_stride; task->diff_coeffs = diff_coeffs; task->diff_coeffs_stride = diff_coeffs_stride; + task->diff_coeffs_offset = diff_coeffs_offset; task->u_mult = u_mult; task->res_mult = res_mult; task->reflect = reflect; @@ -333,8 +332,8 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) priv->residual_line_add = residual_add_line_s1_c; #if HAVE_NASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { - priv->residual_line_calc = mg2di_residual_calc_line_s1_fma3; - priv->residual_line_add = mg2di_residual_add_line_s1_fma3; + priv->residual_line_calc = mg2di_residual_line_calc_s1_fma3; + priv->residual_line_add = mg2di_residual_line_add_s1_fma3; priv->calc_blocksize = 4; } #endif @@ -344,8 +343,8 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) priv->residual_line_add = residual_add_line_s2_c; #if HAVE_NASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { - priv->residual_line_calc = mg2di_residual_calc_line_s2_fma3; - priv->residual_line_add = mg2di_residual_add_line_s2_fma3; + priv->residual_line_calc = mg2di_residual_line_calc_s2_fma3; + priv->residual_line_add = mg2di_residual_line_add_s2_fma3; priv->calc_blocksize = 4; } #endif diff --git a/residual_calc.h b/residual_calc.h index 1498bc9..01ad602 100644 --- a/residual_calc.h +++ b/residual_calc.h @@ -68,6 +68,14 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx); /** * Calculate the residual. * + * @param diff_coeffs Array containing all PDE coefficients. + * @param diff_coeffs_stride Distance between adjacent lines for a given PDE + * coefficient. + * @param diff_coeffs_offset Distance between blocks containing different PDE + * coefficients. I.e. diff_coeffs[i * diff_coeffs_offset + j * diff_coeffs_stride + k] + * is the value of the i-th PDE coefficient (MG2D_DIFF_COEFF_*) at the + * (j, k)th gridpoint. + * * @param reflect Flags indicating if dst should be extended by reflection. * If the 'MG2D_BOUNDARY_x'th bit is set in reflect, then dst should be * reflected reflect_dist points beyond its bounds in the corresponding @@ -79,8 +87,8 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2], double *dst, ptrdiff_t dst_stride, const double *u, ptrdiff_t u_stride, const double *rhs, ptrdiff_t rhs_stride, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], - ptrdiff_t diff_coeffs_stride, + const double *diff_coeffs, ptrdiff_t diff_coeffs_stride, + ptrdiff_t diff_coeffs_offset, double u_mult, double res_mult, int reflect, size_t reflect_dist); -- cgit v1.2.3