diff options
-rw-r--r-- | ell_grid_solve.c | 8 | ||||
-rw-r--r-- | residual_calc.asm | 84 | ||||
-rw-r--r-- | residual_calc.c | 53 | ||||
-rw-r--r-- | residual_calc.h | 12 |
4 files changed, 80 insertions, 77 deletions
diff --git a/ell_grid_solve.c b/ell_grid_solve.c index ecd9d8f..1cf2798 100644 --- a/ell_grid_solve.c +++ b/ell_grid_solve.c @@ -160,17 +160,15 @@ static void boundaries_sync(EGSContext *ctx, NDArray *a_dst) static void residual_calc(EGSContext *ctx, int export_res) { EGSInternal *priv = ctx->priv; - const double *diff_coeffs[MG2D_DIFF_COEFF_NB]; ptrdiff_t *offset = priv->residual_calc_offset[priv->steps_since_sync]; size_t *size = priv->residual_calc_size[priv->steps_since_sync]; + const double *diff_coeffs0 = NDA_PTR2D(priv->diff_coeffs[0], offset[0], offset[1]); + const double *diff_coeffs1 = NDA_PTR2D(priv->diff_coeffs[1], offset[0], offset[1]); NDArray *dst = export_res ? ctx->residual : priv->u_next; int reflect_flags = 0; mg2di_timer_start(&ctx->timer_res_calc); - for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++) - diff_coeffs[i] = NDA_PTR2D(priv->diff_coeffs[i], offset[0], offset[1]); - for (int bnd_idx = 0; bnd_idx < 4; bnd_idx++) if (ctx->boundaries[bnd_idx]->type == MG2D_BC_TYPE_REFLECT && priv->dg->components[priv->local_component].bnd_is_outer[bnd_idx]) @@ -180,7 +178,7 @@ static void residual_calc(EGSContext *ctx, int export_res) NDA_PTR2D(dst, offset[0], offset[1]), dst->stride[0], NDA_PTR2D(ctx->u, offset[0], offset[1]), ctx->u->stride[0], NDA_PTR2D(ctx->rhs, offset[0], offset[1]), ctx->rhs->stride[0], - diff_coeffs, priv->diff_coeffs[0]->stride[0], + diff_coeffs0, priv->diff_coeffs[0]->stride[0], diff_coeffs1 - diff_coeffs0, export_res ? 0.0 : 1.0, export_res ? 1.0 : priv->r.relax_factor, reflect_flags, FD_STENCIL_MAX); diff --git a/residual_calc.asm b/residual_calc.asm index 9cd530d..42eb50b 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -22,14 +22,6 @@ ; double precision %define ELEM_SIZE 8 -; offsets to FD coefficients for given derivative -%define OFF_DIFF_COEFF_00 0 * gprsize -%define OFF_DIFF_COEFF_01 1 * gprsize -%define OFF_DIFF_COEFF_10 2 * gprsize -%define OFF_DIFF_COEFF_11 3 * gprsize -%define OFF_DIFF_COEFF_02 4 * gprsize -%define OFF_DIFF_COEFF_20 5 * gprsize - SECTION .rodata const8: times 8 dq 8.0 @@ -65,15 +57,15 @@ SECTION .text %define up2q uq + ELEM_SIZE * 2 %define um1q uq - ELEM_SIZE %define um2q uq - ELEM_SIZE * 2 - %define coeffs1q diff_coeffs10q - %define coeffs2q diff_coeffs20q + %define coeffs1q diff_coeffsq + diff_coeff_offset_10 + %define coeffs2q diff_coeffsq + diff_coeff_offset_20 %else %define up1q u_upq %define up2q u_up2q %define um1q u_downq %define um2q u_down2q - %define coeffs1q diff_coeffs01q - %define coeffs2q diff_coeffs02q + %define coeffs1q diff_coeffsq + diff_coeff_offset_01 + %define coeffs2q diff_coeffsq + diff_coeff_offset_02 %endif ; load the function values @@ -91,7 +83,7 @@ SECTION .text subpd m11, m8 ; m11 -= u[x+2] addpd m11, m10 ; m11 += u[x-2] %endif - vfmadd231pd m0, m11, [coeffs1q + offsetq] ; res += d_x u * diff_coeffs10 + vfmadd231pd m0, m11, [coeffs1q] ; res += d_x u * diff_coeffs10 ; second derivative addpd m11, m7, m9 ; m11 = u[x+1] + u[x-1] @@ -102,7 +94,7 @@ SECTION .text subpd m11, m10 ; m11 -= u[x-2] %endif subpd m11, m6 ; m11 -= fd0 u[x] - vfmadd231pd m0, m11, [coeffs2q + offsetq] ; res += d_xx u * diff_coeffs20 + vfmadd231pd m0, m11, [coeffs2q] ; res += d_xx u * diff_coeffs20 %endmacro ; calculate and add residual contributions from the second mixed derivative @@ -138,13 +130,30 @@ SECTION .text vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7 %endif - vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 + vfmadd231pd m0, m6, [diff_coeffsq + diff_coeff_offset_11] ; res += d_xy u * diff_coeffs11 %endmacro ; %1: stencil ; %2: 0 - calc; 1 - add %macro RESIDUAL_CALC 2 - %define stencil %1 + +%define stencil %1 + +%if %2 +%define opname add +%else +%define opname calc +%endif + +; typedef void ResidualLineCalc/Add( +; size_t linesize, double *dst, double *dst_max, +; ptrdiff_t u_stride, const double *u, const double *rhs, +; const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, +; double res_mult, [double u_mult (add only)]) +cglobal residual_line_ %+ opname %+ _s %+ stencil, \ + 8, 13, 14 + stencil * 2, \ + linesize, dst, res_max, u_stride, u, rhs, diff_coeffs, diff_coeffs_offset, \ + u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5 %if %2 vpermq m2, m1, 0 @@ -156,27 +165,13 @@ SECTION .text psrlq m13, 1 movu m12, [res_maxq] - ; load pointers to the equation coefficients - %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer - mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] - mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01] - mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10] - mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11] - mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02] - mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20] - ; setup the data pointers and the loop counter shl u_strideq, 3 + shl diff_coeffs_offsetq, 3 shl linesizeq, 3 add dstq, linesizeq add uq, linesizeq add rhsq, linesizeq - add diff_coeffs00q, linesizeq - add diff_coeffs01q, linesizeq - add diff_coeffs10q, linesizeq - add diff_coeffs11q, linesizeq - add diff_coeffs02q, linesizeq - add diff_coeffs20q, linesizeq neg linesizeq ; from now on, the register that held linesize is used as the offset into data arrays %define offsetq linesizeq @@ -195,13 +190,26 @@ SECTION .text movu m14, [const8] %endif + ; offsets to FD coefficients for given derivative + %define diff_coeff_offset_01 1 * diff_coeffs_offsetq + %define diff_coeff_offset_10 2 * diff_coeffs_offsetq + + lea diff_coeffs_off3q, [diff_coeffs_offsetq * 2 + diff_coeffs_offsetq] + %define diff_coeff_offset_11 diff_coeffs_off3q + + %define diff_coeff_offset_02 4 * diff_coeffs_offsetq + + lea diff_coeffs_off5q, [diff_coeffs_offsetq * 4 + diff_coeffs_offsetq] + %define diff_coeff_offset_20 diff_coeffs_off5q + + .loop: xorpd m0, m0 subpd m0, [rhsq + offsetq] ; res = -rhs ; plain value movu m6, [uq + offsetq] ; m6 = u[x] - vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 + vfmadd231pd m0, m6, [diff_coeffsq] ; res += u * diff_coeffs00 %if %2 mulpd m3, m6, m2 %endif @@ -223,6 +231,7 @@ SECTION .text %endif ; store the result + add diff_coeffsq, mmsize add offsetq, mmsize jg .store_partial @@ -267,18 +276,7 @@ SECTION .text %endmacro INIT_YMM fma3 -cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up RESIDUAL_CALC 1, 0 -cglobal residual_add_line_s1, 7, 14, 14, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up RESIDUAL_CALC 1, 1 - -INIT_YMM fma3 -cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 RESIDUAL_CALC 2, 0 - -cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 RESIDUAL_CALC 2, 1 diff --git a/residual_calc.c b/residual_calc.c index c06c966..948655e 100644 --- a/residual_calc.c +++ b/residual_calc.c @@ -33,11 +33,11 @@ typedef void ResidualLineCalc(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult); typedef void ResidualLineAdd (size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult, double u_mult); typedef struct ResidualCalcTask { @@ -52,8 +52,9 @@ typedef struct ResidualCalcTask { const double *rhs; ptrdiff_t rhs_stride; - const double * const *diff_coeffs; + const double *diff_coeffs; ptrdiff_t diff_coeffs_stride; + ptrdiff_t diff_coeffs_offset; double u_mult; double res_mult; @@ -74,10 +75,10 @@ struct ResidualCalcInternal { }; #if HAVE_NASM -ResidualLineCalc mg2di_residual_calc_line_s1_fma3; -ResidualLineCalc mg2di_residual_calc_line_s2_fma3; -ResidualLineAdd mg2di_residual_add_line_s1_fma3; -ResidualLineAdd mg2di_residual_add_line_s2_fma3; +ResidualLineCalc mg2di_residual_line_calc_s1_fma3; +ResidualLineCalc mg2di_residual_line_calc_s2_fma3; +ResidualLineAdd mg2di_residual_line_add_s1_fma3; +ResidualLineAdd mg2di_residual_line_add_s2_fma3; #endif static void @@ -142,7 +143,7 @@ derivatives_calc_s2(double *dst, const double *u, ptrdiff_t stride) static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult) { double res_max = 0.0, res_abs; @@ -154,7 +155,7 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) - res += u_vals[j] * diff_coeffs[j][i]; + res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i]; dst[i] = res_mult * res; res_abs = fabs(res); @@ -166,7 +167,7 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult, double u_mult) { double res_max = 0.0, res_abs; @@ -178,7 +179,7 @@ static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) - res += u_vals[j] * diff_coeffs[j][i]; + res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i]; dst[i] = u_mult * u[i] + res_mult * res; res_abs = fabs(res); @@ -190,7 +191,7 @@ static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult) { double res_max = 0.0, res_abs; @@ -202,7 +203,7 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) - res += u_vals[j] * diff_coeffs[j][i]; + res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i]; dst[i] = res_mult * res; res_abs = fabs(res); @@ -214,7 +215,7 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult, double u_mult) { double res_max = 0.0, res_abs; @@ -226,7 +227,7 @@ static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) - res += u_vals[j] * diff_coeffs[j][i]; + res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i]; dst[i] = u_mult * u[i] + res_mult * res; res_abs = fabs(res); @@ -241,24 +242,21 @@ static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thre ResidualCalcInternal *priv = arg; ResidualCalcTask *task = &priv->task; - const double *diff_coeffs[MG2D_DIFF_COEFF_NB]; + const double *diff_coeffs = task->diff_coeffs + job_idx * task->diff_coeffs_stride; double *dst = task->dst + job_idx * task->dst_stride; - for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++) - diff_coeffs[i] = task->diff_coeffs[i] + job_idx * task->diff_coeffs_stride; - if (task->u_mult == 0.0) { priv->residual_line_calc(task->size[0], dst, priv->residual_max + thread_idx * priv->calc_blocksize, task->u_stride, task->u + job_idx * task->u_stride, task->rhs + job_idx * task->rhs_stride, - diff_coeffs, task->res_mult); + diff_coeffs, task->diff_coeffs_offset, task->res_mult); } else { priv->residual_line_add(task->size[0], dst, priv->residual_max + thread_idx * priv->calc_blocksize, task->u_stride, task->u + job_idx * task->u_stride, task->rhs + job_idx * task->rhs_stride, - diff_coeffs, task->res_mult, task->u_mult); + diff_coeffs, task->diff_coeffs_offset, task->res_mult, task->u_mult); } if (task->reflect & (1 << MG2D_BOUNDARY_0L)) { @@ -286,8 +284,8 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2], double *dst, ptrdiff_t dst_stride, const double *u, ptrdiff_t u_stride, const double *rhs, ptrdiff_t rhs_stride, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], - ptrdiff_t diff_coeffs_stride, + const double *diff_coeffs, ptrdiff_t diff_coeffs_stride, + ptrdiff_t diff_coeffs_offset, double u_mult, double res_mult, int reflect, size_t reflect_dist) { @@ -307,6 +305,7 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2], task->rhs_stride = rhs_stride; task->diff_coeffs = diff_coeffs; task->diff_coeffs_stride = diff_coeffs_stride; + task->diff_coeffs_offset = diff_coeffs_offset; task->u_mult = u_mult; task->res_mult = res_mult; task->reflect = reflect; @@ -333,8 +332,8 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) priv->residual_line_add = residual_add_line_s1_c; #if HAVE_NASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { - priv->residual_line_calc = mg2di_residual_calc_line_s1_fma3; - priv->residual_line_add = mg2di_residual_add_line_s1_fma3; + priv->residual_line_calc = mg2di_residual_line_calc_s1_fma3; + priv->residual_line_add = mg2di_residual_line_add_s1_fma3; priv->calc_blocksize = 4; } #endif @@ -344,8 +343,8 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) priv->residual_line_add = residual_add_line_s2_c; #if HAVE_NASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { - priv->residual_line_calc = mg2di_residual_calc_line_s2_fma3; - priv->residual_line_add = mg2di_residual_add_line_s2_fma3; + priv->residual_line_calc = mg2di_residual_line_calc_s2_fma3; + priv->residual_line_add = mg2di_residual_line_add_s2_fma3; priv->calc_blocksize = 4; } #endif diff --git a/residual_calc.h b/residual_calc.h index 1498bc9..01ad602 100644 --- a/residual_calc.h +++ b/residual_calc.h @@ -68,6 +68,14 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx); /** * Calculate the residual. * + * @param diff_coeffs Array containing all PDE coefficients. + * @param diff_coeffs_stride Distance between adjacent lines for a given PDE + * coefficient. + * @param diff_coeffs_offset Distance between blocks containing different PDE + * coefficients. I.e. diff_coeffs[i * diff_coeffs_offset + j * diff_coeffs_stride + k] + * is the value of the i-th PDE coefficient (MG2D_DIFF_COEFF_*) at the + * (j, k)th gridpoint. + * * @param reflect Flags indicating if dst should be extended by reflection. * If the 'MG2D_BOUNDARY_x'th bit is set in reflect, then dst should be * reflected reflect_dist points beyond its bounds in the corresponding @@ -79,8 +87,8 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2], double *dst, ptrdiff_t dst_stride, const double *u, ptrdiff_t u_stride, const double *rhs, ptrdiff_t rhs_stride, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], - ptrdiff_t diff_coeffs_stride, + const double *diff_coeffs, ptrdiff_t diff_coeffs_stride, + ptrdiff_t diff_coeffs_offset, double u_mult, double res_mult, int reflect, size_t reflect_dist); |