diff options
-rw-r--r-- | ell_relax.c | 54 | ||||
-rw-r--r-- | ell_relax.h | 5 | ||||
-rw-r--r-- | mg2d.c | 18 | ||||
-rw-r--r-- | residual_calc.asm | 52 |
4 files changed, 91 insertions, 38 deletions
diff --git a/ell_relax.c b/ell_relax.c index a616e2e..57c4112 100644 --- a/ell_relax.c +++ b/ell_relax.c @@ -43,10 +43,14 @@ struct EllRelaxInternal { double *boundary_val_base[4]; - void (*residual_calc_line)(size_t linesize, double *dst, + double *residual_max; + size_t residual_max_size; + + void (*residual_calc_line)(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB], const double *fd_factors); + size_t calc_blocksize; ptrdiff_t residual_calc_offset; size_t residual_calc_size[2]; double fd_factors[ELL_RELAX_DIFF_COEFF_NB]; @@ -97,11 +101,11 @@ static const double fd_denoms[][ELL_RELAX_DIFF_COEFF_NB] = { }; #if HAVE_EXTERNAL_ASM -void mg2di_residual_calc_line_s1_fma3(size_t linesize, double *dst, +void mg2di_residual_calc_line_s1_fma3(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB], const double *fd_factors); -void mg2di_residual_calc_line_s2_fma3(size_t linesize, double *dst, +void mg2di_residual_calc_line_s2_fma3(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB], const double *fd_factors); @@ -167,11 +171,12 @@ derivatives_calc_s2(double *dst, const double *u, const double *fd_factors, ptrd -1.0 * valxm2yp2 + 8.0 * valxm2yp1 - 8.0 * valxm2ym1 + 1.0 * valxm2ym2) * fd_factors[ELL_RELAX_DIFF_COEFF_11]; } -static void residual_calc_line_s1_c(size_t linesize, double *dst, +static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB], const double *fd_factors) { + double res_max = 0.0, res_abs; for (size_t i = 0; i < linesize; i++) { double u_vals[ELL_RELAX_DIFF_COEFF_NB]; double res; @@ -182,14 +187,20 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) res += u_vals[j] * diff_coeffs[j][i]; dst[i] = res; + + res_abs = fabs(res); + res_max = MAX(res_max, res_abs); } + + *dst_max = MAX(*dst_max, res_max); } -static void residual_calc_line_s2_c(size_t linesize, double *dst, +static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB], const double *fd_factors) { + double res_max = 0.0, res_abs; for (size_t i = 0; i < linesize; i++) { double u_vals[ELL_RELAX_DIFF_COEFF_NB]; double res; @@ -200,7 +211,12 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) res += u_vals[j] * diff_coeffs[j][i]; dst[i] = res; + + res_abs = fabs(res); + res_max = MAX(res_max, res_abs); } + + *dst_max = MAX(*dst_max, res_max); } static void residual_calc_task(void *arg, unsigned int job_idx, unsigned int thread_idx) @@ -214,6 +230,7 @@ static void residual_calc_task(void *arg, unsigned int job_idx, unsigned int thr diff_coeffs[i] = ctx->diff_coeffs[i] + offset; priv->residual_calc_line(priv->residual_calc_size[0], ctx->residual + offset, + priv->residual_max + thread_idx * priv->calc_blocksize, priv->stride, ctx->u + offset, ctx->rhs + offset, diff_coeffs, priv->fd_factors); } @@ -221,12 +238,19 @@ static void residual_calc_task(void *arg, unsigned int job_idx, unsigned int thr static void residual_calc(EllRelaxContext *ctx) { EllRelaxInternal *priv = ctx->priv; + double res_max = 0.0; int64_t start; + memset(priv->residual_max, 0, sizeof(*priv->residual_max) * priv->residual_max_size); + start = gettime(); tp_execute(ctx->tp, priv->residual_calc_size[1], residual_calc_task, ctx); + for (size_t i = 0; i < priv->residual_max_size; i++) + res_max = MAX(res_max, priv->residual_max[i]); + ctx->residual_max = res_max; + ctx->time_res_calc += gettime() - start; ctx->count_res++; } @@ -357,21 +381,27 @@ int mg2di_ell_relax_step(EllRelaxContext *ctx) int mg2di_ell_relax_init(EllRelaxContext *ctx) { EllRelaxInternal *priv = ctx->priv; + double *tmp; int ret; + priv->calc_blocksize = 1; switch (ctx->fd_stencil) { case 1: priv->residual_calc_line = residual_calc_line_s1_c; #if HAVE_EXTERNAL_ASM - if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) + if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { priv->residual_calc_line = mg2di_residual_calc_line_s1_fma3; + priv->calc_blocksize = 4; + } #endif break; case 2: priv->residual_calc_line = residual_calc_line_s2_c; #if HAVE_EXTERNAL_ASM - if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) + if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { priv->residual_calc_line = mg2di_residual_calc_line_s2_fma3; + priv->calc_blocksize = 4; + } #endif break; default: @@ -426,6 +456,15 @@ int mg2di_ell_relax_init(EllRelaxContext *ctx) } } + priv->residual_max_size = tp_get_nb_threads(ctx->tp) * priv->calc_blocksize; + tmp = realloc(priv->residual_max, + sizeof(*priv->residual_max) * priv->residual_max_size); + if (!tmp) { + priv->residual_max_size = 0; + return -ENOMEM; + } + priv->residual_max = tmp; + boundaries_apply(ctx); residual_calc(ctx); @@ -534,6 +573,7 @@ void mg2di_ell_relax_free(EllRelaxContext **pctx) free(ctx->priv->u_base); free(ctx->priv->rhs_base); free(ctx->priv->residual_base); + free(ctx->priv->residual_max); for (int i = 0; i < ARRAY_ELEMS(ctx->priv->diff_coeffs_base); i++) free(ctx->priv->diff_coeffs_base[i]); for (int i = 0; i < ARRAY_ELEMS(ctx->priv->boundary_val_base); i++) diff --git a/ell_relax.h b/ell_relax.h index 9bee372..5f57a3b 100644 --- a/ell_relax.h +++ b/ell_relax.h @@ -235,6 +235,11 @@ typedef struct EllRelaxContext { ptrdiff_t residual_stride; /** + * Maximum of the absolute value of residual. + */ + double residual_max; + + /** * Coefficients C_{*} that define the differential equation. * * Allocated by the solver in mg2di_ell_relax_alloc(), owned by the solver. @@ -66,18 +66,6 @@ struct MG2DInternal { double *boundaries_base[4]; }; -static double findmax(double *arr, size_t size[2], ptrdiff_t stride) -{ - double ret = 0.0; - for (size_t y = 0; y < size[1]; y++) - for (size_t x = 0; x < size[0]; x++) { - double val = fabs(arr[y * stride + x]); - if (val > ret) - ret = val; - } - return ret; -} - static void log_callback(MG2DLogger *log, int level, const char *fmt, va_list vl) { MG2DContext *ctx = log->opaque; @@ -473,14 +461,12 @@ int mg2d_solve(MG2DContext *ctx) if (ret < 0) return ret; - res_prev = findmax(s_root->residual, s_root->domain_size, - s_root->residual_stride); + res_prev = s_root->residual_max; for (int i = 0; i < ctx->maxiter; i++) { mg_solve_subgrid(ctx, root); - res_cur = findmax(s_root->residual, s_root->domain_size, - s_root->residual_stride); + res_cur = s_root->residual_max; if (res_cur < ctx->tol) { mg2di_log(&priv->logger, MG2D_LOG_INFO, "converged on iteration %d, residual %g\n", diff --git a/residual_calc.asm b/residual_calc.asm index 47dda9b..95eb226 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -41,6 +41,8 @@ SECTION .text ; m0: accumulator for the residual ; m1-m5: splatted constant finite difference coefficients ; m6-m11: working registers +; m12: max(fabs(residual)) +; m13: mask for computing absolute values ; (s2 only) m14-m15: splatted constants 8.0, 30.0 ; calculate and add residual contributions from first and second derivatives @@ -140,7 +142,22 @@ SECTION .text ; %1: stencil %macro RESIDUAL_CALC 1 %define stencil %1 - %define u_downq fd_factorsq ; reuse the fd_factors registers after it is no longer needed + + ; load and splat the finite difference factors + movu m0, [fd_factorsq + OFF_DIFF_COEFF_01] + vpermq m1, m0, 00000000b ; diff factor 01 -> m1 + vpermq m2, m0, 01010101b ; diff factor 10 -> m2 + vpermq m3, m0, 10101010b ; diff factor 11 -> m3 + vpermq m4, m0, 11111111b ; diff factor 02 -> m4 + movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20] + vpermq m5, m0, 00000000b ; diff factor 20 -> m5 + %define u_downq fd_factorsq ; reuse the fd_factors register after it is no longer needed + + ; compute the mask for absolute value + pcmpeqq m13, m13 + psrlq m13, 1 + movu m12, [res_maxq] + ; load pointers to the equation coefficients %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] @@ -166,23 +183,15 @@ SECTION .text ; from now on, the register that held linesize is used as the offset into data arrays %define offsetq linesizeq - ; load and splat the finite difference factors - movu m0, [fd_factorsq + OFF_DIFF_COEFF_01] - vpermq m1, m0, 00000000b ; diff factor 01 -> m1 - vpermq m2, m0, 01010101b ; diff factor 10 -> m2 - vpermq m3, m0, 10101010b ; diff factor 11 -> m3 - vpermq m4, m0, 11111111b ; diff factor 02 -> m4 - movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20] - vpermq m5, m0, 00000000b ; diff factor 20 -> m5 - ; setup pointers to the line above and below lea u_upq, [uq + strideq] mov u_downq, uq sub u_downq, strideq %if stencil == 2 lea u_up2q, [uq + 2 * strideq] - mov u_down2q, u_downq - sub u_down2q, strideq + neg strideq + add strideq, u_downq + %define u_down2q strideq ; reuse the stride register for the u[y-2] line movu m15, [const30] movu m14, [const8] @@ -206,12 +215,15 @@ SECTION .text RES_ADD_DIFF_SINGLEDIR stencil, 1 RES_ADD_DIFF_MIXED stencil + andpd m6, m0, m13 ; m6 = abs(res) + ; store the result add offsetq, mmsize jg .store_partial ; store full block movu [dstq + offsetq - mmsize], m0 + maxpd m12, m6 js .loop jmp .finish @@ -224,10 +236,16 @@ SECTION .text .store1: ; offsetq is now mmsize-2 after the write position movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 + + vpermq m6, m6, 0 + maxpd m12, m6 + jmp .finish .store2: ; offsetq is now mmsize-2 after the write position movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 + mova xm6, xm6 + maxpd m12, m6 jmp .finish .store3: ; offsetq is now mmsize-1 after the write position @@ -235,16 +253,20 @@ SECTION .text vextractf128 xm0, m0, 1 movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0 + vpermq m6, m6, 10100100b + maxpd m12, m6 + .finish: + movu [res_maxq], m12 RET %endmacro INIT_YMM fma3 -cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ +cglobal residual_calc_line_s1, 8, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up RESIDUAL_CALC 1 INIT_YMM fma3 -cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2, u_down2 +cglobal residual_calc_line_s2, 8, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\ + diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2 RESIDUAL_CALC 2 |