aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ell_relax.c54
-rw-r--r--ell_relax.h5
-rw-r--r--mg2d.c18
-rw-r--r--residual_calc.asm52
4 files changed, 91 insertions, 38 deletions
diff --git a/ell_relax.c b/ell_relax.c
index a616e2e..57c4112 100644
--- a/ell_relax.c
+++ b/ell_relax.c
@@ -43,10 +43,14 @@ struct EllRelaxInternal {
double *boundary_val_base[4];
- void (*residual_calc_line)(size_t linesize, double *dst,
+ double *residual_max;
+ size_t residual_max_size;
+
+ void (*residual_calc_line)(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB],
const double *fd_factors);
+ size_t calc_blocksize;
ptrdiff_t residual_calc_offset;
size_t residual_calc_size[2];
double fd_factors[ELL_RELAX_DIFF_COEFF_NB];
@@ -97,11 +101,11 @@ static const double fd_denoms[][ELL_RELAX_DIFF_COEFF_NB] = {
};
#if HAVE_EXTERNAL_ASM
-void mg2di_residual_calc_line_s1_fma3(size_t linesize, double *dst,
+void mg2di_residual_calc_line_s1_fma3(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB],
const double *fd_factors);
-void mg2di_residual_calc_line_s2_fma3(size_t linesize, double *dst,
+void mg2di_residual_calc_line_s2_fma3(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB],
const double *fd_factors);
@@ -167,11 +171,12 @@ derivatives_calc_s2(double *dst, const double *u, const double *fd_factors, ptrd
-1.0 * valxm2yp2 + 8.0 * valxm2yp1 - 8.0 * valxm2ym1 + 1.0 * valxm2ym2) * fd_factors[ELL_RELAX_DIFF_COEFF_11];
}
-static void residual_calc_line_s1_c(size_t linesize, double *dst,
+static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB],
const double *fd_factors)
{
+ double res_max = 0.0, res_abs;
for (size_t i = 0; i < linesize; i++) {
double u_vals[ELL_RELAX_DIFF_COEFF_NB];
double res;
@@ -182,14 +187,20 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst,
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
res += u_vals[j] * diff_coeffs[j][i];
dst[i] = res;
+
+ res_abs = fabs(res);
+ res_max = MAX(res_max, res_abs);
}
+
+ *dst_max = MAX(*dst_max, res_max);
}
-static void residual_calc_line_s2_c(size_t linesize, double *dst,
+static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB],
const double *fd_factors)
{
+ double res_max = 0.0, res_abs;
for (size_t i = 0; i < linesize; i++) {
double u_vals[ELL_RELAX_DIFF_COEFF_NB];
double res;
@@ -200,7 +211,12 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst,
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
res += u_vals[j] * diff_coeffs[j][i];
dst[i] = res;
+
+ res_abs = fabs(res);
+ res_max = MAX(res_max, res_abs);
}
+
+ *dst_max = MAX(*dst_max, res_max);
}
static void residual_calc_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
@@ -214,6 +230,7 @@ static void residual_calc_task(void *arg, unsigned int job_idx, unsigned int thr
diff_coeffs[i] = ctx->diff_coeffs[i] + offset;
priv->residual_calc_line(priv->residual_calc_size[0], ctx->residual + offset,
+ priv->residual_max + thread_idx * priv->calc_blocksize,
priv->stride, ctx->u + offset, ctx->rhs + offset,
diff_coeffs, priv->fd_factors);
}
@@ -221,12 +238,19 @@ static void residual_calc_task(void *arg, unsigned int job_idx, unsigned int thr
static void residual_calc(EllRelaxContext *ctx)
{
EllRelaxInternal *priv = ctx->priv;
+ double res_max = 0.0;
int64_t start;
+ memset(priv->residual_max, 0, sizeof(*priv->residual_max) * priv->residual_max_size);
+
start = gettime();
tp_execute(ctx->tp, priv->residual_calc_size[1], residual_calc_task, ctx);
+ for (size_t i = 0; i < priv->residual_max_size; i++)
+ res_max = MAX(res_max, priv->residual_max[i]);
+ ctx->residual_max = res_max;
+
ctx->time_res_calc += gettime() - start;
ctx->count_res++;
}
@@ -357,21 +381,27 @@ int mg2di_ell_relax_step(EllRelaxContext *ctx)
int mg2di_ell_relax_init(EllRelaxContext *ctx)
{
EllRelaxInternal *priv = ctx->priv;
+ double *tmp;
int ret;
+ priv->calc_blocksize = 1;
switch (ctx->fd_stencil) {
case 1:
priv->residual_calc_line = residual_calc_line_s1_c;
#if HAVE_EXTERNAL_ASM
- if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3)
+ if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
priv->residual_calc_line = mg2di_residual_calc_line_s1_fma3;
+ priv->calc_blocksize = 4;
+ }
#endif
break;
case 2:
priv->residual_calc_line = residual_calc_line_s2_c;
#if HAVE_EXTERNAL_ASM
- if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3)
+ if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
priv->residual_calc_line = mg2di_residual_calc_line_s2_fma3;
+ priv->calc_blocksize = 4;
+ }
#endif
break;
default:
@@ -426,6 +456,15 @@ int mg2di_ell_relax_init(EllRelaxContext *ctx)
}
}
+ priv->residual_max_size = tp_get_nb_threads(ctx->tp) * priv->calc_blocksize;
+ tmp = realloc(priv->residual_max,
+ sizeof(*priv->residual_max) * priv->residual_max_size);
+ if (!tmp) {
+ priv->residual_max_size = 0;
+ return -ENOMEM;
+ }
+ priv->residual_max = tmp;
+
boundaries_apply(ctx);
residual_calc(ctx);
@@ -534,6 +573,7 @@ void mg2di_ell_relax_free(EllRelaxContext **pctx)
free(ctx->priv->u_base);
free(ctx->priv->rhs_base);
free(ctx->priv->residual_base);
+ free(ctx->priv->residual_max);
for (int i = 0; i < ARRAY_ELEMS(ctx->priv->diff_coeffs_base); i++)
free(ctx->priv->diff_coeffs_base[i]);
for (int i = 0; i < ARRAY_ELEMS(ctx->priv->boundary_val_base); i++)
diff --git a/ell_relax.h b/ell_relax.h
index 9bee372..5f57a3b 100644
--- a/ell_relax.h
+++ b/ell_relax.h
@@ -235,6 +235,11 @@ typedef struct EllRelaxContext {
ptrdiff_t residual_stride;
/**
+ * Maximum of the absolute value of residual.
+ */
+ double residual_max;
+
+ /**
* Coefficients C_{*} that define the differential equation.
*
* Allocated by the solver in mg2di_ell_relax_alloc(), owned by the solver.
diff --git a/mg2d.c b/mg2d.c
index f64e0ae..55b8ed9 100644
--- a/mg2d.c
+++ b/mg2d.c
@@ -66,18 +66,6 @@ struct MG2DInternal {
double *boundaries_base[4];
};
-static double findmax(double *arr, size_t size[2], ptrdiff_t stride)
-{
- double ret = 0.0;
- for (size_t y = 0; y < size[1]; y++)
- for (size_t x = 0; x < size[0]; x++) {
- double val = fabs(arr[y * stride + x]);
- if (val > ret)
- ret = val;
- }
- return ret;
-}
-
static void log_callback(MG2DLogger *log, int level, const char *fmt, va_list vl)
{
MG2DContext *ctx = log->opaque;
@@ -473,14 +461,12 @@ int mg2d_solve(MG2DContext *ctx)
if (ret < 0)
return ret;
- res_prev = findmax(s_root->residual, s_root->domain_size,
- s_root->residual_stride);
+ res_prev = s_root->residual_max;
for (int i = 0; i < ctx->maxiter; i++) {
mg_solve_subgrid(ctx, root);
- res_cur = findmax(s_root->residual, s_root->domain_size,
- s_root->residual_stride);
+ res_cur = s_root->residual_max;
if (res_cur < ctx->tol) {
mg2di_log(&priv->logger, MG2D_LOG_INFO, "converged on iteration %d, residual %g\n",
diff --git a/residual_calc.asm b/residual_calc.asm
index 47dda9b..95eb226 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -41,6 +41,8 @@ SECTION .text
; m0: accumulator for the residual
; m1-m5: splatted constant finite difference coefficients
; m6-m11: working registers
+; m12: max(fabs(residual))
+; m13: mask for computing absolute values
; (s2 only) m14-m15: splatted constants 8.0, 30.0
; calculate and add residual contributions from first and second derivatives
@@ -140,7 +142,22 @@ SECTION .text
; %1: stencil
%macro RESIDUAL_CALC 1
%define stencil %1
- %define u_downq fd_factorsq ; reuse the fd_factors registers after it is no longer needed
+
+ ; load and splat the finite difference factors
+ movu m0, [fd_factorsq + OFF_DIFF_COEFF_01]
+ vpermq m1, m0, 00000000b ; diff factor 01 -> m1
+ vpermq m2, m0, 01010101b ; diff factor 10 -> m2
+ vpermq m3, m0, 10101010b ; diff factor 11 -> m3
+ vpermq m4, m0, 11111111b ; diff factor 02 -> m4
+ movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20]
+ vpermq m5, m0, 00000000b ; diff factor 20 -> m5
+ %define u_downq fd_factorsq ; reuse the fd_factors register after it is no longer needed
+
+ ; compute the mask for absolute value
+ pcmpeqq m13, m13
+ psrlq m13, 1
+ movu m12, [res_maxq]
+
; load pointers to the equation coefficients
%define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
@@ -166,23 +183,15 @@ SECTION .text
; from now on, the register that held linesize is used as the offset into data arrays
%define offsetq linesizeq
- ; load and splat the finite difference factors
- movu m0, [fd_factorsq + OFF_DIFF_COEFF_01]
- vpermq m1, m0, 00000000b ; diff factor 01 -> m1
- vpermq m2, m0, 01010101b ; diff factor 10 -> m2
- vpermq m3, m0, 10101010b ; diff factor 11 -> m3
- vpermq m4, m0, 11111111b ; diff factor 02 -> m4
- movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20]
- vpermq m5, m0, 00000000b ; diff factor 20 -> m5
-
; setup pointers to the line above and below
lea u_upq, [uq + strideq]
mov u_downq, uq
sub u_downq, strideq
%if stencil == 2
lea u_up2q, [uq + 2 * strideq]
- mov u_down2q, u_downq
- sub u_down2q, strideq
+ neg strideq
+ add strideq, u_downq
+ %define u_down2q strideq ; reuse the stride register for the u[y-2] line
movu m15, [const30]
movu m14, [const8]
@@ -206,12 +215,15 @@ SECTION .text
RES_ADD_DIFF_SINGLEDIR stencil, 1
RES_ADD_DIFF_MIXED stencil
+ andpd m6, m0, m13 ; m6 = abs(res)
+
; store the result
add offsetq, mmsize
jg .store_partial
; store full block
movu [dstq + offsetq - mmsize], m0
+ maxpd m12, m6
js .loop
jmp .finish
@@ -224,10 +236,16 @@ SECTION .text
.store1:
; offsetq is now mmsize-2 after the write position
movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
+
+ vpermq m6, m6, 0
+ maxpd m12, m6
+
jmp .finish
.store2:
; offsetq is now mmsize-2 after the write position
movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
+ mova xm6, xm6
+ maxpd m12, m6
jmp .finish
.store3:
; offsetq is now mmsize-1 after the write position
@@ -235,16 +253,20 @@ SECTION .text
vextractf128 xm0, m0, 1
movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0
+ vpermq m6, m6, 10100100b
+ maxpd m12, m6
+
.finish:
+ movu [res_maxq], m12
RET
%endmacro
INIT_YMM fma3
-cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
+cglobal residual_calc_line_s1, 8, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\
diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up
RESIDUAL_CALC 1
INIT_YMM fma3
-cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2, u_down2
+cglobal residual_calc_line_s2, 8, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2
RESIDUAL_CALC 2