summaryrefslogtreecommitdiff
path: root/ell_relax.c
diff options
context:
space:
mode:
Diffstat (limited to 'ell_relax.c')
-rw-r--r--ell_relax.c54
1 files changed, 47 insertions, 7 deletions
diff --git a/ell_relax.c b/ell_relax.c
index a616e2e..57c4112 100644
--- a/ell_relax.c
+++ b/ell_relax.c
@@ -43,10 +43,14 @@ struct EllRelaxInternal {
double *boundary_val_base[4];
- void (*residual_calc_line)(size_t linesize, double *dst,
+ double *residual_max;
+ size_t residual_max_size;
+
+ void (*residual_calc_line)(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB],
const double *fd_factors);
+ size_t calc_blocksize;
ptrdiff_t residual_calc_offset;
size_t residual_calc_size[2];
double fd_factors[ELL_RELAX_DIFF_COEFF_NB];
@@ -97,11 +101,11 @@ static const double fd_denoms[][ELL_RELAX_DIFF_COEFF_NB] = {
};
#if HAVE_EXTERNAL_ASM
-void mg2di_residual_calc_line_s1_fma3(size_t linesize, double *dst,
+void mg2di_residual_calc_line_s1_fma3(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB],
const double *fd_factors);
-void mg2di_residual_calc_line_s2_fma3(size_t linesize, double *dst,
+void mg2di_residual_calc_line_s2_fma3(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB],
const double *fd_factors);
@@ -167,11 +171,12 @@ derivatives_calc_s2(double *dst, const double *u, const double *fd_factors, ptrd
-1.0 * valxm2yp2 + 8.0 * valxm2yp1 - 8.0 * valxm2ym1 + 1.0 * valxm2ym2) * fd_factors[ELL_RELAX_DIFF_COEFF_11];
}
-static void residual_calc_line_s1_c(size_t linesize, double *dst,
+static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB],
const double *fd_factors)
{
+ double res_max = 0.0, res_abs;
for (size_t i = 0; i < linesize; i++) {
double u_vals[ELL_RELAX_DIFF_COEFF_NB];
double res;
@@ -182,14 +187,20 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst,
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
res += u_vals[j] * diff_coeffs[j][i];
dst[i] = res;
+
+ res_abs = fabs(res);
+ res_max = MAX(res_max, res_abs);
}
+
+ *dst_max = MAX(*dst_max, res_max);
}
-static void residual_calc_line_s2_c(size_t linesize, double *dst,
+static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
const double * const diff_coeffs[ELL_RELAX_DIFF_COEFF_NB],
const double *fd_factors)
{
+ double res_max = 0.0, res_abs;
for (size_t i = 0; i < linesize; i++) {
double u_vals[ELL_RELAX_DIFF_COEFF_NB];
double res;
@@ -200,7 +211,12 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst,
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
res += u_vals[j] * diff_coeffs[j][i];
dst[i] = res;
+
+ res_abs = fabs(res);
+ res_max = MAX(res_max, res_abs);
}
+
+ *dst_max = MAX(*dst_max, res_max);
}
static void residual_calc_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
@@ -214,6 +230,7 @@ static void residual_calc_task(void *arg, unsigned int job_idx, unsigned int thr
diff_coeffs[i] = ctx->diff_coeffs[i] + offset;
priv->residual_calc_line(priv->residual_calc_size[0], ctx->residual + offset,
+ priv->residual_max + thread_idx * priv->calc_blocksize,
priv->stride, ctx->u + offset, ctx->rhs + offset,
diff_coeffs, priv->fd_factors);
}
@@ -221,12 +238,19 @@ static void residual_calc_task(void *arg, unsigned int job_idx, unsigned int thr
static void residual_calc(EllRelaxContext *ctx)
{
EllRelaxInternal *priv = ctx->priv;
+ double res_max = 0.0;
int64_t start;
+ memset(priv->residual_max, 0, sizeof(*priv->residual_max) * priv->residual_max_size);
+
start = gettime();
tp_execute(ctx->tp, priv->residual_calc_size[1], residual_calc_task, ctx);
+ for (size_t i = 0; i < priv->residual_max_size; i++)
+ res_max = MAX(res_max, priv->residual_max[i]);
+ ctx->residual_max = res_max;
+
ctx->time_res_calc += gettime() - start;
ctx->count_res++;
}
@@ -357,21 +381,27 @@ int mg2di_ell_relax_step(EllRelaxContext *ctx)
int mg2di_ell_relax_init(EllRelaxContext *ctx)
{
EllRelaxInternal *priv = ctx->priv;
+ double *tmp;
int ret;
+ priv->calc_blocksize = 1;
switch (ctx->fd_stencil) {
case 1:
priv->residual_calc_line = residual_calc_line_s1_c;
#if HAVE_EXTERNAL_ASM
- if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3)
+ if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
priv->residual_calc_line = mg2di_residual_calc_line_s1_fma3;
+ priv->calc_blocksize = 4;
+ }
#endif
break;
case 2:
priv->residual_calc_line = residual_calc_line_s2_c;
#if HAVE_EXTERNAL_ASM
- if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3)
+ if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
priv->residual_calc_line = mg2di_residual_calc_line_s2_fma3;
+ priv->calc_blocksize = 4;
+ }
#endif
break;
default:
@@ -426,6 +456,15 @@ int mg2di_ell_relax_init(EllRelaxContext *ctx)
}
}
+ priv->residual_max_size = tp_get_nb_threads(ctx->tp) * priv->calc_blocksize;
+ tmp = realloc(priv->residual_max,
+ sizeof(*priv->residual_max) * priv->residual_max_size);
+ if (!tmp) {
+ priv->residual_max_size = 0;
+ return -ENOMEM;
+ }
+ priv->residual_max = tmp;
+
boundaries_apply(ctx);
residual_calc(ctx);
@@ -534,6 +573,7 @@ void mg2di_ell_relax_free(EllRelaxContext **pctx)
free(ctx->priv->u_base);
free(ctx->priv->rhs_base);
free(ctx->priv->residual_base);
+ free(ctx->priv->residual_max);
for (int i = 0; i < ARRAY_ELEMS(ctx->priv->diff_coeffs_base); i++)
free(ctx->priv->diff_coeffs_base[i]);
for (int i = 0; i < ARRAY_ELEMS(ctx->priv->boundary_val_base); i++)