aboutsummaryrefslogtreecommitdiff
path: root/ell_grid_solve.c
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-04-02 10:25:08 +0200
committerAnton Khirnov <anton@khirnov.net>2019-04-02 11:01:58 +0200
commit57d4eec367a6e96323588f74acf5c48974383a45 (patch)
treead86c14d71f89e2281b912692b4a93acc6d03530 /ell_grid_solve.c
parentbcc67122331d63e38768e7c6c9633be4fc7bd09b (diff)
egs: optimize the correction step
Diffstat (limited to 'ell_grid_solve.c')
-rw-r--r--ell_grid_solve.c23
1 files changed, 23 insertions, 0 deletions
diff --git a/ell_grid_solve.c b/ell_grid_solve.c
index 9aee0e8..dfc48c6 100644
--- a/ell_grid_solve.c
+++ b/ell_grid_solve.c
@@ -29,6 +29,7 @@
#include "bicgstab.h"
#include "common.h"
+#include "cpu.h"
#include "ell_grid_solve.h"
#include "log.h"
#include "mg2d_boundary.h"
@@ -43,6 +44,7 @@ static const double relax_factors[FD_STENCIL_MAX] = {
};
typedef struct EGSRelaxInternal {
+ void (*line_add)(ptrdiff_t, double *, const double *, double);
double relax_factor;
} EGSRelaxInternal;
@@ -281,16 +283,31 @@ static void boundaries_apply(EGSContext *ctx, int init)
ctx->count_boundaries++;
}
+#if HAVE_EXTERNAL_ASM
+void mg2di_line_madd_fma3(ptrdiff_t linesize, double *dst, const double *src, double c);
+#endif
+
+static void line_madd_c(ptrdiff_t linesize, double *dst, const double *src, double c)
+{
+ for (ptrdiff_t i = 0; i < linesize; i++)
+ dst[i] += c * src[i];
+}
+
static int residual_add_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
{
+#if 1
EGSContext *ctx = arg;
EGSInternal *priv = ctx->priv;
+ priv->r.line_add(ctx->domain_size[0], ctx->u->data + ctx->u->stride[0] * job_idx,
+ ctx->residual->data + ctx->residual->stride[0] * job_idx, priv->r.relax_factor);
+#else
for (int idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) {
ptrdiff_t idx = job_idx * ctx->u->stride[0] + idx0;
ctx->u->data[idx] += priv->r.relax_factor * ctx->residual->data[idx];
}
+#endif
return 0;
}
@@ -683,6 +700,12 @@ int mg2di_egs_init(EGSContext *ctx, int flags)
if (r->relax_multiplier > 0.0)
priv->r.relax_factor *= r->relax_multiplier;
+
+ priv->r.line_add = line_madd_c;
+#if HAVE_EXTERNAL_ASM
+ if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3)
+ priv->r.line_add = mg2di_line_madd_fma3;
+#endif
}
priv->fd_factors[MG2D_DIFF_COEFF_00] = 1.0 / fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_00];