From 57d4eec367a6e96323588f74acf5c48974383a45 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Tue, 2 Apr 2019 10:25:08 +0200 Subject: egs: optimize the correction step --- ell_grid_solve.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'ell_grid_solve.c') diff --git a/ell_grid_solve.c b/ell_grid_solve.c index 9aee0e8..dfc48c6 100644 --- a/ell_grid_solve.c +++ b/ell_grid_solve.c @@ -29,6 +29,7 @@ #include "bicgstab.h" #include "common.h" +#include "cpu.h" #include "ell_grid_solve.h" #include "log.h" #include "mg2d_boundary.h" @@ -43,6 +44,7 @@ static const double relax_factors[FD_STENCIL_MAX] = { }; typedef struct EGSRelaxInternal { + void (*line_add)(ptrdiff_t, double *, const double *, double); double relax_factor; } EGSRelaxInternal; @@ -281,16 +283,31 @@ static void boundaries_apply(EGSContext *ctx, int init) ctx->count_boundaries++; } +#if HAVE_EXTERNAL_ASM +void mg2di_line_madd_fma3(ptrdiff_t linesize, double *dst, const double *src, double c); +#endif + +static void line_madd_c(ptrdiff_t linesize, double *dst, const double *src, double c) +{ + for (ptrdiff_t i = 0; i < linesize; i++) + dst[i] += c * src[i]; +} + static int residual_add_task(void *arg, unsigned int job_idx, unsigned int thread_idx) { +#if 1 EGSContext *ctx = arg; EGSInternal *priv = ctx->priv; + priv->r.line_add(ctx->domain_size[0], ctx->u->data + ctx->u->stride[0] * job_idx, + ctx->residual->data + ctx->residual->stride[0] * job_idx, priv->r.relax_factor); +#else for (int idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) { ptrdiff_t idx = job_idx * ctx->u->stride[0] + idx0; ctx->u->data[idx] += priv->r.relax_factor * ctx->residual->data[idx]; } +#endif return 0; } @@ -683,6 +700,12 @@ int mg2di_egs_init(EGSContext *ctx, int flags) if (r->relax_multiplier > 0.0) priv->r.relax_factor *= r->relax_multiplier; + + priv->r.line_add = line_madd_c; +#if HAVE_EXTERNAL_ASM + if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) + priv->r.line_add = mg2di_line_madd_fma3; +#endif } priv->fd_factors[MG2D_DIFF_COEFF_00] = 1.0 / fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_00]; -- cgit v1.2.3