From 57d4eec367a6e96323588f74acf5c48974383a45 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Tue, 2 Apr 2019 10:25:08 +0200 Subject: egs: optimize the correction step --- ell_grid_solve.c | 23 +++++++++++++++++++ meson.build | 1 + ndarray.asm | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 ndarray.asm diff --git a/ell_grid_solve.c b/ell_grid_solve.c index 9aee0e8..dfc48c6 100644 --- a/ell_grid_solve.c +++ b/ell_grid_solve.c @@ -29,6 +29,7 @@ #include "bicgstab.h" #include "common.h" +#include "cpu.h" #include "ell_grid_solve.h" #include "log.h" #include "mg2d_boundary.h" @@ -43,6 +44,7 @@ static const double relax_factors[FD_STENCIL_MAX] = { }; typedef struct EGSRelaxInternal { + void (*line_add)(ptrdiff_t, double *, const double *, double); double relax_factor; } EGSRelaxInternal; @@ -281,16 +283,31 @@ static void boundaries_apply(EGSContext *ctx, int init) ctx->count_boundaries++; } +#if HAVE_EXTERNAL_ASM +void mg2di_line_madd_fma3(ptrdiff_t linesize, double *dst, const double *src, double c); +#endif + +static void line_madd_c(ptrdiff_t linesize, double *dst, const double *src, double c) +{ + for (ptrdiff_t i = 0; i < linesize; i++) + dst[i] += c * src[i]; +} + static int residual_add_task(void *arg, unsigned int job_idx, unsigned int thread_idx) { +#if 1 EGSContext *ctx = arg; EGSInternal *priv = ctx->priv; + priv->r.line_add(ctx->domain_size[0], ctx->u->data + ctx->u->stride[0] * job_idx, + ctx->residual->data + ctx->residual->stride[0] * job_idx, priv->r.relax_factor); +#else for (int idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) { ptrdiff_t idx = job_idx * ctx->u->stride[0] + idx0; ctx->u->data[idx] += priv->r.relax_factor * ctx->residual->data[idx]; } +#endif return 0; } @@ -683,6 +700,12 @@ int mg2di_egs_init(EGSContext *ctx, int flags) if (r->relax_multiplier > 0.0) priv->r.relax_factor *= r->relax_multiplier; + + priv->r.line_add = line_madd_c; +#if HAVE_EXTERNAL_ASM + if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) + priv->r.line_add = mg2di_line_madd_fma3; +#endif } priv->fd_factors[MG2D_DIFF_COEFF_00] = 1.0 / fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_00]; diff --git a/meson.build b/meson.build index ce5e57b..6d3b356 100644 --- a/meson.build +++ b/meson.build @@ -82,6 +82,7 @@ if nasm.found() nasm_sources = files( 'cpuid.asm', + 'ndarray.asm', 'residual_calc.asm', 'transfer_interp.asm', ) diff --git a/ndarray.asm b/ndarray.asm new file mode 100644 index 0000000..94a4ec9 --- /dev/null +++ b/ndarray.asm @@ -0,0 +1,70 @@ +; +; SIMD for basic linear algebra +; Copyright 2018 Anton Khirnov +; +; This program is free software: you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation, either version 3 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program. If not, see . +;/ + + +%include "config.asm" +%include "x86inc.asm" + +SECTION .text + +; double precision +%define ELEM_SIZE 8 + +INIT_YMM fma3 +cglobal line_madd, 3, 3, 2, linesize, dst, src + shl linesizeq, 3 + add dstq, linesizeq + add srcq, linesizeq + neg linesizeq + + vpermq m0, m0, 0 + +.loop: + movu m1, [dstq + linesizeq] + vfmadd231pd m1, m0, [srcq + linesizeq] + + add linesizeq, mmsize + jg .store_partial + + movu [dstq + linesizeq - mmsize], m1 + js .loop + jmp .finish + +.store_partial: + sub linesizeq, ELEM_SIZE + jz .store3 + sub linesizeq, ELEM_SIZE + jz .store2 + +.store1: + ; linesizeq is now mmsize-2 after the write position + movq [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1 + jmp .finish +.store2: + ; linesizeq is now mmsize-2 after the write position + movu [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1 + jmp .finish +.store3: + ; linesizeq is now mmsize-1 after the write position + movu [dstq + linesizeq - mmsize + 1 * ELEM_SIZE], xm1 + vextractf128 xm1, m1, 1 + movq [dstq + linesizeq - mmsize + 3 * ELEM_SIZE], xm1 + +.finish: + + RET -- cgit v1.2.3