From 57d4eec367a6e96323588f74acf5c48974383a45 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Tue, 2 Apr 2019 10:25:08 +0200
Subject: egs: optimize the correction step

---
 ell_grid_solve.c | 23 +++++++++++++++++++
 meson.build      |  1 +
 ndarray.asm      | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+)
 create mode 100644 ndarray.asm

diff --git a/ell_grid_solve.c b/ell_grid_solve.c
index 9aee0e8..dfc48c6 100644
--- a/ell_grid_solve.c
+++ b/ell_grid_solve.c
@@ -29,6 +29,7 @@
 
 #include "bicgstab.h"
 #include "common.h"
+#include "cpu.h"
 #include "ell_grid_solve.h"
 #include "log.h"
 #include "mg2d_boundary.h"
@@ -43,6 +44,7 @@ static const double relax_factors[FD_STENCIL_MAX] = {
 };
 
 typedef struct EGSRelaxInternal {
+    void (*line_add)(ptrdiff_t, double *, const double *, double);
     double relax_factor;
 } EGSRelaxInternal;
 
@@ -281,16 +283,31 @@ static void boundaries_apply(EGSContext *ctx, int init)
     ctx->count_boundaries++;
 }
 
+#if HAVE_EXTERNAL_ASM
+void mg2di_line_madd_fma3(ptrdiff_t linesize, double *dst, const double *src, double c);
+#endif
+
+static void line_madd_c(ptrdiff_t linesize, double *dst, const double *src, double c)
+{
+    for (ptrdiff_t i = 0; i < linesize; i++)
+        dst[i] += c * src[i];
+}
+
 static int residual_add_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
 {
+#if 1
     EGSContext   *ctx = arg;
     EGSInternal *priv = ctx->priv;
 
+    priv->r.line_add(ctx->domain_size[0], ctx->u->data + ctx->u->stride[0] * job_idx,
+                     ctx->residual->data + ctx->residual->stride[0] * job_idx, priv->r.relax_factor);
+#else
     for (int idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) {
         ptrdiff_t idx = job_idx * ctx->u->stride[0] + idx0;
 
         ctx->u->data[idx] += priv->r.relax_factor * ctx->residual->data[idx];
     }
+#endif
 
     return 0;
 }
@@ -683,6 +700,12 @@ int mg2di_egs_init(EGSContext *ctx, int flags)
 
         if (r->relax_multiplier > 0.0)
             priv->r.relax_factor *= r->relax_multiplier;
+
+        priv->r.line_add = line_madd_c;
+#if HAVE_EXTERNAL_ASM
+        if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3)
+            priv->r.line_add = mg2di_line_madd_fma3;
+#endif
     }
 
     priv->fd_factors[MG2D_DIFF_COEFF_00] = 1.0 /  fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_00];
diff --git a/meson.build b/meson.build
index ce5e57b..6d3b356 100644
--- a/meson.build
+++ b/meson.build
@@ -82,6 +82,7 @@ if nasm.found()
 
     nasm_sources = files(
         'cpuid.asm',
+        'ndarray.asm',
         'residual_calc.asm',
         'transfer_interp.asm',
     )
diff --git a/ndarray.asm b/ndarray.asm
new file mode 100644
index 0000000..94a4ec9
--- /dev/null
+++ b/ndarray.asm
@@ -0,0 +1,70 @@
+;
+; SIMD for basic linear algebra
+; Copyright 2018 Anton Khirnov <anton@khirnov.net>
+;
+; This program is free software: you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation, either version 3 of the License, or
+; (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program.  If not, see <http://www.gnu.org/licenses/>.
+;/
+
+
+%include "config.asm"
+%include "x86inc.asm"
+
+SECTION .text
+
+; double precision
+%define ELEM_SIZE 8
+
+INIT_YMM fma3
+cglobal line_madd, 3, 3, 2, linesize, dst, src
+    shl linesizeq, 3
+    add dstq, linesizeq
+    add srcq, linesizeq
+    neg linesizeq
+
+    vpermq m0, m0, 0
+
+.loop:
+    movu m1, [dstq + linesizeq]
+    vfmadd231pd m1, m0, [srcq + linesizeq]
+
+    add linesizeq, mmsize
+    jg .store_partial
+
+    movu [dstq + linesizeq - mmsize], m1
+    js .loop
+    jmp .finish
+
+.store_partial:
+    sub linesizeq, ELEM_SIZE
+    jz .store3
+    sub linesizeq, ELEM_SIZE
+    jz .store2
+
+.store1:
+    ; linesizeq is now mmsize-2 after the write position
+    movq [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1
+    jmp .finish
+.store2:
+    ; linesizeq is now mmsize-2 after the write position
+    movu [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1
+    jmp .finish
+.store3:
+    ; linesizeq is now mmsize-1 after the write position
+    movu [dstq + linesizeq - mmsize + 1 * ELEM_SIZE], xm1
+    vextractf128 xm1, m1, 1
+    movq [dstq + linesizeq - mmsize + 3 * ELEM_SIZE], xm1
+
+.finish:
+
+    RET
-- 
cgit v1.2.3