summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-04-02 10:25:08 +0200
committerAnton Khirnov <anton@khirnov.net>2019-04-02 11:01:58 +0200
commit57d4eec367a6e96323588f74acf5c48974383a45 (patch)
treead86c14d71f89e2281b912692b4a93acc6d03530
parentbcc67122331d63e38768e7c6c9633be4fc7bd09b (diff)
egs: optimize the correction step
-rw-r--r--ell_grid_solve.c23
-rw-r--r--meson.build1
-rw-r--r--ndarray.asm70
3 files changed, 94 insertions, 0 deletions
diff --git a/ell_grid_solve.c b/ell_grid_solve.c
index 9aee0e8..dfc48c6 100644
--- a/ell_grid_solve.c
+++ b/ell_grid_solve.c
@@ -29,6 +29,7 @@
#include "bicgstab.h"
#include "common.h"
+#include "cpu.h"
#include "ell_grid_solve.h"
#include "log.h"
#include "mg2d_boundary.h"
@@ -43,6 +44,7 @@ static const double relax_factors[FD_STENCIL_MAX] = {
};
typedef struct EGSRelaxInternal {
+ void (*line_add)(ptrdiff_t, double *, const double *, double);
double relax_factor;
} EGSRelaxInternal;
@@ -281,16 +283,31 @@ static void boundaries_apply(EGSContext *ctx, int init)
ctx->count_boundaries++;
}
+#if HAVE_EXTERNAL_ASM
+void mg2di_line_madd_fma3(ptrdiff_t linesize, double *dst, const double *src, double c);
+#endif
+
+static void line_madd_c(ptrdiff_t linesize, double *dst, const double *src, double c)
+{
+ for (ptrdiff_t i = 0; i < linesize; i++)
+ dst[i] += c * src[i];
+}
+
static int residual_add_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
{
+#if 1
EGSContext *ctx = arg;
EGSInternal *priv = ctx->priv;
+ priv->r.line_add(ctx->domain_size[0], ctx->u->data + ctx->u->stride[0] * job_idx,
+ ctx->residual->data + ctx->residual->stride[0] * job_idx, priv->r.relax_factor);
+#else
for (int idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) {
ptrdiff_t idx = job_idx * ctx->u->stride[0] + idx0;
ctx->u->data[idx] += priv->r.relax_factor * ctx->residual->data[idx];
}
+#endif
return 0;
}
@@ -683,6 +700,12 @@ int mg2di_egs_init(EGSContext *ctx, int flags)
if (r->relax_multiplier > 0.0)
priv->r.relax_factor *= r->relax_multiplier;
+
+ priv->r.line_add = line_madd_c;
+#if HAVE_EXTERNAL_ASM
+ if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3)
+ priv->r.line_add = mg2di_line_madd_fma3;
+#endif
}
priv->fd_factors[MG2D_DIFF_COEFF_00] = 1.0 / fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_00];
diff --git a/meson.build b/meson.build
index ce5e57b..6d3b356 100644
--- a/meson.build
+++ b/meson.build
@@ -82,6 +82,7 @@ if nasm.found()
nasm_sources = files(
'cpuid.asm',
+ 'ndarray.asm',
'residual_calc.asm',
'transfer_interp.asm',
)
diff --git a/ndarray.asm b/ndarray.asm
new file mode 100644
index 0000000..94a4ec9
--- /dev/null
+++ b/ndarray.asm
@@ -0,0 +1,70 @@
+;
+; SIMD for basic linear algebra
+; Copyright 2018 Anton Khirnov <anton@khirnov.net>
+;
+; This program is free software: you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation, either version 3 of the License, or
+; (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program. If not, see <http://www.gnu.org/licenses/>.
+;/
+
+
+%include "config.asm"
+%include "x86inc.asm"
+
+SECTION .text
+
+; double precision
+%define ELEM_SIZE 8
+
+INIT_YMM fma3
+cglobal line_madd, 3, 3, 2, linesize, dst, src
+ shl linesizeq, 3
+ add dstq, linesizeq
+ add srcq, linesizeq
+ neg linesizeq
+
+ vpermq m0, m0, 0
+
+.loop:
+ movu m1, [dstq + linesizeq]
+ vfmadd231pd m1, m0, [srcq + linesizeq]
+
+ add linesizeq, mmsize
+ jg .store_partial
+
+ movu [dstq + linesizeq - mmsize], m1
+ js .loop
+ jmp .finish
+
+.store_partial:
+ sub linesizeq, ELEM_SIZE
+ jz .store3
+ sub linesizeq, ELEM_SIZE
+ jz .store2
+
+.store1:
+ ; linesizeq is now mmsize-2 after the write position
+ movq [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1
+ jmp .finish
+.store2:
+ ; linesizeq is now mmsize-2 after the write position
+ movu [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1
+ jmp .finish
+.store3:
+ ; linesizeq is now mmsize-1 after the write position
+ movu [dstq + linesizeq - mmsize + 1 * ELEM_SIZE], xm1
+ vextractf128 xm1, m1, 1
+ movq [dstq + linesizeq - mmsize + 3 * ELEM_SIZE], xm1
+
+.finish:
+
+ RET