summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-04-09 22:06:03 +0200
committerAnton Khirnov <anton@khirnov.net>2019-04-09 22:06:03 +0200
commit552bcf4c906522c3ef7695654052f61e12260049 (patch)
tree294b5c636a6c29f050d0a7112cc47508c3a1e14c
parent1e83fd63c30d433ee53769d4e4768feabf822ae2 (diff)
transfer: add AVX2 version of LAGRANGE_5 interpolation.
-rw-r--r--transfer.c9
-rw-r--r--transfer_interp.asm79
2 files changed, 88 insertions, 0 deletions
diff --git a/transfer.c b/transfer.c
index 311c6a9..db3cecb 100644
--- a/transfer.c
+++ b/transfer.c
@@ -61,6 +61,10 @@ void mg2di_transfer_interp_line_cont_4_fma3(double *dst, ptrdiff_t dst_len,
const double *src, ptrdiff_t src_stride,
const ptrdiff_t *idx_x,
const double *fact_x, const double *fact_y);
+void mg2di_transfer_interp_line_cont_6_fma3(double *dst, ptrdiff_t dst_len,
+ const double *src, ptrdiff_t src_stride,
+ const ptrdiff_t *idx_x,
+ const double *fact_x, const double *fact_y);
#endif
#define STENCIL 2
@@ -170,6 +174,11 @@ static int transfer_lagrange_init(GridTransferContext *ctx)
priv->transfer_cont = interp_transfer_line_cont_6;
priv->transfer_generic = interp_transfer_line_generic_6;
priv->stencil = 6;
+#if HAVE_EXTERNAL_ASM
+ if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
+ priv->transfer_cont = mg2di_transfer_interp_line_cont_6_fma3;
+ }
+#endif
break;
case GRID_TRANSFER_LAGRANGE_7:
priv->transfer_cont = interp_transfer_line_cont_8;
diff --git a/transfer_interp.asm b/transfer_interp.asm
index a6ae60f..1b1fe7d 100644
--- a/transfer_interp.asm
+++ b/transfer_interp.asm
@@ -72,3 +72,82 @@ cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx
js .loop
RET
+
+INIT_YMM fma3
+cglobal transfer_interp_line_cont_6, 7, 9, 11, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\
+ idx_x_val, offset6
+ shl src_strideq, 3
+ shl dst_lenq, 3
+
+ add dstq, dst_lenq
+ add idx_xq, dst_lenq
+ lea fact_xq, [fact_xq + 4 * dst_lenq]
+ lea fact_xq, [fact_xq + 2 * dst_lenq]
+ neg dst_lenq
+ ; from now on, the register that held the line size is used as the offset into data arrays
+ %define offsetq dst_lenq
+ lea offset6q, [offsetq + 2 * offsetq]
+ add offset6q, offset6q
+
+ movu m0, [fact_yq]
+ vpermq m1, m0, 01010101b ; fact y + 1 -> m1
+ vpermq m2, m0, 10101010b ; fact y + 2 -> m2
+ vpermq m3, m0, 11111111b ; fact y + 3 -> m3
+ vpermq m0, m0, 00000000b ; fact y + 0 -> m0
+ movu xm4, [fact_yq + 8 * 4]
+ vpermq m5, m4, 01010101b
+ vpermq m4, m4, 0
+
+.loop:
+ mov idx_x_valq, [idx_xq + offsetq]
+ shl idx_x_valq, 3
+
+ movu m6, [fact_xq + offset6q]
+ movu xm7, [fact_xq + offset6q + mmsize]
+
+ mulpd m8, m6, [srcq + idx_x_valq]
+ mulpd xm9, xm7, [srcq + idx_x_valq + mmsize]
+ mulpd m8, m0
+ mulpd m9, m0
+
+ add idx_x_valq, src_strideq
+ mulpd m10, m6, [srcq + idx_x_valq]
+ vfmadd231pd m8, m10, m1
+ mulpd xm11, xm7, [srcq + idx_x_valq + mmsize]
+ vfmadd231pd m9, m11, m1
+
+ add idx_x_valq, src_strideq
+ mulpd m10, m6, [srcq + idx_x_valq]
+ vfmadd231pd m8, m10, m2
+ mulpd xm11, xm7, [srcq + idx_x_valq + mmsize]
+ vfmadd231pd m9, m11, m2
+
+ add idx_x_valq, src_strideq
+ mulpd m10, m6, [srcq + idx_x_valq]
+ vfmadd231pd m8, m10, m3
+ mulpd xm11, xm7, [srcq + idx_x_valq + mmsize]
+ vfmadd231pd m9, m11, m3
+
+ add idx_x_valq, src_strideq
+ mulpd m10, m6, [srcq + idx_x_valq]
+ vfmadd231pd m8, m10, m4
+ mulpd xm11, xm7, [srcq + idx_x_valq + mmsize]
+ vfmadd231pd m9, m11, m4
+
+ add idx_x_valq, src_strideq
+ mulpd m10, m6, [srcq + idx_x_valq]
+ vfmadd231pd m8, m10, m5
+ mulpd xm11, xm7, [srcq + idx_x_valq + mmsize]
+ vfmadd231pd m9, m11, m5
+
+ haddpd m8, m9
+ vpermq m9, m8, 10b
+ haddpd xm8, xm8
+ addpd m8, m9
+
+ movq [dstq + offsetq], xm8
+ add offsetq, 8
+ add offset6q, 8 * 6
+ js .loop
+
+ RET