From 552bcf4c906522c3ef7695654052f61e12260049 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Tue, 9 Apr 2019 22:06:03 +0200 Subject: transfer: add AVX2 version of LAGRANGE_5 interpolation. --- transfer.c | 9 ++++++ transfer_interp.asm | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/transfer.c b/transfer.c index 311c6a9..db3cecb 100644 --- a/transfer.c +++ b/transfer.c @@ -61,6 +61,10 @@ void mg2di_transfer_interp_line_cont_4_fma3(double *dst, ptrdiff_t dst_len, const double *src, ptrdiff_t src_stride, const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y); +void mg2di_transfer_interp_line_cont_6_fma3(double *dst, ptrdiff_t dst_len, + const double *src, ptrdiff_t src_stride, + const ptrdiff_t *idx_x, + const double *fact_x, const double *fact_y); #endif #define STENCIL 2 @@ -170,6 +174,11 @@ static int transfer_lagrange_init(GridTransferContext *ctx) priv->transfer_cont = interp_transfer_line_cont_6; priv->transfer_generic = interp_transfer_line_generic_6; priv->stencil = 6; +#if HAVE_EXTERNAL_ASM + if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { + priv->transfer_cont = mg2di_transfer_interp_line_cont_6_fma3; + } +#endif break; case GRID_TRANSFER_LAGRANGE_7: priv->transfer_cont = interp_transfer_line_cont_8; diff --git a/transfer_interp.asm b/transfer_interp.asm index a6ae60f..1b1fe7d 100644 --- a/transfer_interp.asm +++ b/transfer_interp.asm @@ -72,3 +72,82 @@ cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx js .loop RET + +INIT_YMM fma3 +cglobal transfer_interp_line_cont_6, 7, 9, 11, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\ + idx_x_val, offset6 + shl src_strideq, 3 + shl dst_lenq, 3 + + add dstq, dst_lenq + add idx_xq, dst_lenq + lea fact_xq, [fact_xq + 4 * dst_lenq] + lea fact_xq, [fact_xq + 2 * dst_lenq] + neg dst_lenq + ; from now on, the register that held the line size is used as the offset into data arrays + %define offsetq dst_lenq + lea offset6q, [offsetq + 2 * offsetq] + add offset6q, offset6q + + movu m0, [fact_yq] + vpermq m1, m0, 01010101b ; fact y + 1 -> m1 + vpermq m2, m0, 10101010b ; fact y + 2 -> m2 + vpermq m3, m0, 11111111b ; fact y + 3 -> m3 + vpermq m0, m0, 00000000b ; fact y + 0 -> m0 + movu xm4, [fact_yq + 8 * 4] + vpermq m5, m4, 01010101b + vpermq m4, m4, 0 + +.loop: + mov idx_x_valq, [idx_xq + offsetq] + shl idx_x_valq, 3 + + movu m6, [fact_xq + offset6q] + movu xm7, [fact_xq + offset6q + mmsize] + + mulpd m8, m6, [srcq + idx_x_valq] + mulpd xm9, xm7, [srcq + idx_x_valq + mmsize] + mulpd m8, m0 + mulpd m9, m0 + + add idx_x_valq, src_strideq + mulpd m10, m6, [srcq + idx_x_valq] + vfmadd231pd m8, m10, m1 + mulpd xm11, xm7, [srcq + idx_x_valq + mmsize] + vfmadd231pd m9, m11, m1 + + add idx_x_valq, src_strideq + mulpd m10, m6, [srcq + idx_x_valq] + vfmadd231pd m8, m10, m2 + mulpd xm11, xm7, [srcq + idx_x_valq + mmsize] + vfmadd231pd m9, m11, m2 + + add idx_x_valq, src_strideq + mulpd m10, m6, [srcq + idx_x_valq] + vfmadd231pd m8, m10, m3 + mulpd xm11, xm7, [srcq + idx_x_valq + mmsize] + vfmadd231pd m9, m11, m3 + + add idx_x_valq, src_strideq + mulpd m10, m6, [srcq + idx_x_valq] + vfmadd231pd m8, m10, m4 + mulpd xm11, xm7, [srcq + idx_x_valq + mmsize] + vfmadd231pd m9, m11, m4 + + add idx_x_valq, src_strideq + mulpd m10, m6, [srcq + idx_x_valq] + vfmadd231pd m8, m10, m5 + mulpd xm11, xm7, [srcq + idx_x_valq + mmsize] + vfmadd231pd m9, m11, m5 + + haddpd m8, m9 + vpermq m9, m8, 10b + haddpd xm8, xm8 + addpd m8, m9 + + movq [dstq + offsetq], xm8 + add offsetq, 8 + add offset6q, 8 * 6 + js .loop + + RET -- cgit v1.2.3