From 52581fdefd8257249768deb837548e9c71eac57e Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Mon, 8 Apr 2019 10:42:15 +0200 Subject: x86: add a misc utility header --- residual_calc.asm | 1 + transfer.c | 12 ++++++------ transfer_interp.asm | 25 +++++++++++++------------ util.asm | 40 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 18 deletions(-) create mode 100644 util.asm diff --git a/residual_calc.asm b/residual_calc.asm index 6970fd5..0a85e1d 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -18,6 +18,7 @@ %include "config.asm" %include "x86inc.asm" +%include "util.asm" ; double precision %define ELEM_SIZE 8 diff --git a/transfer.c b/transfer.c index 945470e..bfd0027 100644 --- a/transfer.c +++ b/transfer.c @@ -62,11 +62,11 @@ typedef struct GridTransferLagrange { } GridTransferLagrange; #if HAVE_NASM -void mg2di_transfer_interp2d_line_cont_4_fma3(double *dst, ptrdiff_t dst_len, +void mg2di_transfer_interp2d_line_cont_4_avx2(double *dst, ptrdiff_t dst_len, const double *src, ptrdiff_t src_stride, const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y); -void mg2di_transfer_interp2d_line_cont_6_fma3(double *dst, ptrdiff_t dst_len, +void mg2di_transfer_interp2d_line_cont_6_avx2(double *dst, ptrdiff_t dst_len, const double *src, ptrdiff_t src_stride, const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y); @@ -177,8 +177,8 @@ static int transfer_lagrange_init(GridTransferContext *ctx) priv->stencil = 4; #if HAVE_NASM - if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { - priv->transfer2d_cont = mg2di_transfer_interp2d_line_cont_4_fma3; + if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) { + priv->transfer2d_cont = mg2di_transfer_interp2d_line_cont_4_avx2; } #endif break; @@ -189,8 +189,8 @@ static int transfer_lagrange_init(GridTransferContext *ctx) priv->transfer2d_generic = interp2d_transfer_line_generic_6; priv->stencil = 6; #if HAVE_NASM - if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { - priv->transfer2d_cont = mg2di_transfer_interp2d_line_cont_6_fma3; + if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) { + priv->transfer2d_cont = mg2di_transfer_interp2d_line_cont_6_avx2; } #endif break; diff --git a/transfer_interp.asm b/transfer_interp.asm index e711b8f..d02e188 100644 --- a/transfer_interp.asm +++ b/transfer_interp.asm @@ -18,10 +18,11 @@ %include "config.asm" %include "x86inc.asm" +%include "util.asm" SECTION .text -INIT_YMM fma3 +INIT_YMM avx2 cglobal transfer_interp2d_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\ idx_x_val shl src_strideq, 3 @@ -35,10 +36,10 @@ cglobal transfer_interp2d_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, i %define offsetq dst_lenq movu m0, [fact_yq] - vpermq m1, m0, 01010101b ; fact y + 1 -> m1 - vpermq m2, m0, 10101010b ; fact y + 2 -> m2 - vpermq m3, m0, 11111111b ; fact y + 3 -> m3 - vpermq m0, m0, 00000000b ; fact y + 0 -> m0 + SPLATPD m1, m0, 1 ; fact y + 1 -> m1 + SPLATPD m2, m0, 2 ; fact y + 2 -> m2 + SPLATPD m3, m0, 3 ; fact y + 3 -> m3 + SPLATPD m0, m0, 0 ; fact y + 0 -> m0 .loop: mov idx_x_valq, [idx_xq + offsetq] @@ -73,7 +74,7 @@ cglobal transfer_interp2d_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, i RET -INIT_YMM fma3 +INIT_YMM avx2 cglobal transfer_interp2d_line_cont_6, 7, 9, 11, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\ idx_x_val, offset6 shl src_strideq, 3 @@ -90,13 +91,13 @@ cglobal transfer_interp2d_line_cont_6, 7, 9, 11, dst, dst_len, src, src_stride, add offset6q, offset6q movu m0, [fact_yq] - vpermq m1, m0, 01010101b ; fact y + 1 -> m1 - vpermq m2, m0, 10101010b ; fact y + 2 -> m2 - vpermq m3, m0, 11111111b ; fact y + 3 -> m3 - vpermq m0, m0, 00000000b ; fact y + 0 -> m0 + SPLATPD m1, m0, 1 ; fact y + 1 -> m1 + SPLATPD m2, m0, 2 ; fact y + 2 -> m2 + SPLATPD m3, m0, 3 ; fact y + 3 -> m3 + SPLATPD m0, m0, 0 ; fact y + 0 -> m0 movu xm4, [fact_yq + 8 * 4] - vpermq m5, m4, 01010101b - vpermq m4, m4, 0 + SPLATPD m5, m4, 1 + SPLATPD m4, m4, 0 .loop: mov idx_x_valq, [idx_xq + offsetq] diff --git a/util.asm b/util.asm new file mode 100644 index 0000000..846ea3b --- /dev/null +++ b/util.asm @@ -0,0 +1,40 @@ +; +; Various ASM utility macros +; Copyright 2019 Anton Khirnov +; +; This program is free software: you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation, either version 3 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program. If not, see . + +%define _IMM8SPLAT2B(x) (((x) << 6) | ((x) << 4) | ((x) << 2) | (x)) +%define _IMM4SPLAT1B(x) (((x) << 3) | ((x) << 2) | ((x) << 1) | (x)) + +; splat packed double - copy the specified double into all positions in the +; destination + +; %1 destination register +; %2 source register (may be same as dst) +; %3 index of the element in the source to splat +%macro SPLATPD 3 + %if %3 > 3 + %error Invalid selector %3 + %endif + + %if mmsize == 32 && cpuflag(avx2) + vpermpd %1, %2, _IMM8SPLAT2B(%3) + %elif mmsize == 32 && cpuflag(avx) + shufpd %1, %2, %2, _IMM4SPLAT1B(%3 & 1) + vperm2f128 %1, %1, ((%3 & 2) >> 1) * 0x11 + %else + %error %? not supported with cpuname + %endif +%endmacro -- cgit v1.2.3