aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-04-08 10:42:15 +0200
committerAnton Khirnov <anton@khirnov.net>2024-04-16 14:48:47 +0200
commit52581fdefd8257249768deb837548e9c71eac57e (patch)
tree61375e8a7b53bbea8b49f707e2c5fa5b09742529
parenta2df7298cd7e2dba47cb7274b8b7d983265e7279 (diff)
x86: add a misc utility header
-rw-r--r--residual_calc.asm1
-rw-r--r--transfer.c12
-rw-r--r--transfer_interp.asm25
-rw-r--r--util.asm40
4 files changed, 60 insertions, 18 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 6970fd5..0a85e1d 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -18,6 +18,7 @@
%include "config.asm"
%include "x86inc.asm"
+%include "util.asm"
; double precision
%define ELEM_SIZE 8
diff --git a/transfer.c b/transfer.c
index 945470e..bfd0027 100644
--- a/transfer.c
+++ b/transfer.c
@@ -62,11 +62,11 @@ typedef struct GridTransferLagrange {
} GridTransferLagrange;
#if HAVE_NASM
-void mg2di_transfer_interp2d_line_cont_4_fma3(double *dst, ptrdiff_t dst_len,
+void mg2di_transfer_interp2d_line_cont_4_avx2(double *dst, ptrdiff_t dst_len,
const double *src, ptrdiff_t src_stride,
const ptrdiff_t *idx_x,
const double *fact_x, const double *fact_y);
-void mg2di_transfer_interp2d_line_cont_6_fma3(double *dst, ptrdiff_t dst_len,
+void mg2di_transfer_interp2d_line_cont_6_avx2(double *dst, ptrdiff_t dst_len,
const double *src, ptrdiff_t src_stride,
const ptrdiff_t *idx_x,
const double *fact_x, const double *fact_y);
@@ -177,8 +177,8 @@ static int transfer_lagrange_init(GridTransferContext *ctx)
priv->stencil = 4;
#if HAVE_NASM
- if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
- priv->transfer2d_cont = mg2di_transfer_interp2d_line_cont_4_fma3;
+ if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) {
+ priv->transfer2d_cont = mg2di_transfer_interp2d_line_cont_4_avx2;
}
#endif
break;
@@ -189,8 +189,8 @@ static int transfer_lagrange_init(GridTransferContext *ctx)
priv->transfer2d_generic = interp2d_transfer_line_generic_6;
priv->stencil = 6;
#if HAVE_NASM
- if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
- priv->transfer2d_cont = mg2di_transfer_interp2d_line_cont_6_fma3;
+ if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) {
+ priv->transfer2d_cont = mg2di_transfer_interp2d_line_cont_6_avx2;
}
#endif
break;
diff --git a/transfer_interp.asm b/transfer_interp.asm
index e711b8f..d02e188 100644
--- a/transfer_interp.asm
+++ b/transfer_interp.asm
@@ -18,10 +18,11 @@
%include "config.asm"
%include "x86inc.asm"
+%include "util.asm"
SECTION .text
-INIT_YMM fma3
+INIT_YMM avx2
cglobal transfer_interp2d_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\
idx_x_val
shl src_strideq, 3
@@ -35,10 +36,10 @@ cglobal transfer_interp2d_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, i
%define offsetq dst_lenq
movu m0, [fact_yq]
- vpermq m1, m0, 01010101b ; fact y + 1 -> m1
- vpermq m2, m0, 10101010b ; fact y + 2 -> m2
- vpermq m3, m0, 11111111b ; fact y + 3 -> m3
- vpermq m0, m0, 00000000b ; fact y + 0 -> m0
+ SPLATPD m1, m0, 1 ; fact y + 1 -> m1
+ SPLATPD m2, m0, 2 ; fact y + 2 -> m2
+ SPLATPD m3, m0, 3 ; fact y + 3 -> m3
+ SPLATPD m0, m0, 0 ; fact y + 0 -> m0
.loop:
mov idx_x_valq, [idx_xq + offsetq]
@@ -73,7 +74,7 @@ cglobal transfer_interp2d_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, i
RET
-INIT_YMM fma3
+INIT_YMM avx2
cglobal transfer_interp2d_line_cont_6, 7, 9, 11, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\
idx_x_val, offset6
shl src_strideq, 3
@@ -90,13 +91,13 @@ cglobal transfer_interp2d_line_cont_6, 7, 9, 11, dst, dst_len, src, src_stride,
add offset6q, offset6q
movu m0, [fact_yq]
- vpermq m1, m0, 01010101b ; fact y + 1 -> m1
- vpermq m2, m0, 10101010b ; fact y + 2 -> m2
- vpermq m3, m0, 11111111b ; fact y + 3 -> m3
- vpermq m0, m0, 00000000b ; fact y + 0 -> m0
+ SPLATPD m1, m0, 1 ; fact y + 1 -> m1
+ SPLATPD m2, m0, 2 ; fact y + 2 -> m2
+ SPLATPD m3, m0, 3 ; fact y + 3 -> m3
+ SPLATPD m0, m0, 0 ; fact y + 0 -> m0
movu xm4, [fact_yq + 8 * 4]
- vpermq m5, m4, 01010101b
- vpermq m4, m4, 0
+ SPLATPD m5, m4, 1
+ SPLATPD m4, m4, 0
.loop:
mov idx_x_valq, [idx_xq + offsetq]
diff --git a/util.asm b/util.asm
new file mode 100644
index 0000000..846ea3b
--- /dev/null
+++ b/util.asm
@@ -0,0 +1,40 @@
+;
+; Various ASM utility macros
+; Copyright 2019 Anton Khirnov <anton@khirnov.net>
+;
+; This program is free software: you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation, either version 3 of the License, or
+; (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+%define _IMM8SPLAT2B(x) (((x) << 6) | ((x) << 4) | ((x) << 2) | (x))
+%define _IMM4SPLAT1B(x) (((x) << 3) | ((x) << 2) | ((x) << 1) | (x))
+
+; splat packed double - copy the specified double into all positions in the
+; destination
+
+; %1 destination register
+; %2 source register (may be same as dst)
+; %3 index of the element in the source to splat
+%macro SPLATPD 3
+ %if %3 > 3
+ %error Invalid selector %3
+ %endif
+
+ %if mmsize == 32 && cpuflag(avx2)
+ vpermpd %1, %2, _IMM8SPLAT2B(%3)
+ %elif mmsize == 32 && cpuflag(avx)
+ shufpd %1, %2, %2, _IMM4SPLAT1B(%3 & 1)
+ vperm2f128 %1, %1, ((%3 & 2) >> 1) * 0x11
+ %else
+ %error %? not supported with cpuname
+ %endif
+%endmacro