From 264d8ce5e39676582f2e6a65cf517924846070b9 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Mon, 8 Apr 2019 10:42:15 +0200
Subject: x86: add a misc utility header

---
 residual_calc.asm   | 11 ++++++-----
 transfer.c          |  6 +++---
 transfer_interp.asm | 11 ++++++-----
 util.asm            | 38 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 53 insertions(+), 13 deletions(-)
 create mode 100644 util.asm

diff --git a/residual_calc.asm b/residual_calc.asm
index e5b0268..77d6dc8 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -17,6 +17,7 @@
 ;/
 
 %include "config.asm"
+%include "util.asm"
 %include "x86inc.asm"
 
 ; double precision
@@ -149,12 +150,12 @@ SECTION .text
 
     ; load and splat the finite difference factors
     movu m0, [fd_factorsq + OFF_DIFF_COEFF_01]
-    vpermq  m1, m0, 00000000b                           ; diff factor 01 -> m1
-    vpermq  m2, m0, 01010101b                           ; diff factor 10 -> m2
-    vpermq  m3, m0, 10101010b                           ; diff factor 11 -> m3
-    vpermq  m4, m0, 11111111b                           ; diff factor 02 -> m4
+    SPLATPD m1, m0, 0                           ; diff factor 01 -> m1
+    SPLATPD m2, m0, 1                           ; diff factor 10 -> m2
+    SPLATPD m3, m0, 2                           ; diff factor 11 -> m3
+    SPLATPD m4, m0, 3                           ; diff factor 02 -> m4
     movq   xm0, [fd_factorsq + OFF_DIFF_COEFF_20]
-    vpermq  m5, m0, 00000000b                           ; diff factor 20 -> m5
+    SPLATPD m5, m0, 0                           ; diff factor 20 -> m5
     %define u_downq fd_factorsq    ; reuse the fd_factors register after it is no longer needed
 
     ; compute the mask for absolute value
diff --git a/transfer.c b/transfer.c
index 232d6e1..98051de 100644
--- a/transfer.c
+++ b/transfer.c
@@ -57,7 +57,7 @@ typedef struct GridTransferLagrange {
 } GridTransferLagrange;
 
 #if HAVE_EXTERNAL_ASM
-void mg2di_transfer_interp_line_cont_4_fma3(double *dst, ptrdiff_t dst_len,
+void mg2di_transfer_interp_line_cont_4_avx2(double *dst, ptrdiff_t dst_len,
                                             const double *src, ptrdiff_t src_stride,
                                             const ptrdiff_t *idx_x,
                                             const double *fact_x, const double *fact_y);
@@ -141,8 +141,8 @@ static int transfer_lagrange_init(GridTransferContext *ctx)
         priv->stencil = 4;
 
 #if HAVE_EXTERNAL_ASM
-        if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
-            priv->transfer_cont    = mg2di_transfer_interp_line_cont_4_fma3;
+        if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) {
+            priv->transfer_cont    = mg2di_transfer_interp_line_cont_4_avx2;
         }
 #endif
         break;
diff --git a/transfer_interp.asm b/transfer_interp.asm
index a6ae60f..b7c9af5 100644
--- a/transfer_interp.asm
+++ b/transfer_interp.asm
@@ -17,11 +17,12 @@
 ;/
 
 %include "config.asm"
+%include "util.asm"
 %include "x86inc.asm"
 
 SECTION .text
 
-INIT_YMM fma3
+INIT_YMM avx2
 cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\
                                               idx_x_val
     shl src_strideq, 3
@@ -35,10 +36,10 @@ cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx
     %define offsetq dst_lenq
 
     movu m0, [fact_yq]
-    vpermq  m1, m0, 01010101b                           ; fact y + 1 -> m1
-    vpermq  m2, m0, 10101010b                           ; fact y + 2 -> m2
-    vpermq  m3, m0, 11111111b                           ; fact y + 3 -> m3
-    vpermq  m0, m0, 00000000b                           ; fact y + 0 -> m0
+    SPLATPD m1, m0, 1                           ; fact y + 1 -> m1
+    SPLATPD m2, m0, 2                           ; fact y + 2 -> m2
+    SPLATPD m3, m0, 3                           ; fact y + 3 -> m3
+    SPLATPD m0, m0, 0                           ; fact y + 0 -> m0
 
 .loop:
     mov idx_x_valq, [idx_xq + offsetq]
diff --git a/util.asm b/util.asm
new file mode 100644
index 0000000..6dce023
--- /dev/null
+++ b/util.asm
@@ -0,0 +1,38 @@
+;
+; Various ASM utility macros
+; Copyright 2019 Anton Khirnov <anton@khirnov.net>
+;
+; This program is free software: you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation, either version 3 of the License, or
+; (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+%define IMM8SPLAT2B(x) (((x) << 6) | ((x) << 4) | ((x) << 2) | (x))
+%define IMM4SPLAT1B(x) (((x) << 3) | ((x) << 2) | ((x) << 1) | (x))
+
+; splat - copy the specified double into all positions in the destination
+; %1 destination register
+; %2 source register (may be same as dst)
+; %3 index of the element in the source to splat
+%macro SPLATPD 3
+    %if %3 > 3
+        %error Invalid selector %3
+    %endif
+
+    %if mmsize == 32 && cpuflag(avx2)
+        vpermpd %1, %2, IMM8SPLAT2B(%3)
+    %elif mmsize == 32 && cpuflag(avx)
+        shufpd %1, %2, %2, IMM4SPLAT1B(%3 & 1)
+        vperm2f128 %1, %1, ((%3 & 2) >> 1) * 0x11
+    %else
+        %error %? not supported with cpuname
+    %endif
+%endmacro
-- 
cgit v1.2.3