From 264d8ce5e39676582f2e6a65cf517924846070b9 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Mon, 8 Apr 2019 10:42:15 +0200 Subject: x86: add a misc utility header --- residual_calc.asm | 11 ++++++----- transfer.c | 6 +++--- transfer_interp.asm | 11 ++++++----- util.asm | 38 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 13 deletions(-) create mode 100644 util.asm diff --git a/residual_calc.asm b/residual_calc.asm index e5b0268..77d6dc8 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -17,6 +17,7 @@ ;/ %include "config.asm" +%include "util.asm" %include "x86inc.asm" ; double precision @@ -149,12 +150,12 @@ SECTION .text ; load and splat the finite difference factors movu m0, [fd_factorsq + OFF_DIFF_COEFF_01] - vpermq m1, m0, 00000000b ; diff factor 01 -> m1 - vpermq m2, m0, 01010101b ; diff factor 10 -> m2 - vpermq m3, m0, 10101010b ; diff factor 11 -> m3 - vpermq m4, m0, 11111111b ; diff factor 02 -> m4 + SPLATPD m1, m0, 0 ; diff factor 01 -> m1 + SPLATPD m2, m0, 1 ; diff factor 10 -> m2 + SPLATPD m3, m0, 2 ; diff factor 11 -> m3 + SPLATPD m4, m0, 3 ; diff factor 02 -> m4 movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20] - vpermq m5, m0, 00000000b ; diff factor 20 -> m5 + SPLATPD m5, m0, 0 ; diff factor 20 -> m5 %define u_downq fd_factorsq ; reuse the fd_factors register after it is no longer needed ; compute the mask for absolute value diff --git a/transfer.c b/transfer.c index 232d6e1..98051de 100644 --- a/transfer.c +++ b/transfer.c @@ -57,7 +57,7 @@ typedef struct GridTransferLagrange { } GridTransferLagrange; #if HAVE_EXTERNAL_ASM -void mg2di_transfer_interp_line_cont_4_fma3(double *dst, ptrdiff_t dst_len, +void mg2di_transfer_interp_line_cont_4_avx2(double *dst, ptrdiff_t dst_len, const double *src, ptrdiff_t src_stride, const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y); @@ -141,8 +141,8 @@ static int transfer_lagrange_init(GridTransferContext *ctx) priv->stencil = 4; #if HAVE_EXTERNAL_ASM - if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { - priv->transfer_cont = mg2di_transfer_interp_line_cont_4_fma3; + if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) { + priv->transfer_cont = mg2di_transfer_interp_line_cont_4_avx2; } #endif break; diff --git a/transfer_interp.asm b/transfer_interp.asm index a6ae60f..b7c9af5 100644 --- a/transfer_interp.asm +++ b/transfer_interp.asm @@ -17,11 +17,12 @@ ;/ %include "config.asm" +%include "util.asm" %include "x86inc.asm" SECTION .text -INIT_YMM fma3 +INIT_YMM avx2 cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\ idx_x_val shl src_strideq, 3 @@ -35,10 +36,10 @@ cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx %define offsetq dst_lenq movu m0, [fact_yq] - vpermq m1, m0, 01010101b ; fact y + 1 -> m1 - vpermq m2, m0, 10101010b ; fact y + 2 -> m2 - vpermq m3, m0, 11111111b ; fact y + 3 -> m3 - vpermq m0, m0, 00000000b ; fact y + 0 -> m0 + SPLATPD m1, m0, 1 ; fact y + 1 -> m1 + SPLATPD m2, m0, 2 ; fact y + 2 -> m2 + SPLATPD m3, m0, 3 ; fact y + 3 -> m3 + SPLATPD m0, m0, 0 ; fact y + 0 -> m0 .loop: mov idx_x_valq, [idx_xq + offsetq] diff --git a/util.asm b/util.asm new file mode 100644 index 0000000..6dce023 --- /dev/null +++ b/util.asm @@ -0,0 +1,38 @@ +; +; Various ASM utility macros +; Copyright 2019 Anton Khirnov +; +; This program is free software: you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation, either version 3 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program. If not, see . + +%define IMM8SPLAT2B(x) (((x) << 6) | ((x) << 4) | ((x) << 2) | (x)) +%define IMM4SPLAT1B(x) (((x) << 3) | ((x) << 2) | ((x) << 1) | (x)) + +; splat - copy the specified double into all positions in the destination +; %1 destination register +; %2 source register (may be same as dst) +; %3 index of the element in the source to splat +%macro SPLATPD 3 + %if %3 > 3 + %error Invalid selector %3 + %endif + + %if mmsize == 32 && cpuflag(avx2) + vpermpd %1, %2, IMM8SPLAT2B(%3) + %elif mmsize == 32 && cpuflag(avx) + shufpd %1, %2, %2, IMM4SPLAT1B(%3 & 1) + vperm2f128 %1, %1, ((%3 & 2) >> 1) * 0x11 + %else + %error %? not supported with cpuname + %endif +%endmacro -- cgit v1.2.3