aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-04-09 09:35:58 +0200
committerAnton Khirnov <anton@khirnov.net>2024-04-16 17:32:15 +0200
commitbda21267fd85388d1543976cc86f71097fc2bf5c (patch)
tree9587cffb5f4e1bd42f1ad54d9a42b839e573b5e6
parentbf320a3d0f5c526f0d96551681c37e20b3f44efb (diff)
residual_calc.asm: add AVX512 versionavx512
-rw-r--r--residual_calc.asm39
-rw-r--r--residual_calc.c15
-rw-r--r--util.asm9
3 files changed, 54 insertions, 9 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 0a85e1d..bce2cf9 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -121,14 +121,14 @@ SECTION .text
addpd m7, [u_down2q + 1 * ELEM_SIZE + offsetq] ; + u[y-2, x+1]
subpd m7, [u_down2q - 1 * ELEM_SIZE + offsetq] ; - u[y-2, x-1]
- vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7
+ vfmadd213pd m6, m14, m7 ; m6 = 8 m6 + m7
movu m7, [u_up2q + 2 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x+2]
subpd m7, [u_up2q - 2 * ELEM_SIZE + offsetq] ; - u[y+2, x-2]
subpd m7, [u_down2q + 2 * ELEM_SIZE + offsetq] ; - u[y-2, x+2]
addpd m7, [u_down2q - 2 * ELEM_SIZE + offsetq] ; + u[y-2, x-2]
- vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7
+ vfmadd213pd m6, m14, m7 ; m6 = 8 m6 + m7
%endif
vfmadd231pd m0, m6, [diff_coeffsq + diff_coeff_offset_11] ; res += d_xy u * diff_coeffs11
@@ -152,17 +152,17 @@ SECTION .text
; const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
; double res_mult, [double u_mult (add only)])
cglobal residual_line_ %+ opname %+ _s %+ stencil, \
- 8, 13, 14 + stencil * 2, \
+ 8, 13 + 2 * (mmsize == 64), 14 + stencil * 2, \
linesize, dst, res_max, u_stride, u, rhs, diff_coeffs, diff_coeffs_offset, \
- u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5
+ u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5, mask, tmp
%if %2
- vpermq m2, m1, 0
+ vbroadcastsd m2, xmm1
%endif
- vpermq m1, m0, 0
+ vbroadcastsd m1, xmm0
; compute the mask for absolute value
- pcmpeqq m13, m13
+ ONES m13
psrlq m13, 1
movu m12, [res_maxq]
@@ -205,7 +205,7 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil,
.loop:
- xorpd m0, m0
+ pxor m0, m0
subpd m0, [rhsq + offsetq] ; res = -rhs
; plain value
@@ -225,7 +225,7 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil,
RES_ADD_DIFF_SINGLEDIR stencil, 1
RES_ADD_DIFF_MIXED stencil
- andpd m6, m0, m13 ; m6 = abs(res)
+ pand m6, m0, m13 ; m6 = abs(res)
mulpd m0, m1
%if %2
addpd m0, m3
@@ -243,6 +243,19 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil,
jmp .finish
.store_partial:
+%if mmsize == 64
+ lea tmpq, [offsetq - mmsize]
+ neg tmpd
+ shr tmpd, 3 ; tmp = <number of elements left>
+
+ mov maskd, 1
+ shlx maskd, maskd, tmpd
+ dec maskd ; mask = (1 << tmp) - 1
+
+ kmovw k1, maskd
+ vmovdqu64 [dstq + offsetq - mmsize] {k1}, m0
+ maxpd m12 {k1}, m12, m6
+%else
sub offsetq, ELEM_SIZE
jz .store3
sub offsetq, ELEM_SIZE
@@ -271,6 +284,8 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil,
vpermq m6, m6, 10100100b
maxpd m12, m6
+%endif
+
.finish:
movu [res_maxq], m12
RET
@@ -281,3 +296,9 @@ RESIDUAL_CALC 1, 0
RESIDUAL_CALC 1, 1
RESIDUAL_CALC 2, 0
RESIDUAL_CALC 2, 1
+
+INIT_ZMM avx512
+RESIDUAL_CALC 1, 0
+RESIDUAL_CALC 1, 1
+RESIDUAL_CALC 2, 0
+RESIDUAL_CALC 2, 1
diff --git a/residual_calc.c b/residual_calc.c
index 6d43e4d..fe7ec12 100644
--- a/residual_calc.c
+++ b/residual_calc.c
@@ -79,6 +79,11 @@ ResidualLineCalc mg2di_residual_line_calc_s1_avx2;
ResidualLineCalc mg2di_residual_line_calc_s2_avx2;
ResidualLineAdd mg2di_residual_line_add_s1_avx2;
ResidualLineAdd mg2di_residual_line_add_s2_avx2;
+
+ResidualLineCalc mg2di_residual_line_calc_s1_avx512;
+ResidualLineCalc mg2di_residual_line_calc_s2_avx512;
+ResidualLineAdd mg2di_residual_line_add_s1_avx512;
+ResidualLineAdd mg2di_residual_line_add_s2_avx512;
#endif
static void
@@ -336,6 +341,11 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
priv->residual_line_add = mg2di_residual_line_add_s1_avx2;
priv->calc_blocksize = 4;
}
+ if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX512) {
+ priv->residual_line_calc = mg2di_residual_line_calc_s1_avx512;
+ priv->residual_line_add = mg2di_residual_line_add_s1_avx512;
+ priv->calc_blocksize = 8;
+ }
#endif
break;
case 2:
@@ -347,6 +357,11 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
priv->residual_line_add = mg2di_residual_line_add_s2_avx2;
priv->calc_blocksize = 4;
}
+ if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX512) {
+ priv->residual_line_calc = mg2di_residual_line_calc_s2_avx512;
+ priv->residual_line_add = mg2di_residual_line_add_s2_avx512;
+ priv->calc_blocksize = 8;
+ }
#endif
break;
}
diff --git a/util.asm b/util.asm
index 864080f..89cdcbd 100644
--- a/util.asm
+++ b/util.asm
@@ -40,3 +40,12 @@
%error %? not supported with cpuname
%endif
%endmacro
+
+; make a mm register all-ones
+%macro ONES 1
+ %if mmsize == 64
+ vpternlogq %1, %1, %1, 0xff
+ %else
+ pcmpeqq %1, %1
+ %endif
+%endmacro