From bda21267fd85388d1543976cc86f71097fc2bf5c Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Tue, 9 Apr 2019 09:35:58 +0200 Subject: residual_calc.asm: add AVX512 version --- residual_calc.asm | 39 ++++++++++++++++++++++++++++++--------- residual_calc.c | 15 +++++++++++++++ util.asm | 9 +++++++++ 3 files changed, 54 insertions(+), 9 deletions(-) diff --git a/residual_calc.asm b/residual_calc.asm index 0a85e1d..bce2cf9 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -121,14 +121,14 @@ SECTION .text addpd m7, [u_down2q + 1 * ELEM_SIZE + offsetq] ; + u[y-2, x+1] subpd m7, [u_down2q - 1 * ELEM_SIZE + offsetq] ; - u[y-2, x-1] - vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7 + vfmadd213pd m6, m14, m7 ; m6 = 8 m6 + m7 movu m7, [u_up2q + 2 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x+2] subpd m7, [u_up2q - 2 * ELEM_SIZE + offsetq] ; - u[y+2, x-2] subpd m7, [u_down2q + 2 * ELEM_SIZE + offsetq] ; - u[y-2, x+2] addpd m7, [u_down2q - 2 * ELEM_SIZE + offsetq] ; + u[y-2, x-2] - vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7 + vfmadd213pd m6, m14, m7 ; m6 = 8 m6 + m7 %endif vfmadd231pd m0, m6, [diff_coeffsq + diff_coeff_offset_11] ; res += d_xy u * diff_coeffs11 @@ -152,17 +152,17 @@ SECTION .text ; const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, ; double res_mult, [double u_mult (add only)]) cglobal residual_line_ %+ opname %+ _s %+ stencil, \ - 8, 13, 14 + stencil * 2, \ + 8, 13 + 2 * (mmsize == 64), 14 + stencil * 2, \ linesize, dst, res_max, u_stride, u, rhs, diff_coeffs, diff_coeffs_offset, \ - u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5 + u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5, mask, tmp %if %2 - vpermq m2, m1, 0 + vbroadcastsd m2, xmm1 %endif - vpermq m1, m0, 0 + vbroadcastsd m1, xmm0 ; compute the mask for absolute value - pcmpeqq m13, m13 + ONES m13 psrlq m13, 1 movu m12, [res_maxq] @@ -205,7 +205,7 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil, .loop: - xorpd m0, m0 + pxor m0, m0 subpd m0, [rhsq + offsetq] ; res = -rhs ; plain value @@ -225,7 +225,7 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil, RES_ADD_DIFF_SINGLEDIR stencil, 1 RES_ADD_DIFF_MIXED stencil - andpd m6, m0, m13 ; m6 = abs(res) + pand m6, m0, m13 ; m6 = abs(res) mulpd m0, m1 %if %2 addpd m0, m3 @@ -243,6 +243,19 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil, jmp .finish .store_partial: +%if mmsize == 64 + lea tmpq, [offsetq - mmsize] + neg tmpd + shr tmpd, 3 ; tmp = + + mov maskd, 1 + shlx maskd, maskd, tmpd + dec maskd ; mask = (1 << tmp) - 1 + + kmovw k1, maskd + vmovdqu64 [dstq + offsetq - mmsize] {k1}, m0 + maxpd m12 {k1}, m12, m6 +%else sub offsetq, ELEM_SIZE jz .store3 sub offsetq, ELEM_SIZE @@ -271,6 +284,8 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil, vpermq m6, m6, 10100100b maxpd m12, m6 +%endif + .finish: movu [res_maxq], m12 RET @@ -281,3 +296,9 @@ RESIDUAL_CALC 1, 0 RESIDUAL_CALC 1, 1 RESIDUAL_CALC 2, 0 RESIDUAL_CALC 2, 1 + +INIT_ZMM avx512 +RESIDUAL_CALC 1, 0 +RESIDUAL_CALC 1, 1 +RESIDUAL_CALC 2, 0 +RESIDUAL_CALC 2, 1 diff --git a/residual_calc.c b/residual_calc.c index 6d43e4d..fe7ec12 100644 --- a/residual_calc.c +++ b/residual_calc.c @@ -79,6 +79,11 @@ ResidualLineCalc mg2di_residual_line_calc_s1_avx2; ResidualLineCalc mg2di_residual_line_calc_s2_avx2; ResidualLineAdd mg2di_residual_line_add_s1_avx2; ResidualLineAdd mg2di_residual_line_add_s2_avx2; + +ResidualLineCalc mg2di_residual_line_calc_s1_avx512; +ResidualLineCalc mg2di_residual_line_calc_s2_avx512; +ResidualLineAdd mg2di_residual_line_add_s1_avx512; +ResidualLineAdd mg2di_residual_line_add_s2_avx512; #endif static void @@ -336,6 +341,11 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) priv->residual_line_add = mg2di_residual_line_add_s1_avx2; priv->calc_blocksize = 4; } + if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX512) { + priv->residual_line_calc = mg2di_residual_line_calc_s1_avx512; + priv->residual_line_add = mg2di_residual_line_add_s1_avx512; + priv->calc_blocksize = 8; + } #endif break; case 2: @@ -347,6 +357,11 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) priv->residual_line_add = mg2di_residual_line_add_s2_avx2; priv->calc_blocksize = 4; } + if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX512) { + priv->residual_line_calc = mg2di_residual_line_calc_s2_avx512; + priv->residual_line_add = mg2di_residual_line_add_s2_avx512; + priv->calc_blocksize = 8; + } #endif break; } diff --git a/util.asm b/util.asm index 864080f..89cdcbd 100644 --- a/util.asm +++ b/util.asm @@ -40,3 +40,12 @@ %error %? not supported with cpuname %endif %endmacro + +; make a mm register all-ones +%macro ONES 1 + %if mmsize == 64 + vpternlogq %1, %1, %1, 0xff + %else + pcmpeqq %1, %1 + %endif +%endmacro -- cgit v1.2.3