diff options
Diffstat (limited to 'residual_calc.asm')
-rw-r--r-- | residual_calc.asm | 39 |
1 files changed, 30 insertions, 9 deletions
diff --git a/residual_calc.asm b/residual_calc.asm index 0a85e1d..bce2cf9 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -121,14 +121,14 @@ SECTION .text addpd m7, [u_down2q + 1 * ELEM_SIZE + offsetq] ; + u[y-2, x+1] subpd m7, [u_down2q - 1 * ELEM_SIZE + offsetq] ; - u[y-2, x-1] - vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7 + vfmadd213pd m6, m14, m7 ; m6 = 8 m6 + m7 movu m7, [u_up2q + 2 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x+2] subpd m7, [u_up2q - 2 * ELEM_SIZE + offsetq] ; - u[y+2, x-2] subpd m7, [u_down2q + 2 * ELEM_SIZE + offsetq] ; - u[y-2, x+2] addpd m7, [u_down2q - 2 * ELEM_SIZE + offsetq] ; + u[y-2, x-2] - vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7 + vfmadd213pd m6, m14, m7 ; m6 = 8 m6 + m7 %endif vfmadd231pd m0, m6, [diff_coeffsq + diff_coeff_offset_11] ; res += d_xy u * diff_coeffs11 @@ -152,17 +152,17 @@ SECTION .text ; const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, ; double res_mult, [double u_mult (add only)]) cglobal residual_line_ %+ opname %+ _s %+ stencil, \ - 8, 13, 14 + stencil * 2, \ + 8, 13 + 2 * (mmsize == 64), 14 + stencil * 2, \ linesize, dst, res_max, u_stride, u, rhs, diff_coeffs, diff_coeffs_offset, \ - u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5 + u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5, mask, tmp %if %2 - vpermq m2, m1, 0 + vbroadcastsd m2, xmm1 %endif - vpermq m1, m0, 0 + vbroadcastsd m1, xmm0 ; compute the mask for absolute value - pcmpeqq m13, m13 + ONES m13 psrlq m13, 1 movu m12, [res_maxq] @@ -205,7 +205,7 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil, .loop: - xorpd m0, m0 + pxor m0, m0 subpd m0, [rhsq + offsetq] ; res = -rhs ; plain value @@ -225,7 +225,7 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil, RES_ADD_DIFF_SINGLEDIR stencil, 1 RES_ADD_DIFF_MIXED stencil - andpd m6, m0, m13 ; m6 = abs(res) + pand m6, m0, m13 ; m6 = abs(res) mulpd m0, m1 %if %2 addpd m0, m3 @@ -243,6 +243,19 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil, jmp .finish .store_partial: +%if mmsize == 64 + lea tmpq, [offsetq - mmsize] + neg tmpd + shr tmpd, 3 ; tmp = <number of elements left> + + mov maskd, 1 + shlx maskd, maskd, tmpd + dec maskd ; mask = (1 << tmp) - 1 + + kmovw k1, maskd + vmovdqu64 [dstq + offsetq - mmsize] {k1}, m0 + maxpd m12 {k1}, m12, m6 +%else sub offsetq, ELEM_SIZE jz .store3 sub offsetq, ELEM_SIZE @@ -271,6 +284,8 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil, vpermq m6, m6, 10100100b maxpd m12, m6 +%endif + .finish: movu [res_maxq], m12 RET @@ -281,3 +296,9 @@ RESIDUAL_CALC 1, 0 RESIDUAL_CALC 1, 1 RESIDUAL_CALC 2, 0 RESIDUAL_CALC 2, 1 + +INIT_ZMM avx512 +RESIDUAL_CALC 1, 0 +RESIDUAL_CALC 1, 1 +RESIDUAL_CALC 2, 0 +RESIDUAL_CALC 2, 1 |