aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm39
1 files changed, 30 insertions, 9 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 0a85e1d..bce2cf9 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -121,14 +121,14 @@ SECTION .text
addpd m7, [u_down2q + 1 * ELEM_SIZE + offsetq] ; + u[y-2, x+1]
subpd m7, [u_down2q - 1 * ELEM_SIZE + offsetq] ; - u[y-2, x-1]
- vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7
+ vfmadd213pd m6, m14, m7 ; m6 = 8 m6 + m7
movu m7, [u_up2q + 2 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x+2]
subpd m7, [u_up2q - 2 * ELEM_SIZE + offsetq] ; - u[y+2, x-2]
subpd m7, [u_down2q + 2 * ELEM_SIZE + offsetq] ; - u[y-2, x+2]
addpd m7, [u_down2q - 2 * ELEM_SIZE + offsetq] ; + u[y-2, x-2]
- vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7
+ vfmadd213pd m6, m14, m7 ; m6 = 8 m6 + m7
%endif
vfmadd231pd m0, m6, [diff_coeffsq + diff_coeff_offset_11] ; res += d_xy u * diff_coeffs11
@@ -152,17 +152,17 @@ SECTION .text
; const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
; double res_mult, [double u_mult (add only)])
cglobal residual_line_ %+ opname %+ _s %+ stencil, \
- 8, 13, 14 + stencil * 2, \
+ 8, 13 + 2 * (mmsize == 64), 14 + stencil * 2, \
linesize, dst, res_max, u_stride, u, rhs, diff_coeffs, diff_coeffs_offset, \
- u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5
+ u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5, mask, tmp
%if %2
- vpermq m2, m1, 0
+ vbroadcastsd m2, xmm1
%endif
- vpermq m1, m0, 0
+ vbroadcastsd m1, xmm0
; compute the mask for absolute value
- pcmpeqq m13, m13
+ ONES m13
psrlq m13, 1
movu m12, [res_maxq]
@@ -205,7 +205,7 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil,
.loop:
- xorpd m0, m0
+ pxor m0, m0
subpd m0, [rhsq + offsetq] ; res = -rhs
; plain value
@@ -225,7 +225,7 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil,
RES_ADD_DIFF_SINGLEDIR stencil, 1
RES_ADD_DIFF_MIXED stencil
- andpd m6, m0, m13 ; m6 = abs(res)
+ pand m6, m0, m13 ; m6 = abs(res)
mulpd m0, m1
%if %2
addpd m0, m3
@@ -243,6 +243,19 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil,
jmp .finish
.store_partial:
+%if mmsize == 64
+ lea tmpq, [offsetq - mmsize]
+ neg tmpd
+ shr tmpd, 3 ; tmp = <number of elements left>
+
+ mov maskd, 1
+ shlx maskd, maskd, tmpd
+ dec maskd ; mask = (1 << tmp) - 1
+
+ kmovw k1, maskd
+ vmovdqu64 [dstq + offsetq - mmsize] {k1}, m0
+ maxpd m12 {k1}, m12, m6
+%else
sub offsetq, ELEM_SIZE
jz .store3
sub offsetq, ELEM_SIZE
@@ -271,6 +284,8 @@ cglobal residual_line_ %+ opname %+ _s %+ stencil,
vpermq m6, m6, 10100100b
maxpd m12, m6
+%endif
+
.finish:
movu [res_maxq], m12
RET
@@ -281,3 +296,9 @@ RESIDUAL_CALC 1, 0
RESIDUAL_CALC 1, 1
RESIDUAL_CALC 2, 0
RESIDUAL_CALC 2, 1
+
+INIT_ZMM avx512
+RESIDUAL_CALC 1, 0
+RESIDUAL_CALC 1, 1
+RESIDUAL_CALC 2, 0
+RESIDUAL_CALC 2, 1