aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-04-17 18:28:15 +0200
committerAnton Khirnov <anton@khirnov.net>2019-04-19 17:11:40 +0200
commitb2d93a84e3a9c84e8c591c9f55355b5834c03d4b (patch)
tree189213565869426d3d161e334908e63168c0bd99 /residual_calc.asm
parent5b94910fc4c6a47856290e9c23f9a905cf63c1eb (diff)
egs: premultiply diff_coeffs with the denominator in init
Do not do it at every residual calc, which also allows us to get rid of an extra parameter (and reduce the number of registers used in x86 SIMD).
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm21
1 files changed, 4 insertions, 17 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index c51ba5e..3a5b800 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -39,7 +39,6 @@ SECTION .text
; mm register allocation (both s1 and s2)
; m0: accumulator for the residual
-; m1-m5: splatted constant finite difference coefficients
; m6-m11: working registers
; m12: max(fabs(residual))
; m13: mask for computing absolute values
@@ -90,7 +89,6 @@ SECTION .text
subpd m11, m8 ; m11 -= u[x+2]
addpd m11, m10 ; m11 += u[x-2]
%endif
- mulpd m11, m2
vfmadd231pd m0, m11, [coeffs1q + offsetq] ; res += d_x u * diff_coeffs10
; second derivative
@@ -102,7 +100,6 @@ SECTION .text
subpd m11, m10 ; m11 -= u[x-2]
%endif
subpd m11, m6 ; m11 -= fd0 u[x]
- mulpd m11, m5
vfmadd231pd m0, m11, [coeffs2q + offsetq] ; res += d_xx u * diff_coeffs20
%endmacro
@@ -139,7 +136,6 @@ SECTION .text
vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7
%endif
- mulpd m6, m3
vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11
%endmacro
@@ -147,15 +143,6 @@ SECTION .text
%macro RESIDUAL_CALC 1
%define stencil %1
- ; load and splat the finite difference factors
- movu m0, [fd_factorsq + OFF_DIFF_COEFF_01]
- vpermq m1, m0, 00000000b ; diff factor 01 -> m1
- vpermq m2, m0, 01010101b ; diff factor 10 -> m2
- vpermq m3, m0, 10101010b ; diff factor 11 -> m3
- vpermq m4, m0, 11111111b ; diff factor 02 -> m4
- movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20]
- vpermq m5, m0, 00000000b ; diff factor 20 -> m5
- %define u_downq fd_factorsq ; reuse the fd_factors register after it is no longer needed
; compute the mask for absolute value
pcmpeqq m13, m13
@@ -266,11 +253,11 @@ SECTION .text
%endmacro
INIT_YMM fma3
-cglobal residual_calc_line_s1, 8, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up
+cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
RESIDUAL_CALC 1
INIT_YMM fma3
-cglobal residual_calc_line_s2, 8, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2
+cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2