aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-04-23 15:36:34 +0200
committerAnton Khirnov <anton@khirnov.net>2019-04-24 14:44:36 +0200
commit86ad823b9ade211bfa9361b61571933aff1c9d24 (patch)
tree7a44acde9b11a9958b953f50927441318c2beb5e /residual_calc.asm
parent580740356c44658620bff6f9ddd8a006f04c31fc (diff)
egs: merge residual calc and correct when possible
Also, merge the reflect boundary condition into residual calc+add. Improves performance due to better locality.
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm27
1 files changed, 24 insertions, 3 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 3a5b800..5eea31c 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -39,6 +39,8 @@ SECTION .text
; mm register allocation (both s1 and s2)
; m0: accumulator for the residual
+; m1: dst mult factor
+; m2: res mult factor
; m6-m11: working registers
; m12: max(fabs(residual))
; m13: mask for computing absolute values
@@ -140,9 +142,14 @@ SECTION .text
%endmacro
; %1: stencil
-%macro RESIDUAL_CALC 1
+; %2: 0 - calc; 1 - add
+%macro RESIDUAL_CALC 2
%define stencil %1
+%if %2
+ vpermq m2, m1, 0
+%endif
+ vpermq m1, m0, 0
; compute the mask for absolute value
pcmpeqq m13, m13
@@ -195,6 +202,9 @@ SECTION .text
; plain value
movu m6, [uq + offsetq] ; m6 = u[x]
vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
+%if %2
+ mulpd m3, m6, m2
+%endif
%if stencil == 1
addpd m6, m6 ; m6 = 2 * u[x]
@@ -207,6 +217,10 @@ SECTION .text
RES_ADD_DIFF_MIXED stencil
andpd m6, m0, m13 ; m6 = abs(res)
+ mulpd m0, m1
+%if %2
+ addpd m0, m3
+%endif
; store the result
add offsetq, mmsize
@@ -255,9 +269,16 @@ SECTION .text
INIT_YMM fma3
cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
-RESIDUAL_CALC 1
+RESIDUAL_CALC 1, 0
+cglobal residual_add_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
+RESIDUAL_CALC 1, 1
INIT_YMM fma3
cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
-RESIDUAL_CALC 2
+RESIDUAL_CALC 2, 0
+
+cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
+RESIDUAL_CALC 2, 1