aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-01-09 22:26:44 +0100
committerAnton Khirnov <anton@khirnov.net>2019-01-10 09:14:21 +0100
commitb7aa818e0fc9d61d9fd37d4d4bbdc3394eef4f29 (patch)
tree9ff3314025e9b20ebf44d12ea77ef4bb723df1f7 /residual_calc.asm
parentf989d2831f72ee69d5e96ca73a5a6427b064534f (diff)
residual_calc.asm: templatize the entire residual computation
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm106
1 files changed, 29 insertions, 77 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 6424b2f..289c3fb 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -137,10 +137,10 @@ SECTION .text
vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11
%endmacro
-INIT_YMM fma3
-cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up
- %define u_downq fd_factorsq
+; %1: stencil
+%macro RESIDUAL_CALC 1
+ %define stencil %1
+ %define u_downq fd_factorsq ; reuse the fd_factors registers after it is no longer needed
; load pointers to the equation coefficients
%define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
@@ -179,6 +179,15 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co
lea u_upq, [uq + strideq]
mov u_downq, uq
sub u_downq, strideq
+ %if stencil == 2
+ lea u_up2q, [uq + 2 * strideq]
+ mov u_down2q, u_downq
+ sub u_down2q, strideq
+
+ movu m15, [const30]
+ movu m14, [const8]
+ movu m13, [const16]
+ %endif
.loop:
xorpd m0, m0
@@ -188,11 +197,15 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co
movu m6, [uq + offsetq] ; m6 = u[x]
vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
- addpd m6, m6 ; m6 = 2 * u[x]
+ %if stencil == 1
+ addpd m6, m6 ; m6 = 2 * u[x]
+ %else
+ mulpd m6, m15 ; m6 = 30 * u[x]
+ %endif
- RES_ADD_DIFF_SINGLEDIR 1, 0
- RES_ADD_DIFF_SINGLEDIR 1, 1
- RES_ADD_DIFF_MIXED 1
+ RES_ADD_DIFF_SINGLEDIR stencil, 0
+ RES_ADD_DIFF_SINGLEDIR stencil, 1
+ RES_ADD_DIFF_MIXED stencil
; store the result
movu [dstq + offsetq], m0
@@ -200,75 +213,14 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co
js .loop
RET
+%endmacro
INIT_YMM fma3
-cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2, u_down
- %define u_down2q fd_factorsq ; reuse the fd_factors registers after it is no longer needed
- ; load pointers to the equation coefficients
- %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
- mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
- mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01]
- mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10]
- mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11]
- mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02]
- mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20]
-
- ; setup the data pointers and the loop counter
- shl strideq, 3
- shl linesizeq, 3
- add dstq, linesizeq
- add uq, linesizeq
- add rhsq, linesizeq
- add diff_coeffs00q, linesizeq
- add diff_coeffs01q, linesizeq
- add diff_coeffs10q, linesizeq
- add diff_coeffs11q, linesizeq
- add diff_coeffs02q, linesizeq
- add diff_coeffs20q, linesizeq
- neg linesizeq
- ; from now on, the register that held linesize is used as the offset into data arrays
- %define offsetq linesizeq
-
- ; load and splat the finite difference factors
- movu m0, [fd_factorsq + OFF_DIFF_COEFF_01]
- vpermq m1, m0, 00000000b ; diff factor 01 -> m1
- vpermq m2, m0, 01010101b ; diff factor 10 -> m2
- vpermq m3, m0, 10101010b ; diff factor 11 -> m3
- vpermq m4, m0, 11111111b ; diff factor 02 -> m4
- movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20]
- vpermq m5, m0, 00000000b ; diff factor 20 -> m5
-
- movu m15, [const30]
- movu m14, [const8]
- movu m13, [const16]
-
- ; setup pointers to the lines above and below
- lea u_upq, [uq + strideq]
- lea u_up2q, [uq + 2 * strideq]
-
- mov u_downq, uq
- sub u_downq, strideq
- mov u_down2q, u_downq
- sub u_down2q, strideq
-
-.loop:
- xorpd m0, m0
- subpd m0, [rhsq + offsetq] ; res = -rhs
-
- ; plain value
- movu m6, [uq + offsetq]
- vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
-
- mulpd m6, m15 ; m6 = 30 u[x]
-
- RES_ADD_DIFF_SINGLEDIR 2, 0
- RES_ADD_DIFF_SINGLEDIR 2, 1
- RES_ADD_DIFF_MIXED 2
-
- ; store the result
- movu [dstq + offsetq], m0
- add offsetq, mmsize
- js .loop
+cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up
+RESIDUAL_CALC 1
- RET
+INIT_YMM fma3
+cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2, u_down2
+RESIDUAL_CALC 2