aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-01-09 21:29:23 +0100
committerAnton Khirnov <anton@khirnov.net>2019-01-10 09:14:21 +0100
commit4a69f2b5c22e1df0474330a176198770b42a365b (patch)
treeb6533e47e9cf1dd88a3ba54582253e2ba7cc61f3 /residual_calc.asm
parent88db705983c7b518372c93ef50bac5e8ddb1e6bf (diff)
residual_calc.asm: templatize computing non-mixed derivatives
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm136
1 files changed, 66 insertions, 70 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 7048da3..bfb6ce3 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -39,6 +39,68 @@ const64: times 8 dq 64.0
SECTION .text
+; mm register allocation (both s1 and s2)
+; m0: accumulator for the residual
+; m1-m5: splatted constant finite difference coefficients
+; m6-m11: working registers
+; (s2 only) m12-m15: splatted constants 64.0, 16.0, 8.0, 30.0
+
+; calculate and add residual contributions from first and second derivatives
+; along a single direction (x or y)
+;
+; parameters:
+; %1: stencil
+; %2: 0 -- x; 1 -- y
+;
+; register use (in addition to register allocation described above):
+; m6: on entry contains u[x] multiplied by the corresponding FD coefficient, not
+; clobbered
+; m7-m11 used for work (clobbered)
+%macro RES_ADD_DIFF_SINGLEDIR 2
+ %define stencil %1
+
+ %if %2 == 0
+ %define up1q uq + ELEM_SIZE
+ %define up2q uq + ELEM_SIZE * 2
+ %define um1q uq - ELEM_SIZE
+ %define um2q uq - ELEM_SIZE * 2
+ %else
+ %define up1q u_upq
+ %define up2q u_up2q
+ %define um1q u_downq
+ %define um2q u_down2q
+ %endif
+
+ ; load the function values
+ movu m7, [up1q + offsetq] ; m7 = u[x+1]
+ movu m9, [um1q + offsetq] ; m9 = u[x-1]
+ %if stencil == 2
+ movu m8, [up2q + offsetq] ; m8 = u[x+2]
+ movu m10, [um2q + offsetq] ; m10 = u[x-2]
+ %endif
+
+ ; first derivative
+ subpd m11, m7, m9 ; m11 = u[x+1] - u[x-1]
+ %if stencil == 2
+ mulpd m11, m14 ; m11 *= 8
+ subpd m11, m8 ; m11 -= u[x+2]
+ addpd m11, m10 ; m11 += u[x-2]
+ %endif
+ mulpd m11, m2
+ vfmadd231pd m0, m11, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10
+
+ ; second derivative
+ addpd m11, m7, m9 ; m11 = u[x+1] + u[x-1]
+ %if stencil == 2
+ mulpd m11, m13 ; m11 *= 16
+ subpd m11, m8 ; m11 -= u[x+2]
+ subpd m11, m10 ; m11 -= u[x-2]
+ %endif
+ subpd m11, m6 ; m11 -= fd0 u[x]
+ mulpd m11, m5
+ vfmadd231pd m0, m11, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20
+%endmacro
+
INIT_YMM fma3
cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up
@@ -92,31 +154,8 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co
addpd m6, m6 ; m6 = 2 * u[x]
- ; dx, d2x
- movu m8, [uq + offsetq + ELEM_SIZE]
- movu m9, [uq + offsetq - ELEM_SIZE]
-
- subpd m7, m8, m9
- mulpd m7, m2
- vfmadd231pd m0, m7, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10
-
- addpd m7, m8, m9
- subpd m7, m6
- mulpd m7, m5
- vfmadd231pd m0, m7, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20
-
- ; dy, d2y
- movu m8, [u_upq + offsetq]
- movu m9, [u_downq + offsetq]
-
- subpd m7, m8, m9
- mulpd m7, m1
- vfmadd231pd m0, m7, [diff_coeffs01q + offsetq] ; res += d_y u * diff_coeffs01
-
- addpd m7, m8, m9
- subpd m7, m6
- mulpd m7, m4
- vfmadd231pd m0, m7, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02
+ RES_ADD_DIFF_SINGLEDIR 1, 0
+ RES_ADD_DIFF_SINGLEDIR 1, 1
; mixed d2xy
movu m6, [u_upq + offsetq + ELEM_SIZE]
@@ -196,51 +235,8 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co
mulpd m6, m15 ; m6 = 30 u[x]
- ; dx, d2x
- movu m7, [uq + offsetq + ELEM_SIZE] ; m7 = u[x+1]
- movu m8, [uq + offsetq + ELEM_SIZE * 2] ; m8 = u[x+2]
- movu m9, [uq + offsetq - ELEM_SIZE] ; m9 = u[x-1]
- movu m10, [uq + offsetq - ELEM_SIZE * 2] ; m10 = u[x-2]
-
- mulpd m11, m14, m7 ; m11 = 8 u[x+1]
- vfnmadd231pd m11, m14, m9 ; m11 -= 8 u[x-1]
- subpd m11, m8 ; m11 -= u[x+2]
- addpd m11, m10 ; m11 += u[x-2]
-
- mulpd m11, m2,
- vfmadd231pd m0, m11, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10
-
- mulpd m11, m13, m7 ; m11 = 16 u[x+1]
- vfmadd231pd m11, m13, m9 ; m11 += 16 u[x-1]
- subpd m11, m8 ; m11 -= u[x+2]
- subpd m11, m10 ; m11 -= u[x-2]
- subpd m11, m6 ; m11 -= 30 u[x]
-
- mulpd m11, m5
- vfmadd231pd m0, m11, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20
-
- ; dy, d2y
- movu m7, [u_upq + offsetq] ; m7 = u[y+1]
- movu m8, [u_up2q + offsetq] ; m8 = u[y+2]
- movu m9, [u_downq + offsetq] ; m9 = u[y-1]
- movu m10, [u_down2q + offsetq] ; m10 = u[y-2]
-
- mulpd m11, m14, m7 ; m11 = 8 u[y+1]
- vfnmadd231pd m11, m14, m9 ; m11 -= 8 u[y-1]
- subpd m11, m8 ; m11 -= u[y+2]
- addpd m11, m10 ; m11 += u[y-2]
-
- mulpd m11, m1,
- vfmadd231pd m0, m11, [diff_coeffs01q + offsetq] ; res += d_y u * diff_coeffs01
-
- mulpd m11, m13, m7 ; m11 = 16 u[y+1]
- vfmadd231pd m11, m13, m9 ; m11 += 16 u[y-1]
- subpd m11, m8 ; m11 -= u[y+2]
- subpd m11, m10 ; m11 -= u[y-2]
- subpd m11, m6 ; m11 -= 30 u[x]
-
- mulpd m11, m4
- vfmadd231pd m0, m11, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02
+ RES_ADD_DIFF_SINGLEDIR 2, 0
+ RES_ADD_DIFF_SINGLEDIR 2, 1
; mixed d2xy
movu m6, [u_up2q + offsetq + 2 * ELEM_SIZE] ; m6 = u[y+2, x+2]