aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-01-09 22:11:52 +0100
committerAnton Khirnov <anton@khirnov.net>2019-01-10 09:14:21 +0100
commitf989d2831f72ee69d5e96ca73a5a6427b064534f (patch)
treec601b29a397c6d0116fd49fc60da76c456536e0f /residual_calc.asm
parent4a69f2b5c22e1df0474330a176198770b42a365b (diff)
residual_calc.asm: templatize computing the mixed derivative
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm74
1 files changed, 39 insertions, 35 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index bfb6ce3..6424b2f 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -35,7 +35,6 @@ SECTION .rodata
const8: times 8 dq 8.0
const16: times 8 dq 16.0
const30: times 8 dq 30.0
-const64: times 8 dq 64.0
SECTION .text
@@ -101,6 +100,43 @@ SECTION .text
vfmadd231pd m0, m11, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20
%endmacro
+; calculate and add residual contributions from the second mixed derivative
+;
+; parameters:
+; %1: stencil
+;
+; register use (in addition to register allocation described above):
+; m6, m7: used for work (clobbered)
+%macro RES_ADD_DIFF_MIXED 1
+ movu m6, [u_upq + 1 * ELEM_SIZE + offsetq] ; m6 = u[y+1, x+1]
+ subpd m6, [u_upq - 1 * ELEM_SIZE + offsetq] ; - u[y+1, x-1]
+ subpd m6, [u_downq + 1 * ELEM_SIZE + offsetq] ; - u[y-1, x+1]
+ addpd m6, [u_downq - 1 * ELEM_SIZE + offsetq] ; + u[y-1, x-1]
+
+ %if %1 == 2
+ movu m7, [u_up2q - 1 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x-1]
+ subpd m7, [u_up2q + 1 * ELEM_SIZE + offsetq] ; - u[y+2, x+1]
+ subpd m7, [u_upq + 2 * ELEM_SIZE + offsetq] ; - u[y+1, x+2]
+ addpd m7, [u_upq - 2 * ELEM_SIZE + offsetq] ; + u[y+1, x-2]
+ addpd m7, [u_downq + 2 * ELEM_SIZE + offsetq] ; + u[y-1, x+2]
+ subpd m7, [u_downq - 2 * ELEM_SIZE + offsetq] ; - u[y-1, x-2]
+ addpd m7, [u_down2q + 1 * ELEM_SIZE + offsetq] ; + u[y-2, x+1]
+ subpd m7, [u_down2q - 1 * ELEM_SIZE + offsetq] ; - u[y-2, x-1]
+
+ vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7
+
+ movu m7, [u_up2q + 2 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x+2]
+ subpd m7, [u_up2q - 2 * ELEM_SIZE + offsetq] ; - u[y+2, x-2]
+ subpd m7, [u_down2q + 2 * ELEM_SIZE + offsetq] ; - u[y-2, x+2]
+ addpd m7, [u_down2q - 2 * ELEM_SIZE + offsetq] ; + u[y-2, x-2]
+
+ vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7
+ %endif
+
+ mulpd m6, m3
+ vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11
+%endmacro
+
INIT_YMM fma3
cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up
@@ -156,15 +192,7 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co
RES_ADD_DIFF_SINGLEDIR 1, 0
RES_ADD_DIFF_SINGLEDIR 1, 1
-
- ; mixed d2xy
- movu m6, [u_upq + offsetq + ELEM_SIZE]
- subpd m6, [u_upq + offsetq - ELEM_SIZE]
- subpd m6, [u_downq + offsetq + ELEM_SIZE]
- addpd m6, [u_downq + offsetq - ELEM_SIZE]
-
- mulpd m6, m3
- vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11
+ RES_ADD_DIFF_MIXED 1
; store the result
movu [dstq + offsetq], m0
@@ -214,7 +242,6 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co
movu m15, [const30]
movu m14, [const8]
movu m13, [const16]
- movu m12, [const64]
; setup pointers to the lines above and below
lea u_upq, [uq + strideq]
@@ -237,30 +264,7 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co
RES_ADD_DIFF_SINGLEDIR 2, 0
RES_ADD_DIFF_SINGLEDIR 2, 1
-
- ; mixed d2xy
- movu m6, [u_up2q + offsetq + 2 * ELEM_SIZE] ; m6 = u[y+2, x+2]
- vfnmadd231pd m6, m14, [u_up2q + offsetq + 1 * ELEM_SIZE] ; m6 -= 8 u[y+2, x+1]
- vfmadd231pd m6, m14, [u_up2q + offsetq - 1 * ELEM_SIZE] ; m6 += 8 u[y+2, x-1]
- subpd m6, [u_up2q + offsetq - 2 * ELEM_SIZE] ; m6 -= u[y+2, x-2]
-
- vfnmadd231pd m6, m14, [u_upq + offsetq + 2 * ELEM_SIZE] ; m6 -= 8 u[y+1, x+2]
- vfmadd231pd m6, m12, [u_upq + offsetq + 1 * ELEM_SIZE] ; m6 += 64 u[y+1, x+1]
- vfnmadd231pd m6, m12, [u_upq + offsetq - 1 * ELEM_SIZE] ; m6 -= 64 u[y+1, x-1]
- vfmadd231pd m6, m14, [u_upq + offsetq - 2 * ELEM_SIZE] ; m6 += 8 u[y+1, x-2]
-
- vfmadd231pd m6, m14, [u_downq + offsetq + 2 * ELEM_SIZE] ; m6 += 8 u[y-1, x+2]
- vfnmadd231pd m6, m12, [u_downq + offsetq + 1 * ELEM_SIZE] ; m6 -= 64 u[y-1, x+1]
- vfmadd231pd m6, m12, [u_downq + offsetq - 1 * ELEM_SIZE] ; m6 += 64 u[y-1, x-1]
- vfnmadd231pd m6, m14, [u_downq + offsetq - 2 * ELEM_SIZE] ; m6 -= 8 u[y-1, x-2]
-
- subpd m6, [u_down2q + offsetq + 2 * ELEM_SIZE] ; m6 -= u[y-2, x+2]
- vfmadd231pd m6, m14, [u_down2q + offsetq + 1 * ELEM_SIZE] ; m6 += 8 u[y-2, x+1]
- vfnmadd231pd m6, m14, [u_down2q + offsetq - 1 * ELEM_SIZE] ; m6 += 8 u[y-2, x-1]
- addpd m6, [u_down2q + offsetq - 2 * ELEM_SIZE] ; m6 += u[y-2, x-2]
-
- mulpd m6, m3
- vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11
+ RES_ADD_DIFF_MIXED 2
; store the result
movu [dstq + offsetq], m0