aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm84
1 files changed, 41 insertions, 43 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 9cd530d..42eb50b 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -22,14 +22,6 @@
; double precision
%define ELEM_SIZE 8
-; offsets to FD coefficients for given derivative
-%define OFF_DIFF_COEFF_00 0 * gprsize
-%define OFF_DIFF_COEFF_01 1 * gprsize
-%define OFF_DIFF_COEFF_10 2 * gprsize
-%define OFF_DIFF_COEFF_11 3 * gprsize
-%define OFF_DIFF_COEFF_02 4 * gprsize
-%define OFF_DIFF_COEFF_20 5 * gprsize
-
SECTION .rodata
const8: times 8 dq 8.0
@@ -65,15 +57,15 @@ SECTION .text
%define up2q uq + ELEM_SIZE * 2
%define um1q uq - ELEM_SIZE
%define um2q uq - ELEM_SIZE * 2
- %define coeffs1q diff_coeffs10q
- %define coeffs2q diff_coeffs20q
+ %define coeffs1q diff_coeffsq + diff_coeff_offset_10
+ %define coeffs2q diff_coeffsq + diff_coeff_offset_20
%else
%define up1q u_upq
%define up2q u_up2q
%define um1q u_downq
%define um2q u_down2q
- %define coeffs1q diff_coeffs01q
- %define coeffs2q diff_coeffs02q
+ %define coeffs1q diff_coeffsq + diff_coeff_offset_01
+ %define coeffs2q diff_coeffsq + diff_coeff_offset_02
%endif
; load the function values
@@ -91,7 +83,7 @@ SECTION .text
subpd m11, m8 ; m11 -= u[x+2]
addpd m11, m10 ; m11 += u[x-2]
%endif
- vfmadd231pd m0, m11, [coeffs1q + offsetq] ; res += d_x u * diff_coeffs10
+ vfmadd231pd m0, m11, [coeffs1q] ; res += d_x u * diff_coeffs10
; second derivative
addpd m11, m7, m9 ; m11 = u[x+1] + u[x-1]
@@ -102,7 +94,7 @@ SECTION .text
subpd m11, m10 ; m11 -= u[x-2]
%endif
subpd m11, m6 ; m11 -= fd0 u[x]
- vfmadd231pd m0, m11, [coeffs2q + offsetq] ; res += d_xx u * diff_coeffs20
+ vfmadd231pd m0, m11, [coeffs2q] ; res += d_xx u * diff_coeffs20
%endmacro
; calculate and add residual contributions from the second mixed derivative
@@ -138,13 +130,30 @@ SECTION .text
vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7
%endif
- vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11
+ vfmadd231pd m0, m6, [diff_coeffsq + diff_coeff_offset_11] ; res += d_xy u * diff_coeffs11
%endmacro
; %1: stencil
; %2: 0 - calc; 1 - add
%macro RESIDUAL_CALC 2
- %define stencil %1
+
+%define stencil %1
+
+%if %2
+%define opname add
+%else
+%define opname calc
+%endif
+
+; typedef void ResidualLineCalc/Add(
+; size_t linesize, double *dst, double *dst_max,
+; ptrdiff_t u_stride, const double *u, const double *rhs,
+; const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
+; double res_mult, [double u_mult (add only)])
+cglobal residual_line_ %+ opname %+ _s %+ stencil, \
+ 8, 13, 14 + stencil * 2, \
+ linesize, dst, res_max, u_stride, u, rhs, diff_coeffs, diff_coeffs_offset, \
+ u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5
%if %2
vpermq m2, m1, 0
@@ -156,27 +165,13 @@ SECTION .text
psrlq m13, 1
movu m12, [res_maxq]
- ; load pointers to the equation coefficients
- %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
- mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
- mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01]
- mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10]
- mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11]
- mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02]
- mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20]
-
; setup the data pointers and the loop counter
shl u_strideq, 3
+ shl diff_coeffs_offsetq, 3
shl linesizeq, 3
add dstq, linesizeq
add uq, linesizeq
add rhsq, linesizeq
- add diff_coeffs00q, linesizeq
- add diff_coeffs01q, linesizeq
- add diff_coeffs10q, linesizeq
- add diff_coeffs11q, linesizeq
- add diff_coeffs02q, linesizeq
- add diff_coeffs20q, linesizeq
neg linesizeq
; from now on, the register that held linesize is used as the offset into data arrays
%define offsetq linesizeq
@@ -195,13 +190,26 @@ SECTION .text
movu m14, [const8]
%endif
+ ; offsets to FD coefficients for given derivative
+ %define diff_coeff_offset_01 1 * diff_coeffs_offsetq
+ %define diff_coeff_offset_10 2 * diff_coeffs_offsetq
+
+ lea diff_coeffs_off3q, [diff_coeffs_offsetq * 2 + diff_coeffs_offsetq]
+ %define diff_coeff_offset_11 diff_coeffs_off3q
+
+ %define diff_coeff_offset_02 4 * diff_coeffs_offsetq
+
+ lea diff_coeffs_off5q, [diff_coeffs_offsetq * 4 + diff_coeffs_offsetq]
+ %define diff_coeff_offset_20 diff_coeffs_off5q
+
+
.loop:
xorpd m0, m0
subpd m0, [rhsq + offsetq] ; res = -rhs
; plain value
movu m6, [uq + offsetq] ; m6 = u[x]
- vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
+ vfmadd231pd m0, m6, [diff_coeffsq] ; res += u * diff_coeffs00
%if %2
mulpd m3, m6, m2
%endif
@@ -223,6 +231,7 @@ SECTION .text
%endif
; store the result
+ add diff_coeffsq, mmsize
add offsetq, mmsize
jg .store_partial
@@ -267,18 +276,7 @@ SECTION .text
%endmacro
INIT_YMM fma3
-cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
RESIDUAL_CALC 1, 0
-cglobal residual_add_line_s1, 7, 14, 14, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
RESIDUAL_CALC 1, 1
-
-INIT_YMM fma3
-cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2, 0
-
-cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2, 1