aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2024-04-15 21:44:14 +0200
committerAnton Khirnov <anton@khirnov.net>2024-04-15 21:44:14 +0200
commite30cfde7614be7062249954eab6c3f56eeabbb51 (patch)
tree1a27f188ed94b9ae4d566150ca951a8ac7f0fad1 /residual_calc.asm
parent982d71cb08f6ccf564c0558c659ae2756bb39ba1 (diff)
residual_calc: accept all diff coefficients in a single array
Plus an offset parameter that signals the distance between different coefficients. This allows to avoid passing so many pointers around, which reduces register pressure and simplifies writing SIMD. Seems also to be a little faster.
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm84
1 files changed, 41 insertions, 43 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 9cd530d..42eb50b 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -22,14 +22,6 @@
; double precision
%define ELEM_SIZE 8
-; offsets to FD coefficients for given derivative
-%define OFF_DIFF_COEFF_00 0 * gprsize
-%define OFF_DIFF_COEFF_01 1 * gprsize
-%define OFF_DIFF_COEFF_10 2 * gprsize
-%define OFF_DIFF_COEFF_11 3 * gprsize
-%define OFF_DIFF_COEFF_02 4 * gprsize
-%define OFF_DIFF_COEFF_20 5 * gprsize
-
SECTION .rodata
const8: times 8 dq 8.0
@@ -65,15 +57,15 @@ SECTION .text
%define up2q uq + ELEM_SIZE * 2
%define um1q uq - ELEM_SIZE
%define um2q uq - ELEM_SIZE * 2
- %define coeffs1q diff_coeffs10q
- %define coeffs2q diff_coeffs20q
+ %define coeffs1q diff_coeffsq + diff_coeff_offset_10
+ %define coeffs2q diff_coeffsq + diff_coeff_offset_20
%else
%define up1q u_upq
%define up2q u_up2q
%define um1q u_downq
%define um2q u_down2q
- %define coeffs1q diff_coeffs01q
- %define coeffs2q diff_coeffs02q
+ %define coeffs1q diff_coeffsq + diff_coeff_offset_01
+ %define coeffs2q diff_coeffsq + diff_coeff_offset_02
%endif
; load the function values
@@ -91,7 +83,7 @@ SECTION .text
subpd m11, m8 ; m11 -= u[x+2]
addpd m11, m10 ; m11 += u[x-2]
%endif
- vfmadd231pd m0, m11, [coeffs1q + offsetq] ; res += d_x u * diff_coeffs10
+ vfmadd231pd m0, m11, [coeffs1q] ; res += d_x u * diff_coeffs10
; second derivative
addpd m11, m7, m9 ; m11 = u[x+1] + u[x-1]
@@ -102,7 +94,7 @@ SECTION .text
subpd m11, m10 ; m11 -= u[x-2]
%endif
subpd m11, m6 ; m11 -= fd0 u[x]
- vfmadd231pd m0, m11, [coeffs2q + offsetq] ; res += d_xx u * diff_coeffs20
+ vfmadd231pd m0, m11, [coeffs2q] ; res += d_xx u * diff_coeffs20
%endmacro
; calculate and add residual contributions from the second mixed derivative
@@ -138,13 +130,30 @@ SECTION .text
vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7
%endif
- vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11
+ vfmadd231pd m0, m6, [diff_coeffsq + diff_coeff_offset_11] ; res += d_xy u * diff_coeffs11
%endmacro
; %1: stencil
; %2: 0 - calc; 1 - add
%macro RESIDUAL_CALC 2
- %define stencil %1
+
+%define stencil %1
+
+%if %2
+%define opname add
+%else
+%define opname calc
+%endif
+
+; typedef void ResidualLineCalc/Add(
+; size_t linesize, double *dst, double *dst_max,
+; ptrdiff_t u_stride, const double *u, const double *rhs,
+; const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
+; double res_mult, [double u_mult (add only)])
+cglobal residual_line_ %+ opname %+ _s %+ stencil, \
+ 8, 13, 14 + stencil * 2, \
+ linesize, dst, res_max, u_stride, u, rhs, diff_coeffs, diff_coeffs_offset, \
+ u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5
%if %2
vpermq m2, m1, 0
@@ -156,27 +165,13 @@ SECTION .text
psrlq m13, 1
movu m12, [res_maxq]
- ; load pointers to the equation coefficients
- %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
- mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
- mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01]
- mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10]
- mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11]
- mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02]
- mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20]
-
; setup the data pointers and the loop counter
shl u_strideq, 3
+ shl diff_coeffs_offsetq, 3
shl linesizeq, 3
add dstq, linesizeq
add uq, linesizeq
add rhsq, linesizeq
- add diff_coeffs00q, linesizeq
- add diff_coeffs01q, linesizeq
- add diff_coeffs10q, linesizeq
- add diff_coeffs11q, linesizeq
- add diff_coeffs02q, linesizeq
- add diff_coeffs20q, linesizeq
neg linesizeq
; from now on, the register that held linesize is used as the offset into data arrays
%define offsetq linesizeq
@@ -195,13 +190,26 @@ SECTION .text
movu m14, [const8]
%endif
+ ; offsets to FD coefficients for given derivative
+ %define diff_coeff_offset_01 1 * diff_coeffs_offsetq
+ %define diff_coeff_offset_10 2 * diff_coeffs_offsetq
+
+ lea diff_coeffs_off3q, [diff_coeffs_offsetq * 2 + diff_coeffs_offsetq]
+ %define diff_coeff_offset_11 diff_coeffs_off3q
+
+ %define diff_coeff_offset_02 4 * diff_coeffs_offsetq
+
+ lea diff_coeffs_off5q, [diff_coeffs_offsetq * 4 + diff_coeffs_offsetq]
+ %define diff_coeff_offset_20 diff_coeffs_off5q
+
+
.loop:
xorpd m0, m0
subpd m0, [rhsq + offsetq] ; res = -rhs
; plain value
movu m6, [uq + offsetq] ; m6 = u[x]
- vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
+ vfmadd231pd m0, m6, [diff_coeffsq] ; res += u * diff_coeffs00
%if %2
mulpd m3, m6, m2
%endif
@@ -223,6 +231,7 @@ SECTION .text
%endif
; store the result
+ add diff_coeffsq, mmsize
add offsetq, mmsize
jg .store_partial
@@ -267,18 +276,7 @@ SECTION .text
%endmacro
INIT_YMM fma3
-cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
RESIDUAL_CALC 1, 0
-cglobal residual_add_line_s1, 7, 14, 14, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
RESIDUAL_CALC 1, 1
-
-INIT_YMM fma3
-cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2, 0
-
-cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2, 1