summaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm29
1 files changed, 29 insertions, 0 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 77d6dc8..3613c82 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -38,6 +38,11 @@ const30: times 8 dq 30.0
SECTION .text
+%define PREFETCH_DIST 2
+%macro PREFETCHT0 1
+ ;prefetcht0 %1
+%endmacro
+
; mm register allocation (both s1 and s2)
; m0: accumulator for the residual
; m1-m5: splatted constant finite difference coefficients
@@ -205,10 +210,20 @@ SECTION .text
.loop:
xorpd m0, m0
subpd m0, [rhsq + offsetq] ; res = -rhs
+ PREFETCHT0 [rhsq + offsetq + PREFETCH_DIST * mmsize]
+ PREFETCHT0 [uq + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE]
+ PREFETCHT0 [uq + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE]
; plain value
movu m6, [uq + offsetq] ; m6 = u[x]
vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
+ PREFETCHT0 [diff_coeffs00q + offsetq + PREFETCH_DIST * mmsize]
+ PREFETCHT0 [u_upq + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE]
+ PREFETCHT0 [u_upq + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE]
+%if stencil == 2
+ PREFETCHT0 [u_up2q + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE]
+ PREFETCHT0 [u_up2q + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE]
+%endif
%if stencil == 1
addpd m6, m6 ; m6 = 2 * u[x]
@@ -217,8 +232,22 @@ SECTION .text
%endif
RES_ADD_DIFF_SINGLEDIR stencil, 0
+ PREFETCHT0 [diff_coeffs10q + offsetq + PREFETCH_DIST * mmsize]
+ PREFETCHT0 [diff_coeffs20q + offsetq + PREFETCH_DIST * mmsize]
+ PREFETCHT0 [u_downq + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE]
+ PREFETCHT0 [u_downq + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE]
+%if stencil == 2
+ PREFETCHT0 [u_down2q + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE]
+ PREFETCHT0 [u_down2q + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE]
+%endif
+
RES_ADD_DIFF_SINGLEDIR stencil, 1
+ PREFETCHT0 [diff_coeffs01q + offsetq + PREFETCH_DIST * mmsize]
+ PREFETCHT0 [diff_coeffs02q + offsetq + PREFETCH_DIST * mmsize]
+
RES_ADD_DIFF_MIXED stencil
+ PREFETCHT0 [diff_coeffs11q + offsetq + PREFETCH_DIST * mmsize]
+ PREFETCHT0 [dstq + offsetq + PREFETCH_DIST * mmsize]
andpd m6, m0, m13 ; m6 = abs(res)