summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-04-17 10:27:23 +0200
committerAnton Khirnov <anton@khirnov.net>2019-04-17 10:27:37 +0200
commitc64c60f671a004b0495279597d9c5ad1610b8155 (patch)
treef41600b12bb9097312002f8d4b788074d8c83b95
parent264d8ce5e39676582f2e6a65cf517924846070b9 (diff)
simd: prefetchsimd
-rw-r--r--ndarray.asm18
-rw-r--r--residual_calc.asm29
2 files changed, 45 insertions, 2 deletions
diff --git a/ndarray.asm b/ndarray.asm
index 94a4ec9..b5f4c22 100644
--- a/ndarray.asm
+++ b/ndarray.asm
@@ -34,18 +34,32 @@ cglobal line_madd, 3, 3, 2, linesize, dst, src
vpermq m0, m0, 0
+%define PREFETCH_DIST 4
.loop:
+ ;prefetcht0 [srcq + linesizeq + mmsize * (PREFETCH_DIST + 0)]
+ ;prefetcht0 [srcq + linesizeq + mmsize * (PREFETCH_DIST + 1)]
+ ;prefetcht0 [dstq + linesizeq + mmsize * (PREFETCH_DIST + 0)]
+ ;prefetcht0 [dstq + linesizeq + mmsize * (PREFETCH_DIST + 1)]
movu m1, [dstq + linesizeq]
+ movu m2, [dstq + linesizeq + mmsize]
vfmadd231pd m1, m0, [srcq + linesizeq]
+ vfmadd231pd m2, m0, [srcq + linesizeq + mmsize]
- add linesizeq, mmsize
+ add linesizeq, 2 * mmsize
jg .store_partial
- movu [dstq + linesizeq - mmsize], m1
+ movu [dstq + linesizeq - 2 * mmsize], m1
+ movu [dstq + linesizeq - mmsize], m2
js .loop
jmp .finish
.store_partial:
+ cmp linesizeq, mmsize
+ jg .store_low
+ movu [dstq + linesizeq - 2 * mmsize], m1
+ jz .finish
+ mova m1, m2
+.store_low:
sub linesizeq, ELEM_SIZE
jz .store3
sub linesizeq, ELEM_SIZE
diff --git a/residual_calc.asm b/residual_calc.asm
index 77d6dc8..3613c82 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -38,6 +38,11 @@ const30: times 8 dq 30.0
SECTION .text
+%define PREFETCH_DIST 2
+%macro PREFETCHT0 1
+ ;prefetcht0 %1
+%endmacro
+
; mm register allocation (both s1 and s2)
; m0: accumulator for the residual
; m1-m5: splatted constant finite difference coefficients
@@ -205,10 +210,20 @@ SECTION .text
.loop:
xorpd m0, m0
subpd m0, [rhsq + offsetq] ; res = -rhs
+ PREFETCHT0 [rhsq + offsetq + PREFETCH_DIST * mmsize]
+ PREFETCHT0 [uq + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE]
+ PREFETCHT0 [uq + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE]
; plain value
movu m6, [uq + offsetq] ; m6 = u[x]
vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
+ PREFETCHT0 [diff_coeffs00q + offsetq + PREFETCH_DIST * mmsize]
+ PREFETCHT0 [u_upq + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE]
+ PREFETCHT0 [u_upq + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE]
+%if stencil == 2
+ PREFETCHT0 [u_up2q + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE]
+ PREFETCHT0 [u_up2q + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE]
+%endif
%if stencil == 1
addpd m6, m6 ; m6 = 2 * u[x]
@@ -217,8 +232,22 @@ SECTION .text
%endif
RES_ADD_DIFF_SINGLEDIR stencil, 0
+ PREFETCHT0 [diff_coeffs10q + offsetq + PREFETCH_DIST * mmsize]
+ PREFETCHT0 [diff_coeffs20q + offsetq + PREFETCH_DIST * mmsize]
+ PREFETCHT0 [u_downq + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE]
+ PREFETCHT0 [u_downq + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE]
+%if stencil == 2
+ PREFETCHT0 [u_down2q + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE]
+ PREFETCHT0 [u_down2q + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE]
+%endif
+
RES_ADD_DIFF_SINGLEDIR stencil, 1
+ PREFETCHT0 [diff_coeffs01q + offsetq + PREFETCH_DIST * mmsize]
+ PREFETCHT0 [diff_coeffs02q + offsetq + PREFETCH_DIST * mmsize]
+
RES_ADD_DIFF_MIXED stencil
+ PREFETCHT0 [diff_coeffs11q + offsetq + PREFETCH_DIST * mmsize]
+ PREFETCHT0 [dstq + offsetq + PREFETCH_DIST * mmsize]
andpd m6, m0, m13 ; m6 = abs(res)