diff options
author | Anton Khirnov <anton@khirnov.net> | 2019-04-17 10:27:23 +0200 |
---|---|---|
committer | Anton Khirnov <anton@khirnov.net> | 2019-04-17 10:27:37 +0200 |
commit | c64c60f671a004b0495279597d9c5ad1610b8155 (patch) | |
tree | f41600b12bb9097312002f8d4b788074d8c83b95 /residual_calc.asm | |
parent | 264d8ce5e39676582f2e6a65cf517924846070b9 (diff) |
simd: prefetchsimd
Diffstat (limited to 'residual_calc.asm')
-rw-r--r-- | residual_calc.asm | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/residual_calc.asm b/residual_calc.asm index 77d6dc8..3613c82 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -38,6 +38,11 @@ const30: times 8 dq 30.0 SECTION .text +%define PREFETCH_DIST 2 +%macro PREFETCHT0 1 + ;prefetcht0 %1 +%endmacro + ; mm register allocation (both s1 and s2) ; m0: accumulator for the residual ; m1-m5: splatted constant finite difference coefficients @@ -205,10 +210,20 @@ SECTION .text .loop: xorpd m0, m0 subpd m0, [rhsq + offsetq] ; res = -rhs + PREFETCHT0 [rhsq + offsetq + PREFETCH_DIST * mmsize] + PREFETCHT0 [uq + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE] + PREFETCHT0 [uq + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE] ; plain value movu m6, [uq + offsetq] ; m6 = u[x] vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 + PREFETCHT0 [diff_coeffs00q + offsetq + PREFETCH_DIST * mmsize] + PREFETCHT0 [u_upq + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE] + PREFETCHT0 [u_upq + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE] +%if stencil == 2 + PREFETCHT0 [u_up2q + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE] + PREFETCHT0 [u_up2q + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE] +%endif %if stencil == 1 addpd m6, m6 ; m6 = 2 * u[x] @@ -217,8 +232,22 @@ SECTION .text %endif RES_ADD_DIFF_SINGLEDIR stencil, 0 + PREFETCHT0 [diff_coeffs10q + offsetq + PREFETCH_DIST * mmsize] + PREFETCHT0 [diff_coeffs20q + offsetq + PREFETCH_DIST * mmsize] + PREFETCHT0 [u_downq + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE] + PREFETCHT0 [u_downq + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE] +%if stencil == 2 + PREFETCHT0 [u_down2q + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE] + PREFETCHT0 [u_down2q + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE] +%endif + RES_ADD_DIFF_SINGLEDIR stencil, 1 + PREFETCHT0 [diff_coeffs01q + offsetq + PREFETCH_DIST * mmsize] + PREFETCHT0 [diff_coeffs02q + offsetq + PREFETCH_DIST * mmsize] + RES_ADD_DIFF_MIXED stencil + PREFETCHT0 [diff_coeffs11q + offsetq + PREFETCH_DIST * mmsize] + PREFETCHT0 [dstq + offsetq + PREFETCH_DIST * mmsize] andpd m6, m0, m13 ; m6 = abs(res) |