From c64c60f671a004b0495279597d9c5ad1610b8155 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Wed, 17 Apr 2019 10:27:23 +0200 Subject: simd: prefetch --- ndarray.asm | 18 ++++++++++++++++-- residual_calc.asm | 29 +++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/ndarray.asm b/ndarray.asm index 94a4ec9..b5f4c22 100644 --- a/ndarray.asm +++ b/ndarray.asm @@ -34,18 +34,32 @@ cglobal line_madd, 3, 3, 2, linesize, dst, src vpermq m0, m0, 0 +%define PREFETCH_DIST 4 .loop: + ;prefetcht0 [srcq + linesizeq + mmsize * (PREFETCH_DIST + 0)] + ;prefetcht0 [srcq + linesizeq + mmsize * (PREFETCH_DIST + 1)] + ;prefetcht0 [dstq + linesizeq + mmsize * (PREFETCH_DIST + 0)] + ;prefetcht0 [dstq + linesizeq + mmsize * (PREFETCH_DIST + 1)] movu m1, [dstq + linesizeq] + movu m2, [dstq + linesizeq + mmsize] vfmadd231pd m1, m0, [srcq + linesizeq] + vfmadd231pd m2, m0, [srcq + linesizeq + mmsize] - add linesizeq, mmsize + add linesizeq, 2 * mmsize jg .store_partial - movu [dstq + linesizeq - mmsize], m1 + movu [dstq + linesizeq - 2 * mmsize], m1 + movu [dstq + linesizeq - mmsize], m2 js .loop jmp .finish .store_partial: + cmp linesizeq, mmsize + jg .store_low + movu [dstq + linesizeq - 2 * mmsize], m1 + jz .finish + mova m1, m2 +.store_low: sub linesizeq, ELEM_SIZE jz .store3 sub linesizeq, ELEM_SIZE diff --git a/residual_calc.asm b/residual_calc.asm index 77d6dc8..3613c82 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -38,6 +38,11 @@ const30: times 8 dq 30.0 SECTION .text +%define PREFETCH_DIST 2 +%macro PREFETCHT0 1 + ;prefetcht0 %1 +%endmacro + ; mm register allocation (both s1 and s2) ; m0: accumulator for the residual ; m1-m5: splatted constant finite difference coefficients @@ -205,10 +210,20 @@ SECTION .text .loop: xorpd m0, m0 subpd m0, [rhsq + offsetq] ; res = -rhs + PREFETCHT0 [rhsq + offsetq + PREFETCH_DIST * mmsize] + PREFETCHT0 [uq + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE] + PREFETCHT0 [uq + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE] ; plain value movu m6, [uq + offsetq] ; m6 = u[x] vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 + PREFETCHT0 [diff_coeffs00q + offsetq + PREFETCH_DIST * mmsize] + PREFETCHT0 [u_upq + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE] + PREFETCHT0 [u_upq + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE] +%if stencil == 2 + PREFETCHT0 [u_up2q + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE] + PREFETCHT0 [u_up2q + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE] +%endif %if stencil == 1 addpd m6, m6 ; m6 = 2 * u[x] @@ -217,8 +232,22 @@ SECTION .text %endif RES_ADD_DIFF_SINGLEDIR stencil, 0 + PREFETCHT0 [diff_coeffs10q + offsetq + PREFETCH_DIST * mmsize] + PREFETCHT0 [diff_coeffs20q + offsetq + PREFETCH_DIST * mmsize] + PREFETCHT0 [u_downq + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE] + PREFETCHT0 [u_downq + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE] +%if stencil == 2 + PREFETCHT0 [u_down2q + offsetq + PREFETCH_DIST * mmsize - stencil * ELEM_SIZE] + PREFETCHT0 [u_down2q + offsetq + PREFETCH_DIST * mmsize + stencil * ELEM_SIZE] +%endif + RES_ADD_DIFF_SINGLEDIR stencil, 1 + PREFETCHT0 [diff_coeffs01q + offsetq + PREFETCH_DIST * mmsize] + PREFETCHT0 [diff_coeffs02q + offsetq + PREFETCH_DIST * mmsize] + RES_ADD_DIFF_MIXED stencil + PREFETCHT0 [diff_coeffs11q + offsetq + PREFETCH_DIST * mmsize] + PREFETCHT0 [dstq + offsetq + PREFETCH_DIST * mmsize] andpd m6, m0, m13 ; m6 = abs(res) -- cgit v1.2.3