diff options
Diffstat (limited to 'ndarray.asm')
-rw-r--r-- | ndarray.asm | 18 |
1 files changed, 16 insertions, 2 deletions
diff --git a/ndarray.asm b/ndarray.asm index 94a4ec9..b5f4c22 100644 --- a/ndarray.asm +++ b/ndarray.asm @@ -34,18 +34,32 @@ cglobal line_madd, 3, 3, 2, linesize, dst, src vpermq m0, m0, 0 +%define PREFETCH_DIST 4 .loop: + ;prefetcht0 [srcq + linesizeq + mmsize * (PREFETCH_DIST + 0)] + ;prefetcht0 [srcq + linesizeq + mmsize * (PREFETCH_DIST + 1)] + ;prefetcht0 [dstq + linesizeq + mmsize * (PREFETCH_DIST + 0)] + ;prefetcht0 [dstq + linesizeq + mmsize * (PREFETCH_DIST + 1)] movu m1, [dstq + linesizeq] + movu m2, [dstq + linesizeq + mmsize] vfmadd231pd m1, m0, [srcq + linesizeq] + vfmadd231pd m2, m0, [srcq + linesizeq + mmsize] - add linesizeq, mmsize + add linesizeq, 2 * mmsize jg .store_partial - movu [dstq + linesizeq - mmsize], m1 + movu [dstq + linesizeq - 2 * mmsize], m1 + movu [dstq + linesizeq - mmsize], m2 js .loop jmp .finish .store_partial: + cmp linesizeq, mmsize + jg .store_low + movu [dstq + linesizeq - 2 * mmsize], m1 + jz .finish + mova m1, m2 +.store_low: sub linesizeq, ELEM_SIZE jz .store3 sub linesizeq, ELEM_SIZE |