summaryrefslogtreecommitdiff
path: root/ndarray.asm
diff options
context:
space:
mode:
Diffstat (limited to 'ndarray.asm')
-rw-r--r--ndarray.asm18
1 files changed, 16 insertions, 2 deletions
diff --git a/ndarray.asm b/ndarray.asm
index 94a4ec9..b5f4c22 100644
--- a/ndarray.asm
+++ b/ndarray.asm
@@ -34,18 +34,32 @@ cglobal line_madd, 3, 3, 2, linesize, dst, src
vpermq m0, m0, 0
+%define PREFETCH_DIST 4
.loop:
+ ;prefetcht0 [srcq + linesizeq + mmsize * (PREFETCH_DIST + 0)]
+ ;prefetcht0 [srcq + linesizeq + mmsize * (PREFETCH_DIST + 1)]
+ ;prefetcht0 [dstq + linesizeq + mmsize * (PREFETCH_DIST + 0)]
+ ;prefetcht0 [dstq + linesizeq + mmsize * (PREFETCH_DIST + 1)]
movu m1, [dstq + linesizeq]
+ movu m2, [dstq + linesizeq + mmsize]
vfmadd231pd m1, m0, [srcq + linesizeq]
+ vfmadd231pd m2, m0, [srcq + linesizeq + mmsize]
- add linesizeq, mmsize
+ add linesizeq, 2 * mmsize
jg .store_partial
- movu [dstq + linesizeq - mmsize], m1
+ movu [dstq + linesizeq - 2 * mmsize], m1
+ movu [dstq + linesizeq - mmsize], m2
js .loop
jmp .finish
.store_partial:
+ cmp linesizeq, mmsize
+ jg .store_low
+ movu [dstq + linesizeq - 2 * mmsize], m1
+ jz .finish
+ mova m1, m2
+.store_low:
sub linesizeq, ELEM_SIZE
jz .store3
sub linesizeq, ELEM_SIZE