From 57d4eec367a6e96323588f74acf5c48974383a45 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Tue, 2 Apr 2019 10:25:08 +0200
Subject: egs: optimize the correction step

---
 ndarray.asm | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 ndarray.asm

(limited to 'ndarray.asm')

diff --git a/ndarray.asm b/ndarray.asm
new file mode 100644
index 0000000..94a4ec9
--- /dev/null
+++ b/ndarray.asm
@@ -0,0 +1,70 @@
+;
+; SIMD for basic linear algebra
+; Copyright 2018 Anton Khirnov <anton@khirnov.net>
+;
+; This program is free software: you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation, either version 3 of the License, or
+; (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program.  If not, see <http://www.gnu.org/licenses/>.
+;/
+
+
+%include "config.asm"
+%include "x86inc.asm"
+
+SECTION .text
+
+; double precision
+%define ELEM_SIZE 8
+
+INIT_YMM fma3
+cglobal line_madd, 3, 3, 2, linesize, dst, src
+    shl linesizeq, 3
+    add dstq, linesizeq
+    add srcq, linesizeq
+    neg linesizeq
+
+    vpermq m0, m0, 0
+
+.loop:
+    movu m1, [dstq + linesizeq]
+    vfmadd231pd m1, m0, [srcq + linesizeq]
+
+    add linesizeq, mmsize
+    jg .store_partial
+
+    movu [dstq + linesizeq - mmsize], m1
+    js .loop
+    jmp .finish
+
+.store_partial:
+    sub linesizeq, ELEM_SIZE
+    jz .store3
+    sub linesizeq, ELEM_SIZE
+    jz .store2
+
+.store1:
+    ; linesizeq is now mmsize-2 after the write position
+    movq [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1
+    jmp .finish
+.store2:
+    ; linesizeq is now mmsize-2 after the write position
+    movu [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1
+    jmp .finish
+.store3:
+    ; linesizeq is now mmsize-1 after the write position
+    movu [dstq + linesizeq - mmsize + 1 * ELEM_SIZE], xm1
+    vextractf128 xm1, m1, 1
+    movq [dstq + linesizeq - mmsize + 3 * ELEM_SIZE], xm1
+
+.finish:
+
+    RET
-- 
cgit v1.2.3