aboutsummaryrefslogtreecommitdiff
path: root/ndarray.asm
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-04-02 10:25:08 +0200
committerAnton Khirnov <anton@khirnov.net>2019-04-02 11:01:58 +0200
commit57d4eec367a6e96323588f74acf5c48974383a45 (patch)
treead86c14d71f89e2281b912692b4a93acc6d03530 /ndarray.asm
parentbcc67122331d63e38768e7c6c9633be4fc7bd09b (diff)
egs: optimize the correction step
Diffstat (limited to 'ndarray.asm')
-rw-r--r--ndarray.asm70
1 files changed, 70 insertions, 0 deletions
diff --git a/ndarray.asm b/ndarray.asm
new file mode 100644
index 0000000..94a4ec9
--- /dev/null
+++ b/ndarray.asm
@@ -0,0 +1,70 @@
+;
+; SIMD for basic linear algebra
+; Copyright 2018 Anton Khirnov <anton@khirnov.net>
+;
+; This program is free software: you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation, either version 3 of the License, or
+; (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program. If not, see <http://www.gnu.org/licenses/>.
+;/
+
+
+%include "config.asm"
+%include "x86inc.asm"
+
+SECTION .text
+
+; double precision
+%define ELEM_SIZE 8
+
+INIT_YMM fma3
+cglobal line_madd, 3, 3, 2, linesize, dst, src
+ shl linesizeq, 3
+ add dstq, linesizeq
+ add srcq, linesizeq
+ neg linesizeq
+
+ vpermq m0, m0, 0
+
+.loop:
+ movu m1, [dstq + linesizeq]
+ vfmadd231pd m1, m0, [srcq + linesizeq]
+
+ add linesizeq, mmsize
+ jg .store_partial
+
+ movu [dstq + linesizeq - mmsize], m1
+ js .loop
+ jmp .finish
+
+.store_partial:
+ sub linesizeq, ELEM_SIZE
+ jz .store3
+ sub linesizeq, ELEM_SIZE
+ jz .store2
+
+.store1:
+ ; linesizeq is now mmsize-2 after the write position
+ movq [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1
+ jmp .finish
+.store2:
+ ; linesizeq is now mmsize-2 after the write position
+ movu [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1
+ jmp .finish
+.store3:
+ ; linesizeq is now mmsize-1 after the write position
+ movu [dstq + linesizeq - mmsize + 1 * ELEM_SIZE], xm1
+ vextractf128 xm1, m1, 1
+ movq [dstq + linesizeq - mmsize + 3 * ELEM_SIZE], xm1
+
+.finish:
+
+ RET