From 57d4eec367a6e96323588f74acf5c48974383a45 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Tue, 2 Apr 2019 10:25:08 +0200 Subject: egs: optimize the correction step --- ndarray.asm | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 ndarray.asm (limited to 'ndarray.asm') diff --git a/ndarray.asm b/ndarray.asm new file mode 100644 index 0000000..94a4ec9 --- /dev/null +++ b/ndarray.asm @@ -0,0 +1,70 @@ +; +; SIMD for basic linear algebra +; Copyright 2018 Anton Khirnov +; +; This program is free software: you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation, either version 3 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program. If not, see . +;/ + + +%include "config.asm" +%include "x86inc.asm" + +SECTION .text + +; double precision +%define ELEM_SIZE 8 + +INIT_YMM fma3 +cglobal line_madd, 3, 3, 2, linesize, dst, src + shl linesizeq, 3 + add dstq, linesizeq + add srcq, linesizeq + neg linesizeq + + vpermq m0, m0, 0 + +.loop: + movu m1, [dstq + linesizeq] + vfmadd231pd m1, m0, [srcq + linesizeq] + + add linesizeq, mmsize + jg .store_partial + + movu [dstq + linesizeq - mmsize], m1 + js .loop + jmp .finish + +.store_partial: + sub linesizeq, ELEM_SIZE + jz .store3 + sub linesizeq, ELEM_SIZE + jz .store2 + +.store1: + ; linesizeq is now mmsize-2 after the write position + movq [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1 + jmp .finish +.store2: + ; linesizeq is now mmsize-2 after the write position + movu [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1 + jmp .finish +.store3: + ; linesizeq is now mmsize-1 after the write position + movu [dstq + linesizeq - mmsize + 1 * ELEM_SIZE], xm1 + vextractf128 xm1, m1, 1 + movq [dstq + linesizeq - mmsize + 3 * ELEM_SIZE], xm1 + +.finish: + + RET -- cgit v1.2.3