; ; SIMD for basic linear algebra ; Copyright 2018 Anton Khirnov ; ; This program is free software: you can redistribute it and/or modify ; it under the terms of the GNU General Public License as published by ; the Free Software Foundation, either version 3 of the License, or ; (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program. If not, see . ;/ %include "config.asm" %include "x86inc.asm" SECTION .text ; double precision %define ELEM_SIZE 8 INIT_YMM fma3 cglobal line_madd, 3, 3, 2, linesize, dst, src shl linesizeq, 3 add dstq, linesizeq add srcq, linesizeq neg linesizeq vpermq m0, m0, 0 .loop: movu m1, [dstq + linesizeq] vfmadd231pd m1, m0, [srcq + linesizeq] add linesizeq, mmsize jg .store_partial movu [dstq + linesizeq - mmsize], m1 js .loop jmp .finish .store_partial: sub linesizeq, ELEM_SIZE jz .store3 sub linesizeq, ELEM_SIZE jz .store2 .store1: ; linesizeq is now mmsize-2 after the write position movq [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1 jmp .finish .store2: ; linesizeq is now mmsize-2 after the write position movu [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1 jmp .finish .store3: ; linesizeq is now mmsize-1 after the write position movu [dstq + linesizeq - mmsize + 1 * ELEM_SIZE], xm1 vextractf128 xm1, m1, 1 movq [dstq + linesizeq - mmsize + 3 * ELEM_SIZE], xm1 .finish: RET