aboutsummaryrefslogtreecommitdiff
path: root/ndarray.asm
blob: 94a4ec974f959a475bb6c564f97266d4059a9686 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
;
; SIMD for basic linear algebra
; Copyright 2018 Anton Khirnov <anton@khirnov.net>
;
; This program is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation, either version 3 of the License, or
; (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program.  If not, see <http://www.gnu.org/licenses/>.
;/


%include "config.asm"
%include "x86inc.asm"

SECTION .text

; double precision
%define ELEM_SIZE 8

INIT_YMM fma3
cglobal line_madd, 3, 3, 2, linesize, dst, src
    shl linesizeq, 3
    add dstq, linesizeq
    add srcq, linesizeq
    neg linesizeq

    vpermq m0, m0, 0

.loop:
    movu m1, [dstq + linesizeq]
    vfmadd231pd m1, m0, [srcq + linesizeq]

    add linesizeq, mmsize
    jg .store_partial

    movu [dstq + linesizeq - mmsize], m1
    js .loop
    jmp .finish

.store_partial:
    sub linesizeq, ELEM_SIZE
    jz .store3
    sub linesizeq, ELEM_SIZE
    jz .store2

.store1:
    ; linesizeq is now mmsize-2 after the write position
    movq [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1
    jmp .finish
.store2:
    ; linesizeq is now mmsize-2 after the write position
    movu [dstq + linesizeq - mmsize + 2 * ELEM_SIZE], xm1
    jmp .finish
.store3:
    ; linesizeq is now mmsize-1 after the write position
    movu [dstq + linesizeq - mmsize + 1 * ELEM_SIZE], xm1
    vextractf128 xm1, m1, 1
    movq [dstq + linesizeq - mmsize + 3 * ELEM_SIZE], xm1

.finish:

    RET