summaryrefslogtreecommitdiff
path: root/readwrite.asm
blob: b498f63aa3430384a4f32f6c0ed51a708c590559 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
;
; Copyright 2019 Anton Khirnov <anton@khirnov.net>
;
; This program is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation, either version 3 of the License, or
; (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program.  If not, see <http://www.gnu.org/licenses/>.
;

%include "config.asm"
%include "x86inc.asm"

SECTION .rodata align=64

const1:  times 8 dq  1.0

SECTION .text

INIT_YMM avx
cglobal mem_write, 2, 2, 1, dst, dstlen
    add dstq, dstlenq
    neg dstlenq

    mova m0, [const1]

.loop:
    mova [dstq + dstlenq + mmsize * 0], m0
    mova [dstq + dstlenq + mmsize * 1], m0
    mova [dstq + dstlenq + mmsize * 2], m0
    mova [dstq + dstlenq + mmsize * 3], m0
    mova [dstq + dstlenq + mmsize * 4], m0
    mova [dstq + dstlenq + mmsize * 5], m0
    mova [dstq + dstlenq + mmsize * 6], m0
    mova [dstq + dstlenq + mmsize * 7], m0

    add dstlenq, mmsize * 8
    js .loop

    RET

INIT_YMM avx
cglobal mem_read, 2, 2, 1, src, srclen
    add srcq, srclenq
    neg srclenq

.loop:
    mova         m0, [srcq + srclenq]
    mova         m0, [srcq + srclenq + mmsize]
    mova         m0, [srcq + srclenq + mmsize * 2]
    mova         m0, [srcq + srclenq + mmsize * 3]
    mova         m0, [srcq + srclenq + mmsize * 4]
    mova         m0, [srcq + srclenq + mmsize * 5]
    mova         m0, [srcq + srclenq + mmsize * 6]
    mova         m0, [srcq + srclenq + mmsize * 7]

    add srclenq, mmsize * 8
    js .loop

    RET