summaryrefslogtreecommitdiff
path: root/transfer_interp.asm
blob: a6ae60f91c693ea72aca10190581ba14439d629b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
;
; SIMD for interpolation
; Copyright 2019 Anton Khirnov <anton@khirnov.net>
;
; This program is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation, either version 3 of the License, or
; (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program.  If not, see <http://www.gnu.org/licenses/>.
;/

%include "config.asm"
%include "x86inc.asm"

SECTION .text

INIT_YMM fma3
cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\
                                              idx_x_val
    shl src_strideq, 3
    shl dst_lenq,    3

    add dstq,    dst_lenq
    add idx_xq,  dst_lenq
    lea fact_xq, [fact_xq + 4 * dst_lenq]
    neg dst_lenq
    ; from now on, the register that held the line size is used as the offset into data arrays
    %define offsetq dst_lenq

    movu m0, [fact_yq]
    vpermq  m1, m0, 01010101b                           ; fact y + 1 -> m1
    vpermq  m2, m0, 10101010b                           ; fact y + 2 -> m2
    vpermq  m3, m0, 11111111b                           ; fact y + 3 -> m3
    vpermq  m0, m0, 00000000b                           ; fact y + 0 -> m0

.loop:
    mov idx_x_valq, [idx_xq + offsetq]
    shl idx_x_valq, 3

    xorpd m4, m4

    movu  m5, [fact_xq + 4 * offsetq]

    mulpd m6, m5, [srcq + idx_x_valq]
    mulpd m6, m0

    add idx_x_valq, src_strideq
    mulpd m7, m5, [srcq + idx_x_valq]
    vfmadd231pd m6, m7, m1

    add idx_x_valq, src_strideq
    mulpd m7, m5, [srcq + idx_x_valq]
    vfmadd231pd m6, m7, m2

    add idx_x_valq, src_strideq
    mulpd m7, m5, [srcq + idx_x_valq]
    vfmadd231pd m6, m7, m3

    haddpd m6, m6
    vpermq m6, m6, 00001000b
    haddpd m6, m6

    movq [dstq + offsetq], xm6
    add offsetq, 8
    js .loop

    RET