1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
;
; SIMD for interpolation
; Copyright 2019 Anton Khirnov <anton@khirnov.net>
;
; This program is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation, either version 3 of the License, or
; (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program. If not, see <http://www.gnu.org/licenses/>.
;/
%include "config.asm"
%include "x86inc.asm"
SECTION .text
INIT_YMM fma3
cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\
idx_x_val
shl src_strideq, 3
shl dst_lenq, 3
add dstq, dst_lenq
add idx_xq, dst_lenq
lea fact_xq, [fact_xq + 4 * dst_lenq]
neg dst_lenq
; from now on, the register that held the line size is used as the offset into data arrays
%define offsetq dst_lenq
movu m0, [fact_yq]
vpermq m1, m0, 01010101b ; fact y + 1 -> m1
vpermq m2, m0, 10101010b ; fact y + 2 -> m2
vpermq m3, m0, 11111111b ; fact y + 3 -> m3
vpermq m0, m0, 00000000b ; fact y + 0 -> m0
.loop:
mov idx_x_valq, [idx_xq + offsetq]
shl idx_x_valq, 3
xorpd m4, m4
movu m5, [fact_xq + 4 * offsetq]
mulpd m6, m5, [srcq + idx_x_valq]
mulpd m6, m0
add idx_x_valq, src_strideq
mulpd m7, m5, [srcq + idx_x_valq]
vfmadd231pd m6, m7, m1
add idx_x_valq, src_strideq
mulpd m7, m5, [srcq + idx_x_valq]
vfmadd231pd m6, m7, m2
add idx_x_valq, src_strideq
mulpd m7, m5, [srcq + idx_x_valq]
vfmadd231pd m6, m7, m3
haddpd m6, m6
vpermq m6, m6, 00001000b
haddpd m6, m6
movq [dstq + offsetq], xm6
add offsetq, 8
js .loop
RET
INIT_YMM fma3
cglobal transfer_interp_line_cont_6, 7, 9, 11, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\
idx_x_val, offset6
shl src_strideq, 3
shl dst_lenq, 3
add dstq, dst_lenq
add idx_xq, dst_lenq
lea fact_xq, [fact_xq + 4 * dst_lenq]
lea fact_xq, [fact_xq + 2 * dst_lenq]
neg dst_lenq
; from now on, the register that held the line size is used as the offset into data arrays
%define offsetq dst_lenq
lea offset6q, [offsetq + 2 * offsetq]
add offset6q, offset6q
movu m0, [fact_yq]
vpermq m1, m0, 01010101b ; fact y + 1 -> m1
vpermq m2, m0, 10101010b ; fact y + 2 -> m2
vpermq m3, m0, 11111111b ; fact y + 3 -> m3
vpermq m0, m0, 00000000b ; fact y + 0 -> m0
movu xm4, [fact_yq + 8 * 4]
vpermq m5, m4, 01010101b
vpermq m4, m4, 0
.loop:
mov idx_x_valq, [idx_xq + offsetq]
shl idx_x_valq, 3
movu m6, [fact_xq + offset6q]
movu xm7, [fact_xq + offset6q + mmsize]
mulpd m8, m6, [srcq + idx_x_valq]
mulpd xm9, xm7, [srcq + idx_x_valq + mmsize]
mulpd m8, m0
mulpd m9, m0
add idx_x_valq, src_strideq
mulpd m10, m6, [srcq + idx_x_valq]
vfmadd231pd m8, m10, m1
mulpd xm11, xm7, [srcq + idx_x_valq + mmsize]
vfmadd231pd m9, m11, m1
add idx_x_valq, src_strideq
mulpd m10, m6, [srcq + idx_x_valq]
vfmadd231pd m8, m10, m2
mulpd xm11, xm7, [srcq + idx_x_valq + mmsize]
vfmadd231pd m9, m11, m2
add idx_x_valq, src_strideq
mulpd m10, m6, [srcq + idx_x_valq]
vfmadd231pd m8, m10, m3
mulpd xm11, xm7, [srcq + idx_x_valq + mmsize]
vfmadd231pd m9, m11, m3
add idx_x_valq, src_strideq
mulpd m10, m6, [srcq + idx_x_valq]
vfmadd231pd m8, m10, m4
mulpd xm11, xm7, [srcq + idx_x_valq + mmsize]
vfmadd231pd m9, m11, m4
add idx_x_valq, src_strideq
mulpd m10, m6, [srcq + idx_x_valq]
vfmadd231pd m8, m10, m5
mulpd xm11, xm7, [srcq + idx_x_valq + mmsize]
vfmadd231pd m9, m11, m5
haddpd m8, m9
vpermq m9, m8, 10b
haddpd xm8, xm8
addpd m8, m9
movq [dstq + offsetq], xm8
add offsetq, 8
add offset6q, 8 * 6
js .loop
RET
|