1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
|
;
; SIMD for interpolation
; Copyright 2019 Anton Khirnov <anton@khirnov.net>
;
; This program is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation, either version 3 of the License, or
; (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program. If not, see <http://www.gnu.org/licenses/>.
;/
%include "config.asm"
%include "x86inc.asm"
SECTION .text
INIT_YMM fma3
cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\
idx_x_val
shl src_strideq, 3
shl dst_lenq, 3
add dstq, dst_lenq
add idx_xq, dst_lenq
lea fact_xq, [fact_xq + 4 * dst_lenq]
neg dst_lenq
; from now on, the register that held the line size is used as the offset into data arrays
%define offsetq dst_lenq
movu m0, [fact_yq]
vpermq m1, m0, 01010101b ; fact y + 1 -> m1
vpermq m2, m0, 10101010b ; fact y + 2 -> m2
vpermq m3, m0, 11111111b ; fact y + 3 -> m3
vpermq m0, m0, 00000000b ; fact y + 0 -> m0
.loop:
mov idx_x_valq, [idx_xq + offsetq]
shl idx_x_valq, 3
xorpd m4, m4
movu m5, [fact_xq + 4 * offsetq]
mulpd m6, m5, [srcq + idx_x_valq]
mulpd m6, m0
add idx_x_valq, src_strideq
mulpd m7, m5, [srcq + idx_x_valq]
vfmadd231pd m6, m7, m1
add idx_x_valq, src_strideq
mulpd m7, m5, [srcq + idx_x_valq]
vfmadd231pd m6, m7, m2
add idx_x_valq, src_strideq
mulpd m7, m5, [srcq + idx_x_valq]
vfmadd231pd m6, m7, m3
haddpd m6, m6
vpermq m6, m6, 00001000b
haddpd m6, m6
movq [dstq + offsetq], xm6
add offsetq, 8
js .loop
RET
|