1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
%include "x86util.asm"
SECTION .text
; len1 len2
; compute vec2^T·mat·vec1 = ∑ ∑ mat[i, j] vec1[i] vec2[j]
; i=1 j=1
%macro SCALARPRODUCT_METRIC 0
cglobal scalarproduct_metric, 5, 7, 7, len1, len2, mat, vec1, vec2, rowpos
shl len2q, 3
shl len1q, 3
add vec1q, len1q
add vec2q, len2q
neg len2q
lea r6, [3 * len1q]
xorpd m0, m0
.loop_2:
mov rowposq, len1q
neg rowposq
xorpd m1, m1
xorpd m2, m2
%if mmsize == 32
xorpd m3, m3
xorpd m4, m4
%endif
.loop_1:
mova m5, [vec1q + rowposq]
%if mmsize == 32
FMULADD_PD m4, m5, [matq + r6q], m4, m6
FMULADD_PD m3, m5, [matq + 2 * len1q], m3, m6
%endif
FMULADD_PD m2, m5, [matq + 1 * len1q], m2, m6
FMULADD_PD m1, m5, [matq + 0 * len1q], m1, m6
add matq, mmsize
add rowposq, mmsize
js .loop_1
haddpd m1, m2
%if mmsize == 32
vextractf128 xmm2, ymm1, 1
addpd xmm1, xmm2
haddpd m3, m4
vextractf128 xmm4, ymm3, 1
addpd xmm3, xmm4
vinsertf128 ymm1, ymm1, xmm3, 1
%endif
FMULADD_PD m0, m1, [vec2q + len2q], m0, m6
%if mmsize == 32
add matq, r6
%else
add matq, len1q
%endif
add len2q, mmsize
js .loop_2
haddpd m0, m0
%if mmsize == 32
vextractf128 xmm1, ymm0, 1
addpd xmm0, xmm1
%endif
emms
RET
%endmacro
INIT_XMM sse3
SCALARPRODUCT_METRIC
INIT_YMM avx
SCALARPRODUCT_METRIC
INIT_YMM fma3
SCALARPRODUCT_METRIC
|