aboutsummaryrefslogtreecommitdiff
path: root/expansion.asm
diff options
context:
space:
mode:
Diffstat (limited to 'expansion.asm')
-rw-r--r--expansion.asm91
1 files changed, 91 insertions, 0 deletions
diff --git a/expansion.asm b/expansion.asm
new file mode 100644
index 0000000..4ac77bf
--- /dev/null
+++ b/expansion.asm
@@ -0,0 +1,91 @@
+%include "x86util.asm"
+
+SECTION .text
+
+; len1 len2
+; compute vec2^T·mat·vec1 = ∑ ∑ mat[i, j] vec1[i] vec2[j]
+; i=1 j=1
+%macro SCALARPRODUCT_METRIC 0
+cglobal scalarproduct_metric, 5, 7, 7, len1, len2, mat, vec1, vec2, rowpos
+ shl len2q, 3
+ shl len1q, 3
+
+ add vec1q, len1q
+ add vec2q, len2q
+ neg len2q
+
+ lea r6, [3 * len1q]
+
+ xorpd m0, m0
+
+.loop_2
+ mov rowposq, len1q
+ neg rowposq
+
+ xorpd m1, m1
+ xorpd m2, m2
+
+%if mmsize == 32
+ xorpd m3, m3
+ xorpd m4, m4
+%endif
+
+.loop_1
+ mova m5, [vec1q + rowposq]
+
+%if mmsize == 32
+ FMULADD_PD m4, m5, [matq + r6q], m4, m6
+ FMULADD_PD m3, m5, [matq + 2 * len1q], m3, m6
+%endif
+
+ FMULADD_PD m2, m5, [matq + 1 * len1q], m2, m6
+ FMULADD_PD m1, m5, [matq + 0 * len1q], m1, m6
+
+ add matq, mmsize
+ add rowposq, mmsize
+ js .loop_1
+
+ haddpd m1, m2
+
+%if mmsize == 32
+ vextractf128 xmm2, ymm1, 1
+ addpd xmm1, xmm2
+
+ haddpd m3, m4
+ vextractf128 xmm4, ymm3, 1
+ addpd xmm3, xmm4
+
+ vinsertf128 ymm1, ymm1, xmm3, 1
+%endif
+
+ FMULADD_PD m0, m1, [vec2q + len2q], m0, m6
+
+%if mmsize == 32
+ add matq, r6
+%else
+ add matq, len1q
+%endif
+ add len2q, mmsize
+ js .loop_2
+
+ haddpd m0, m0
+
+%if mmsize == 32
+ vextractf128 xmm1, ymm0, 1
+ addpd xmm0, xmm1
+%endif
+
+ emms
+
+ RET
+%endmacro
+
+INIT_XMM sse3
+SCALARPRODUCT_METRIC
+
+INIT_YMM avx
+SCALARPRODUCT_METRIC
+
+INIT_YMM fma3
+SCALARPRODUCT_METRIC
+