%include "x86util.asm" SECTION .text ; len1 len2 ; compute vec2^T·mat·vec1 = ∑ ∑ mat[i, j] vec1[i] vec2[j] ; i=1 j=1 %macro SCALARPRODUCT_METRIC 0 cglobal scalarproduct_metric, 5, 7, 7, len1, len2, mat, vec1, vec2, rowpos shl len2q, 3 shl len1q, 3 add vec1q, len1q add vec2q, len2q neg len2q lea r6, [3 * len1q] xorpd m0, m0 .loop_2: mov rowposq, len1q neg rowposq xorpd m1, m1 xorpd m2, m2 %if mmsize == 32 xorpd m3, m3 xorpd m4, m4 %endif .loop_1: mova m5, [vec1q + rowposq] %if mmsize == 32 FMULADD_PD m4, m5, [matq + r6q], m4, m6 FMULADD_PD m3, m5, [matq + 2 * len1q], m3, m6 %endif FMULADD_PD m2, m5, [matq + 1 * len1q], m2, m6 FMULADD_PD m1, m5, [matq + 0 * len1q], m1, m6 add matq, mmsize add rowposq, mmsize js .loop_1 haddpd m1, m2 %if mmsize == 32 vextractf128 xmm2, ymm1, 1 addpd xmm1, xmm2 haddpd m3, m4 vextractf128 xmm4, ymm3, 1 addpd xmm3, xmm4 vinsertf128 ymm1, ymm1, xmm3, 1 %endif FMULADD_PD m0, m1, [vec2q + len2q], m0, m6 %if mmsize == 32 add matq, r6 %else add matq, len1q %endif add len2q, mmsize js .loop_2 haddpd m0, m0 %if mmsize == 32 vextractf128 xmm1, ymm0, 1 addpd xmm0, xmm1 %endif emms RET %endmacro INIT_XMM sse3 SCALARPRODUCT_METRIC INIT_YMM avx SCALARPRODUCT_METRIC INIT_YMM fma3 SCALARPRODUCT_METRIC