From 7d7487e85c066bf3f4e5821a49081f520b6bc1e7 Mon Sep 17 00:00:00 2001 From: James Almer Date: Mon, 10 Mar 2014 17:09:20 -0300 Subject: x86/float_dsp: add ff_vector_{fmul_add, fmac_scalar}_fma3 ~7% faster than AVX Signed-off-by: James Almer Signed-off-by: Michael Niedermayer --- libavutil/x86/float_dsp.asm | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'libavutil/x86/float_dsp.asm') diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index f762e34f6a..d0f4be8c53 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -80,10 +80,17 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len .loop: %assign a 0 %rep 32/mmsize +%if cpuflag(fma3) + mova m1, [dstq+lenq+(a+0)*mmsize] + mova m2, [dstq+lenq+(a+1)*mmsize] + fmaddps m1, m0, [srcq+lenq+(a+0)*mmsize], m1 + fmaddps m2, m0, [srcq+lenq+(a+1)*mmsize], m2 +%else mulps m1, m0, [srcq+lenq+(a+0)*mmsize] mulps m2, m0, [srcq+lenq+(a+1)*mmsize] addps m1, m1, [dstq+lenq+(a+0)*mmsize] addps m2, m2, [dstq+lenq+(a+1)*mmsize] +%endif mova [dstq+lenq+(a+0)*mmsize], m1 mova [dstq+lenq+(a+1)*mmsize], m2 %assign a a+2 @@ -99,6 +106,10 @@ VECTOR_FMAC_SCALAR INIT_YMM avx VECTOR_FMAC_SCALAR %endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_FMAC_SCALAR +%endif ;------------------------------------------------------------------------------ ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) @@ -182,16 +193,23 @@ VECTOR_DMUL_SCALAR ; const float *src2, int len) ;----------------------------------------------------------------------------- %macro VECTOR_FMUL_ADD 0 -cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len +cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len lea lenq, [lend*4 - 2*mmsize] ALIGN 16 .loop: mova m0, [src0q + lenq] mova m1, [src0q + lenq + mmsize] +%if cpuflag(fma3) + mova m2, [src2q + lenq] + mova m3, [src2q + lenq + mmsize] + fmaddps m0, m0, [src1q + lenq], m2 + fmaddps m1, m1, [src1q + lenq + mmsize], m3 +%else mulps m0, m0, [src1q + lenq] mulps m1, m1, [src1q + lenq + mmsize] addps m0, m0, [src2q + lenq] addps m1, m1, [src2q + lenq + mmsize] +%endif mova [dstq + lenq], m0 mova [dstq + lenq + mmsize], m1 @@ -206,6 +224,10 @@ VECTOR_FMUL_ADD INIT_YMM avx VECTOR_FMUL_ADD %endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_FMUL_ADD +%endif ;----------------------------------------------------------------------------- ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, -- cgit v1.2.3