diff options
Diffstat (limited to 'libavutil/x86/float_dsp.asm')
-rw-r--r-- | libavutil/x86/float_dsp.asm | 46 |
1 files changed, 39 insertions, 7 deletions
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index d96249978a..d0f4be8c53 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -1,20 +1,22 @@ ;***************************************************************************** ;* x86-optimized Float DSP functions ;* -;* This file is part of Libav. +;* Copyright 2006 Loren Merritt ;* -;* Libav is free software; you can redistribute it and/or +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -48,8 +50,10 @@ ALIGN 16 INIT_XMM sse VECTOR_FMUL +%if HAVE_AVX_EXTERNAL INIT_YMM avx VECTOR_FMUL +%endif ;------------------------------------------------------------------------------ ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len) @@ -76,10 +80,17 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len .loop: %assign a 0 %rep 32/mmsize +%if cpuflag(fma3) + mova m1, [dstq+lenq+(a+0)*mmsize] + mova m2, [dstq+lenq+(a+1)*mmsize] + fmaddps m1, m0, [srcq+lenq+(a+0)*mmsize], m1 + fmaddps m2, m0, [srcq+lenq+(a+1)*mmsize], m2 +%else mulps m1, m0, [srcq+lenq+(a+0)*mmsize] mulps m2, m0, [srcq+lenq+(a+1)*mmsize] addps m1, m1, [dstq+lenq+(a+0)*mmsize] addps m2, m2, [dstq+lenq+(a+1)*mmsize] +%endif mova [dstq+lenq+(a+0)*mmsize], m1 mova [dstq+lenq+(a+1)*mmsize], m2 %assign a a+2 @@ -91,8 +102,14 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len INIT_XMM sse VECTOR_FMAC_SCALAR +%if HAVE_AVX_EXTERNAL INIT_YMM avx VECTOR_FMAC_SCALAR +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_FMAC_SCALAR +%endif ;------------------------------------------------------------------------------ ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) @@ -176,16 +193,23 @@ VECTOR_DMUL_SCALAR ; const float *src2, int len) ;----------------------------------------------------------------------------- %macro VECTOR_FMUL_ADD 0 -cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len +cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len lea lenq, [lend*4 - 2*mmsize] ALIGN 16 .loop: mova m0, [src0q + lenq] mova m1, [src0q + lenq + mmsize] +%if cpuflag(fma3) + mova m2, [src2q + lenq] + mova m3, [src2q + lenq + mmsize] + fmaddps m0, m0, [src1q + lenq], m2 + fmaddps m1, m1, [src1q + lenq + mmsize], m3 +%else mulps m0, m0, [src1q + lenq] mulps m1, m1, [src1q + lenq + mmsize] addps m0, m0, [src2q + lenq] addps m1, m1, [src2q + lenq + mmsize] +%endif mova [dstq + lenq], m0 mova [dstq + lenq + mmsize], m1 @@ -196,8 +220,14 @@ ALIGN 16 INIT_XMM sse VECTOR_FMUL_ADD +%if HAVE_AVX_EXTERNAL INIT_YMM avx VECTOR_FMUL_ADD +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_FMUL_ADD +%endif ;----------------------------------------------------------------------------- ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, @@ -233,8 +263,10 @@ ALIGN 16 INIT_XMM sse VECTOR_FMUL_REVERSE +%if HAVE_AVX_EXTERNAL INIT_YMM avx VECTOR_FMUL_REVERSE +%endif ; float scalarproduct_float_sse(const float *v1, const float *v2, int len) INIT_XMM sse @@ -272,8 +304,8 @@ cglobal butterflies_float, 3,3,3, src0, src1, len test lenq, lenq jz .end shl lenq, 2 - lea src0q, [src0q + lenq] - lea src1q, [src1q + lenq] + add src0q, lenq + add src1q, lenq neg lenq .loop: mova m0, [src0q + lenq] |