diff options
Diffstat (limited to 'libavcodec/x86/dcadsp.asm')
-rw-r--r-- | libavcodec/x86/dcadsp.asm | 119 |
1 files changed, 77 insertions, 42 deletions
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm index 56039baa6f..4746784795 100644 --- a/libavcodec/x86/dcadsp.asm +++ b/libavcodec/x86/dcadsp.asm @@ -2,20 +2,20 @@ ;* SSE-optimized functions for the DCA decoder ;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -199,66 +199,85 @@ INIT_XMM sse DCA_LFE_FIR 0 DCA_LFE_FIR 1 -INIT_XMM sse2 +%macro SETZERO 1 +%if cpuflag(sse2) && notcpuflag(avx) + pxor %1, %1 +%else + xorps %1, %1, %1 +%endif +%endmacro + +%macro SHUF 3 +%if cpuflag(avx) + mova %3, [%2 - 16] + vperm2f128 %1, %3, %3, 1 + vshufps %1, %1, %1, q0123 +%elif cpuflag(sse2) + pshufd %1, [%2], q0123 +%else + mova %1, [%2] + shufps %1, %1, q0123 +%endif +%endmacro + %macro INNER_LOOP 1 ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i ;~ a += window[i + j] * (-synth_buf[15 - i + j]) ;~ b += window[i + j + 16] * (synth_buf[i + j]) - pshufd m5, [ptr2 + j + (15 - 3) * 4], q0123 + SHUF m5, ptr2 + j + (15 - 3) * 4, m6 mova m6, [ptr1 + j] %if ARCH_X86_64 - pshufd m11, [ptr2 + j + (15 - 3) * 4 - mmsize], q0123 + SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12 mova m12, [ptr1 + j + mmsize] %endif - mulps m6, [win + %1 + j + 16 * 4] - mulps m5, [win + %1 + j] + FMULADD_PS m2, m6, [win + %1 + j + 16 * 4], m2, m6 + mulps m5, m5, [win + %1 + j] + subps m1, m1, m5 %if ARCH_X86_64 - mulps m12, [win + %1 + j + mmsize + 16 * 4] - mulps m11, [win + %1 + j + mmsize] -%endif - addps m2, m6 - subps m1, m5 -%if ARCH_X86_64 - addps m8, m12 - subps m7, m11 + FMULADD_PS m8, m12, [win + %1 + j + mmsize + 16 * 4], m8, m12 + mulps m11, m11, [win + %1 + j + mmsize] + subps m7, m7, m11 %endif ;~ c += window[i + j + 32] * (synth_buf[16 + i + j]) ;~ d += window[i + j + 48] * (synth_buf[31 - i + j]) - pshufd m6, [ptr2 + j + (31 - 3) * 4], q0123 + SHUF m6, ptr2 + j + (31 - 3) * 4, m5 mova m5, [ptr1 + j + 16 * 4] %if ARCH_X86_64 - pshufd m12, [ptr2 + j + (31 - 3) * 4 - mmsize], q0123 + SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11 mova m11, [ptr1 + j + mmsize + 16 * 4] %endif - mulps m5, [win + %1 + j + 32 * 4] - mulps m6, [win + %1 + j + 48 * 4] -%if ARCH_X86_64 - mulps m11, [win + %1 + j + mmsize + 32 * 4] - mulps m12, [win + %1 + j + mmsize + 48 * 4] -%endif - addps m3, m5 - addps m4, m6 + FMULADD_PS m3, m5, [win + %1 + j + 32 * 4], m3, m5 + FMULADD_PS m4, m6, [win + %1 + j + 48 * 4], m4, m6 %if ARCH_X86_64 - addps m9, m11 - addps m10, m12 + FMULADD_PS m9, m11, [win + %1 + j + mmsize + 32 * 4], m9, m11 + FMULADD_PS m10, m12, [win + %1 + j + mmsize + 48 * 4], m10, m12 %endif sub j, 64 * 4 %endmacro -; void ff_synth_filter_inner_sse2(float *synth_buf, float synth_buf2[32], -; const float window[512], float out[32], -; intptr_t offset, float scale) +; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32], +; const float window[512], float out[32], +; intptr_t offset, float scale) +%macro SYNTH_FILTER 0 cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \ synth_buf, synth_buf2, window, out, off, scale %define scale m0 %if ARCH_X86_32 || WIN64 - movd scale, scalem +%if cpuflag(sse2) && notcpuflag(avx) + movd m0, scalem + SPLATD m0 +%else + VBROADCASTSS m0, scalem +%endif ; Make sure offset is in a register and not on the stack %define OFFQ r4q %else + SPLATD xmm0 +%if cpuflag(avx) + vinsertf128 m0, m0, xmm0, 1 +%endif %define OFFQ offq %endif - pshufd m0, m0, 0 ; prepare inner counter limit 1 mov r5q, 480 sub r5q, offmp @@ -274,8 +293,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \ %endif .mainloop ; m1 = a m2 = b m3 = c m4 = d - pxor m3, m3 - pxor m4, m4 + SETZERO m3 + SETZERO m4 mova m1, [buf2 + i] mova m2, [buf2 + i + 16 * 4] %if ARCH_X86_32 @@ -292,8 +311,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \ %define ptr2 r7q ; must be loaded %define win r8q %define j r9q - pxor m9, m9 - pxor m10, m10 + SETZERO m9 + SETZERO m10 mova m7, [buf2 + i + mmsize] mova m8, [buf2 + i + mmsize + 16 * 4] lea win, [windowq + i] @@ -325,11 +344,11 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \ %endif ;~ out[i] = a * scale; ;~ out[i + 16] = b * scale; - mulps m1, scale - mulps m2, scale + mulps m1, m1, scale + mulps m2, m2, scale %if ARCH_X86_64 - mulps m7, scale - mulps m8, scale + mulps m7, m7, scale + mulps m8, m8, scale %endif ;~ synth_buf2[i] = c; ;~ synth_buf2[i + 16] = d; @@ -350,3 +369,19 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \ sub i, (ARCH_X86_64 + 1) * mmsize jge .mainloop RET +%endmacro + +%if ARCH_X86_32 +INIT_XMM sse +SYNTH_FILTER +%endif +INIT_XMM sse2 +SYNTH_FILTER +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +SYNTH_FILTER +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +SYNTH_FILTER +%endif |