summaryrefslogtreecommitdiff
path: root/libavcodec/x86/dcadsp.asm
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2014-03-01 23:46:27 -0300
committerMichael Niedermayer <michaelni@gmx.at>2014-03-02 11:58:10 +0100
commit884e085d1ea34f2f773b9589ae8e8aa9ca91b358 (patch)
tree1a57da75c20fde135db63eab6730e4b58b0f249a /libavcodec/x86/dcadsp.asm
parentf5d1d1e4667ba346ea7e0f97e6d2756bc9d4abde (diff)
x86/synth_filter: Revert the switch to float ops with SSE2
This reverts the changes 64672098361361cd15d37e36f747ab44de5b80ca and 68c3ed936a76c3ff7738f602fa90237ac7e3ce08 did to the SSE2 version, which generated a hit of about 5 cycles. Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/dcadsp.asm')
-rw-r--r--libavcodec/x86/dcadsp.asm21
1 files changed, 17 insertions, 4 deletions
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 972ce1e3be..a6a4582524 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -199,6 +199,14 @@ INIT_XMM sse
DCA_LFE_FIR 0
DCA_LFE_FIR 1
+%macro SETZERO 1
+%if cpuflag(sse2) && notcpuflag(avx)
+ pxor %1, %1
+%else
+ xorps %1, %1, %1
+%endif
+%endmacro
+
%macro SHUF 3
%if cpuflag(avx)
mova %3, [%2 - 16]
@@ -265,7 +273,12 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
synth_buf, synth_buf2, window, out, off, scale
%define scale m0
%if ARCH_X86_32 || WIN64
+%if cpuflag(sse2) && notcpuflag(avx)
+ movd m0, scalem
+ SPLATD m0
+%else
VBROADCASTSS m0, scalem
+%endif
; Make sure offset is in a register and not on the stack
%define OFFQ r4q
%else
@@ -290,8 +303,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
%endif
.mainloop
; m1 = a m2 = b m3 = c m4 = d
- xorps m3, m3, m3
- xorps m4, m4, m4
+ SETZERO m3
+ SETZERO m4
mova m1, [buf2 + i]
mova m2, [buf2 + i + 16 * 4]
%if ARCH_X86_32
@@ -308,8 +321,8 @@ cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
%define ptr2 r7q ; must be loaded
%define win r8q
%define j r9q
- xorps m9, m9, m9
- xorps m10, m10, m10
+ SETZERO m9
+ SETZERO m10
mova m7, [buf2 + i + mmsize]
mova m8, [buf2 + i + mmsize + 16 * 4]
lea win, [windowq + i]