summaryrefslogtreecommitdiff
path: root/libavcodec/x86/sbrdsp.asm
diff options
context:
space:
mode:
authorChristophe GISQUET <christophe.gisquet@gmail.com>2012-02-23 19:48:58 +0100
committerRonald S. Bultje <rsbultje@gmail.com>2012-02-23 15:50:06 -0800
commit34454c761f01275d4adaf40df6d70a59011c4a6c (patch)
treea25a23c028ddee97c1195567f855ce064bdbe916 /libavcodec/x86/sbrdsp.asm
parent2e74a5abc2fda6cfbc86589852d6194d502332cb (diff)
SBR DSP x86: implement SSE sbr_sum_square_sse
The 32bits targets have been compiled with -mfpmath=sse for proper reference. sbr_sum_square C /32bits: 82c (unrolled)/102c C /64bits: 69c (unrolled)/82c SSE/32bits: 42c SSE/64bits: 31c Use of SSE4.1 dpps to perform the final sum is slower. Not unrolling to perform 8 operations in a loop yields 10 more cycles. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Diffstat (limited to 'libavcodec/x86/sbrdsp.asm')
-rw-r--r--libavcodec/x86/sbrdsp.asm74
1 files changed, 74 insertions, 0 deletions
diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
new file mode 100644
index 0000000000..71471bd5ab
--- /dev/null
+++ b/libavcodec/x86/sbrdsp.asm
@@ -0,0 +1,74 @@
+;******************************************************************************
+;* AAC Spectral Band Replication decoding functions
+;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+;SECTION_RODATA
+SECTION .text
+
+INIT_XMM sse
+cglobal sbr_sum_square, 2, 3, 6
+ mov r2, r1
+ xorps m0, m0
+ xorps m1, m1
+ sar r2, 3
+ jz .prepare
+.loop:
+ movu m2, [r0 + 0]
+ movu m3, [r0 + 16]
+ movu m4, [r0 + 32]
+ movu m5, [r0 + 48]
+ mulps m2, m2
+ mulps m3, m3
+ mulps m4, m4
+ mulps m5, m5
+ addps m0, m2
+ addps m1, m3
+ addps m0, m4
+ addps m1, m5
+ add r0, 64
+ dec r2
+ jnz .loop
+.prepare:
+ and r1, 7
+ sar r1, 1
+ jz .end
+; len is a multiple of 2, thus there are at least 4 elements to process
+.endloop:
+ movu m2, [r0]
+ add r0, 16
+ mulps m2, m2
+ dec r1
+ addps m0, m2
+ jnz .endloop
+.end:
+ addps m0, m1
+ movhlps m2, m0
+ addps m0, m2
+ movss m1, m0
+ shufps m0, m0, 1
+ addss m0, m1
+%if ARCH_X86_64 == 0
+ movd r0m, m0
+ fld dword r0m
+%endif
+ RET