summaryrefslogtreecommitdiff
path: root/libavcodec/x86/dcadsp_init.c
diff options
context:
space:
mode:
authorChristophe Gisquet <christophe.gisquet@gmail.com>2014-02-14 15:03:13 +0000
committerJanne Grunau <janne-libav@jannau.net>2014-02-28 13:03:22 +0100
commit4cb6964244fd6c099383d8b7e99731e72cc844b9 (patch)
tree8b2fd29f4af8ba5d052e04d4dd1f4e97efaf73f2 /libavcodec/x86/dcadsp_init.c
parent7686afd049be98d18663682b92d983340fa2c305 (diff)
dcadec: simplify decoding of VQ high frequencies
The vector dequantization has a test in a loop preventing effective SIMD implementation. By moving it out of the loop, this loop can be DSPized. Therefore, modify the current DSP implementation. In particular, the DSP implementation no longer has to handle null loop sizes. The decode_hf implementations have following timings: For x86 Arrandale: C SSE SSE2 SSE4 win32: 260 162 119 104 win64: 242 N/A 89 72 The arm NEON optimizations follow in a later patch as external asm. The now unused check for the y modifier in arm inline asm is removed from configure.
Diffstat (limited to 'libavcodec/x86/dcadsp_init.c')
-rw-r--r--libavcodec/x86/dcadsp_init.c18
1 files changed, 12 insertions, 6 deletions
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 5f6e8c5a19..3821892166 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -23,9 +23,15 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/dcadsp.h"
-void ff_int8x8_fmul_int32_sse(float *dst, const int8_t *src, int scale);
-void ff_int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale);
-void ff_int8x8_fmul_int32_sse4(float *dst, const int8_t *src, int scale);
+void ff_decode_hf_sse(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
+ const int8_t hf_vq[1024][32], intptr_t vq_offset,
+ int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
+void ff_decode_hf_sse2(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
+ const int8_t hf_vq[1024][32], intptr_t vq_offset,
+ int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
+void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
+ const int8_t hf_vq[1024][32], intptr_t vq_offset,
+ int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
@@ -35,18 +41,18 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
if (EXTERNAL_SSE(cpu_flags)) {
#if ARCH_X86_32
- s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse;
+ s->decode_hf = ff_decode_hf_sse;
#endif
s->lfe_fir[0] = ff_dca_lfe_fir0_sse;
s->lfe_fir[1] = ff_dca_lfe_fir1_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
- s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse2;
+ s->decode_hf = ff_decode_hf_sse2;
}
if (EXTERNAL_SSE4(cpu_flags)) {
- s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse4;
+ s->decode_hf = ff_decode_hf_sse4;
}
}