summaryrefslogtreecommitdiff
path: root/libavcodec/x86
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/dcadsp.asm53
-rw-r--r--libavcodec/x86/dcadsp_init.c18
2 files changed, 50 insertions, 21 deletions
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index c1db118676..56039baa6f 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -26,18 +26,35 @@ pf_inv16: times 4 dd 0x3D800000 ; 1/16
SECTION_TEXT
-; void int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale)
-%macro INT8X8_FMUL_INT32 0
-cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
- cvtsi2ss m0, scalem
+; void decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS],
+; const int8_t hf_vq[1024][32], intptr_t vq_offset,
+; int32_t scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end)
+
+%macro DECODE_HF 0
+cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
+ lea srcq, [srcq + offsetq]
+ shl startq, 2
+ mov offsetd, endm
+%define DICT offsetq
+ shl offsetq, 2
+ mov endm, offsetq
+.loop:
+%if ARCH_X86_64
+ mov offsetd, [scaleq + 2 * startq]
+ cvtsi2ss m0, offsetd
+%else
+ cvtsi2ss m0, [scaleq + 2 * startq]
+%endif
+ mov offsetd, [numq + startq]
mulss m0, [pf_inv16]
+ shl DICT, 5
shufps m0, m0, 0
%if cpuflag(sse2)
%if cpuflag(sse4)
- pmovsxbd m1, [srcq+0]
- pmovsxbd m2, [srcq+4]
+ pmovsxbd m1, [srcq + DICT + 0]
+ pmovsxbd m2, [srcq + DICT + 4]
%else
- movq m1, [srcq]
+ movq m1, [srcq + DICT]
punpcklbw m1, m1
mova m2, m1
punpcklwd m1, m1
@@ -48,8 +65,8 @@ cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
cvtdq2ps m1, m1
cvtdq2ps m2, m2
%else
- movd mm0, [srcq+0]
- movd mm1, [srcq+4]
+ movd mm0, [srcq + DICT + 0]
+ movd mm1, [srcq + DICT + 4]
punpcklbw mm0, mm0
punpcklbw mm1, mm1
movq mm2, mm0
@@ -67,27 +84,33 @@ cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
cvtpi2ps m3, mm2
cvtpi2ps m4, mm3
shufps m0, m0, 0
- emms
shufps m1, m3, q1010
shufps m2, m4, q1010
%endif
mulps m1, m0
mulps m2, m0
- mova [dstq+ 0], m1
- mova [dstq+16], m2
+ mova [dstq + 8 * startq + 0], m1
+ mova [dstq + 8 * startq + 16], m2
+ add startq, 4
+ cmp startq, endm
+ jl .loop
+.end:
+%if notcpuflag(sse2)
+ emms
+%endif
REP_RET
%endmacro
%if ARCH_X86_32
INIT_XMM sse
-INT8X8_FMUL_INT32
+DECODE_HF
%endif
INIT_XMM sse2
-INT8X8_FMUL_INT32
+DECODE_HF
INIT_XMM sse4
-INT8X8_FMUL_INT32
+DECODE_HF
; %1=v0/v1 %2=in1 %3=in2
%macro FIR_LOOP 2-3
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 5f6e8c5a19..3821892166 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -23,9 +23,15 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/dcadsp.h"
-void ff_int8x8_fmul_int32_sse(float *dst, const int8_t *src, int scale);
-void ff_int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale);
-void ff_int8x8_fmul_int32_sse4(float *dst, const int8_t *src, int scale);
+void ff_decode_hf_sse(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
+ const int8_t hf_vq[1024][32], intptr_t vq_offset,
+ int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
+void ff_decode_hf_sse2(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
+ const int8_t hf_vq[1024][32], intptr_t vq_offset,
+ int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
+void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
+ const int8_t hf_vq[1024][32], intptr_t vq_offset,
+ int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
@@ -35,18 +41,18 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
if (EXTERNAL_SSE(cpu_flags)) {
#if ARCH_X86_32
- s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse;
+ s->decode_hf = ff_decode_hf_sse;
#endif
s->lfe_fir[0] = ff_dca_lfe_fir0_sse;
s->lfe_fir[1] = ff_dca_lfe_fir1_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
- s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse2;
+ s->decode_hf = ff_decode_hf_sse2;
}
if (EXTERNAL_SSE4(cpu_flags)) {
- s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse4;
+ s->decode_hf = ff_decode_hf_sse4;
}
}