From b75726cb7903c30663faadf615d41486cbb5e828 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Mon, 23 Apr 2012 15:10:35 -0400 Subject: lavr: add x86-optimized function for mixing 2 to 1 s16p with q8 coeffs --- libavresample/x86/audio_mix.asm | 41 ++++++++++++++++++++++++++++++++++++++ libavresample/x86/audio_mix_init.c | 5 +++++ 2 files changed, 46 insertions(+) (limited to 'libavresample/x86') diff --git a/libavresample/x86/audio_mix.asm b/libavresample/x86/audio_mix.asm index c2e310b3d5..8a4cf061cd 100644 --- a/libavresample/x86/audio_mix.asm +++ b/libavresample/x86/audio_mix.asm @@ -109,3 +109,44 @@ INIT_XMM sse2 MIX_2_TO_1_S16P_FLT INIT_XMM sse4 MIX_2_TO_1_S16P_FLT + +;----------------------------------------------------------------------------- +; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len, +; int out_ch, int in_ch); +;----------------------------------------------------------------------------- + +INIT_XMM sse2 +cglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1 + mov src1q, [srcq+gprsize] + mov srcq, [srcq] + sub src1q, srcq + mov matrixq, [matrixq] + movd m4, [matrixq] + movd m5, [matrixq] + SPLATW m4, m4, 0 + SPLATW m5, m5, 1 + pxor m0, m0 + punpcklwd m4, m0 + punpcklwd m5, m0 + ALIGN 16 +.loop: + mova m0, [srcq ] + mova m2, [srcq+src1q] + punpckhwd m1, m0, m0 + punpcklwd m0, m0 + punpckhwd m3, m2, m2 + punpcklwd m2, m2 + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m5 + pmaddwd m3, m5 + paddd m0, m2 + paddd m1, m3 + psrad m0, 8 + psrad m1, 8 + packssdw m0, m1 + mova [srcq], m0 + add srcq, mmsize + sub lend, mmsize/2 + jg .loop + REP_RET diff --git a/libavresample/x86/audio_mix_init.c b/libavresample/x86/audio_mix_init.c index 947c3146ca..fa204d6d36 100644 --- a/libavresample/x86/audio_mix_init.c +++ b/libavresample/x86/audio_mix_init.c @@ -32,6 +32,9 @@ extern void ff_mix_2_to_1_s16p_flt_sse2(int16_t **src, float **matrix, int len, extern void ff_mix_2_to_1_s16p_flt_sse4(int16_t **src, float **matrix, int len, int out_ch, int in_ch); +extern void ff_mix_2_to_1_s16p_q8_sse2(int16_t **src, int16_t **matrix, + int len, int out_ch, int in_ch); + av_cold void ff_audio_mix_init_x86(AudioMix *am) { #if HAVE_YASM @@ -44,6 +47,8 @@ av_cold void ff_audio_mix_init_x86(AudioMix *am) if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) { ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT, 2, 1, 16, 8, "SSE2", ff_mix_2_to_1_s16p_flt_sse2); + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_Q8, + 2, 1, 16, 8, "SSE2", ff_mix_2_to_1_s16p_q8_sse2); } if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) { ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT, -- cgit v1.2.3