From 5904f25b9f0bc5d79685b37c9befb8d12e430352 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Fri, 20 Apr 2012 13:49:53 -0400 Subject: lavr: Add x86-optimized functions for s32 to s16 conversion --- libavresample/x86/audio_convert.asm | 38 ++++++++++++++++++++++++++++++++++ libavresample/x86/audio_convert_init.c | 9 ++++++++ 2 files changed, 47 insertions(+) (limited to 'libavresample') diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index 6e14892928..8435edefb0 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -83,6 +83,44 @@ CONV_S16_TO_FLT INIT_XMM sse4 CONV_S16_TO_FLT +;------------------------------------------------------------------------------ +; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len); +;------------------------------------------------------------------------------ + +%macro CONV_S32_TO_S16 0 +cglobal conv_s32_to_s16, 3,3,4, dst, src, len + lea lenq, [2*lend] + lea srcq, [srcq+2*lenq] + add dstq, lenq + neg lenq +.loop: + mova m0, [srcq+2*lenq ] + mova m1, [srcq+2*lenq+ mmsize] + mova m2, [srcq+2*lenq+2*mmsize] + mova m3, [srcq+2*lenq+3*mmsize] + psrad m0, 16 + psrad m1, 16 + psrad m2, 16 + psrad m3, 16 + packssdw m0, m1 + packssdw m2, m3 + mova [dstq+lenq ], m0 + mova [dstq+lenq+mmsize], m2 + add lenq, mmsize*2 + jl .loop +%if mmsize == 8 + emms + RET +%else + REP_RET +%endif +%endmacro + +INIT_MMX mmx +CONV_S32_TO_S16 +INIT_XMM sse2 +CONV_S32_TO_S16 + ;----------------------------------------------------------------------------- ; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len, ; int channels); diff --git a/libavresample/x86/audio_convert_init.c b/libavresample/x86/audio_convert_init.c index 2488cd3724..c4fe3aee07 100644 --- a/libavresample/x86/audio_convert_init.c +++ b/libavresample/x86/audio_convert_init.c @@ -27,6 +27,9 @@ extern void ff_conv_s16_to_s32_sse2(int16_t *dst, const int32_t *src, int len); extern void ff_conv_s16_to_flt_sse2(float *dst, const int16_t *src, int len); extern void ff_conv_s16_to_flt_sse4(float *dst, const int16_t *src, int len); +extern void ff_conv_s32_to_s16_mmx (int16_t *dst, const int32_t *src, int len); +extern void ff_conv_s32_to_s16_sse2(int16_t *dst, const int32_t *src, int len); + extern void ff_conv_fltp_to_flt_6ch_mmx (float *dst, float *const *src, int len); extern void ff_conv_fltp_to_flt_6ch_sse4(float *dst, float *const *src, int len); extern void ff_conv_fltp_to_flt_6ch_avx (float *dst, float *const *src, int len); @@ -37,6 +40,8 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac) int mm_flags = av_get_cpu_flags(); if (mm_flags & AV_CPU_FLAG_MMX && HAVE_MMX) { + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S32, + 0, 1, 8, "MMX", ff_conv_s32_to_s16_mmx); ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP, 6, 1, 4, "MMX", ff_conv_fltp_to_flt_6ch_mmx); } @@ -49,6 +54,10 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac) 6, 16, 4, "AVX", ff_conv_fltp_to_flt_6ch_avx); } if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) { + if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S32, + 0, 16, 16, "SSE2", ff_conv_s32_to_s16_sse2); + } ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S32, AV_SAMPLE_FMT_S16, 0, 16, 8, "SSE2", ff_conv_s16_to_s32_sse2); ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16, -- cgit v1.2.3