From 97ce1ba8673636c96d3cc002bb76221c60324d95 Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Mon, 23 Apr 2012 18:29:58 -0400 Subject: lavr: Add x86-optimized functions for s32 to flt conversion --- libavresample/x86/audio_convert.asm | 37 ++++++++++++++++++++++++++++++++++ libavresample/x86/audio_convert_init.c | 9 +++++++++ 2 files changed, 46 insertions(+) (limited to 'libavresample/x86') diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index 8435edefb0..53091acf99 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -26,6 +26,7 @@ SECTION_RODATA 32 +pf_s32_inv_scale: times 8 dd 0x30000000 pf_s16_inv_scale: times 4 dd 0x38000000 SECTION_TEXT @@ -121,6 +122,42 @@ CONV_S32_TO_S16 INIT_XMM sse2 CONV_S32_TO_S16 +;------------------------------------------------------------------------------ +; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len); +;------------------------------------------------------------------------------ + +%macro CONV_S32_TO_FLT 0 +cglobal conv_s32_to_flt, 3,3,3, dst, src, len + lea lenq, [4*lend] + add srcq, lenq + add dstq, lenq + neg lenq + mova m0, [pf_s32_inv_scale] + ALIGN 16 +.loop: + cvtdq2ps m1, [srcq+lenq ] + cvtdq2ps m2, [srcq+lenq+mmsize] + mulps m1, m1, m0 + mulps m2, m2, m0 + mova [dstq+lenq ], m1 + mova [dstq+lenq+mmsize], m2 + add lenq, mmsize*2 + jl .loop +%if mmsize == 32 + vzeroupper + RET +%else + REP_RET +%endif +%endmacro + +INIT_XMM sse2 +CONV_S32_TO_FLT +%if HAVE_AVX +INIT_YMM avx +CONV_S32_TO_FLT +%endif + ;----------------------------------------------------------------------------- ; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len, ; int channels); diff --git a/libavresample/x86/audio_convert_init.c b/libavresample/x86/audio_convert_init.c index c4fe3aee07..f0a1a1a55f 100644 --- a/libavresample/x86/audio_convert_init.c +++ b/libavresample/x86/audio_convert_init.c @@ -30,6 +30,9 @@ extern void ff_conv_s16_to_flt_sse4(float *dst, const int16_t *src, int len); extern void ff_conv_s32_to_s16_mmx (int16_t *dst, const int32_t *src, int len); extern void ff_conv_s32_to_s16_sse2(int16_t *dst, const int32_t *src, int len); +extern void ff_conv_s32_to_flt_sse2(float *dst, const int32_t *src, int len); +extern void ff_conv_s32_to_flt_avx (float *dst, const int32_t *src, int len); + extern void ff_conv_fltp_to_flt_6ch_mmx (float *dst, float *const *src, int len); extern void ff_conv_fltp_to_flt_6ch_sse4(float *dst, float *const *src, int len); extern void ff_conv_fltp_to_flt_6ch_avx (float *dst, float *const *src, int len); @@ -62,10 +65,16 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac) 0, 16, 8, "SSE2", ff_conv_s16_to_s32_sse2); ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16, 0, 16, 8, "SSE2", ff_conv_s16_to_flt_sse2); + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S32, + 0, 16, 8, "SSE2", ff_conv_s32_to_flt_sse2); } if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) { ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16, 0, 16, 8, "SSE4", ff_conv_s16_to_flt_sse4); } + if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { + ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S32, + 0, 32, 16, "AVX", ff_conv_s32_to_flt_avx); + } #endif } -- cgit v1.2.3