From 42d324694883cdf1fff1612ac70fa403692a1ad4 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sun, 20 Jan 2013 13:20:30 -0800 Subject: floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp. Now, nellymoserenc and aacenc no longer depends on dsputil. Independent of this patch, wmaprodec also does not depend on dsputil, so I removed it from there also. --- libavutil/arm/float_dsp_init_neon.c | 4 +++ libavutil/arm/float_dsp_init_vfp.c | 4 +++ libavutil/arm/float_dsp_neon.S | 24 +++++++++++++ libavutil/arm/float_dsp_vfp.S | 69 +++++++++++++++++++++++++++++++++++++ 4 files changed, 101 insertions(+) (limited to 'libavutil/arm') diff --git a/libavutil/arm/float_dsp_init_neon.c b/libavutil/arm/float_dsp_init_neon.c index 41e513fcdc..c6f02bd2c5 100644 --- a/libavutil/arm/float_dsp_init_neon.c +++ b/libavutil/arm/float_dsp_init_neon.c @@ -38,6 +38,9 @@ void ff_vector_fmul_window_neon(float *dst, const float *src0, void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, const float *src2, int len); +void ff_vector_fmul_reverse_neon(float *dst, const float *src0, + const float *src1, int len); + void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp) { fdsp->vector_fmul = ff_vector_fmul_neon; @@ -45,4 +48,5 @@ void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp) fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_neon; fdsp->vector_fmul_window = ff_vector_fmul_window_neon; fdsp->vector_fmul_add = ff_vector_fmul_add_neon; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_neon; } diff --git a/libavutil/arm/float_dsp_init_vfp.c b/libavutil/arm/float_dsp_init_vfp.c index ef808d8a5b..7247d56762 100644 --- a/libavutil/arm/float_dsp_init_vfp.c +++ b/libavutil/arm/float_dsp_init_vfp.c @@ -25,10 +25,14 @@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len); +void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, + const float *src1, int len); + void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp) { int cpu_flags = av_get_cpu_flags(); if (!have_vfpv3(cpu_flags)) fdsp->vector_fmul = ff_vector_fmul_vfp; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; } diff --git a/libavutil/arm/float_dsp_neon.S b/libavutil/arm/float_dsp_neon.S index 100eb02455..d00e59de8f 100644 --- a/libavutil/arm/float_dsp_neon.S +++ b/libavutil/arm/float_dsp_neon.S @@ -220,3 +220,27 @@ function ff_vector_fmul_add_neon, export=1 2: vst1.32 {q12-q13},[r0,:128]! bx lr endfunc + +function ff_vector_fmul_reverse_neon, export=1 + add r2, r2, r3, lsl #2 + sub r2, r2, #32 + mov r12, #-32 + vld1.32 {q0-q1}, [r1,:128]! + vld1.32 {q2-q3}, [r2,:128], r12 +1: pld [r1, #32] + vrev64.32 q3, q3 + vmul.f32 d16, d0, d7 + vmul.f32 d17, d1, d6 + pld [r2, #-32] + vrev64.32 q2, q2 + vmul.f32 d18, d2, d5 + vmul.f32 d19, d3, d4 + subs r3, r3, #8 + beq 2f + vld1.32 {q0-q1}, [r1,:128]! + vld1.32 {q2-q3}, [r2,:128], r12 + vst1.32 {q8-q9}, [r0,:128]! + b 1b +2: vst1.32 {q8-q9}, [r0,:128]! + bx lr +endfunc diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S index 3931828381..82952807de 100644 --- a/libavutil/arm/float_dsp_vfp.S +++ b/libavutil/arm/float_dsp_vfp.S @@ -66,3 +66,72 @@ function ff_vector_fmul_vfp, export=1 vpop {d8-d15} bx lr endfunc + +/** + * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. + * Assume that len is a positive number and is multiple of 8 + */ +@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, +@ const float *src1, int len) +function ff_vector_fmul_reverse_vfp, export=1 + vpush {d8-d15} + add r2, r2, r3, lsl #2 + vldmdb r2!, {s0-s3} + vldmia r1!, {s8-s11} + vldmdb r2!, {s4-s7} + vldmia r1!, {s12-s15} + vmul.f32 s8, s3, s8 + vmul.f32 s9, s2, s9 + vmul.f32 s10, s1, s10 + vmul.f32 s11, s0, s11 +1: + subs r3, r3, #16 + it ge + vldmdbge r2!, {s16-s19} + vmul.f32 s12, s7, s12 + it ge + vldmiage r1!, {s24-s27} + vmul.f32 s13, s6, s13 + it ge + vldmdbge r2!, {s20-s23} + vmul.f32 s14, s5, s14 + it ge + vldmiage r1!, {s28-s31} + vmul.f32 s15, s4, s15 + it ge + vmulge.f32 s24, s19, s24 + it gt + vldmdbgt r2!, {s0-s3} + it ge + vmulge.f32 s25, s18, s25 + vstmia r0!, {s8-s13} + it ge + vmulge.f32 s26, s17, s26 + it gt + vldmiagt r1!, {s8-s11} + itt ge + vmulge.f32 s27, s16, s27 + vmulge.f32 s28, s23, s28 + it gt + vldmdbgt r2!, {s4-s7} + it ge + vmulge.f32 s29, s22, s29 + vstmia r0!, {s14-s15} + ittt ge + vmulge.f32 s30, s21, s30 + vmulge.f32 s31, s20, s31 + vmulge.f32 s8, s3, s8 + it gt + vldmiagt r1!, {s12-s15} + itttt ge + vmulge.f32 s9, s2, s9 + vmulge.f32 s10, s1, s10 + vstmiage r0!, {s24-s27} + vmulge.f32 s11, s0, s11 + it ge + vstmiage r0!, {s28-s31} + bgt 1b + + vpop {d8-d15} + bx lr +endfunc -- cgit v1.2.3