summaryrefslogtreecommitdiff
path: root/libavutil/arm
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2013-01-20 13:20:30 -0800
committerRonald S. Bultje <rsbultje@gmail.com>2013-01-22 11:55:42 -0800
commit42d324694883cdf1fff1612ac70fa403692a1ad4 (patch)
tree99b6a8629388614ec2395c847b79a2c448583341 /libavutil/arm
parent55aa03b9f8f11ebb7535424cc0e5635558590f49 (diff)
floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp.
Now, nellymoserenc and aacenc no longer depends on dsputil. Independent of this patch, wmaprodec also does not depend on dsputil, so I removed it from there also.
Diffstat (limited to 'libavutil/arm')
-rw-r--r--libavutil/arm/float_dsp_init_neon.c4
-rw-r--r--libavutil/arm/float_dsp_init_vfp.c4
-rw-r--r--libavutil/arm/float_dsp_neon.S24
-rw-r--r--libavutil/arm/float_dsp_vfp.S69
4 files changed, 101 insertions, 0 deletions
diff --git a/libavutil/arm/float_dsp_init_neon.c b/libavutil/arm/float_dsp_init_neon.c
index 41e513fcdc..c6f02bd2c5 100644
--- a/libavutil/arm/float_dsp_init_neon.c
+++ b/libavutil/arm/float_dsp_init_neon.c
@@ -38,6 +38,9 @@ void ff_vector_fmul_window_neon(float *dst, const float *src0,
void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
const float *src2, int len);
+void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
+ const float *src1, int len);
+
void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp)
{
fdsp->vector_fmul = ff_vector_fmul_neon;
@@ -45,4 +48,5 @@ void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp)
fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
fdsp->vector_fmul_window = ff_vector_fmul_window_neon;
fdsp->vector_fmul_add = ff_vector_fmul_add_neon;
+ fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
}
diff --git a/libavutil/arm/float_dsp_init_vfp.c b/libavutil/arm/float_dsp_init_vfp.c
index ef808d8a5b..7247d56762 100644
--- a/libavutil/arm/float_dsp_init_vfp.c
+++ b/libavutil/arm/float_dsp_init_vfp.c
@@ -25,10 +25,14 @@
void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1,
int len);
+void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
+ const float *src1, int len);
+
void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp)
{
int cpu_flags = av_get_cpu_flags();
if (!have_vfpv3(cpu_flags))
fdsp->vector_fmul = ff_vector_fmul_vfp;
+ fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
}
diff --git a/libavutil/arm/float_dsp_neon.S b/libavutil/arm/float_dsp_neon.S
index 100eb02455..d00e59de8f 100644
--- a/libavutil/arm/float_dsp_neon.S
+++ b/libavutil/arm/float_dsp_neon.S
@@ -220,3 +220,27 @@ function ff_vector_fmul_add_neon, export=1
2: vst1.32 {q12-q13},[r0,:128]!
bx lr
endfunc
+
+function ff_vector_fmul_reverse_neon, export=1
+ add r2, r2, r3, lsl #2
+ sub r2, r2, #32
+ mov r12, #-32
+ vld1.32 {q0-q1}, [r1,:128]!
+ vld1.32 {q2-q3}, [r2,:128], r12
+1: pld [r1, #32]
+ vrev64.32 q3, q3
+ vmul.f32 d16, d0, d7
+ vmul.f32 d17, d1, d6
+ pld [r2, #-32]
+ vrev64.32 q2, q2
+ vmul.f32 d18, d2, d5
+ vmul.f32 d19, d3, d4
+ subs r3, r3, #8
+ beq 2f
+ vld1.32 {q0-q1}, [r1,:128]!
+ vld1.32 {q2-q3}, [r2,:128], r12
+ vst1.32 {q8-q9}, [r0,:128]!
+ b 1b
+2: vst1.32 {q8-q9}, [r0,:128]!
+ bx lr
+endfunc
diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S
index 3931828381..82952807de 100644
--- a/libavutil/arm/float_dsp_vfp.S
+++ b/libavutil/arm/float_dsp_vfp.S
@@ -66,3 +66,72 @@ function ff_vector_fmul_vfp, export=1
vpop {d8-d15}
bx lr
endfunc
+
+/**
+ * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
+ * Assume that len is a positive number and is multiple of 8
+ */
+@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
+@ const float *src1, int len)
+function ff_vector_fmul_reverse_vfp, export=1
+ vpush {d8-d15}
+ add r2, r2, r3, lsl #2
+ vldmdb r2!, {s0-s3}
+ vldmia r1!, {s8-s11}
+ vldmdb r2!, {s4-s7}
+ vldmia r1!, {s12-s15}
+ vmul.f32 s8, s3, s8
+ vmul.f32 s9, s2, s9
+ vmul.f32 s10, s1, s10
+ vmul.f32 s11, s0, s11
+1:
+ subs r3, r3, #16
+ it ge
+ vldmdbge r2!, {s16-s19}
+ vmul.f32 s12, s7, s12
+ it ge
+ vldmiage r1!, {s24-s27}
+ vmul.f32 s13, s6, s13
+ it ge
+ vldmdbge r2!, {s20-s23}
+ vmul.f32 s14, s5, s14
+ it ge
+ vldmiage r1!, {s28-s31}
+ vmul.f32 s15, s4, s15
+ it ge
+ vmulge.f32 s24, s19, s24
+ it gt
+ vldmdbgt r2!, {s0-s3}
+ it ge
+ vmulge.f32 s25, s18, s25
+ vstmia r0!, {s8-s13}
+ it ge
+ vmulge.f32 s26, s17, s26
+ it gt
+ vldmiagt r1!, {s8-s11}
+ itt ge
+ vmulge.f32 s27, s16, s27
+ vmulge.f32 s28, s23, s28
+ it gt
+ vldmdbgt r2!, {s4-s7}
+ it ge
+ vmulge.f32 s29, s22, s29
+ vstmia r0!, {s14-s15}
+ ittt ge
+ vmulge.f32 s30, s21, s30
+ vmulge.f32 s31, s20, s31
+ vmulge.f32 s8, s3, s8
+ it gt
+ vldmiagt r1!, {s12-s15}
+ itttt ge
+ vmulge.f32 s9, s2, s9
+ vmulge.f32 s10, s1, s10
+ vstmiage r0!, {s24-s27}
+ vmulge.f32 s11, s0, s11
+ it ge
+ vstmiage r0!, {s28-s31}
+ bgt 1b
+
+ vpop {d8-d15}
+ bx lr
+endfunc