summaryrefslogtreecommitdiff
path: root/libavutil/arm
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2013-01-23 14:04:50 +0100
committerMichael Niedermayer <michaelni@gmx.at>2013-01-23 14:04:50 +0100
commit6e6e1708984e45881b9a5d4e26c3e7de852c54d5 (patch)
tree5e04d38f8e152faf98921843ca5e4530cbdc46a4 /libavutil/arm
parentb1b870fbd7185bffbe27c5918001b40a8ff8b920 (diff)
parent42d324694883cdf1fff1612ac70fa403692a1ad4 (diff)
Merge commit '42d324694883cdf1fff1612ac70fa403692a1ad4'
* commit '42d324694883cdf1fff1612ac70fa403692a1ad4': floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp. Conflicts: libavcodec/arm/dsputil_init_vfp.c libavcodec/arm/dsputil_vfp.S libavcodec/dsputil.c libavcodec/ppc/float_altivec.c libavcodec/x86/dsputil.asm libavutil/x86/float_dsp.asm Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavutil/arm')
-rw-r--r--libavutil/arm/float_dsp_init_neon.c4
-rw-r--r--libavutil/arm/float_dsp_init_vfp.c4
-rw-r--r--libavutil/arm/float_dsp_neon.S24
-rw-r--r--libavutil/arm/float_dsp_vfp.S69
4 files changed, 101 insertions, 0 deletions
diff --git a/libavutil/arm/float_dsp_init_neon.c b/libavutil/arm/float_dsp_init_neon.c
index 41e513fcdc..c6f02bd2c5 100644
--- a/libavutil/arm/float_dsp_init_neon.c
+++ b/libavutil/arm/float_dsp_init_neon.c
@@ -38,6 +38,9 @@ void ff_vector_fmul_window_neon(float *dst, const float *src0,
void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
const float *src2, int len);
+void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
+ const float *src1, int len);
+
void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp)
{
fdsp->vector_fmul = ff_vector_fmul_neon;
@@ -45,4 +48,5 @@ void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp)
fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
fdsp->vector_fmul_window = ff_vector_fmul_window_neon;
fdsp->vector_fmul_add = ff_vector_fmul_add_neon;
+ fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
}
diff --git a/libavutil/arm/float_dsp_init_vfp.c b/libavutil/arm/float_dsp_init_vfp.c
index 7abc3322cf..f7e2f54601 100644
--- a/libavutil/arm/float_dsp_init_vfp.c
+++ b/libavutil/arm/float_dsp_init_vfp.c
@@ -25,10 +25,14 @@
void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1,
int len);
+void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
+ const float *src1, int len);
+
void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp)
{
int cpu_flags = av_get_cpu_flags();
if (!have_vfpv3(cpu_flags))
fdsp->vector_fmul = ff_vector_fmul_vfp;
+ fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
}
diff --git a/libavutil/arm/float_dsp_neon.S b/libavutil/arm/float_dsp_neon.S
index 100eb02455..d00e59de8f 100644
--- a/libavutil/arm/float_dsp_neon.S
+++ b/libavutil/arm/float_dsp_neon.S
@@ -220,3 +220,27 @@ function ff_vector_fmul_add_neon, export=1
2: vst1.32 {q12-q13},[r0,:128]!
bx lr
endfunc
+
+function ff_vector_fmul_reverse_neon, export=1
+ add r2, r2, r3, lsl #2
+ sub r2, r2, #32
+ mov r12, #-32
+ vld1.32 {q0-q1}, [r1,:128]!
+ vld1.32 {q2-q3}, [r2,:128], r12
+1: pld [r1, #32]
+ vrev64.32 q3, q3
+ vmul.f32 d16, d0, d7
+ vmul.f32 d17, d1, d6
+ pld [r2, #-32]
+ vrev64.32 q2, q2
+ vmul.f32 d18, d2, d5
+ vmul.f32 d19, d3, d4
+ subs r3, r3, #8
+ beq 2f
+ vld1.32 {q0-q1}, [r1,:128]!
+ vld1.32 {q2-q3}, [r2,:128], r12
+ vst1.32 {q8-q9}, [r0,:128]!
+ b 1b
+2: vst1.32 {q8-q9}, [r0,:128]!
+ bx lr
+endfunc
diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S
index db63e5a675..8695fbd981 100644
--- a/libavutil/arm/float_dsp_vfp.S
+++ b/libavutil/arm/float_dsp_vfp.S
@@ -66,3 +66,72 @@ function ff_vector_fmul_vfp, export=1
vpop {d8-d15}
bx lr
endfunc
+
+/**
+ * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
+ * Assume that len is a positive number and is multiple of 8
+ */
+@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
+@ const float *src1, int len)
+function ff_vector_fmul_reverse_vfp, export=1
+ vpush {d8-d15}
+ add r2, r2, r3, lsl #2
+ vldmdb r2!, {s0-s3}
+ vldmia r1!, {s8-s11}
+ vldmdb r2!, {s4-s7}
+ vldmia r1!, {s12-s15}
+ vmul.f32 s8, s3, s8
+ vmul.f32 s9, s2, s9
+ vmul.f32 s10, s1, s10
+ vmul.f32 s11, s0, s11
+1:
+ subs r3, r3, #16
+ it ge
+ vldmdbge r2!, {s16-s19}
+ vmul.f32 s12, s7, s12
+ it ge
+ vldmiage r1!, {s24-s27}
+ vmul.f32 s13, s6, s13
+ it ge
+ vldmdbge r2!, {s20-s23}
+ vmul.f32 s14, s5, s14
+ it ge
+ vldmiage r1!, {s28-s31}
+ vmul.f32 s15, s4, s15
+ it ge
+ vmulge.f32 s24, s19, s24
+ it gt
+ vldmdbgt r2!, {s0-s3}
+ it ge
+ vmulge.f32 s25, s18, s25
+ vstmia r0!, {s8-s13}
+ it ge
+ vmulge.f32 s26, s17, s26
+ it gt
+ vldmiagt r1!, {s8-s11}
+ itt ge
+ vmulge.f32 s27, s16, s27
+ vmulge.f32 s28, s23, s28
+ it gt
+ vldmdbgt r2!, {s4-s7}
+ it ge
+ vmulge.f32 s29, s22, s29
+ vstmia r0!, {s14-s15}
+ ittt ge
+ vmulge.f32 s30, s21, s30
+ vmulge.f32 s31, s20, s31
+ vmulge.f32 s8, s3, s8
+ it gt
+ vldmiagt r1!, {s12-s15}
+ itttt ge
+ vmulge.f32 s9, s2, s9
+ vmulge.f32 s10, s1, s10
+ vstmiage r0!, {s24-s27}
+ vmulge.f32 s11, s0, s11
+ it ge
+ vstmiage r0!, {s28-s31}
+ bgt 1b
+
+ vpop {d8-d15}
+ bx lr
+endfunc