Merge commit '42d324694883cdf1fff1612ac70fa403692a1ad4'

* commit '42d324694883cdf1fff1612ac70fa403692a1ad4': floatdsp: move vector_fmul_reverse from dsputil to avfloatdsp. Conflicts: libavcodec/arm/dsputil_init_vfp.c libavcodec/arm/dsputil_vfp.S libavcodec/dsputil.c libavcodec/ppc/float_altivec.c libavcodec/x86/dsputil.asm libavutil/x86/float_dsp.asm Merged-by: Michael Niedermayer <michaelni@gmx.at>
author: Michael Niedermayer <michaelni@gmx.at> 2013-01-23 14:04:50 +0100
committer: Michael Niedermayer <michaelni@gmx.at> 2013-01-23 14:04:50 +0100
commit: 6e6e1708984e45881b9a5d4e26c3e7de852c54d5 (patch)
tree: 5e04d38f8e152faf98921843ca5e4530cbdc46a4 /libavutil/arm
parent: b1b870fbd7185bffbe27c5918001b40a8ff8b920 (diff)
parent: 42d324694883cdf1fff1612ac70fa403692a1ad4 (diff)
4 files changed, 101 insertions, 0 deletions
diff --git a/libavutil/arm/float_dsp_init_neon.c b/libavutil/arm/float_dsp_init_neon.c
index 41e513fcdc..c6f02bd2c5 100644
--- a/libavutil/arm/float_dsp_init_neon.c
+++ b/libavutil/arm/float_dsp_init_neon.c
@@ -38,6 +38,9 @@ void ff_vector_fmul_window_neon(float *dst, const float *src0,
 void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
                              const float *src2, int len);
 
+void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
+                                 const float *src1, int len);
+
 void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp)
 {
     fdsp->vector_fmul = ff_vector_fmul_neon;
@@ -45,4 +48,5 @@ void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp)
     fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
     fdsp->vector_fmul_window = ff_vector_fmul_window_neon;
     fdsp->vector_fmul_add    = ff_vector_fmul_add_neon;
+    fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
 }
diff --git a/libavutil/arm/float_dsp_init_vfp.c b/libavutil/arm/float_dsp_init_vfp.c
index 7abc3322cf..f7e2f54601 100644
--- a/libavutil/arm/float_dsp_init_vfp.c
+++ b/libavutil/arm/float_dsp_init_vfp.c
@@ -25,10 +25,14 @@
 void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1,
                         int len);
 
+void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
+                                const float *src1, int len);
+
 void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp)
 {
     int cpu_flags = av_get_cpu_flags();
 
     if (!have_vfpv3(cpu_flags))
         fdsp->vector_fmul = ff_vector_fmul_vfp;
+    fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
 }
diff --git a/libavutil/arm/float_dsp_neon.S b/libavutil/arm/float_dsp_neon.S
index 100eb02455..d00e59de8f 100644
--- a/libavutil/arm/float_dsp_neon.S
+++ b/libavutil/arm/float_dsp_neon.S
@@ -220,3 +220,27 @@ function ff_vector_fmul_add_neon, export=1
 2:      vst1.32         {q12-q13},[r0,:128]!
         bx              lr
 endfunc
+
+function ff_vector_fmul_reverse_neon, export=1
+        add             r2,  r2,  r3,  lsl #2
+        sub             r2,  r2,  #32
+        mov             r12, #-32
+        vld1.32         {q0-q1},  [r1,:128]!
+        vld1.32         {q2-q3},  [r2,:128], r12
+1:      pld             [r1, #32]
+        vrev64.32       q3,  q3
+        vmul.f32        d16, d0,  d7
+        vmul.f32        d17, d1,  d6
+        pld             [r2, #-32]
+        vrev64.32       q2,  q2
+        vmul.f32        d18, d2,  d5
+        vmul.f32        d19, d3,  d4
+        subs            r3,  r3,  #8
+        beq             2f
+        vld1.32         {q0-q1},  [r1,:128]!
+        vld1.32         {q2-q3},  [r2,:128], r12
+        vst1.32         {q8-q9},  [r0,:128]!
+        b               1b
+2:      vst1.32         {q8-q9},  [r0,:128]!
+        bx              lr
+endfunc
diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S
index db63e5a675..8695fbd981 100644
--- a/libavutil/arm/float_dsp_vfp.S
+++ b/libavutil/arm/float_dsp_vfp.S
@@ -66,3 +66,72 @@ function ff_vector_fmul_vfp, export=1
         vpop            {d8-d15}
         bx              lr
 endfunc
+
+/**
+ * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
+ * Assume that len is a positive number and is multiple of 8
+ */
+@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
+@                                 const float *src1, int len)
+function ff_vector_fmul_reverse_vfp, export=1
+        vpush           {d8-d15}
+        add             r2,  r2,  r3, lsl #2
+        vldmdb          r2!, {s0-s3}
+        vldmia          r1!, {s8-s11}
+        vldmdb          r2!, {s4-s7}
+        vldmia          r1!, {s12-s15}
+        vmul.f32        s8,  s3,  s8
+        vmul.f32        s9,  s2,  s9
+        vmul.f32        s10, s1,  s10
+        vmul.f32        s11, s0,  s11
+1:
+        subs            r3,  r3,  #16
+        it              ge
+        vldmdbge        r2!, {s16-s19}
+        vmul.f32        s12, s7,  s12
+        it              ge
+        vldmiage        r1!, {s24-s27}
+        vmul.f32        s13, s6,  s13
+        it              ge
+        vldmdbge        r2!, {s20-s23}
+        vmul.f32        s14, s5,  s14
+        it              ge
+        vldmiage        r1!, {s28-s31}
+        vmul.f32        s15, s4,  s15
+        it              ge
+        vmulge.f32      s24, s19, s24
+        it              gt
+        vldmdbgt        r2!, {s0-s3}
+        it              ge
+        vmulge.f32      s25, s18, s25
+        vstmia          r0!, {s8-s13}
+        it              ge
+        vmulge.f32      s26, s17, s26
+        it              gt
+        vldmiagt        r1!, {s8-s11}
+        itt             ge
+        vmulge.f32      s27, s16, s27
+        vmulge.f32      s28, s23, s28
+        it              gt
+        vldmdbgt        r2!, {s4-s7}
+        it              ge
+        vmulge.f32      s29, s22, s29
+        vstmia          r0!, {s14-s15}
+        ittt            ge
+        vmulge.f32      s30, s21, s30
+        vmulge.f32      s31, s20, s31
+        vmulge.f32      s8,  s3,  s8
+        it              gt
+        vldmiagt        r1!, {s12-s15}
+        itttt           ge
+        vmulge.f32      s9,  s2,  s9
+        vmulge.f32      s10, s1,  s10
+        vstmiage        r0!, {s24-s27}
+        vmulge.f32      s11, s0,  s11
+        it              ge
+        vstmiage        r0!, {s28-s31}
+        bgt             1b
+
+        vpop            {d8-d15}
+        bx              lr
+endfunc
author	Michael Niedermayer <michaelni@gmx.at>	2013-01-23 14:04:50 +0100
committer	Michael Niedermayer <michaelni@gmx.at>	2013-01-23 14:04:50 +0100
commit	6e6e1708984e45881b9a5d4e26c3e7de852c54d5 (patch)
tree	5e04d38f8e152faf98921843ca5e4530cbdc46a4 /libavutil/arm
parent	b1b870fbd7185bffbe27c5918001b40a8ff8b920 (diff)
parent	42d324694883cdf1fff1612ac70fa403692a1ad4 (diff)