summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMåns Rullgård <mans@mansr.com>2009-09-22 20:37:55 +0000
committerMåns Rullgård <mans@mansr.com>2009-09-22 20:37:55 +0000
commit275cfd1501c385857d0fecb827aa3a3a4177b102 (patch)
treeec1090ff74844ec5822602d1f9ff5d97744e5d4c
parent0cb71412829dfe60421961a19a1deb1a8a52066a (diff)
ARM: NEON optimised scalarproduct_float
Originally committed as revision 19971 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r--libavcodec/arm/dsputil_neon.c2
-rw-r--r--libavcodec/arm/dsputil_neon_s.S13
2 files changed, 15 insertions, 0 deletions
diff --git a/libavcodec/arm/dsputil_neon.c b/libavcodec/arm/dsputil_neon.c
index 03e17466b6..bfea27ff13 100644
--- a/libavcodec/arm/dsputil_neon.c
+++ b/libavcodec/arm/dsputil_neon.c
@@ -168,6 +168,7 @@ void ff_sv_fmul_scalar_2_neon(float *dst, const float **vp, float mul,
void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul,
int len);
void ff_butterflies_float_neon(float *v1, float *v2, int len);
+float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
void ff_float_to_int16_neon(int16_t *, const float *, long);
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
@@ -282,6 +283,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->vector_fmul_window = ff_vector_fmul_window_neon;
c->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
c->butterflies_float = ff_butterflies_float_neon;
+ c->scalarproduct_float = ff_scalarproduct_float_neon;
c->vector_fmul_sv_scalar[0] = ff_vector_fmul_sv_scalar_2_neon;
c->vector_fmul_sv_scalar[1] = ff_vector_fmul_sv_scalar_4_neon;
diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S
index 8764cedc4e..b9779354fa 100644
--- a/libavcodec/arm/dsputil_neon_s.S
+++ b/libavcodec/arm/dsputil_neon_s.S
@@ -1010,3 +1010,16 @@ function ff_butterflies_float_neon, export=1
bgt 1b
bx lr
.endfunc
+
+function ff_scalarproduct_float_neon, export=1
+ vmov.f32 q2, #0.0
+1: vld1.32 {q0},[r0,:128]!
+ vld1.32 {q1},[r1,:128]!
+ vmla.f32 q2, q0, q1
+ subs r2, r2, #4
+ bgt 1b
+ vadd.f32 d0, d4, d5
+ vpadd.f32 d0, d0, d0
+NOVFP vmov.32 r0, d0[0]
+ bx lr
+ .endfunc