summaryrefslogtreecommitdiff
path: root/libavcodec/arm
diff options
context:
space:
mode:
authorJustin Ruggles <justin.ruggles@gmail.com>2011-01-13 15:28:06 -0500
committerMans Rullgard <mans@mansr.com>2011-01-22 17:53:27 +0000
commit6eabb0d3ad42b91c1b4c298718c29961f7c1653a (patch)
tree0cb7ebc7b25fcb4bf3f91fe2735ff9f264dff015 /libavcodec/arm
parentfcb7e535dd9ad142c079af62af9c1d0f4b001057 (diff)
Change DSPContext.vector_fmul() from dst=dst*src to dest=src0*src1.
Signed-off-by: Mans Rullgard <mans@mansr.com>
Diffstat (limited to 'libavcodec/arm')
-rw-r--r--libavcodec/arm/dsputil_init_neon.c2
-rw-r--r--libavcodec/arm/dsputil_init_vfp.c3
-rw-r--r--libavcodec/arm/dsputil_neon.S45
-rw-r--r--libavcodec/arm/dsputil_vfp.S29
4 files changed, 39 insertions, 40 deletions
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 04ebb00576..221183cef8 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -138,7 +138,7 @@ void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
-void ff_vector_fmul_neon(float *dst, const float *src, int len);
+void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, int len);
void ff_vector_fmul_window_neon(float *dst, const float *src0,
const float *src1, const float *win,
float add_bias, int len);
diff --git a/libavcodec/arm/dsputil_init_vfp.c b/libavcodec/arm/dsputil_init_vfp.c
index 9f8c1b7840..76ef6b4171 100644
--- a/libavcodec/arm/dsputil_init_vfp.c
+++ b/libavcodec/arm/dsputil_init_vfp.c
@@ -21,7 +21,8 @@
#include "libavcodec/dsputil.h"
#include "dsputil_arm.h"
-void ff_vector_fmul_vfp(float *dst, const float *src, int len);
+void ff_vector_fmul_vfp(float *dst, const float *src0,
+ const float *src1, int len);
void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
const float *src1, int len);
void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S
index 2bcdb397f9..42fb38de52 100644
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -738,42 +738,41 @@ function ff_float_to_int16_interleave_neon, export=1
endfunc
function ff_vector_fmul_neon, export=1
- mov r3, r0
- subs r2, r2, #8
- vld1.64 {d0-d3}, [r0,:128]!
- vld1.64 {d4-d7}, [r1,:128]!
+ subs r3, r3, #8
+ vld1.64 {d0-d3}, [r1,:128]!
+ vld1.64 {d4-d7}, [r2,:128]!
vmul.f32 q8, q0, q2
vmul.f32 q9, q1, q3
beq 3f
- bics ip, r2, #15
+ bics ip, r3, #15
beq 2f
1: subs ip, ip, #16
- vld1.64 {d0-d1}, [r0,:128]!
- vld1.64 {d4-d5}, [r1,:128]!
+ vld1.64 {d0-d1}, [r1,:128]!
+ vld1.64 {d4-d5}, [r2,:128]!
vmul.f32 q10, q0, q2
- vld1.64 {d2-d3}, [r0,:128]!
- vld1.64 {d6-d7}, [r1,:128]!
+ vld1.64 {d2-d3}, [r1,:128]!
+ vld1.64 {d6-d7}, [r2,:128]!
vmul.f32 q11, q1, q3
- vst1.64 {d16-d19},[r3,:128]!
- vld1.64 {d0-d1}, [r0,:128]!
- vld1.64 {d4-d5}, [r1,:128]!
+ vst1.64 {d16-d19},[r0,:128]!
+ vld1.64 {d0-d1}, [r1,:128]!
+ vld1.64 {d4-d5}, [r2,:128]!
vmul.f32 q8, q0, q2
- vld1.64 {d2-d3}, [r0,:128]!
- vld1.64 {d6-d7}, [r1,:128]!
+ vld1.64 {d2-d3}, [r1,:128]!
+ vld1.64 {d6-d7}, [r2,:128]!
vmul.f32 q9, q1, q3
- vst1.64 {d20-d23},[r3,:128]!
+ vst1.64 {d20-d23},[r0,:128]!
bne 1b
- ands r2, r2, #15
+ ands r3, r3, #15
beq 3f
-2: vld1.64 {d0-d1}, [r0,:128]!
- vld1.64 {d4-d5}, [r1,:128]!
- vst1.64 {d16-d17},[r3,:128]!
+2: vld1.64 {d0-d1}, [r1,:128]!
+ vld1.64 {d4-d5}, [r2,:128]!
+ vst1.64 {d16-d17},[r0,:128]!
vmul.f32 q8, q0, q2
- vld1.64 {d2-d3}, [r0,:128]!
- vld1.64 {d6-d7}, [r1,:128]!
- vst1.64 {d18-d19},[r3,:128]!
+ vld1.64 {d2-d3}, [r1,:128]!
+ vld1.64 {d6-d7}, [r2,:128]!
+ vst1.64 {d18-d19},[r0,:128]!
vmul.f32 q9, q1, q3
-3: vst1.64 {d16-d19},[r3,:128]!
+3: vst1.64 {d16-d19},[r0,:128]!
bx lr
endfunc
diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S
index b704ba9144..a65b69e20a 100644
--- a/libavcodec/arm/dsputil_vfp.S
+++ b/libavcodec/arm/dsputil_vfp.S
@@ -41,34 +41,33 @@
* ARM VFP optimized implementation of 'vector_fmul_c' function.
* Assume that len is a positive number and is multiple of 8
*/
-@ void ff_vector_fmul_vfp(float *dst, const float *src, int len)
+@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
function ff_vector_fmul_vfp, export=1
vpush {d8-d15}
- mov r3, r0
fmrx r12, fpscr
orr r12, r12, #(3 << 16) /* set vector size to 4 */
fmxr fpscr, r12
- vldmia r3!, {s0-s3}
- vldmia r1!, {s8-s11}
- vldmia r3!, {s4-s7}
- vldmia r1!, {s12-s15}
+ vldmia r1!, {s0-s3}
+ vldmia r2!, {s8-s11}
+ vldmia r1!, {s4-s7}
+ vldmia r2!, {s12-s15}
vmul.f32 s8, s0, s8
1:
- subs r2, r2, #16
+ subs r3, r3, #16
vmul.f32 s12, s4, s12
- vldmiage r3!, {s16-s19}
- vldmiage r1!, {s24-s27}
- vldmiage r3!, {s20-s23}
- vldmiage r1!, {s28-s31}
+ vldmiage r1!, {s16-s19}
+ vldmiage r2!, {s24-s27}
+ vldmiage r1!, {s20-s23}
+ vldmiage r2!, {s28-s31}
vmulge.f32 s24, s16, s24
vstmia r0!, {s8-s11}
vstmia r0!, {s12-s15}
vmulge.f32 s28, s20, s28
- vldmiagt r3!, {s0-s3}
- vldmiagt r1!, {s8-s11}
- vldmiagt r3!, {s4-s7}
- vldmiagt r1!, {s12-s15}
+ vldmiagt r1!, {s0-s3}
+ vldmiagt r2!, {s8-s11}
+ vldmiagt r1!, {s4-s7}
+ vldmiagt r2!, {s12-s15}
vmulge.f32 s8, s0, s8
vstmiage r0!, {s24-s27}
vstmiage r0!, {s28-s31}