From 6eabb0d3ad42b91c1b4c298718c29961f7c1653a Mon Sep 17 00:00:00 2001 From: Justin Ruggles Date: Thu, 13 Jan 2011 15:28:06 -0500 Subject: Change DSPContext.vector_fmul() from dst=dst*src to dest=src0*src1. Signed-off-by: Mans Rullgard --- libavcodec/arm/dsputil_init_neon.c | 2 +- libavcodec/arm/dsputil_init_vfp.c | 3 ++- libavcodec/arm/dsputil_neon.S | 45 +++++++++++++++++++------------------- libavcodec/arm/dsputil_vfp.S | 29 ++++++++++++------------ 4 files changed, 39 insertions(+), 40 deletions(-) (limited to 'libavcodec/arm') diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c index 04ebb00576..221183cef8 100644 --- a/libavcodec/arm/dsputil_init_neon.c +++ b/libavcodec/arm/dsputil_init_neon.c @@ -138,7 +138,7 @@ void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int); void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *); void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *); -void ff_vector_fmul_neon(float *dst, const float *src, int len); +void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, int len); void ff_vector_fmul_window_neon(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len); diff --git a/libavcodec/arm/dsputil_init_vfp.c b/libavcodec/arm/dsputil_init_vfp.c index 9f8c1b7840..76ef6b4171 100644 --- a/libavcodec/arm/dsputil_init_vfp.c +++ b/libavcodec/arm/dsputil_init_vfp.c @@ -21,7 +21,8 @@ #include "libavcodec/dsputil.h" #include "dsputil_arm.h" -void ff_vector_fmul_vfp(float *dst, const float *src, int len); +void ff_vector_fmul_vfp(float *dst, const float *src0, + const float *src1, int len); void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, const float *src1, int len); void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S index 2bcdb397f9..42fb38de52 100644 --- a/libavcodec/arm/dsputil_neon.S +++ b/libavcodec/arm/dsputil_neon.S @@ -738,42 +738,41 @@ function ff_float_to_int16_interleave_neon, export=1 endfunc function ff_vector_fmul_neon, export=1 - mov r3, r0 - subs r2, r2, #8 - vld1.64 {d0-d3}, [r0,:128]! - vld1.64 {d4-d7}, [r1,:128]! + subs r3, r3, #8 + vld1.64 {d0-d3}, [r1,:128]! + vld1.64 {d4-d7}, [r2,:128]! vmul.f32 q8, q0, q2 vmul.f32 q9, q1, q3 beq 3f - bics ip, r2, #15 + bics ip, r3, #15 beq 2f 1: subs ip, ip, #16 - vld1.64 {d0-d1}, [r0,:128]! - vld1.64 {d4-d5}, [r1,:128]! + vld1.64 {d0-d1}, [r1,:128]! + vld1.64 {d4-d5}, [r2,:128]! vmul.f32 q10, q0, q2 - vld1.64 {d2-d3}, [r0,:128]! - vld1.64 {d6-d7}, [r1,:128]! + vld1.64 {d2-d3}, [r1,:128]! + vld1.64 {d6-d7}, [r2,:128]! vmul.f32 q11, q1, q3 - vst1.64 {d16-d19},[r3,:128]! - vld1.64 {d0-d1}, [r0,:128]! - vld1.64 {d4-d5}, [r1,:128]! + vst1.64 {d16-d19},[r0,:128]! + vld1.64 {d0-d1}, [r1,:128]! + vld1.64 {d4-d5}, [r2,:128]! vmul.f32 q8, q0, q2 - vld1.64 {d2-d3}, [r0,:128]! - vld1.64 {d6-d7}, [r1,:128]! + vld1.64 {d2-d3}, [r1,:128]! + vld1.64 {d6-d7}, [r2,:128]! vmul.f32 q9, q1, q3 - vst1.64 {d20-d23},[r3,:128]! + vst1.64 {d20-d23},[r0,:128]! bne 1b - ands r2, r2, #15 + ands r3, r3, #15 beq 3f -2: vld1.64 {d0-d1}, [r0,:128]! - vld1.64 {d4-d5}, [r1,:128]! - vst1.64 {d16-d17},[r3,:128]! +2: vld1.64 {d0-d1}, [r1,:128]! + vld1.64 {d4-d5}, [r2,:128]! + vst1.64 {d16-d17},[r0,:128]! vmul.f32 q8, q0, q2 - vld1.64 {d2-d3}, [r0,:128]! - vld1.64 {d6-d7}, [r1,:128]! - vst1.64 {d18-d19},[r3,:128]! + vld1.64 {d2-d3}, [r1,:128]! + vld1.64 {d6-d7}, [r2,:128]! + vst1.64 {d18-d19},[r0,:128]! vmul.f32 q9, q1, q3 -3: vst1.64 {d16-d19},[r3,:128]! +3: vst1.64 {d16-d19},[r0,:128]! bx lr endfunc diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S index b704ba9144..a65b69e20a 100644 --- a/libavcodec/arm/dsputil_vfp.S +++ b/libavcodec/arm/dsputil_vfp.S @@ -41,34 +41,33 @@ * ARM VFP optimized implementation of 'vector_fmul_c' function. * Assume that len is a positive number and is multiple of 8 */ -@ void ff_vector_fmul_vfp(float *dst, const float *src, int len) +@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len) function ff_vector_fmul_vfp, export=1 vpush {d8-d15} - mov r3, r0 fmrx r12, fpscr orr r12, r12, #(3 << 16) /* set vector size to 4 */ fmxr fpscr, r12 - vldmia r3!, {s0-s3} - vldmia r1!, {s8-s11} - vldmia r3!, {s4-s7} - vldmia r1!, {s12-s15} + vldmia r1!, {s0-s3} + vldmia r2!, {s8-s11} + vldmia r1!, {s4-s7} + vldmia r2!, {s12-s15} vmul.f32 s8, s0, s8 1: - subs r2, r2, #16 + subs r3, r3, #16 vmul.f32 s12, s4, s12 - vldmiage r3!, {s16-s19} - vldmiage r1!, {s24-s27} - vldmiage r3!, {s20-s23} - vldmiage r1!, {s28-s31} + vldmiage r1!, {s16-s19} + vldmiage r2!, {s24-s27} + vldmiage r1!, {s20-s23} + vldmiage r2!, {s28-s31} vmulge.f32 s24, s16, s24 vstmia r0!, {s8-s11} vstmia r0!, {s12-s15} vmulge.f32 s28, s20, s28 - vldmiagt r3!, {s0-s3} - vldmiagt r1!, {s8-s11} - vldmiagt r3!, {s4-s7} - vldmiagt r1!, {s12-s15} + vldmiagt r1!, {s0-s3} + vldmiagt r2!, {s8-s11} + vldmiagt r1!, {s4-s7} + vldmiagt r2!, {s12-s15} vmulge.f32 s8, s0, s8 vstmiage r0!, {s24-s27} vstmiage r0!, {s28-s31} -- cgit v1.2.3