From eb6a6cd788a172f146534c5fab9b98d6cbf59520 Mon Sep 17 00:00:00 2001 From: David Conrad Date: Sat, 17 Apr 2010 02:04:30 +0000 Subject: vp3: DC-only IDCT 2-4% faster overall decode Originally committed as revision 22896 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/arm/dsputil_init_neon.c | 2 ++ libavcodec/arm/vp3dsp_neon.S | 44 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) (limited to 'libavcodec/arm') diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c index 1f2169ead5..0e44160392 100644 --- a/libavcodec/arm/dsputil_init_neon.c +++ b/libavcodec/arm/dsputil_init_neon.c @@ -32,6 +32,7 @@ void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); void ff_vp3_idct_neon(DCTELEM *data); void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); +void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const DCTELEM *data); void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); @@ -294,6 +295,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) if (CONFIG_VP3_DECODER) { c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon; c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon; + c->vp3_idct_dc_add = ff_vp3_idct_dc_add_neon; } c->vector_fmul = ff_vector_fmul_neon; diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S index 6deae4725e..ade19984c2 100644 --- a/libavcodec/arm/vp3dsp_neon.S +++ b/libavcodec/arm/vp3dsp_neon.S @@ -374,3 +374,47 @@ function ff_vp3_idct_add_neon, export=1 vst1.64 {d7}, [r2,:64], r1 bx lr endfunc + +function ff_vp3_idct_dc_add_neon, export=1 + ldrsh r2, [r2] + movw r3, #46341 + mul r2, r3, r2 + smulwt r2, r3, r2 + mov r3, r0 + vdup.16 q15, r2 + vrshr.s16 q15, q15, #4 + + vld1.8 {d0}, [r0,:64], r1 + vld1.8 {d1}, [r0,:64], r1 + vld1.8 {d2}, [r0,:64], r1 + vaddw.u8 q8, q15, d0 + vld1.8 {d3}, [r0,:64], r1 + vaddw.u8 q9, q15, d1 + vld1.8 {d4}, [r0,:64], r1 + vaddw.u8 q10, q15, d2 + vld1.8 {d5}, [r0,:64], r1 + vaddw.u8 q11, q15, d3 + vld1.8 {d6}, [r0,:64], r1 + vaddw.u8 q12, q15, d4 + vld1.8 {d7}, [r0,:64], r1 + vaddw.u8 q13, q15, d5 + vqmovun.s16 d0, q8 + vaddw.u8 q14, q15, d6 + vqmovun.s16 d1, q9 + vaddw.u8 q15, q15, d7 + vqmovun.s16 d2, q10 + vst1.8 {d0}, [r3,:64], r1 + vqmovun.s16 d3, q11 + vst1.8 {d1}, [r3,:64], r1 + vqmovun.s16 d4, q12 + vst1.8 {d2}, [r3,:64], r1 + vqmovun.s16 d5, q13 + vst1.8 {d3}, [r3,:64], r1 + vqmovun.s16 d6, q14 + vst1.8 {d4}, [r3,:64], r1 + vqmovun.s16 d7, q15 + vst1.8 {d5}, [r3,:64], r1 + vst1.8 {d6}, [r3,:64], r1 + vst1.8 {d7}, [r3,:64], r1 + bx lr +endfunc -- cgit v1.2.3