summaryrefslogtreecommitdiff
path: root/libavcodec/arm
diff options
context:
space:
mode:
authorDavid Conrad <lessen42@gmail.com>2010-04-17 02:04:30 +0000
committerDavid Conrad <lessen42@gmail.com>2010-04-17 02:04:30 +0000
commiteb6a6cd788a172f146534c5fab9b98d6cbf59520 (patch)
tree23225d7976eefaf0292342e6ee8b4ac946efcb8e /libavcodec/arm
parentf32f7d8b24d1228df447be85046b9346292d936e (diff)
vp3: DC-only IDCT
2-4% faster overall decode Originally committed as revision 22896 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/arm')
-rw-r--r--libavcodec/arm/dsputil_init_neon.c2
-rw-r--r--libavcodec/arm/vp3dsp_neon.S44
2 files changed, 46 insertions, 0 deletions
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 1f2169ead5..0e44160392 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -32,6 +32,7 @@ void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
void ff_vp3_idct_neon(DCTELEM *data);
void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
+void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const DCTELEM *data);
void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int);
void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int);
@@ -294,6 +295,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
if (CONFIG_VP3_DECODER) {
c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon;
c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon;
+ c->vp3_idct_dc_add = ff_vp3_idct_dc_add_neon;
}
c->vector_fmul = ff_vector_fmul_neon;
diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S
index 6deae4725e..ade19984c2 100644
--- a/libavcodec/arm/vp3dsp_neon.S
+++ b/libavcodec/arm/vp3dsp_neon.S
@@ -374,3 +374,47 @@ function ff_vp3_idct_add_neon, export=1
vst1.64 {d7}, [r2,:64], r1
bx lr
endfunc
+
+function ff_vp3_idct_dc_add_neon, export=1
+ ldrsh r2, [r2]
+ movw r3, #46341
+ mul r2, r3, r2
+ smulwt r2, r3, r2
+ mov r3, r0
+ vdup.16 q15, r2
+ vrshr.s16 q15, q15, #4
+
+ vld1.8 {d0}, [r0,:64], r1
+ vld1.8 {d1}, [r0,:64], r1
+ vld1.8 {d2}, [r0,:64], r1
+ vaddw.u8 q8, q15, d0
+ vld1.8 {d3}, [r0,:64], r1
+ vaddw.u8 q9, q15, d1
+ vld1.8 {d4}, [r0,:64], r1
+ vaddw.u8 q10, q15, d2
+ vld1.8 {d5}, [r0,:64], r1
+ vaddw.u8 q11, q15, d3
+ vld1.8 {d6}, [r0,:64], r1
+ vaddw.u8 q12, q15, d4
+ vld1.8 {d7}, [r0,:64], r1
+ vaddw.u8 q13, q15, d5
+ vqmovun.s16 d0, q8
+ vaddw.u8 q14, q15, d6
+ vqmovun.s16 d1, q9
+ vaddw.u8 q15, q15, d7
+ vqmovun.s16 d2, q10
+ vst1.8 {d0}, [r3,:64], r1
+ vqmovun.s16 d3, q11
+ vst1.8 {d1}, [r3,:64], r1
+ vqmovun.s16 d4, q12
+ vst1.8 {d2}, [r3,:64], r1
+ vqmovun.s16 d5, q13
+ vst1.8 {d3}, [r3,:64], r1
+ vqmovun.s16 d6, q14
+ vst1.8 {d4}, [r3,:64], r1
+ vqmovun.s16 d7, q15
+ vst1.8 {d5}, [r3,:64], r1
+ vst1.8 {d6}, [r3,:64], r1
+ vst1.8 {d7}, [r3,:64], r1
+ bx lr
+endfunc