From aeaf268e52fc11c1f64914a319e0edddf1346d6a Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 18 Jan 2013 16:43:04 +0100 Subject: vp3: integrate clear_blocks with idct of previous block. This is identical to what e.g. vp8 does, and prevents the function call overhead (plus dependency on dsputil for this particular function). Arm asm updated by Janne Grunau . Signed-off-by: Janne Grunau --- libavcodec/arm/vp3dsp_neon.S | 22 +++++++++++++++------- libavcodec/ppc/vp3dsp_altivec.c | 2 ++ libavcodec/vp3.c | 5 ++--- libavcodec/vp3dsp.c | 5 ++++- libavcodec/vp3dsp.h | 2 +- libavcodec/x86/vp3dsp.asm | 27 ++++++++++++++++++++------- libavcodec/x86/vp3dsp_init.c | 2 +- 7 files changed, 45 insertions(+), 20 deletions(-) (limited to 'libavcodec') diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S index e09de57281..e5ecfc337e 100644 --- a/libavcodec/arm/vp3dsp_neon.S +++ b/libavcodec/arm/vp3dsp_neon.S @@ -108,14 +108,20 @@ endfunc function vp3_idct_start_neon vpush {d8-d15} + vmov.i16 q4, #0 + vmov.i16 q5, #0 movrel r3, vp3_idct_constants vld1.64 {d0-d1}, [r3,:128] - vld1.64 {d16-d19}, [r2,:128]! - vld1.64 {d20-d23}, [r2,:128]! - vld1.64 {d24-d27}, [r2,:128]! + vld1.64 {d16-d19}, [r2,:128] + vst1.64 {q4-q5}, [r2,:128]! + vld1.64 {d20-d23}, [r2,:128] + vst1.64 {q4-q5}, [r2,:128]! + vld1.64 {d24-d27}, [r2,:128] + vst1.64 {q4-q5}, [r2,:128]! vadd.s16 q1, q8, q12 vsub.s16 q8, q8, q12 - vld1.64 {d28-d31}, [r2,:128]! + vld1.64 {d28-d31}, [r2,:128] + vst1.64 {q4-q5}, [r2,:128]! vp3_idct_core_neon: vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16 @@ -345,10 +351,12 @@ function ff_vp3_idct_add_neon, export=1 endfunc function ff_vp3_idct_dc_add_neon, export=1 - ldrsh r2, [r2] + ldrsh r12, [r2] mov r3, r0 - add r2, r2, #15 - vdup.16 q15, r2 + add r12, r12, #15 + vdup.16 q15, r12 + mov r12, 0 + strh r12, [r2] vshr.s16 q15, q15, #5 vld1.8 {d0}, [r0,:64], r1 diff --git a/libavcodec/ppc/vp3dsp_altivec.c b/libavcodec/ppc/vp3dsp_altivec.c index 75a36779ce..6adf9aefac 100644 --- a/libavcodec/ppc/vp3dsp_altivec.c +++ b/libavcodec/ppc/vp3dsp_altivec.c @@ -140,6 +140,7 @@ static void vp3_idct_put_altivec(uint8_t *dst, int stride, DCTELEM block[64]) PUT(b5) dst += stride; PUT(b6) dst += stride; PUT(b7) + memset(block, 0, sizeof(*block) * 64); } static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64]) @@ -171,6 +172,7 @@ static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64]) ADD(b5) dst += stride; ADD(b6) dst += stride; ADD(b7) + memset(block, 0, sizeof(*block) * 64); } #endif /* HAVE_ALTIVEC */ diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c index 0340c22bb2..9417535314 100644 --- a/libavcodec/vp3.c +++ b/libavcodec/vp3.c @@ -138,6 +138,7 @@ typedef struct Vp3DecodeContext { DSPContext dsp; VideoDSPContext vdsp; VP3DSPContext vp3dsp; + DECLARE_ALIGNED(16, DCTELEM, block)[64]; int flipped_image; int last_slice_end; int skip_loop_filter; @@ -1458,7 +1459,7 @@ static void await_reference_row(Vp3DecodeContext *s, Vp3Fragment *fragment, int static void render_slice(Vp3DecodeContext *s, int slice) { int x, y, i, j, fragment; - LOCAL_ALIGNED_16(DCTELEM, block, [64]); + DCTELEM *block = s->block; int motion_x = 0xdeadbeef, motion_y = 0xdeadbeef; int motion_halfpel_index; uint8_t *motion_source; @@ -1571,8 +1572,6 @@ static void render_slice(Vp3DecodeContext *s, int slice) } } - s->dsp.clear_block(block); - /* invert DCT and place (or add) in final output */ if (s->all_fragments[i].coding_method == MODE_INTRA) { diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c index 9b0b5d0a9c..9e6209dfdd 100644 --- a/libavcodec/vp3dsp.c +++ b/libavcodec/vp3dsp.c @@ -215,14 +215,16 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int static void vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/){ idct(dest, line_size, block, 1); + memset(block, 0, sizeof(*block) * 64); } static void vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/){ idct(dest, line_size, block, 2); + memset(block, 0, sizeof(*block) * 64); } static void vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, - const DCTELEM *block/*align 16*/){ + DCTELEM *block/*align 16*/){ int i, dc = (block[0] + 15) >> 5; for(i = 0; i < 8; i++){ @@ -236,6 +238,7 @@ static void vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, dest[7] = av_clip_uint8(dest[7] + dc); dest += line_size; } + block[0] = 0; } static void vp3_v_loop_filter_c(uint8_t *first_pixel, int stride, diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h index 3781bbf3a7..feb300017a 100644 --- a/libavcodec/vp3dsp.h +++ b/libavcodec/vp3dsp.h @@ -25,7 +25,7 @@ typedef struct VP3DSPContext { void (*idct_put)(uint8_t *dest, int line_size, DCTELEM *block); void (*idct_add)(uint8_t *dest, int line_size, DCTELEM *block); - void (*idct_dc_add)(uint8_t *dest, int line_size, const DCTELEM *block); + void (*idct_dc_add)(uint8_t *dest, int line_size, DCTELEM *block); void (*v_loop_filter)(uint8_t *src, int stride, int *bounding_values); void (*h_loop_filter)(uint8_t *src, int stride, int *bounding_values); diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index fc1e776a13..d2c464c5cf 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -561,6 +561,13 @@ cglobal vp3_idct_put, 3, 4, 9 movhps [r0+r3 ], m3 %endif %assign %%i %%i+64 +%endrep + + pxor m0, m0 +%assign %%offset 0 +%rep 128/mmsize + mova [r2+%%offset], m0 +%assign %%offset %%offset+mmsize %endrep RET @@ -600,6 +607,11 @@ cglobal vp3_idct_add, 3, 4, 9 movhps [r0+r1], m0 %endif lea r0, [r0+r1*2] +%assign %%offset 0 +%rep 32/mmsize + mova [r2+%%offset], m4 +%assign %%offset %%offset+mmsize +%endrep add r2, 32 dec r3 jg .loop @@ -620,7 +632,7 @@ vp3_idct_funcs paddusb m2, m0 movq m4, [r0+r1*2] paddusb m3, m0 - movq m5, [r0+r3 ] + movq m5, [r0+r2 ] paddusb m4, m0 paddusb m5, m0 psubusb m2, m1 @@ -630,7 +642,7 @@ vp3_idct_funcs movq [r0+r1 ], m3 psubusb m5, m1 movq [r0+r1*2], m4 - movq [r0+r3 ], m5 + movq [r0+r2 ], m5 %endmacro INIT_MMX mmxext @@ -638,11 +650,12 @@ cglobal vp3_idct_dc_add, 3, 4 %if ARCH_X86_64 movsxd r1, r1d %endif - lea r3, [r1*3] - movsx r2, word [r2] - add r2, 15 - sar r2, 5 - movd m0, r2d + movsx r3, word [r2] + mov word [r2], 0 + lea r2, [r1*3] + add r3, 15 + sar r3, 5 + movd m0, r3d pshufw m0, m0, 0x0 pxor m1, m1 psubw m1, m0 diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c index bbe74ba44a..95beeabfec 100644 --- a/libavcodec/x86/vp3dsp_init.c +++ b/libavcodec/x86/vp3dsp_init.c @@ -32,7 +32,7 @@ void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size, - const DCTELEM *block); + DCTELEM *block); void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride, int *bounding_values); -- cgit v1.2.3