diff options
-rw-r--r-- | libavcodec/arm/vp3dsp_neon.S | 22 | ||||
-rw-r--r-- | libavcodec/ppc/vp3dsp_altivec.c | 2 | ||||
-rw-r--r-- | libavcodec/vp3.c | 5 | ||||
-rw-r--r-- | libavcodec/vp3dsp.c | 5 | ||||
-rw-r--r-- | libavcodec/vp3dsp.h | 2 | ||||
-rw-r--r-- | libavcodec/x86/vp3dsp.asm | 27 | ||||
-rw-r--r-- | libavcodec/x86/vp3dsp_init.c | 2 |
7 files changed, 45 insertions, 20 deletions
diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S index e09de57281..e5ecfc337e 100644 --- a/libavcodec/arm/vp3dsp_neon.S +++ b/libavcodec/arm/vp3dsp_neon.S @@ -108,14 +108,20 @@ endfunc function vp3_idct_start_neon vpush {d8-d15} + vmov.i16 q4, #0 + vmov.i16 q5, #0 movrel r3, vp3_idct_constants vld1.64 {d0-d1}, [r3,:128] - vld1.64 {d16-d19}, [r2,:128]! - vld1.64 {d20-d23}, [r2,:128]! - vld1.64 {d24-d27}, [r2,:128]! + vld1.64 {d16-d19}, [r2,:128] + vst1.64 {q4-q5}, [r2,:128]! + vld1.64 {d20-d23}, [r2,:128] + vst1.64 {q4-q5}, [r2,:128]! + vld1.64 {d24-d27}, [r2,:128] + vst1.64 {q4-q5}, [r2,:128]! vadd.s16 q1, q8, q12 vsub.s16 q8, q8, q12 - vld1.64 {d28-d31}, [r2,:128]! + vld1.64 {d28-d31}, [r2,:128] + vst1.64 {q4-q5}, [r2,:128]! vp3_idct_core_neon: vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16 @@ -345,10 +351,12 @@ function ff_vp3_idct_add_neon, export=1 endfunc function ff_vp3_idct_dc_add_neon, export=1 - ldrsh r2, [r2] + ldrsh r12, [r2] mov r3, r0 - add r2, r2, #15 - vdup.16 q15, r2 + add r12, r12, #15 + vdup.16 q15, r12 + mov r12, 0 + strh r12, [r2] vshr.s16 q15, q15, #5 vld1.8 {d0}, [r0,:64], r1 diff --git a/libavcodec/ppc/vp3dsp_altivec.c b/libavcodec/ppc/vp3dsp_altivec.c index 75a36779ce..6adf9aefac 100644 --- a/libavcodec/ppc/vp3dsp_altivec.c +++ b/libavcodec/ppc/vp3dsp_altivec.c @@ -140,6 +140,7 @@ static void vp3_idct_put_altivec(uint8_t *dst, int stride, DCTELEM block[64]) PUT(b5) dst += stride; PUT(b6) dst += stride; PUT(b7) + memset(block, 0, sizeof(*block) * 64); } static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64]) @@ -171,6 +172,7 @@ static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64]) ADD(b5) dst += stride; ADD(b6) dst += stride; ADD(b7) + memset(block, 0, sizeof(*block) * 64); } #endif /* HAVE_ALTIVEC */ diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c index 0340c22bb2..9417535314 100644 --- a/libavcodec/vp3.c +++ b/libavcodec/vp3.c @@ -138,6 +138,7 @@ typedef struct Vp3DecodeContext { DSPContext dsp; VideoDSPContext vdsp; VP3DSPContext vp3dsp; + DECLARE_ALIGNED(16, DCTELEM, block)[64]; int flipped_image; int last_slice_end; int skip_loop_filter; @@ -1458,7 +1459,7 @@ static void await_reference_row(Vp3DecodeContext *s, Vp3Fragment *fragment, int static void render_slice(Vp3DecodeContext *s, int slice) { int x, y, i, j, fragment; - LOCAL_ALIGNED_16(DCTELEM, block, [64]); + DCTELEM *block = s->block; int motion_x = 0xdeadbeef, motion_y = 0xdeadbeef; int motion_halfpel_index; uint8_t *motion_source; @@ -1571,8 +1572,6 @@ static void render_slice(Vp3DecodeContext *s, int slice) } } - s->dsp.clear_block(block); - /* invert DCT and place (or add) in final output */ if (s->all_fragments[i].coding_method == MODE_INTRA) { diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c index 9b0b5d0a9c..9e6209dfdd 100644 --- a/libavcodec/vp3dsp.c +++ b/libavcodec/vp3dsp.c @@ -215,14 +215,16 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int static void vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/){ idct(dest, line_size, block, 1); + memset(block, 0, sizeof(*block) * 64); } static void vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/){ idct(dest, line_size, block, 2); + memset(block, 0, sizeof(*block) * 64); } static void vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, - const DCTELEM *block/*align 16*/){ + DCTELEM *block/*align 16*/){ int i, dc = (block[0] + 15) >> 5; for(i = 0; i < 8; i++){ @@ -236,6 +238,7 @@ static void vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, dest[7] = av_clip_uint8(dest[7] + dc); dest += line_size; } + block[0] = 0; } static void vp3_v_loop_filter_c(uint8_t *first_pixel, int stride, diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h index 3781bbf3a7..feb300017a 100644 --- a/libavcodec/vp3dsp.h +++ b/libavcodec/vp3dsp.h @@ -25,7 +25,7 @@ typedef struct VP3DSPContext { void (*idct_put)(uint8_t *dest, int line_size, DCTELEM *block); void (*idct_add)(uint8_t *dest, int line_size, DCTELEM *block); - void (*idct_dc_add)(uint8_t *dest, int line_size, const DCTELEM *block); + void (*idct_dc_add)(uint8_t *dest, int line_size, DCTELEM *block); void (*v_loop_filter)(uint8_t *src, int stride, int *bounding_values); void (*h_loop_filter)(uint8_t *src, int stride, int *bounding_values); diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index fc1e776a13..d2c464c5cf 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -562,6 +562,13 @@ cglobal vp3_idct_put, 3, 4, 9 %endif %assign %%i %%i+64 %endrep + + pxor m0, m0 +%assign %%offset 0 +%rep 128/mmsize + mova [r2+%%offset], m0 +%assign %%offset %%offset+mmsize +%endrep RET cglobal vp3_idct_add, 3, 4, 9 @@ -600,6 +607,11 @@ cglobal vp3_idct_add, 3, 4, 9 movhps [r0+r1], m0 %endif lea r0, [r0+r1*2] +%assign %%offset 0 +%rep 32/mmsize + mova [r2+%%offset], m4 +%assign %%offset %%offset+mmsize +%endrep add r2, 32 dec r3 jg .loop @@ -620,7 +632,7 @@ vp3_idct_funcs paddusb m2, m0 movq m4, [r0+r1*2] paddusb m3, m0 - movq m5, [r0+r3 ] + movq m5, [r0+r2 ] paddusb m4, m0 paddusb m5, m0 psubusb m2, m1 @@ -630,7 +642,7 @@ vp3_idct_funcs movq [r0+r1 ], m3 psubusb m5, m1 movq [r0+r1*2], m4 - movq [r0+r3 ], m5 + movq [r0+r2 ], m5 %endmacro INIT_MMX mmxext @@ -638,11 +650,12 @@ cglobal vp3_idct_dc_add, 3, 4 %if ARCH_X86_64 movsxd r1, r1d %endif - lea r3, [r1*3] - movsx r2, word [r2] - add r2, 15 - sar r2, 5 - movd m0, r2d + movsx r3, word [r2] + mov word [r2], 0 + lea r2, [r1*3] + add r3, 15 + sar r3, 5 + movd m0, r3d pshufw m0, m0, 0x0 pxor m1, m1 psubw m1, m0 diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c index bbe74ba44a..95beeabfec 100644 --- a/libavcodec/x86/vp3dsp_init.c +++ b/libavcodec/x86/vp3dsp_init.c @@ -32,7 +32,7 @@ void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size, - const DCTELEM *block); + DCTELEM *block); void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride, int *bounding_values); |