From aeaf268e52fc11c1f64914a319e0edddf1346d6a Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 18 Jan 2013 16:43:04 +0100 Subject: vp3: integrate clear_blocks with idct of previous block. This is identical to what e.g. vp8 does, and prevents the function call overhead (plus dependency on dsputil for this particular function). Arm asm updated by Janne Grunau . Signed-off-by: Janne Grunau --- libavcodec/x86/vp3dsp.asm | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'libavcodec/x86/vp3dsp.asm') diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index fc1e776a13..d2c464c5cf 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -561,6 +561,13 @@ cglobal vp3_idct_put, 3, 4, 9 movhps [r0+r3 ], m3 %endif %assign %%i %%i+64 +%endrep + + pxor m0, m0 +%assign %%offset 0 +%rep 128/mmsize + mova [r2+%%offset], m0 +%assign %%offset %%offset+mmsize %endrep RET @@ -600,6 +607,11 @@ cglobal vp3_idct_add, 3, 4, 9 movhps [r0+r1], m0 %endif lea r0, [r0+r1*2] +%assign %%offset 0 +%rep 32/mmsize + mova [r2+%%offset], m4 +%assign %%offset %%offset+mmsize +%endrep add r2, 32 dec r3 jg .loop @@ -620,7 +632,7 @@ vp3_idct_funcs paddusb m2, m0 movq m4, [r0+r1*2] paddusb m3, m0 - movq m5, [r0+r3 ] + movq m5, [r0+r2 ] paddusb m4, m0 paddusb m5, m0 psubusb m2, m1 @@ -630,7 +642,7 @@ vp3_idct_funcs movq [r0+r1 ], m3 psubusb m5, m1 movq [r0+r1*2], m4 - movq [r0+r3 ], m5 + movq [r0+r2 ], m5 %endmacro INIT_MMX mmxext @@ -638,11 +650,12 @@ cglobal vp3_idct_dc_add, 3, 4 %if ARCH_X86_64 movsxd r1, r1d %endif - lea r3, [r1*3] - movsx r2, word [r2] - add r2, 15 - sar r2, 5 - movd m0, r2d + movsx r3, word [r2] + mov word [r2], 0 + lea r2, [r1*3] + add r3, 15 + sar r3, 5 + movd m0, r3d pshufw m0, m0, 0x0 pxor m1, m1 psubw m1, m0 -- cgit v1.2.3