From 729f90e26802057f06905ab15a34612168eeac80 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Wed, 4 Apr 2012 20:03:15 +0000 Subject: x86inc improvements for 64-bit Add support for all x86-64 registers Prefer caller-saved register over callee-saved on WIN64 Support up to 15 function arguments Also (by Ronald S. Bultje) Fix up our asm to work with new x86inc.asm. Signed-off-by: Ronald S. Bultje Signed-off-by: Justin Ruggles --- libavcodec/x86/h264_deblock.asm | 60 ++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 30 deletions(-) (limited to 'libavcodec/x86/h264_deblock.asm') diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 0f61922276..1982dc4bd3 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -328,11 +328,11 @@ cglobal deblock_v_luma_8_%1, 5,5,10 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal deblock_h_luma_8_%1, 5,7 - movsxd r10, r1d - lea r11, [r10+r10*2] +cglobal deblock_h_luma_8_%1, 5,9 + movsxd r7, r1d + lea r8, [r7+r7*2] lea r6, [r0-4] - lea r5, [r0-4+r11] + lea r5, [r0-4+r8] %if WIN64 sub rsp, 0x98 %define pix_tmp rsp+0x30 @@ -342,14 +342,14 @@ cglobal deblock_h_luma_8_%1, 5,7 %endif ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp - lea r6, [r6+r10*8] - lea r5, [r5+r10*8] - TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp + lea r6, [r6+r7*8] + lea r5, [r5+r7*8] + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8 ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them + ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them lea r0, [pix_tmp+0x30] mov r1d, 0x10 %if WIN64 @@ -364,17 +364,17 @@ cglobal deblock_h_luma_8_%1, 5,7 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) - shl r10, 3 - sub r6, r10 - sub r5, r10 - shr r10, 3 + shl r7, 3 + sub r6, r7 + sub r5, r7 + shr r7, 3 movq m0, [pix_tmp+0x10] movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) %if WIN64 add rsp, 0x98 @@ -705,32 +705,32 @@ INIT_MMX ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra_8_%1, 4,7 - movsxd r10, r1d - lea r11, [r10*3] +cglobal deblock_h_luma_intra_8_%1, 4,9 + movsxd r7, r1d + lea r8, [r7*3] lea r6, [r0-4] - lea r5, [r0-4+r11] + lea r5, [r0-4+r8] sub rsp, 0x88 %define pix_tmp rsp ; transpose 8x16 -> tmp space - TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) - lea r6, [r6+r10*8] - lea r5, [r5+r10*8] - TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) + lea r6, [r6+r7*8] + lea r5, [r5+r7*8] + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) lea r0, [pix_tmp+0x40] mov r1, 0x10 call deblock_v_luma_intra_8_%1 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) - lea r5, [r6+r11] - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) - shl r10, 3 - sub r6, r10 - sub r5, r10 - shr r10, 3 - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) + lea r5, [r6+r8] + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) + shl r7, 3 + sub r6, r7 + sub r5, r7 + shr r7, 3 + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) add rsp, 0x88 RET %else -- cgit v1.2.3