summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHenrik Gramner <hengar-6@student.ltu.se>2012-04-04 20:03:15 +0000
committerJustin Ruggles <justin.ruggles@gmail.com>2012-04-11 15:47:00 -0400
commit729f90e26802057f06905ab15a34612168eeac80 (patch)
tree41f8c4cedf10851b5b437aeeb558ce3d0f8db1dc
parente1ce756844e684876318570dcebc74bc66c084f0 (diff)
x86inc improvements for 64-bit
Add support for all x86-64 registers Prefer caller-saved register over callee-saved on WIN64 Support up to 15 function arguments Also (by Ronald S. Bultje) Fix up our asm to work with new x86inc.asm. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: Justin Ruggles <justin.ruggles@gmail.com>
-rw-r--r--libavcodec/x86/dsputil_yasm.asm36
-rw-r--r--libavcodec/x86/fft_mmx.asm25
-rw-r--r--libavcodec/x86/fmtconvert.asm6
-rw-r--r--libavcodec/x86/h264_chromamc.asm48
-rw-r--r--libavcodec/x86/h264_deblock.asm60
-rw-r--r--libavcodec/x86/h264_idct.asm156
-rw-r--r--libavcodec/x86/h264_idct_10bit.asm24
-rw-r--r--libavcodec/x86/h264_intrapred.asm30
-rw-r--r--libavcodec/x86/h264_qpel_10bit.asm20
-rw-r--r--libavcodec/x86/h264_weight.asm14
-rw-r--r--libavutil/x86/x86inc.asm218
-rw-r--r--libswscale/x86/output.asm4
-rw-r--r--libswscale/x86/scale.asm18
13 files changed, 318 insertions, 341 deletions
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 00dc18b469..bec4063260 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -497,9 +497,9 @@ cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
%macro EMU_EDGE_FUNC 0
%if ARCH_X86_64
-%define w_reg r10
-cglobal emu_edge_core, 6, 7, 1
- mov r11, r5 ; save block_h
+%define w_reg r7
+cglobal emu_edge_core, 6, 9, 1
+ mov r8, r5 ; save block_h
%else
%define w_reg r6
cglobal emu_edge_core, 2, 7, 0
@@ -536,7 +536,7 @@ cglobal emu_edge_core, 2, 7, 0
sub r0, w_reg
%if ARCH_X86_64
mov r3, r0 ; backup of buf+block_h*linesize
- mov r5, r11
+ mov r5, r8
%else
mov r0m, r0 ; backup of buf+block_h*linesize
mov r5, r5m
@@ -550,7 +550,7 @@ cglobal emu_edge_core, 2, 7, 0
; FIXME we can do a if size == 1 here if that makes any speed difference, test me
sar w_reg, 1
sal w_reg, 6
- ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
+ ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
%ifdef PIC
lea rax, [.emuedge_extend_left_2]
@@ -560,7 +560,7 @@ cglobal emu_edge_core, 2, 7, 0
%endif
call w_reg
- ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
+ ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
.right_extend:
%if ARCH_X86_32
mov r0, r0m
@@ -591,7 +591,7 @@ cglobal emu_edge_core, 2, 7, 0
%define vall al
%define valh ah
%define valw ax
-%define valw2 r10w
+%define valw2 r7w
%define valw3 r3w
%if WIN64
%define valw4 r4w
@@ -618,7 +618,7 @@ cglobal emu_edge_core, 2, 7, 0
; - else if (%2 & 8) fills 8 bytes into mm0
; - if (%2 & 7 == 4) fills the last 4 bytes into rax
; - else if (%2 & 4) fills 4 bytes into mm0-1
-; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax
+; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
; (note that we're using r3 for body/bottom because it's a shorter
; opcode, and then the loop fits in 128 bytes)
; - else fills remaining bytes into rax
@@ -848,7 +848,7 @@ ALIGN 64
%endrep
%endmacro ; LEFT_EXTEND
-; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
+; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
%macro RIGHT_EXTEND 0
%assign %%n 2
%rep 11
@@ -858,7 +858,7 @@ ALIGN 64
sub r3, r2 ; dst -= linesize
READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
- dec r11
+ dec r8
%else ; ARCH_X86_32
sub r0, r2 ; dst -= linesize
READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
@@ -937,11 +937,11 @@ ALIGN 64
%macro SLOW_V_EXTEND 0
.slow_v_extend_loop:
; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
-; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
+; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
%if ARCH_X86_64
- push r11 ; save old value of block_h
+ push r8 ; save old value of block_h
test r3, r3
-%define cnt_reg r11
+%define cnt_reg r8
jz .do_body_copy ; if (!start_y) goto do_body_copy
V_COPY_ROW top, r3
%else
@@ -955,7 +955,7 @@ ALIGN 64
V_COPY_ROW body, r4
%if ARCH_X86_64
- pop r11 ; restore old value of block_h
+ pop r8 ; restore old value of block_h
%define cnt_reg r3
%endif
test r5, r5
@@ -974,7 +974,7 @@ ALIGN 64
%macro SLOW_LEFT_EXTEND 0
.slow_left_extend_loop:
-; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
+; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
mov r4, 8
sub r0, linesize
READ_V_PIXEL 8, [r0+w_reg]
@@ -1002,11 +1002,11 @@ ALIGN 64
%macro SLOW_RIGHT_EXTEND 0
.slow_right_extend_loop:
-; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
-; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
+; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
+; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
%if ARCH_X86_64
%define buf_reg r3
-%define bh_reg r11
+%define bh_reg r8
%else
%define buf_reg r0
%define bh_reg r5
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
index a2f26cca33..225c66635d 100644
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -749,14 +749,11 @@ INIT_XMM
%endmacro
%macro DECL_IMDCT 2
-cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
+cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
%if ARCH_X86_64
-%define rrevtab r10
-%define rtcos r11
-%define rtsin r12
- push r12
- push r13
- push r14
+%define rrevtab r7
+%define rtcos r8
+%define rtsin r9
%else
%define rrevtab r6
%define rtsin r6
@@ -798,12 +795,12 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *
%if ARCH_X86_64
movzx r5, word [rrevtab+r4-4]
movzx r6, word [rrevtab+r4-2]
- movzx r13, word [rrevtab+r3]
- movzx r14, word [rrevtab+r3+2]
+ movzx r10, word [rrevtab+r3]
+ movzx r11, word [rrevtab+r3+2]
movlps [r1+r5 *8], xmm0
movhps [r1+r6 *8], xmm0
- movlps [r1+r13*8], xmm1
- movhps [r1+r14*8], xmm1
+ movlps [r1+r10*8], xmm1
+ movhps [r1+r11*8], xmm1
add r4, 4
%else
mov r6, [esp]
@@ -839,11 +836,7 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *
mov r1, -mmsize
sub r1, r0
%2 r0, r1, r6, rtcos, rtsin
-%if ARCH_X86_64
- pop r14
- pop r13
- pop r12
-%else
+%if ARCH_X86_64 == 0
add esp, 12
%endif
%ifidn avx_enabled, 1
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 3f39c7e564..63befc94f6 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -179,9 +179,8 @@ FLOAT_TO_INT16_INTERLEAVE2 sse2
%macro FLOAT_TO_INT16_INTERLEAVE6 1
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
-cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
+cglobal float_to_int16_interleave6_%1, 2,8,0, dst, src, src1, src2, src3, src4, src5, len
%if ARCH_X86_64
- %define lend r10d
mov lend, r2d
%else
%define lend dword r2m
@@ -240,9 +239,8 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2
;-----------------------------------------------------------------------------
%macro FLOAT_INTERLEAVE6 2
-cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5
+cglobal float_interleave6_%1, 2,8,%2, dst, src, src1, src2, src3, src4, src5, len
%if ARCH_X86_64
- %define lend r10d
mov lend, r2d
%else
%define lend dword r2m
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index 8b621fa8bb..64a4efe057 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -91,9 +91,22 @@ SECTION .text
%endmacro
%macro chroma_mc8_mmx_func 3
+%ifidn %2, rv40
+%ifdef PIC
+%define rnd_1d_rv40 r8
+%define rnd_2d_rv40 r8
+%define extra_regs 2
+%else ; no-PIC
+%define rnd_1d_rv40 rnd_rv40_1d_tbl
+%define rnd_2d_rv40 rnd_rv40_2d_tbl
+%define extra_regs 1
+%endif ; PIC
+%else
+%define extra_regs 0
+%endif ; rv40
; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
; int stride, int h, int mx, int my)
-cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
+cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0
%if ARCH_X86_64
movsxd r2, r2d
%endif
@@ -106,19 +119,12 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
.at_least_one_non_zero
%ifidn %2, rv40
-%ifdef PIC
-%define rnd_1d_rv40 r11
-%define rnd_2d_rv40 r11
-%else ; no-PIC
-%define rnd_1d_rv40 rnd_rv40_1d_tbl
-%define rnd_2d_rv40 rnd_rv40_2d_tbl
-%endif
%if ARCH_X86_64
- mov r10, r5
- and r10, 6 ; &~1 for mx/my=[0,7]
- lea r10, [r10*4+r4]
- sar r10d, 1
-%define rnd_bias r10
+ mov r7, r5
+ and r7, 6 ; &~1 for mx/my=[0,7]
+ lea r7, [r7*4+r4]
+ sar r7d, 1
+%define rnd_bias r7
%define dest_reg r0
%else ; x86-32
mov r0, r5
@@ -145,7 +151,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
%ifidn %2, rv40
%ifdef PIC
- lea r11, [rnd_rv40_1d_tbl]
+ lea r8, [rnd_rv40_1d_tbl]
%endif
%if ARCH_X86_64 == 0
mov r5, r0m
@@ -196,7 +202,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
movd m6, r5d ; y
%ifidn %2, rv40
%ifdef PIC
- lea r11, [rnd_rv40_2d_tbl]
+ lea r8, [rnd_rv40_2d_tbl]
%endif
%if ARCH_X86_64 == 0
mov r5, r0m
@@ -278,7 +284,13 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
%endmacro
%macro chroma_mc4_mmx_func 3
-cglobal %1_%2_chroma_mc4_%3, 6, 6, 0
+%define extra_regs 0
+%ifidn %2, rv40
+%ifdef PIC
+%define extra_regs 1
+%endif ; PIC
+%endif ; rv40
+cglobal %1_%2_chroma_mc4_%3, 6, 6 + extra_regs, 0
%if ARCH_X86_64
movsxd r2, r2d
%endif
@@ -296,8 +308,8 @@ cglobal %1_%2_chroma_mc4_%3, 6, 6, 0
%ifidn %2, rv40
%ifdef PIC
- lea r11, [rnd_rv40_2d_tbl]
-%define rnd_2d_rv40 r11
+ lea r6, [rnd_rv40_2d_tbl]
+%define rnd_2d_rv40 r6
%else
%define rnd_2d_rv40 rnd_rv40_2d_tbl
%endif
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 0f61922276..1982dc4bd3 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -328,11 +328,11 @@ cglobal deblock_v_luma_8_%1, 5,5,10
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal deblock_h_luma_8_%1, 5,7
- movsxd r10, r1d
- lea r11, [r10+r10*2]
+cglobal deblock_h_luma_8_%1, 5,9
+ movsxd r7, r1d
+ lea r8, [r7+r7*2]
lea r6, [r0-4]
- lea r5, [r0-4+r11]
+ lea r5, [r0-4+r8]
%if WIN64
sub rsp, 0x98
%define pix_tmp rsp+0x30
@@ -342,14 +342,14 @@ cglobal deblock_h_luma_8_%1, 5,7
%endif
; transpose 6x16 -> tmp space
- TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
- lea r6, [r6+r10*8]
- lea r5, [r5+r10*8]
- TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
+ TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
+ lea r6, [r6+r7*8]
+ lea r5, [r5+r7*8]
+ TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
- ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
+ ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30]
mov r1d, 0x10
%if WIN64
@@ -364,17 +364,17 @@ cglobal deblock_h_luma_8_%1, 5,7
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
- TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
+ TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
- shl r10, 3
- sub r6, r10
- sub r5, r10
- shr r10, 3
+ shl r7, 3
+ sub r6, r7
+ sub r5, r7
+ shr r7, 3
movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
- TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
+ TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
%if WIN64
add rsp, 0x98
@@ -705,32 +705,32 @@ INIT_MMX
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra_8_%1, 4,7
- movsxd r10, r1d
- lea r11, [r10*3]
+cglobal deblock_h_luma_intra_8_%1, 4,9
+ movsxd r7, r1d
+ lea r8, [r7*3]
lea r6, [r0-4]
- lea r5, [r0-4+r11]
+ lea r5, [r0-4+r8]
sub rsp, 0x88
%define pix_tmp rsp
; transpose 8x16 -> tmp space
- TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
- lea r6, [r6+r10*8]
- lea r5, [r5+r10*8]
- TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
+ TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
+ lea r6, [r6+r7*8]
+ lea r5, [r5+r7*8]
+ TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
lea r0, [pix_tmp+0x40]
mov r1, 0x10
call deblock_v_luma_intra_8_%1
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
- lea r5, [r6+r11]
- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
- shl r10, 3
- sub r6, r10
- sub r5, r10
- shr r10, 3
- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
+ lea r5, [r6+r8]
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
+ shl r7, 3
+ sub r6, r7
+ sub r5, r7
+ shr r7, 3
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
add rsp, 0x88
RET
%else
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 5e8c0edfa6..cc83806884 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -45,8 +45,10 @@ scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
db 4+13*8, 5+13*8, 4+14*8, 5+14*8
db 6+13*8, 7+13*8, 6+14*8, 7+14*8
%ifdef PIC
-%define scan8 r11
+%define npicregs 1
+%define scan8 picregq
%else
+%define npicregs 0
%define scan8 scan8_mem
%endif
@@ -301,10 +303,10 @@ cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0
; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add16_8_mmx, 5, 7, 0
+cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
xor r5, r5
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
.nextblock
movzx r6, byte [scan8+r5]
@@ -323,13 +325,13 @@ cglobal h264_idct_add16_8_mmx, 5, 7, 0
; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct8_add4_8_mmx, 5, 7, 0
+cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
%assign pad 128+4-(stack_offset&7)
SUB rsp, pad
xor r5, r5
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
.nextblock
movzx r6, byte [scan8+r5]
@@ -355,10 +357,10 @@ cglobal h264_idct8_add4_8_mmx, 5, 7, 0
; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add16_8_mmx2, 5, 7, 0
+cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
xor r5, r5
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
.nextblock
movzx r6, byte [scan8+r5]
@@ -371,16 +373,13 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0
test r6, r6
jz .no_dc
DC_ADD_MMX2_INIT r2, r3, r6
-%if ARCH_X86_64
-%define dst_reg r10
-%define dst_regd r10d
-%else
-%define dst_reg r1
-%define dst_regd r1d
+%if ARCH_X86_64 == 0
+%define dst2q r1
+%define dst2d r1d
%endif
- mov dst_regd, dword [r1+r5*4]
- lea dst_reg, [r0+dst_reg]
- DC_ADD_MMX2_OP movh, dst_reg, r3, r6
+ mov dst2d, dword [r1+r5*4]
+ lea dst2q, [r0+dst2q]
+ DC_ADD_MMX2_OP movh, dst2q, r3, r6
%if ARCH_X86_64 == 0
mov r1, r1m
%endif
@@ -402,10 +401,10 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0
; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add16intra_8_mmx, 5, 7, 0
+cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
xor r5, r5
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
.nextblock
movzx r6, byte [scan8+r5]
@@ -425,10 +424,10 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7, 0
; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0
+cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
xor r5, r5
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
.nextblock
movzx r6, byte [scan8+r5]
@@ -448,16 +447,13 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0
test r6, r6
jz .skipblock
DC_ADD_MMX2_INIT r2, r3, r6
-%if ARCH_X86_64
-%define dst_reg r10
-%define dst_regd r10d
-%else
-%define dst_reg r1
-%define dst_regd r1d
+%if ARCH_X86_64 == 0
+%define dst2q r1
+%define dst2d r1d
%endif
- mov dst_regd, dword [r1+r5*4]
- add dst_reg, r0
- DC_ADD_MMX2_OP movh, dst_reg, r3, r6
+ mov dst2d, dword [r1+r5*4]
+ add dst2q, r0
+ DC_ADD_MMX2_OP movh, dst2q, r3, r6
%if ARCH_X86_64 == 0
mov r1, r1m
%endif
@@ -470,13 +466,13 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0
; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct8_add4_8_mmx2, 5, 7, 0
+cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
%assign pad 128+4-(stack_offset&7)
SUB rsp, pad
xor r5, r5
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
.nextblock
movzx r6, byte [scan8+r5]
@@ -489,18 +485,15 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0
test r6, r6
jz .no_dc
DC_ADD_MMX2_INIT r2, r3, r6
-%if ARCH_X86_64
-%define dst_reg r10
-%define dst_regd r10d
-%else
-%define dst_reg r1
-%define dst_regd r1d
-%endif
- mov dst_regd, dword [r1+r5*4]
- lea dst_reg, [r0+dst_reg]
- DC_ADD_MMX2_OP mova, dst_reg, r3, r6
- lea dst_reg, [dst_reg+r3*4]
- DC_ADD_MMX2_OP mova, dst_reg, r3, r6
+%if ARCH_X86_64 == 0
+%define dst2q r1
+%define dst2d r1d
+%endif
+ mov dst2d, dword [r1+r5*4]
+ lea dst2q, [r0+dst2q]
+ DC_ADD_MMX2_OP mova, dst2q, r3, r6
+ lea dst2q, [dst2q+r3*4]
+ DC_ADD_MMX2_OP mova, dst2q, r3, r6
%if ARCH_X86_64 == 0
mov r1, r1m
%endif
@@ -533,10 +526,10 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0
INIT_XMM
; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct8_add4_8_sse2, 5, 7, 10
+cglobal h264_idct8_add4_8_sse2, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
xor r5, r5
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
.nextblock
movzx r6, byte [scan8+r5]
@@ -550,18 +543,15 @@ cglobal h264_idct8_add4_8_sse2, 5, 7, 10
jz .no_dc
INIT_MMX
DC_ADD_MMX2_INIT r2, r3, r6
-%if ARCH_X86_64
-%define dst_reg r10
-%define dst_regd r10d
-%else
-%define dst_reg r1
-%define dst_regd r1d
-%endif
- mov dst_regd, dword [r1+r5*4]
- add dst_reg, r0
- DC_ADD_MMX2_OP mova, dst_reg, r3, r6
- lea dst_reg, [dst_reg+r3*4]
- DC_ADD_MMX2_OP mova, dst_reg, r3, r6
+%if ARCH_X86_64 == 0
+%define dst2q r1
+%define dst2d r1d
+%endif
+ mov dst2d, dword [r1+r5*4]
+ add dst2q, r0
+ DC_ADD_MMX2_OP mova, dst2q, r3, r6
+ lea dst2q, [dst2q+r3*4]
+ DC_ADD_MMX2_OP mova, dst2q, r3, r6
%if ARCH_X86_64 == 0
mov r1, r1m
%endif
@@ -572,9 +562,9 @@ INIT_MMX
REP_RET
.no_dc
INIT_XMM
- mov dst_regd, dword [r1+r5*4]
- add dst_reg, r0
- IDCT8_ADD_SSE dst_reg, r2, r3, r6
+ mov dst2d, dword [r1+r5*4]
+ add dst2q, r0
+ IDCT8_ADD_SSE dst2q, r2, r3, r6
%if ARCH_X86_64 == 0
mov r1, r1m
%endif
@@ -595,7 +585,7 @@ h264_idct_add8_mmx_plane:
jz .skipblock
%if ARCH_X86_64
mov r0d, dword [r1+r5*4]
- add r0, [r10]
+ add r0, [dst2q]
%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
@@ -611,20 +601,20 @@ h264_idct_add8_mmx_plane:
; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add8_8_mmx, 5, 7, 0
+cglobal h264_idct_add8_8_mmx, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
mov r5, 16
add r2, 512
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
%if ARCH_X86_64
- mov r10, r0
+ mov dst2q, r0
%endif
call h264_idct_add8_mmx_plane
mov r5, 32
add r2, 384
%if ARCH_X86_64
- add r10, gprsize
+ add dst2q, gprsize
%else
add r0mp, gprsize
%endif
@@ -639,7 +629,7 @@ h264_idct_add8_mmx2_plane
jz .try_dc
%if ARCH_X86_64
mov r0d, dword [r1+r5*4]
- add r0, [r10]
+ add r0, [dst2q]
%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
@@ -658,7 +648,7 @@ h264_idct_add8_mmx2_plane
DC_ADD_MMX2_INIT r2, r3, r6
%if ARCH_X86_64
mov r0d, dword [r1+r5*4]
- add r0, [r10]
+ add r0, [dst2q]
%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
@@ -674,20 +664,20 @@ h264_idct_add8_mmx2_plane
; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add8_8_mmx2, 5, 7, 0
+cglobal h264_idct_add8_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
mov r5, 16
add r2, 512
%if ARCH_X86_64
- mov r10, r0
+ mov dst2q, r0
%endif
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
call h264_idct_add8_mmx2_plane
mov r5, 32
add r2, 384
%if ARCH_X86_64
- add r10, gprsize
+ add dst2q, gprsize
%else
add r0mp, gprsize
%endif
@@ -739,7 +729,7 @@ x264_add8x4_idct_sse2:
jz .cycle%1end
mov r0d, dword [r1+%1*8]
%if ARCH_X86_64
- add r0, r10
+ add r0, r5
%else
add r0, r0m
%endif
@@ -752,9 +742,9 @@ x264_add8x4_idct_sse2:
; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add16_8_sse2, 5, 5, 8
+cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8
%if ARCH_X86_64
- mov r10, r0
+ mov r5, r0
%endif
; unrolling of the loop leads to an average performance gain of
; 20-25%
@@ -774,7 +764,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8
jz .try%1dc
mov r0d, dword [r1+%1*8]
%if ARCH_X86_64
- add r0, r10
+ add r0, r7
%else
add r0, r0m
%endif
@@ -786,7 +776,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8
jz .cycle%1end
mov r0d, dword [r1+%1*8]
%if ARCH_X86_64
- add r0, r10
+ add r0, r7
%else
add r0, r0m
%endif
@@ -799,9 +789,9 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
+cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8
%if ARCH_X86_64
- mov r10, r0
+ mov r7, r0
%endif
add16intra_sse2_cycle 0, 0xc
add16intra_sse2_cycle 1, 0x14
@@ -819,7 +809,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
jz .try%1dc
%if ARCH_X86_64
mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
- add r0, [r10]
+ add r0, [r7]
%else
mov r0, r0m
mov r0, [r0]
@@ -833,7 +823,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
jz .cycle%1end
%if ARCH_X86_64
mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
- add r0, [r10]
+ add r0, [r7]
%else
mov r0, r0m
mov r0, [r0]
@@ -850,15 +840,15 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add8_8_sse2, 5, 7, 8
+cglobal h264_idct_add8_8_sse2, 5, 7 + ARCH_X86_64, 8
add r2, 512
%if ARCH_X86_64
- mov r10, r0
+ mov r7, r0
%endif
add8_sse2_cycle 0, 0x34
add8_sse2_cycle 1, 0x3c
%if ARCH_X86_64
- add r10, gprsize
+ add r7, gprsize
%else
add r0mp, gprsize
%endif
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index 501c2a4da1..934a7ff633 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -29,24 +29,6 @@ SECTION_RODATA
pw_pixel_max: times 8 dw ((1 << 10)-1)
pd_32: times 4 dd 32
-scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
- db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
- db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
- db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
- db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
- db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
- db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
- db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
- db 4+11*8, 5+11*8, 4+12*8, 5+12*8
- db 6+11*8, 7+11*8, 6+12*8, 7+12*8
- db 4+13*8, 5+13*8, 4+14*8, 5+14*8
- db 6+13*8, 7+13*8, 6+14*8, 7+14*8
-
-%ifdef PIC
-%define scan8 r11
-%else
-%define scan8 scan8_mem
-%endif
SECTION .text
@@ -315,9 +297,9 @@ IDCT_ADD16INTRA_10 avx
; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
%macro IDCT_ADD8 1
-cglobal h264_idct_add8_10_%1,5,7,7
+cglobal h264_idct_add8_10_%1,5,8,7
%if ARCH_X86_64
- mov r10, r0
+ mov r7, r0
%endif
add r2, 1024
mov r0, [r0]
@@ -325,7 +307,7 @@ cglobal h264_idct_add8_10_%1,5,7,7
ADD16_OP_INTRA %1, 18, 4+ 7*8
add r2, 1024-128*2
%if ARCH_X86_64
- mov r0, [r10+gprsize]
+ mov r0, [r7+gprsize]
%else
mov r0, r0m
mov r0, [r0+gprsize]
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
index c6b4386627..3beb3b9d6d 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -289,7 +289,7 @@ cglobal pred16x16_tm_vp8_sse2, 2,6,6
;-----------------------------------------------------------------------------
%macro H264_PRED16x16_PLANE 3
-cglobal pred16x16_plane_%3_%1, 2, 7, %2
+cglobal pred16x16_plane_%3_%1, 2, 9, %2
mov r2, r1 ; +stride
neg r1 ; -stride
@@ -349,7 +349,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
add r4, r2
%if ARCH_X86_64
-%define e_reg r11
+%define e_reg r8
%else
%define e_reg r0
%endif
@@ -370,8 +370,8 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
movzx e_reg, byte [r3 ]
%if ARCH_X86_64
- movzx r10, byte [r4+r2 ]
- sub r10, e_reg
+ movzx r7, byte [r4+r2 ]
+ sub r7, e_reg
%else
movzx r6, byte [r4+r2 ]
sub r6, e_reg
@@ -386,7 +386,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
movzx r6, byte [r3 ]
sub r6, r4
%if ARCH_X86_64
- lea r6, [r10+r6*2]
+ lea r6, [r7+r6*2]
lea r5, [r5+r6*2]
add r5, r6
%else
@@ -396,9 +396,9 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
movzx r4, byte [e_reg ]
%if ARCH_X86_64
- movzx r10, byte [r3 +r2 ]
- sub r10, r4
- sub r5, r10
+ movzx r7, byte [r3 +r2 ]
+ sub r7, r4
+ sub r5, r7
%else
movzx r6, byte [r3 +r2 ]
sub r6, r4
@@ -410,7 +410,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
movzx r6, byte [r3 +r2*2]
sub r6, r4
%if ARCH_X86_64
- add r6, r10
+ add r6, r7
%endif
lea r5, [r5+r6*8]
@@ -588,7 +588,7 @@ H264_PRED16x16_PLANE ssse3, 8, svq3
;-----------------------------------------------------------------------------
%macro H264_PRED8x8_PLANE 2
-cglobal pred8x8_plane_%1, 2, 7, %2
+cglobal pred8x8_plane_%1, 2, 9, %2
mov r2, r1 ; +stride
neg r1 ; -stride
@@ -642,7 +642,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2
add r4, r2
%if ARCH_X86_64
-%define e_reg r11
+%define e_reg r8
%else
%define e_reg r0
%endif
@@ -653,9 +653,9 @@ cglobal pred8x8_plane_%1, 2, 7, %2
movzx e_reg, byte [r3 ]
%if ARCH_X86_64
- movzx r10, byte [r4+r2 ]
- sub r10, e_reg
- sub r5, r10
+ movzx r7, byte [r4+r2 ]
+ sub r7, e_reg
+ sub r5, r7
%else
movzx r6, byte [r4+r2 ]
sub r6, e_reg
@@ -667,7 +667,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2
movzx r6, byte [r4+r2*2 ]
sub r6, e_reg
%if ARCH_X86_64
- add r6, r10
+ add r6, r7
%endif
lea r5, [r5+r6*4]
diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
index bdacf9f472..788d715d61 100644
--- a/libavcodec/x86/h264_qpel_10bit.asm
+++ b/libavcodec/x86/h264_qpel_10bit.asm
@@ -121,8 +121,8 @@ MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
%endmacro
%macro MCAxA_OP 8
-cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
%if ARCH_X86_32
+cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
call stub_%2_h264_qpel%4_%3_10_%1
mov r0, r0m
mov r1, r1m
@@ -141,17 +141,19 @@ cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
call stub_%2_h264_qpel%4_%3_10_%1
RET
%else ; ARCH_X86_64
- mov r10, r0
- mov r11, r1
+cglobal %2_h264_qpel%5_%3_10_%1, %6,%7 + 2,%8
+ mov r%7, r0
+%assign p1 %7+1
+ mov r %+ p1, r1
call stub_%2_h264_qpel%4_%3_10_%1
- lea r0, [r10+%4*2]
- lea r1, [r11+%4*2]
+ lea r0, [r%7+%4*2]
+ lea r1, [r %+ p1+%4*2]
call stub_%2_h264_qpel%4_%3_10_%1
- lea r0, [r10+r2*%4]
- lea r1, [r11+r2*%4]
+ lea r0, [r%7+r2*%4]
+ lea r1, [r %+ p1+r2*%4]
call stub_%2_h264_qpel%4_%3_10_%1
- lea r0, [r10+r2*%4+%4*2]
- lea r1, [r11+r2*%4+%4*2]
+ lea r0, [r%7+r2*%4+%4*2]
+ lea r1, [r %+ p1+r2*%4+%4*2]
%if UNIX64 == 0 ; fall through to function
call stub_%2_h264_qpel%4_%3_10_%1
RET
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index 1c40e49eaa..22ce72d19f 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -127,7 +127,7 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2
%macro BIWEIGHT_SETUP 0
%if ARCH_X86_64
-%define off_regd r11d
+%define off_regd r7d
%else
%define off_regd r3d
%endif
@@ -175,7 +175,7 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2
%endmacro
INIT_MMX
-cglobal h264_biweight_16_mmx2, 7, 7, 0
+cglobal h264_biweight_16_mmx2, 7, 8, 0
BIWEIGHT_SETUP
movifnidn r3d, r3m
.nextrow
@@ -194,7 +194,7 @@ cglobal h264_biweight_16_mmx2, 7, 7, 0
REP_RET
%macro BIWEIGHT_FUNC_MM 3
-cglobal h264_biweight_%1_%3, 7, 7, %2
+cglobal h264_biweight_%1_%3, 7, 8, %2
BIWEIGHT_SETUP
movifnidn r3d, r3m
.nextrow
@@ -215,7 +215,7 @@ INIT_XMM
BIWEIGHT_FUNC_MM 16, 8, sse2
%macro BIWEIGHT_FUNC_HALF_MM 3
-cglobal h264_biweight_%1_%3, 7, 7, %2
+cglobal h264_biweight_%1_%3, 7, 8, %2
BIWEIGHT_SETUP
movifnidn r3d, r3m
sar r3, 1
@@ -245,7 +245,7 @@ BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
%macro BIWEIGHT_SSSE3_SETUP 0
%if ARCH_X86_64
-%define off_regd r11d
+%define off_regd r7d
%else
%define off_regd r3d
%endif
@@ -277,7 +277,7 @@ BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
%endmacro
INIT_XMM
-cglobal h264_biweight_16_ssse3, 7, 7, 8
+cglobal h264_biweight_16_ssse3, 7, 8, 8
BIWEIGHT_SSSE3_SETUP
movifnidn r3d, r3m
@@ -296,7 +296,7 @@ cglobal h264_biweight_16_ssse3, 7, 7, 8
REP_RET
INIT_XMM
-cglobal h264_biweight_8_ssse3, 7, 7, 8
+cglobal h264_biweight_8_ssse3, 7, 8, 8
BIWEIGHT_SSSE3_SETUP
movifnidn r3d, r3m
sar r3, 1
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 7db1e9c311..ea9f9a1550 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1,11 +1,12 @@
;*****************************************************************************
;* x86inc.asm: x264asm abstraction layer
;*****************************************************************************
-;* Copyright (C) 2005-2011 x264 project
+;* Copyright (C) 2005-2012 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Anton Mitrofanov <BugMaster@narod.ru>
;* Jason Garrett-Glaser <darkshikari@gmail.com>
+;* Henrik Gramner <hengar-6@student.ltu.se>
;*
;* Permission to use, copy, modify, and/or distribute this software for any
;* purpose with or without fee is hereby granted, provided that the above
@@ -95,6 +96,9 @@
default rel
%endif
+; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
+CPU amdnop
+
; Macros to eliminate most code duplication between x86_32 and x86_64:
; Currently this works only for leaf functions which load all their arguments
; into registers at the start, and make no other use of the stack. Luckily that
@@ -128,18 +132,20 @@
; rNm is the original location of arg N (a register or on the stack), dword
; rNmp is native size
-%macro DECLARE_REG 6
+%macro DECLARE_REG 5-6
%define r%1q %2
%define r%1d %3
%define r%1w %4
%define r%1b %5
- %define r%1m %6
- %ifid %6 ; i.e. it's a register
+ %if %0 == 5
+ %define r%1m %3
%define r%1mp %2
%elif ARCH_X86_64 ; memory
- %define r%1mp qword %6
+ %define r%1m [rsp + stack_offset + %6]
+ %define r%1mp qword r %+ %1m
%else
- %define r%1mp dword %6
+ %define r%1m [esp + stack_offset + %6]
+ %define r%1mp dword r %+ %1m
%endif
%define r%1 %2
%endmacro
@@ -187,7 +193,7 @@ DECLARE_REG_SIZE bp, bpl
%endrep
%endmacro
-DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%if ARCH_X86_64
%define gprsize 8
@@ -205,6 +211,33 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
%assign stack_offset stack_offset-gprsize
%endmacro
+%macro PUSH_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ PUSH r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ pop r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+ %rep %0
+ %if %1 < num_args
+ mov r%1, r %+ %1 %+ mp
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
%macro SUB 2
sub %1, %2
%ifidn %1, rsp
@@ -272,39 +305,34 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
%if WIN64 ; Windows x64 ;=================================================
-DECLARE_REG 0, rcx, ecx, cx, cl, ecx
-DECLARE_REG 1, rdx, edx, dx, dl, edx
-DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
-DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
-DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
-DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
-DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
-%define r7m [rsp + stack_offset + 64]
-%define r8m [rsp + stack_offset + 72]
-
-%macro LOAD_IF_USED 2 ; reg_id, number_of_args
- %if %1 < %2
- mov r%1, [rsp + stack_offset + 8 + %1*8]
- %endif
-%endmacro
+DECLARE_REG 0, rcx, ecx, cx, cl
+DECLARE_REG 1, rdx, edx, dx, dl
+DECLARE_REG 2, R8, R8D, R8W, R8B
+DECLARE_REG 3, R9, R9D, R9W, R9B
+DECLARE_REG 4, R10, R10D, R10W, R10B, 40
+DECLARE_REG 5, R11, R11D, R11W, R11B, 48
+DECLARE_REG 6, rax, eax, ax, al, 56
+DECLARE_REG 7, rdi, edi, di, dil, 64
+DECLARE_REG 8, rsi, esi, si, sil, 72
+DECLARE_REG 9, rbx, ebx, bx, bl, 80
+DECLARE_REG 10, rbp, ebp, bp, bpl, 88
+DECLARE_REG 11, R12, R12D, R12W, R12B, 96
+DECLARE_REG 12, R13, R13D, R13W, R13B, 104
+DECLARE_REG 13, R14, R14D, R14W, R14B, 112
+DECLARE_REG 14, R15, R15D, R15W, R15B, 120
%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
- ASSERT %2 >= %1
+ %assign num_args %1
%assign regs_used %2
- ASSERT regs_used <= 7
- %if regs_used > 4
- push r4
- push r5
- %assign stack_offset stack_offset+16
- %endif
+ ASSERT regs_used >= num_args
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
%if mmsize == 8
%assign xmm_regs_used 0
%else
WIN64_SPILL_XMM %3
%endif
- LOAD_IF_USED 4, %1
- LOAD_IF_USED 5, %1
- LOAD_IF_USED 6, %1
+ LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS %4
%endmacro
@@ -312,12 +340,11 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
%assign xmm_regs_used %1
ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 6
- sub rsp, (xmm_regs_used-6)*16+16
- %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
+ SUB rsp, (xmm_regs_used-6)*16+16
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
- movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
+ movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
%endrep
%endif
%endmacro
@@ -327,7 +354,7 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
- movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
+ movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
%endrep
add %1, (xmm_regs_used-6)*16+16
%endif
@@ -341,15 +368,12 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL rsp
- %if regs_used > 4
- pop r5
- pop r4
- %endif
+ POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
ret
%endmacro
%macro REP_RET 0
- %if regs_used > 4 || xmm_regs_used > 6
+ %if regs_used > 7 || xmm_regs_used > 6
RET
%else
rep ret
@@ -358,92 +382,80 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
%elif ARCH_X86_64 ; *nix x64 ;=============================================
-DECLARE_REG 0, rdi, edi, di, dil, edi
-DECLARE_REG 1, rsi, esi, si, sil, esi
-DECLARE_REG 2, rdx, edx, dx, dl, edx
-DECLARE_REG 3, rcx, ecx, cx, cl, ecx
-DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
-DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
-DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
-%define r7m [rsp + stack_offset + 16]
-%define r8m [rsp + stack_offset + 24]
-
-%macro LOAD_IF_USED 2 ; reg_id, number_of_args
- %if %1 < %2
- mov r%1, [rsp - 40 + %1*8]
- %endif
-%endmacro
+DECLARE_REG 0, rdi, edi, di, dil
+DECLARE_REG 1, rsi, esi, si, sil
+DECLARE_REG 2, rdx, edx, dx, dl
+DECLARE_REG 3, rcx, ecx, cx, cl
+DECLARE_REG 4, R8, R8D, R8W, R8B
+DECLARE_REG 5, R9, R9D, R9W, R9B
+DECLARE_REG 6, rax, eax, ax, al, 8
+DECLARE_REG 7, R10, R10D, R10W, R10B, 16
+DECLARE_REG 8, R11, R11D, R11W, R11B, 24
+DECLARE_REG 9, rbx, ebx, bx, bl, 32
+DECLARE_REG 10, rbp, ebp, bp, bpl, 40
+DECLARE_REG 11, R12, R12D, R12W, R12B, 48
+DECLARE_REG 12, R13, R13D, R13W, R13B, 56
+DECLARE_REG 13, R14, R14D, R14W, R14B, 64
+DECLARE_REG 14, R15, R15D, R15W, R15B, 72
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
- ASSERT %2 >= %1
- ASSERT %2 <= 7
- LOAD_IF_USED 6, %1
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 9, 10, 11, 12, 13, 14
+ LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS %4
%endmacro
%macro RET 0
+ POP_IF_USED 14, 13, 12, 11, 10, 9
ret
%endmacro
%macro REP_RET 0
- rep ret
+ %if regs_used > 9
+ RET
+ %else
+ rep ret
+ %endif
%endmacro
%else ; X86_32 ;==============================================================
-DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
-DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
-DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
-DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
-DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
-DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
-DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
-%define r7m [esp + stack_offset + 32]
-%define r8m [esp + stack_offset + 36]
+DECLARE_REG 0, eax, eax, ax, al, 4
+DECLARE_REG 1, ecx, ecx, cx, cl, 8
+DECLARE_REG 2, edx, edx, dx, dl, 12
+DECLARE_REG 3, ebx, ebx, bx, bl, 16
+DECLARE_REG 4, esi, esi, si, null, 20
+DECLARE_REG 5, edi, edi, di, null, 24
+DECLARE_REG 6, ebp, ebp, bp, null, 28
%define rsp esp
-%macro PUSH_IF_USED 1 ; reg_id
- %if %1 < regs_used
- push r%1
- %assign stack_offset stack_offset+4
- %endif
-%endmacro
-
-%macro POP_IF_USED 1 ; reg_id
- %if %1 < regs_used
- pop r%1
- %endif
+%macro DECLARE_ARG 1-*
+ %rep %0
+ %define r%1m [esp + stack_offset + 4*%1 + 4]
+ %define r%1mp dword r%1m
+ %rotate 1
+ %endrep
%endmacro
-%macro LOAD_IF_USED 2 ; reg_id, number_of_args
- %if %1 < %2
- mov r%1, [esp + stack_offset + 4 + %1*4]
- %endif
-%endmacro
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
- ASSERT %2 >= %1
+ %assign num_args %1
%assign regs_used %2
- ASSERT regs_used <= 7
- PUSH_IF_USED 3
- PUSH_IF_USED 4
- PUSH_IF_USED 5
- PUSH_IF_USED 6
- LOAD_IF_USED 0, %1
- LOAD_IF_USED 1, %1
- LOAD_IF_USED 2, %1
- LOAD_IF_USED 3, %1
- LOAD_IF_USED 4, %1
- LOAD_IF_USED 5, %1
- LOAD_IF_USED 6, %1
+ %if regs_used > 7
+ %assign regs_used 7
+ %endif
+ ASSERT regs_used >= num_args
+ PUSH_IF_USED 3, 4, 5, 6
+ LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
DEFINE_ARGS %4
%endmacro
%macro RET 0
- POP_IF_USED 6
- POP_IF_USED 5
- POP_IF_USED 4
- POP_IF_USED 3
+ POP_IF_USED 6, 5, 4, 3
ret
%endmacro
@@ -464,8 +476,6 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%endmacro
%endif
-
-
;=============================================================================
; arch-independent part
;=============================================================================
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index 68dbf51b02..9b0b01253a 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -62,11 +62,11 @@ SECTION .text
%define cntr_reg fltsizeq
%define movsx mov
%else
-%define cntr_reg r11
+%define cntr_reg r7
%define movsx movsxd
%endif
-cglobal yuv2planeX_%1, %3, 7, %2, filter, fltsize, src, dst, w, dither, offset
+cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%if %1 == 8 || %1 == 9 || %1 == 10
pxor m6, m6
%endif ; %1 == 8/9/10
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index f7ed45fcf3..d56e253afa 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -53,7 +53,7 @@ SECTION .text
%ifnidn %3, X
cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
%else
-cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
+cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
%endif
%if ARCH_X86_64
movsxd wq, wd
@@ -245,10 +245,9 @@ cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, srcmem, filter, fltpos, fltsiz
%define dlt 0
%endif ; %4 ==/!= X4
%if ARCH_X86_64
- push r12
-%define srcq r11
-%define pos1q r10
-%define srcendq r12
+%define srcq r8
+%define pos1q r7
+%define srcendq r9
movsxd fltsizeq, fltsized ; filterSize
lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
%else ; x86-32
@@ -388,16 +387,7 @@ cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, srcmem, filter, fltpos, fltsiz
add wq, 2
%endif ; %3 ==/!= X
jl .loop
-%ifnidn %3, X
REP_RET
-%else ; %3 == X
-%if ARCH_X86_64
- pop r12
- RET
-%else ; x86-32
- REP_RET
-%endif ; x86-32/64
-%endif ; %3 ==/!= X
%endmacro
; SCALE_FUNCS source_width, intermediate_nbits, n_xmm