summaryrefslogtreecommitdiff
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2014-12-15 20:40:48 -0500
committerMichael Niedermayer <michaelni@gmx.at>2014-12-16 02:51:26 +0100
commit0a7964dca5e52536c05a72987c3d7dbb12add942 (patch)
tree1eaadb04dc2a232aa372b7ca0475437d43ce87af /libavcodec/x86
parent1a86e29ff0bcd5ecbd97057418f7f7dd814c5d9c (diff)
vp9/x86: save one register on 32bit idct32x32.
Fixes build on win32. Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/vp9itxfm.asm51
1 files changed, 48 insertions, 3 deletions
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
index 908040c054..64859a061d 100644
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -2526,7 +2526,8 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
INIT_XMM %1
-cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride, block, eob
+cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob
+ movifnidn eobd, dword eobm
%if cpuflag(ssse3)
cmp eobd, 135
jg .idctfull
@@ -2540,6 +2541,9 @@ cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride,
%endif
; dc-only case
+ movifnidn blockq, blockmp
+ movifnidn dstq, dstmp
+ movifnidn strideq, stridemp
%if cpuflag(ssse3)
movd m0, [blockq]
mova m1, [pw_11585x2]
@@ -2572,15 +2576,22 @@ cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride,
%if ARCH_X86_64
DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
%else
- DEFINE_ARGS dst, stride, block, stride30, dst_end, stride2, tmp
-%define cntd dword r4m
%define dst_bakq r0mp
%endif
%if cpuflag(ssse3)
.idct8x8:
+%if ARCH_X86_32
+ DEFINE_ARGS block, u1, u2, u3, u4, tmp
+ mov blockq, r2mp
+%endif
mov tmpq, rsp
VP9_IDCT32_1D blockq, 1, 8
+%if ARCH_X86_32
+ DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+ mov strideq, r1mp
+%define cntd dword r3m
+%endif
mov stride30q, strideq ; stride
lea stride2q, [strideq*2] ; stride*2
shl stride30q, 5 ; stride*32
@@ -2597,10 +2608,18 @@ cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride,
; at the end of the loop, m7 should still be zero
; use that to zero out block coefficients
+%if ARCH_X86_32
+ DEFINE_ARGS block
+ mov blockq, r2mp
+%endif
ZERO_BLOCK blockq, 64, 8, m1
RET
.idct16x16:
+%if ARCH_X86_32
+ DEFINE_ARGS block, tmp, cnt
+ mov blockq, r2mp
+%endif
mov cntd, 2
mov tmpq, rsp
.loop1_16x16:
@@ -2609,7 +2628,14 @@ cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride,
add tmpq, 512
dec cntd
jg .loop1_16x16
+
+%if ARCH_X86_64
sub blockq, 32
+%else
+ DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+ mov strideq, r1mp
+%define cntd dword r3m
+%endif
mov stride30q, strideq ; stride
lea stride2q, [strideq*2] ; stride*2
@@ -2628,11 +2654,19 @@ cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride,
; at the end of the loop, m7 should still be zero
; use that to zero out block coefficients
+%if ARCH_X86_32
+ DEFINE_ARGS block
+ mov blockq, r2mp
+%endif
ZERO_BLOCK blockq, 64, 16, m1
RET
%endif
.idctfull:
+%if ARCH_X86_32
+ DEFINE_ARGS block, tmp, cnt
+ mov blockq, r2mp
+%endif
mov cntd, 4
mov tmpq, rsp
.loop1_full:
@@ -2641,7 +2675,14 @@ cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride,
add tmpq, 512
dec cntd
jg .loop1_full
+
+%if ARCH_X86_64
sub blockq, 64
+%else
+ DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+ mov strideq, r1mp
+%define cntd dword r3m
+%endif
mov stride30q, strideq ; stride
lea stride2q, [strideq*2] ; stride*2
@@ -2660,6 +2701,10 @@ cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride,
; at the end of the loop, m7 should still be zero
; use that to zero out block coefficients
+%if ARCH_X86_32
+ DEFINE_ARGS block
+ mov blockq, r2mp
+%endif
ZERO_BLOCK blockq, 64, 32, m1
RET
%endmacro