summaryrefslogtreecommitdiff
path: root/libavcodec/x86/h264_idct_10bit.asm
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2013-02-18 21:03:02 -0800
committerMartin Storsjö <martin@martin.st>2013-04-10 11:03:06 +0300
commit62844c3fd66940c7747e9b2bb7804e265319f43f (patch)
treeb0e5a05644457aa5d7598d1fefa1a41f83550753 /libavcodec/x86/h264_idct_10bit.asm
parente8cafd2773bc56455c8816593cbd9368f2d69a80 (diff)
h264: Integrate clear_blocks calls with IDCT
The non-intra-pcm branch in hl_decode_mb (simple, 8bpp) goes from 700 to 672 cycles, and the complete loop of decode_mb_cabac and hl_decode_mb (in the decode_slice loop) goes from 1759 to 1733 cycles on the clip tested (cathedral), i.e. almost 30 cycles per mb faster. Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/x86/h264_idct_10bit.asm')
-rw-r--r--libavcodec/x86/h264_idct_10bit.asm53
1 files changed, 48 insertions, 5 deletions
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index 51965f0f9f..4e51d2b5d0 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -66,6 +66,10 @@ SECTION .text
paddd m0, [pd_32]
IDCT4_1D d,0,1,2,3,4,5
pxor m5, m5
+ mova [%2+ 0], m5
+ mova [%2+16], m5
+ mova [%2+32], m5
+ mova [%2+48], m5
STORE_DIFFx2 m0, m1, m4, m5, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m2, m3, m4, m5, %1, %3
@@ -98,6 +102,10 @@ add4x4_idct %+ SUFFIX:
paddd m0, [pd_32]
IDCT4_1D d,0,1,2,3,4,5
pxor m5, m5
+ mova [r2+ 0], m5
+ mova [r2+16], m5
+ mova [r2+32], m5
+ mova [r2+48], m5
STORE_DIFFx2 m0, m1, m4, m5, r5, r3
lea r5, [r5+r3*2]
STORE_DIFFx2 m2, m3, m4, m5, r5, r3
@@ -181,6 +189,7 @@ IDCT_ADD16_10
INIT_MMX mmxext
cglobal h264_idct_dc_add_10,3,3
movd m0, [r1]
+ mov dword [r1], 0
paddd m0, [pd_32]
psrad m0, 6
lea r1, [r2*3]
@@ -193,11 +202,11 @@ cglobal h264_idct_dc_add_10,3,3
; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
;-----------------------------------------------------------------------------
%macro IDCT8_DC_ADD 0
-cglobal h264_idct8_dc_add_10,3,3,7
- mov r1d, [r1]
- add r1, 32
- sar r1, 6
- movd m0, r1d
+cglobal h264_idct8_dc_add_10,3,4,7
+ movd m0, [r1]
+ mov dword[r1], 0
+ paddd m0, [pd_32]
+ psrad m0, 6
lea r1, [r2*3]
SPLATW m0, m0, 0
mova m6, [pw_pixel_max]
@@ -247,6 +256,8 @@ idct_dc_add %+ SUFFIX:
add r5, r0
movq m0, [r2+ 0]
movhps m0, [r2+64]
+ mov dword [r2+ 0], 0
+ mov dword [r2+64], 0
paddd m0, [pd_32]
psrad m0, 6
pshufhw m0, m0, 0
@@ -461,6 +472,22 @@ h264_idct8_add1_10 %+ SUFFIX:
packssdw m8, m0
paddsw m8, [r0]
pxor m0, m0
+ mova [r1+ 0], m0
+ mova [r1+ 16], m0
+ mova [r1+ 32], m0
+ mova [r1+ 48], m0
+ mova [r1+ 64], m0
+ mova [r1+ 80], m0
+ mova [r1+ 96], m0
+ mova [r1+112], m0
+ mova [r1+128], m0
+ mova [r1+144], m0
+ mova [r1+160], m0
+ mova [r1+176], m0
+ mova [r1+192], m0
+ mova [r1+208], m0
+ mova [r1+224], m0
+ mova [r1+240], m0
CLIPW m8, m0, [pw_pixel_max]
mova [r0], m8
mova m8, [pw_pixel_max]
@@ -480,6 +507,22 @@ h264_idct8_add1_10 %+ SUFFIX:
lea r3, [r0+8]
IDCT8_ADD_SSE_END r0, rsp, r2
IDCT8_ADD_SSE_END r3, rsp+16, r2
+ mova [r1+ 0], m7
+ mova [r1+ 16], m7
+ mova [r1+ 32], m7
+ mova [r1+ 48], m7
+ mova [r1+ 64], m7
+ mova [r1+ 80], m7
+ mova [r1+ 96], m7
+ mova [r1+112], m7
+ mova [r1+128], m7
+ mova [r1+144], m7
+ mova [r1+160], m7
+ mova [r1+176], m7
+ mova [r1+192], m7
+ mova [r1+208], m7
+ mova [r1+224], m7
+ mova [r1+240], m7
%endif ; ARCH_X86_64
add rsp, pad