summaryrefslogtreecommitdiff
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorJason Garrett-Glaser <darkshikari@gmail.com>2011-01-14 21:34:25 +0000
committerJason Garrett-Glaser <darkshikari@gmail.com>2011-01-14 21:34:25 +0000
commit19fb234e4af1ff9f58ff2fdd604ac6f6bb87ad6b (patch)
tree220be84d79d9c771c1afeab43fdd2aaa82fea01d /libavcodec/x86
parent6c18f1cda2e2b2471ebf75d30d552cb0cb61b6ad (diff)
H.264: split luma dc idct out and implement MMX/SSE2 versions
About 2.5x the speed. NOTE: the way that the asm code handles large qmuls is a bit suboptimal. If x264-style dequant was used (separate shift and qmul values), it might be possible to get some extra speed. Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/dsputil_mmx.c1
-rw-r--r--libavcodec/x86/h264_idct.asm154
-rw-r--r--libavcodec/x86/h264dsp_mmx.c4
3 files changed, 159 insertions, 0 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 909ec414e7..375a4c5e09 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -41,6 +41,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
{0x8000000080000000ULL, 0x8000000080000000ULL};
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL;
DECLARE_ALIGNED(8, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 9c154f80b3..fdb35003a8 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -47,6 +47,7 @@ scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
%endif
cextern pw_32
+cextern pw_1
SECTION .text
@@ -854,3 +855,156 @@ cglobal h264_idct_add8_sse2, 5, 7, 8
add8_sse2_cycle 2, 0x21
add8_sse2_cycle 3, 0x29
RET
+
+;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
+
+%macro WALSH4_1D 5
+ SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
+ SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
+ SWAP %1, %4, %3
+%endmacro
+
+%macro DEQUANT_MMX 3
+ mova m7, [pw_1]
+ mova m4, %1
+ punpcklwd %1, m7
+ punpckhwd m4, m7
+ mova m5, %2
+ punpcklwd %2, m7
+ punpckhwd m5, m7
+ movd m7, t3d
+ punpckldq m7, m7
+ pmaddwd %1, m7
+ pmaddwd %2, m7
+ pmaddwd m4, m7
+ pmaddwd m5, m7
+ psrad %1, %3
+ psrad %2, %3
+ psrad m4, %3
+ psrad m5, %3
+ packssdw %1, m4
+ packssdw %2, m5
+%endmacro
+
+%macro STORE_WORDS_MMX 5
+ movd t0d, %1
+ psrlq %1, 32
+ movd t1d, %1
+ mov [t2+%2*32], t0w
+ mov [t2+%4*32], t1w
+ shr t0d, 16
+ shr t1d, 16
+ mov [t2+%3*32], t0w
+ mov [t2+%5*32], t1w
+%endmacro
+
+%macro DEQUANT_STORE_MMX 1
+ DEQUANT_MMX m0, m1, %1
+ STORE_WORDS_MMX m0, 0, 1, 4, 5
+ STORE_WORDS_MMX m1, 2, 3, 6, 7
+
+ DEQUANT_MMX m2, m3, %1
+ STORE_WORDS_MMX m2, 8, 9, 12, 13
+ STORE_WORDS_MMX m3, 10, 11, 14, 15
+%endmacro
+
+%macro STORE_WORDS_SSE 9
+ movd t0d, %1
+ psrldq %1, 4
+ movd t1d, %1
+ psrldq %1, 4
+ mov [t2+%2*32], t0w
+ mov [t2+%4*32], t1w
+ shr t0d, 16
+ shr t1d, 16
+ mov [t2+%3*32], t0w
+ mov [t2+%5*32], t1w
+ movd t0d, %1
+ psrldq %1, 4
+ movd t1d, %1
+ mov [t2+%6*32], t0w
+ mov [t2+%8*32], t1w
+ shr t0d, 16
+ shr t1d, 16
+ mov [t2+%7*32], t0w
+ mov [t2+%9*32], t1w
+%endmacro
+
+%macro DEQUANT_STORE_SSE2 1
+ movd xmm4, t3d
+ movq xmm5, [pw_1]
+ pshufd xmm4, xmm4, 0
+ movq2dq xmm0, m0
+ movq2dq xmm1, m1
+ movq2dq xmm2, m2
+ movq2dq xmm3, m3
+ punpcklwd xmm0, xmm5
+ punpcklwd xmm1, xmm5
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm3, xmm5
+ pmaddwd xmm0, xmm4
+ pmaddwd xmm1, xmm4
+ pmaddwd xmm2, xmm4
+ pmaddwd xmm3, xmm4
+ psrad xmm0, %1
+ psrad xmm1, %1
+ psrad xmm2, %1
+ psrad xmm3, %1
+ packssdw xmm0, xmm1
+ packssdw xmm2, xmm3
+ STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7
+ STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15
+%endmacro
+
+%macro IDCT_DC_DEQUANT 2
+cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
+ movq m3, [r1+24]
+ movq m2, [r1+16]
+ movq m1, [r1+ 8]
+ movq m0, [r1+ 0]
+ WALSH4_1D 0,1,2,3,4
+ TRANSPOSE4x4W 0,1,2,3,4
+ WALSH4_1D 0,1,2,3,4
+
+; shift, tmp, output, qmul
+%ifdef WIN64
+ DECLARE_REG_TMP 0,3,1,2
+ ; we can't avoid this, because r0 is the shift register (ecx) on win64
+ xchg r0, t2
+%elifdef ARCH_X86_64
+ DECLARE_REG_TMP 3,1,0,2
+%else
+ DECLARE_REG_TMP 1,3,0,2
+%endif
+
+ cmp t3d, 32767
+ jg .big_qmul
+ add t3d, 128 << 16
+%ifidn %1,mmx
+ DEQUANT_STORE_MMX 8
+%else
+ DEQUANT_STORE_SSE2 8
+%endif
+ RET
+.big_qmul:
+ bsr t0d, t3d
+ add t3d, 128 << 16
+ mov t1d, 7
+ cmp t0d, t1d
+ cmovg t0d, t1d
+ inc t1d
+ shr t3d, t0b
+ sub t1d, t0d
+%ifidn %1,mmx
+ movd m6, t1d
+ DEQUANT_STORE_MMX m6
+%else
+ movd xmm6, t1d
+ DEQUANT_STORE_SSE2 xmm6
+%endif
+ RET
+%endmacro
+
+INIT_MMX
+IDCT_DC_DEQUANT mmx, 0
+IDCT_DC_DEQUANT sse2, 7
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 401a488cb5..d9e45f8b03 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -59,6 +59,8 @@ void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM
int stride, const uint8_t nnzc[6*8]);
void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block,
int stride, const uint8_t nnzc[6*8]);
+void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul);
+void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);
/***********************************/
/* deblocking */
@@ -301,6 +303,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
c->h264_idct8_add4 = ff_h264_idct8_add4_mmx;
c->h264_idct_add8 = ff_h264_idct_add8_mmx;
c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
+ c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx;
if (mm_flags & AV_CPU_FLAG_MMX2) {
c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
@@ -341,6 +344,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
if (mm_flags&AV_CPU_FLAG_SSE2) {
c->h264_idct8_add = ff_h264_idct8_add_sse2;
c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
+ c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;