From 19fb234e4af1ff9f58ff2fdd604ac6f6bb87ad6b Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Fri, 14 Jan 2011 21:34:25 +0000
Subject: H.264: split luma dc idct out and implement MMX/SSE2 versions About
 2.5x the speed.

NOTE: the way that the asm code handles large qmuls is a bit suboptimal.
If x264-style dequant was used (separate shift and qmul values), it might
be possible to get some extra speed.

Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavcodec/x86/h264dsp_mmx.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'libavcodec/x86/h264dsp_mmx.c')

diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 401a488cb5..d9e45f8b03 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -59,6 +59,8 @@ void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM
                                   int stride, const uint8_t nnzc[6*8]);
 void ff_h264_idct_add8_sse2      (uint8_t **dest, const int *block_offset, DCTELEM *block,
                                   int stride, const uint8_t nnzc[6*8]);
+void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul);
+void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);
 
 /***********************************/
 /* deblocking */
@@ -301,6 +303,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
         c->h264_idct8_add4     = ff_h264_idct8_add4_mmx;
         c->h264_idct_add8      = ff_h264_idct_add8_mmx;
         c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
+        c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx;
 
         if (mm_flags & AV_CPU_FLAG_MMX2) {
             c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
@@ -341,6 +344,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
             if (mm_flags&AV_CPU_FLAG_SSE2) {
                 c->h264_idct8_add = ff_h264_idct8_add_sse2;
                 c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
+                c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
 
                 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
                 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
-- 
cgit v1.2.3