diff options
author | Jason Garrett-Glaser <darkshikari@gmail.com> | 2011-01-14 21:34:25 +0000 |
---|---|---|
committer | Jason Garrett-Glaser <darkshikari@gmail.com> | 2011-01-14 21:34:25 +0000 |
commit | 19fb234e4af1ff9f58ff2fdd604ac6f6bb87ad6b (patch) | |
tree | 220be84d79d9c771c1afeab43fdd2aaa82fea01d /libavcodec/x86/h264dsp_mmx.c | |
parent | 6c18f1cda2e2b2471ebf75d30d552cb0cb61b6ad (diff) |
H.264: split luma dc idct out and implement MMX/SSE2 versions
About 2.5x the speed.
NOTE: the way that the asm code handles large qmuls is a bit suboptimal.
If x264-style dequant was used (separate shift and qmul values), it might
be possible to get some extra speed.
Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86/h264dsp_mmx.c')
-rw-r--r-- | libavcodec/x86/h264dsp_mmx.c | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 401a488cb5..d9e45f8b03 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -59,6 +59,8 @@ void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM int stride, const uint8_t nnzc[6*8]); void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul); +void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul); /***********************************/ /* deblocking */ @@ -301,6 +303,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c) c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; c->h264_idct_add8 = ff_h264_idct_add8_mmx; c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; + c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx; if (mm_flags & AV_CPU_FLAG_MMX2) { c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; @@ -341,6 +344,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c) if (mm_flags&AV_CPU_FLAG_SSE2) { c->h264_idct8_add = ff_h264_idct8_add_sse2; c->h264_idct8_add4= ff_h264_idct8_add4_sse2; + c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; |