From ef9d1d15751c6a2e4c570727c198854ce8b44603 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Fri, 10 Feb 2006 06:55:25 +0000 Subject: h264: special case dc-only idct. ~1% faster overall Originally committed as revision 4971 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/dsputil.c | 2 + libavcodec/dsputil.h | 4 ++ libavcodec/h264.c | 97 ++++++++++++++++++++++++++----------------- libavcodec/h264idct.c | 25 +++++++++++ libavcodec/i386/dsputil_mmx.c | 2 + libavcodec/i386/h264dsp_mmx.c | 81 ++++++++++++++++++++++++++++++++++++ 6 files changed, 173 insertions(+), 38 deletions(-) (limited to 'libavcodec') diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index e7435a20b9..5a518f4c8c 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -3851,6 +3851,8 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) c->h264_idct_add= ff_h264_idct_add_c; c->h264_idct8_add= ff_h264_idct8_add_c; + c->h264_idct_dc_add= ff_h264_idct_dc_add_c; + c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c; c->get_pixels = get_pixels_c; c->diff_pixels = diff_pixels_c; diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 5cfde2f5ef..5188bf8ee9 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -52,6 +52,8 @@ void ff_fdct_sse2(DCTELEM *block); void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride); void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride); +void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); +void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block); void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block); @@ -330,6 +332,8 @@ typedef struct DSPContext { void (*h264_idct_add)(uint8_t *dst, DCTELEM *block, int stride); void (*h264_idct8_add)(uint8_t *dst, DCTELEM *block, int stride); + void (*h264_idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); + void (*h264_idct8_dc_add)(uint8_t *dst, DCTELEM *block, int stride); } DSPContext; void dsputil_static_init(void); diff --git a/libavcodec/h264.c b/libavcodec/h264.c index cd21c26037..014d635f6b 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -3314,6 +3314,7 @@ static void hl_decode_mb(H264Context *h){ const unsigned int bottom = mb_y & 1; const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass); void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); + void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); if(!s->decode) return; @@ -3337,9 +3338,16 @@ static void hl_decode_mb(H264Context *h){ // dct_offset = s->linesize * 16; } - idct_add = transform_bypass - ? IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4 - : IS_8x8DCT(mb_type) ? s->dsp.h264_idct8_add : s->dsp.h264_idct_add; + if(transform_bypass){ + idct_dc_add = + idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4; + }else if(IS_8x8DCT(mb_type)){ + idct_dc_add = s->dsp.h264_idct8_dc_add; + idct_add = s->dsp.h264_idct8_add; + }else{ + idct_dc_add = s->dsp.h264_idct_dc_add; + idct_add = s->dsp.h264_idct_add; + } if (IS_INTRA_PCM(mb_type)) { unsigned int x, y; @@ -3389,17 +3397,22 @@ static void hl_decode_mb(H264Context *h){ for(i=0; i<16; i+=4){ uint8_t * const ptr= dest_y + block_offset[i]; const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; + const int nnz = h->non_zero_count_cache[ scan8[i] ]; h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<topright_samples_available<<(i+1))&0x8000, linesize); - if(h->non_zero_count_cache[ scan8[i] ]) - idct_add(ptr, h->mb + i*16, linesize); + if(nnz){ + if(nnz == 1 && h->mb[i*16]) + idct_dc_add(ptr, h->mb + i*16, linesize); + else + idct_add(ptr, h->mb + i*16, linesize); + } } }else for(i=0; i<16; i++){ uint8_t * const ptr= dest_y + block_offset[i]; uint8_t *topright; const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; - int tr; + int nnz, tr; if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){ const int topright_avail= (h->topright_samples_available<pred4x4[ dir ](ptr, topright, linesize); - if(h->non_zero_count_cache[ scan8[i] ]){ - if(s->codec_id == CODEC_ID_H264) - idct_add(ptr, h->mb + i*16, linesize); - else + nnz = h->non_zero_count_cache[ scan8[i] ]; + if(nnz){ + if(s->codec_id == CODEC_ID_H264){ + if(nnz == 1 && h->mb[i*16]) + idct_dc_add(ptr, h->mb + i*16, linesize); + else + idct_add(ptr, h->mb + i*16, linesize); + }else svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0); } } @@ -3453,11 +3470,23 @@ static void hl_decode_mb(H264Context *h){ if(!IS_INTRA4x4(mb_type)){ if(s->codec_id == CODEC_ID_H264){ - const int di = IS_8x8DCT(mb_type) ? 4 : 1; - for(i=0; i<16; i+=di){ - if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below - uint8_t * const ptr= dest_y + block_offset[i]; - idct_add(ptr, h->mb + i*16, linesize); + if(IS_INTRA16x16(mb_type)){ + for(i=0; i<16; i++){ + if(h->non_zero_count_cache[ scan8[i] ]) + idct_add(dest_y + block_offset[i], h->mb + i*16, linesize); + else if(h->mb[i*16]) + idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize); + } + }else{ + const int di = IS_8x8DCT(mb_type) ? 4 : 1; + for(i=0; i<16; i+=di){ + int nnz = h->non_zero_count_cache[ scan8[i] ]; + if(nnz){ + if(nnz==1 && h->mb[i*16]) + idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize); + else + idct_add(dest_y + block_offset[i], h->mb + i*16, linesize); + } } } }else{ @@ -3471,34 +3500,26 @@ static void hl_decode_mb(H264Context *h){ } if(!(s->flags&CODEC_FLAG_GRAY)){ - idct_add = transform_bypass ? s->dsp.add_pixels4 : s->dsp.h264_idct_add; - if(!transform_bypass){ + uint8_t *dest[2] = {dest_cb, dest_cr}; + if(transform_bypass){ + idct_add = idct_dc_add = s->dsp.add_pixels4; + }else{ + idct_add = s->dsp.h264_idct_add; + idct_dc_add = s->dsp.h264_idct_dc_add; chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]); chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]); } if(s->codec_id == CODEC_ID_H264){ - for(i=16; i<16+4; i++){ - if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ - uint8_t * const ptr= dest_cb + block_offset[i]; - idct_add(ptr, h->mb + i*16, uvlinesize); - } - } - for(i=20; i<20+4; i++){ - if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ - uint8_t * const ptr= dest_cr + block_offset[i]; - idct_add(ptr, h->mb + i*16, uvlinesize); - } + for(i=16; i<16+8; i++){ + if(h->non_zero_count_cache[ scan8[i] ]) + idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize); + else if(h->mb[i*16]) + idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize); } }else{ - for(i=16; i<16+4; i++){ - if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ - uint8_t * const ptr= dest_cb + block_offset[i]; - svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2); - } - } - for(i=20; i<20+4; i++){ + for(i=16; i<16+8; i++){ if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ - uint8_t * const ptr= dest_cr + block_offset[i]; + uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i]; svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2); } } @@ -5131,7 +5152,7 @@ decode_intra_mb: return -1; } nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ]; - nnz[0] |= nnz[1] | nnz[8] | nnz[9]; + nnz[0] += nnz[1] + nnz[8] + nnz[9]; }else{ for(i4x4=0; i4x4<4; i4x4++){ const int index= i4x4 + 4*i8x8; @@ -5690,7 +5711,7 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n h->non_zero_count_cache[scan8[16+n]] = coeff_count; else { assert( cat == 5 ); - fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, 1, 1); + fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1); } for( i = coeff_count - 1; i >= 0; i-- ) { diff --git a/libavcodec/h264idct.c b/libavcodec/h264idct.c index a4ddf1d51d..3e44385d5e 100644 --- a/libavcodec/h264idct.c +++ b/libavcodec/h264idct.c @@ -139,3 +139,28 @@ void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){ dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ]; } } + +// assumes all AC coefs are 0 +void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){ + int i, j; + uint8_t *cm = cropTbl + MAX_NEG_CROP; + int dc = (block[0] + 32) >> 6; + for( j = 0; j < 4; j++ ) + { + for( i = 0; i < 4; i++ ) + dst[i] = cm[ dst[i] + dc ]; + dst += stride; + } +} + +void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){ + int i, j; + uint8_t *cm = cropTbl + MAX_NEG_CROP; + int dc = (block[0] + 32) >> 6; + for( j = 0; j < 8; j++ ) + { + for( i = 0; i < 8; i++ ) + dst[i] = cm[ dst[i] + dc ]; + dst += stride; + } +} diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c index 7d69859a67..54c1d2d635 100644 --- a/libavcodec/i386/dsputil_mmx.c +++ b/libavcodec/i386/dsputil_mmx.c @@ -2754,6 +2754,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) #endif //CONFIG_ENCODERS c->h264_idct_add= ff_h264_idct_add_mmx2; + c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; + c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; diff --git a/libavcodec/i386/h264dsp_mmx.c b/libavcodec/i386/h264dsp_mmx.c index 44a4718e9f..6debfd9fc8 100644 --- a/libavcodec/i386/h264dsp_mmx.c +++ b/libavcodec/i386/h264dsp_mmx.c @@ -104,6 +104,87 @@ void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride) ); } +void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) +{ + int dc = (block[0] + 32) >> 6; + asm volatile( + "movd %0, %%mm0 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "pmaxsw %%mm7, %%mm0 \n\t" + "pmaxsw %%mm7, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + ::"r"(dc) + ); + asm volatile( + "movd %0, %%mm2 \n\t" + "movd %1, %%mm3 \n\t" + "movd %2, %%mm4 \n\t" + "movd %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movd %%mm2, %0 \n\t" + "movd %%mm3, %1 \n\t" + "movd %%mm4, %2 \n\t" + "movd %%mm5, %3 \n\t" + :"+m"(*(uint32_t*)(dst+0*stride)), + "+m"(*(uint32_t*)(dst+1*stride)), + "+m"(*(uint32_t*)(dst+2*stride)), + "+m"(*(uint32_t*)(dst+3*stride)) + ); +} + +void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) +{ + int dc = (block[0] + 32) >> 6; + int y; + asm volatile( + "movd %0, %%mm0 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "pmaxsw %%mm7, %%mm0 \n\t" + "pmaxsw %%mm7, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + ::"r"(dc) + ); + for(y=2; y--; dst += 4*stride){ + asm volatile( + "movq %0, %%mm2 \n\t" + "movq %1, %%mm3 \n\t" + "movq %2, %%mm4 \n\t" + "movq %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movq %%mm2, %0 \n\t" + "movq %%mm3, %1 \n\t" + "movq %%mm4, %2 \n\t" + "movq %%mm5, %3 \n\t" + :"+m"(*(uint64_t*)(dst+0*stride)), + "+m"(*(uint64_t*)(dst+1*stride)), + "+m"(*(uint64_t*)(dst+2*stride)), + "+m"(*(uint64_t*)(dst+3*stride)) + ); + } +} + /***********************************/ /* deblocking */ -- cgit v1.2.3