From bb32fded3623a20ff8999c2924315841c08c985c Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Thu, 21 Jul 2011 19:04:37 +0100 Subject: dnxhddec: optimise dnxhd_decode_dct_block() Template the function for 8/10-bit and use lowlevel bitstream macros. 6% faster overall on i7 gcc 4.5. Signed-off-by: Mans Rullgard --- libavcodec/dnxhddec.c | 104 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 37 deletions(-) (limited to 'libavcodec/dnxhddec.c') diff --git a/libavcodec/dnxhddec.c b/libavcodec/dnxhddec.c index 7b32cd3ed4..426be2e985 100644 --- a/libavcodec/dnxhddec.c +++ b/libavcodec/dnxhddec.c @@ -31,7 +31,7 @@ #include "dnxhddata.h" #include "dsputil.h" -typedef struct { +typedef struct DNXHDContext { AVCodecContext *avctx; AVFrame picture; GetBitContext gb; @@ -47,11 +47,16 @@ typedef struct { ScanTable scantable; const CIDEntry *cid_table; int bit_depth; // 8, 10 or 0 if not initialized at all. + void (*decode_dct_block)(struct DNXHDContext *ctx, DCTELEM *block, + int n, int qscale); } DNXHDContext; #define DNXHD_VLC_BITS 9 #define DNXHD_DC_VLC_BITS 7 +static void dnxhd_decode_dct_block_8(DNXHDContext *ctx, DCTELEM *block, int n, int qscale); +static void dnxhd_decode_dct_block_10(DNXHDContext *ctx, DCTELEM *block, int n, int qscale); + static av_cold int dnxhd_decode_init(AVCodecContext *avctx) { DNXHDContext *ctx = avctx->priv_data; @@ -118,6 +123,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, const uint8_t *buf, int buf_si if (ctx->bit_depth != 10) { dsputil_init(&ctx->dsp, ctx->avctx); ctx->bit_depth = 10; + ctx->decode_dct_block = dnxhd_decode_dct_block_10; } } else { ctx->avctx->pix_fmt = PIX_FMT_YUV422P; @@ -125,6 +131,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, const uint8_t *buf, int buf_si if (ctx->bit_depth != 8) { dsputil_init(&ctx->dsp, ctx->avctx); ctx->bit_depth = 8; + ctx->decode_dct_block = dnxhd_decode_dct_block_8; } } @@ -165,71 +172,94 @@ static int dnxhd_decode_header(DNXHDContext *ctx, const uint8_t *buf, int buf_si return 0; } -static int dnxhd_decode_dc(DNXHDContext *ctx) -{ - int len; - - len = get_vlc2(&ctx->gb, ctx->dc_vlc.table, DNXHD_DC_VLC_BITS, 1); - return len ? get_xbits(&ctx->gb, len) : 0; -} - -static void dnxhd_decode_dct_block(DNXHDContext *ctx, DCTELEM *block, int n, int qscale) +static av_always_inline void dnxhd_decode_dct_block(DNXHDContext *ctx, + DCTELEM *block, int n, + int qscale, + int index_bits, + int level_bias, + int level_shift) { - int i, j, index, index2; + int i, j, index1, index2, len; int level, component, sign; - const uint8_t *weigth_matrix; + const uint8_t *weight_matrix; + OPEN_READER(bs, &ctx->gb); if (n&2) { component = 1 + (n&1); - weigth_matrix = ctx->cid_table->chroma_weight; + weight_matrix = ctx->cid_table->chroma_weight; } else { component = 0; - weigth_matrix = ctx->cid_table->luma_weight; + weight_matrix = ctx->cid_table->luma_weight; } - ctx->last_dc[component] += dnxhd_decode_dc(ctx); + UPDATE_CACHE(bs, &ctx->gb); + GET_VLC(len, bs, &ctx->gb, ctx->dc_vlc.table, DNXHD_DC_VLC_BITS, 1); + if (len) { + level = GET_CACHE(bs, &ctx->gb); + LAST_SKIP_BITS(bs, &ctx->gb, len); + sign = ~level >> 31; + level = (NEG_USR32(sign ^ level, len) ^ sign) - sign; + ctx->last_dc[component] += level; + } block[0] = ctx->last_dc[component]; //av_log(ctx->avctx, AV_LOG_DEBUG, "dc %d\n", block[0]); + for (i = 1; ; i++) { - index = get_vlc2(&ctx->gb, ctx->ac_vlc.table, DNXHD_VLC_BITS, 2); - //av_log(ctx->avctx, AV_LOG_DEBUG, "index %d\n", index); - level = ctx->cid_table->ac_level[index]; + UPDATE_CACHE(bs, &ctx->gb); + GET_VLC(index1, bs, &ctx->gb, ctx->ac_vlc.table, + DNXHD_VLC_BITS, 2); + //av_log(ctx->avctx, AV_LOG_DEBUG, "index %d\n", index1); + level = ctx->cid_table->ac_level[index1]; if (!level) { /* EOB */ //av_log(ctx->avctx, AV_LOG_DEBUG, "EOB\n"); - return; + break; } - sign = get_sbits(&ctx->gb, 1); - if (ctx->cid_table->ac_index_flag[index]) { - level += get_bits(&ctx->gb, ctx->cid_table->index_bits)<<6; + sign = SHOW_SBITS(bs, &ctx->gb, 1); + SKIP_BITS(bs, &ctx->gb, 1); + + if (ctx->cid_table->ac_index_flag[index1]) { + level += SHOW_UBITS(bs, &ctx->gb, index_bits) << 6; + SKIP_BITS(bs, &ctx->gb, index_bits); } - if (ctx->cid_table->ac_run_flag[index]) { - index2 = get_vlc2(&ctx->gb, ctx->run_vlc.table, DNXHD_VLC_BITS, 2); + if (ctx->cid_table->ac_run_flag[index1]) { + UPDATE_CACHE(bs, &ctx->gb); + GET_VLC(index2, bs, &ctx->gb, ctx->run_vlc.table, + DNXHD_VLC_BITS, 2); i += ctx->cid_table->run[index2]; } if (i > 63) { av_log(ctx->avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", n, i); - return; + break; } j = ctx->scantable.permutated[i]; //av_log(ctx->avctx, AV_LOG_DEBUG, "j %d\n", j); - //av_log(ctx->avctx, AV_LOG_DEBUG, "level %d, weigth %d\n", level, weigth_matrix[i]); - level = (2*level+1) * qscale * weigth_matrix[i]; - if (ctx->bit_depth == 10) { - if (weigth_matrix[i] != 8) - level += 8; - level >>= 4; - } else { - if (weigth_matrix[i] != 32) - level += 32; - level >>= 6; - } + //av_log(ctx->avctx, AV_LOG_DEBUG, "level %d, weight %d\n", level, weight_matrix[i]); + level = (2*level+1) * qscale * weight_matrix[i]; + if (weight_matrix[i] != level_bias) + level += level_bias; + level >>= level_shift; + //av_log(NULL, AV_LOG_DEBUG, "i %d, j %d, end level %d\n", i, j, level); block[j] = (level^sign) - sign; } + + CLOSE_READER(bs, &ctx->gb); +} + +static void dnxhd_decode_dct_block_8(DNXHDContext *ctx, DCTELEM *block, + int n, int qscale) +{ + dnxhd_decode_dct_block(ctx, block, n, qscale, 4, 32, 6); +} + +static void dnxhd_decode_dct_block_10(DNXHDContext *ctx, DCTELEM *block, + int n, int qscale) +{ + dnxhd_decode_dct_block(ctx, block, n, qscale, 6, 8, 4); } static int dnxhd_decode_macroblock(DNXHDContext *ctx, int x, int y) @@ -247,7 +277,7 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, int x, int y) for (i = 0; i < 8; i++) { ctx->dsp.clear_block(ctx->blocks[i]); - dnxhd_decode_dct_block(ctx, ctx->blocks[i], i, qscale); + ctx->decode_dct_block(ctx, ctx->blocks[i], i, qscale); } if (ctx->picture.interlaced_frame) { -- cgit v1.2.3