From 951455c1c18d54177f281dba174078e54a835361 Mon Sep 17 00:00:00 2001 From: Daniel Kang Date: Thu, 12 Jul 2012 13:07:06 -0700 Subject: vp8: implement sliced threading Testing gives 25-30% gain on HD clips with two threads and up to 50% gain with eight threads. Sliced threading uses more memory than single or frame threading. Frame threading and single threading keep the previous memory layout. Signed-off-by: Luca Barbato --- libavcodec/vp8.h | 63 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 24 deletions(-) (limited to 'libavcodec/vp8.h') diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h index 2f2cb80a0a..1355da4d68 100644 --- a/libavcodec/vp8.h +++ b/libavcodec/vp8.h @@ -4,6 +4,7 @@ * Copyright (C) 2010 David Conrad * Copyright (C) 2010 Ronald S. Bultje * Copyright (C) 2010 Jason Garrett-Glaser + * Copyright (C) 2012 Daniel Kang * * This file is part of Libav. * @@ -88,10 +89,40 @@ typedef struct { } VP8Macroblock; typedef struct { + pthread_mutex_t lock; + pthread_cond_t cond; + int thread_nr; + int thread_mb_pos; // (mb_y << 16) | (mb_x & 0xFFFF) + int wait_mb_pos; // What the current thread is waiting on. + uint8_t *edge_emu_buffer; + /** + * For coeff decode, we need to know whether the above block had non-zero + * coefficients. This means for each macroblock, we need data for 4 luma + * blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9 + * per macroblock. We keep the last row in top_nnz. + */ + DECLARE_ALIGNED(8, uint8_t, left_nnz)[9]; + /** + * This is the index plus one of the last non-zero coeff + * for each of the blocks in the current macroblock. + * So, 0 -> no coeffs + * 1 -> dc-only (special transform) + * 2+-> full transform + */ + DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4]; + DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16]; + DECLARE_ALIGNED(16, DCTELEM, block_dc)[16]; + VP8FilterStrength *filter_strength; +} VP8ThreadData; + +#define MAX_THREADS 8 +typedef struct { + VP8ThreadData *thread_data; AVCodecContext *avctx; AVFrame *framep[4]; AVFrame *next_framep[4]; - uint8_t *edge_emu_buffer; + AVFrame *curframe; + AVFrame *prev_frame; uint16_t mb_width; /* number of horizontal MB */ uint16_t mb_height; /* number of vertical MB */ @@ -128,7 +159,6 @@ typedef struct { } filter; VP8Macroblock *macroblocks; - VP8FilterStrength *filter_strength; uint8_t *intra4x4_pred_mode_top; uint8_t intra4x4_pred_mode_left[4]; @@ -169,32 +199,10 @@ typedef struct { int8_t ref[4]; } lf_delta; - /** - * Cache of the top row needed for intra prediction - * 16 for luma, 8 for each chroma plane - */ uint8_t (*top_border)[16+8+8]; - - /** - * For coeff decode, we need to know whether the above block had non-zero - * coefficients. This means for each macroblock, we need data for 4 luma - * blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9 - * per macroblock. We keep the last row in top_nnz. - */ uint8_t (*top_nnz)[9]; - DECLARE_ALIGNED(8, uint8_t, left_nnz)[9]; - /** - * This is the index plus one of the last non-zero coeff - * for each of the blocks in the current macroblock. - * So, 0 -> no coeffs - * 1 -> dc-only (special transform) - * 2+-> full transform - */ - DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4]; VP56RangeCoder c; ///< header context, includes mb modes and motion vectors - DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16]; - DECLARE_ALIGNED(16, DCTELEM, block_dc)[16]; /** * These are all of the updatable probabilities for binary decisions. @@ -247,6 +255,13 @@ typedef struct { uint8_t *segmentation_maps[5]; int num_maps_to_be_freed; int maps_are_invalid; + int num_jobs; + /** + * This describes the macroblock memory layout. + * 0 -> Only width+height*2+1 macroblocks allocated (frame/single thread). + * 1 -> Macroblocks for entire frame alloced (sliced thread). + */ + int mb_layout; } VP8Context; #endif /* AVCODEC_VP8_H */ -- cgit v1.2.3