summaryrefslogtreecommitdiff
path: root/libavcodec/vp9.c
diff options
context:
space:
mode:
authorIlia Valiakhmetov <zakne0ne@gmail.com>2017-09-08 04:02:49 +0700
committerRonald S. Bultje <rsbultje@gmail.com>2017-09-08 10:25:40 -0400
commite59da0f7ff129d570adb72c6479f7ce07cf5a0f9 (patch)
treecbcc6765a25e8ef9076e1fd00d147e7a886d885e /libavcodec/vp9.c
parent83c12fefd22fc2326a000019e5c1a33e90a874e8 (diff)
avcodec/vp9: Add tile threading support
Signed-off-by: Ilia Valiakhmetov <zakne0ne@gmail.com> Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Diffstat (limited to 'libavcodec/vp9.c')
-rw-r--r--libavcodec/vp9.c665
1 files changed, 475 insertions, 190 deletions
diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 94430db9a3..a71045e081 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -36,6 +36,64 @@
#define VP9_SYNCCODE 0x498342
+#if HAVE_THREADS
+static void vp9_free_entries(AVCodecContext *avctx) {
+ VP9Context *s = avctx->priv_data;
+
+ if (avctx->active_thread_type & FF_THREAD_SLICE) {
+ pthread_mutex_destroy(&s->progress_mutex);
+ pthread_cond_destroy(&s->progress_cond);
+ av_freep(&s->entries);
+ }
+}
+
+static int vp9_alloc_entries(AVCodecContext *avctx, int n) {
+ VP9Context *s = avctx->priv_data;
+ int i;
+
+ if (avctx->active_thread_type & FF_THREAD_SLICE) {
+ if (s->entries)
+ av_freep(&s->entries);
+
+ s->entries = av_malloc_array(n, sizeof(atomic_int));
+
+ if (!s->entries) {
+ av_freep(&s->entries);
+ return AVERROR(ENOMEM);
+ }
+
+ for (i = 0; i < n; i++)
+ atomic_init(&s->entries[i], 0);
+
+ pthread_mutex_init(&s->progress_mutex, NULL);
+ pthread_cond_init(&s->progress_cond, NULL);
+ }
+ return 0;
+}
+
+static void vp9_report_tile_progress(VP9Context *s, int field, int n) {
+ pthread_mutex_lock(&s->progress_mutex);
+ atomic_fetch_add_explicit(&s->entries[field], n, memory_order_relaxed);
+ pthread_cond_signal(&s->progress_cond);
+ pthread_mutex_unlock(&s->progress_mutex);
+}
+
+static void vp9_await_tile_progress(VP9Context *s, int field, int n) {
+ if (atomic_load_explicit(&s->entries[field], memory_order_acquire) >= n)
+ return;
+
+ pthread_mutex_lock(&s->progress_mutex);
+ while (atomic_load_explicit(&s->entries[field], memory_order_relaxed) != n)
+ pthread_cond_wait(&s->progress_cond, &s->progress_mutex);
+ pthread_mutex_unlock(&s->progress_mutex);
+}
+#else
+static void vp9_free_entries(VP9Context *s) {}
+static int vp9_alloc_entries(AVCodecContext *avctx, int n) { return 0; }
+static void vp9_report_tile_progress(VP9Context *s, int field, int n) {}
+static void vp9_await_tile_progress(VP9Context *s, int field, int n) {}
+#endif
+
static void vp9_frame_unref(AVCodecContext *avctx, VP9Frame *f)
{
ff_thread_release_buffer(avctx, &f->tf);
@@ -118,6 +176,7 @@ static int update_size(AVCodecContext *avctx, int w, int h)
VP9Context *s = avctx->priv_data;
uint8_t *p;
int bytesperpixel = s->bytesperpixel, ret, cols, rows;
+ int lflvl_len, i;
av_assert0(w > 0 && h > 0);
@@ -170,13 +229,14 @@ static int update_size(AVCodecContext *avctx, int w, int h)
s->sb_rows = (h + 63) >> 6;
s->cols = (w + 7) >> 3;
s->rows = (h + 7) >> 3;
+ lflvl_len = avctx->active_thread_type == FF_THREAD_SLICE ? s->sb_rows : 1;
#define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
av_freep(&s->intra_pred_data[0]);
// FIXME we slightly over-allocate here for subsampled chroma, but a little
// bit of padding shouldn't affect performance...
p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
- sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
+ lflvl_len * sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
if (!p)
return AVERROR(ENOMEM);
assign(s->intra_pred_data[0], uint8_t *, 64 * bytesperpixel);
@@ -195,12 +255,15 @@ static int update_size(AVCodecContext *avctx, int w, int h)
assign(s->above_comp_ctx, uint8_t *, 8);
assign(s->above_ref_ctx, uint8_t *, 8);
assign(s->above_filter_ctx, uint8_t *, 8);
- assign(s->lflvl, VP9Filter *, 1);
+ assign(s->lflvl, VP9Filter *, lflvl_len);
#undef assign
- // these will be re-allocated a little later
- av_freep(&s->b_base);
- av_freep(&s->block_base);
+ if (s->td) {
+ for (i = 0; i < s->active_tile_cols; i++) {
+ av_freep(&s->td[i].b_base);
+ av_freep(&s->td[i].block_base);
+ }
+ }
if (s->s.h.bpp != s->last_bpp) {
ff_vp9dsp_init(&s->dsp, s->s.h.bpp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
@@ -213,40 +276,50 @@ static int update_size(AVCodecContext *avctx, int w, int h)
static int update_block_buffers(AVCodecContext *avctx)
{
+ int i;
VP9Context *s = avctx->priv_data;
int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
+ VP9TileData *td = &s->td[0];
- if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
+ if (td->b_base && td->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
return 0;
- av_free(s->b_base);
- av_free(s->block_base);
+ av_free(td->b_base);
+ av_free(td->block_base);
chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
chroma_eobs = 16 * 16 >> (s->ss_h + s->ss_v);
if (s->s.frames[CUR_FRAME].uses_2pass) {
int sbs = s->sb_cols * s->sb_rows;
- s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
- s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
+ td->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
+ td->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
16 * 16 + 2 * chroma_eobs) * sbs);
- if (!s->b_base || !s->block_base)
+ if (!td->b_base || !td->block_base)
return AVERROR(ENOMEM);
- s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
- s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
- s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
- s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
- s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
+ td->uvblock_base[0] = td->block_base + sbs * 64 * 64 * bytesperpixel;
+ td->uvblock_base[1] = td->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
+ td->eob_base = (uint8_t *) (td->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
+ td->uveob_base[0] = td->eob_base + 16 * 16 * sbs;
+ td->uveob_base[1] = td->uveob_base[0] + chroma_eobs * sbs;
} else {
- s->b_base = av_malloc(sizeof(VP9Block));
- s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
- 16 * 16 + 2 * chroma_eobs);
- if (!s->b_base || !s->block_base)
- return AVERROR(ENOMEM);
- s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
- s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
- s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
- s->uveob_base[0] = s->eob_base + 16 * 16;
- s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
+ for (i = 1; i < s->active_tile_cols; i++) {
+ if (s->td[i].b_base && s->td[i].block_base) {
+ av_free(s->td[i].b_base);
+ av_free(s->td[i].block_base);
+ }
+ }
+ for (i = 0; i < s->active_tile_cols; i++) {
+ s->td[i].b_base = av_malloc(sizeof(VP9Block));
+ s->td[i].block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
+ 16 * 16 + 2 * chroma_eobs);
+ if (!s->td[i].b_base || !s->td[i].block_base)
+ return AVERROR(ENOMEM);
+ s->td[i].uvblock_base[0] = s->td[i].block_base + 64 * 64 * bytesperpixel;
+ s->td[i].uvblock_base[1] = s->td[i].uvblock_base[0] + chroma_blocks * bytesperpixel;
+ s->td[i].eob_base = (uint8_t *) (s->td[i].uvblock_base[1] + chroma_blocks * bytesperpixel);
+ s->td[i].uveob_base[0] = s->td[i].eob_base + 16 * 16;
+ s->td[i].uveob_base[1] = s->td[i].uveob_base[0] + chroma_eobs;
+ }
}
s->block_alloc_using_2pass = s->s.frames[CUR_FRAME].uses_2pass;
@@ -543,8 +616,20 @@ static int decode_frame_header(AVCodecContext *avctx,
sharp = get_bits(&s->gb, 3);
// if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
// the old cache values since they are still valid
- if (s->s.h.filter.sharpness != sharp)
- memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
+ if (s->s.h.filter.sharpness != sharp) {
+ for (i = 1; i <= 63; i++) {
+ int limit = i;
+
+ if (sharp > 0) {
+ limit >>= (sharp + 3) >> 2;
+ limit = FFMIN(limit, 9 - sharp);
+ }
+ limit = FFMAX(limit, 1);
+
+ s->filter_lut.lim_lut[i] = limit;
+ s->filter_lut.mblim_lut[i] = 2 * (i + 2) + limit;
+ }
+ }
s->s.h.filter.sharpness = sharp;
if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
@@ -662,12 +747,36 @@ static int decode_frame_header(AVCodecContext *avctx,
s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
+ int n_range_coders;
+ VP56RangeCoder *rc;
+
+ if (s->td) {
+ for (i = 0; i < s->active_tile_cols; i++) {
+ av_free(s->td[i].b_base);
+ av_free(s->td[i].block_base);
+ }
+ av_free(s->td);
+ }
+
s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
- s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
- sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
- if (!s->c_b) {
- av_log(avctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
+ vp9_free_entries(avctx);
+ s->active_tile_cols = avctx->active_thread_type == FF_THREAD_SLICE ?
+ s->s.h.tiling.tile_cols : 1;
+ vp9_alloc_entries(avctx, s->sb_rows);
+ if (avctx->active_thread_type == FF_THREAD_SLICE) {
+ n_range_coders = 4; // max_tile_rows
+ } else {
+ n_range_coders = s->s.h.tiling.tile_cols;
+ }
+ s->td = av_mallocz_array(s->active_tile_cols, sizeof(VP9TileData) +
+ n_range_coders * sizeof(VP56RangeCoder));
+ if (!s->td)
return AVERROR(ENOMEM);
+ rc = (VP56RangeCoder *) &s->td[s->active_tile_cols];
+ for (i = 0; i < s->active_tile_cols; i++) {
+ s->td[i].s = s;
+ s->td[i].c_b = rc;
+ rc += n_range_coders;
}
}
@@ -735,12 +844,15 @@ static int decode_frame_header(AVCodecContext *avctx,
return AVERROR_INVALIDDATA;
}
- if (s->s.h.keyframe || s->s.h.intraonly) {
- memset(s->counts.coef, 0, sizeof(s->counts.coef));
- memset(s->counts.eob, 0, sizeof(s->counts.eob));
- } else {
- memset(&s->counts, 0, sizeof(s->counts));
+ for (i = 0; i < s->active_tile_cols; i++) {
+ if (s->s.h.keyframe || s->s.h.intraonly) {
+ memset(s->td[i].counts.coef, 0, sizeof(s->td[0].counts.coef));
+ memset(s->td[i].counts.eob, 0, sizeof(s->td[0].counts.eob));
+ } else {
+ memset(&s->td[i].counts, 0, sizeof(s->td[0].counts));
+ }
}
+
/* FIXME is it faster to not copy here, but do it down in the fw updates
* as explicit copies if the fw update is missing (and skip the copy upon
* fw update)? */
@@ -789,7 +901,7 @@ static int decode_frame_header(AVCodecContext *avctx,
else
p[n] = r[n];
}
- p[3] = 0;
+ memcpy(&p[3], ff_vp9_model_pareto8[p[2]], 8);
}
} else {
for (j = 0; j < 2; j++)
@@ -801,7 +913,7 @@ static int decode_frame_header(AVCodecContext *avctx,
if (m > 3 && l == 0) // dc only has 3 pt
break;
memcpy(p, r, 3);
- p[3] = 0;
+ memcpy(&p[3], ff_vp9_model_pareto8[p[2]], 8);
}
}
if (s->s.h.txfmmode == i)
@@ -929,12 +1041,12 @@ static int decode_frame_header(AVCodecContext *avctx,
return (data2 - data) + size2;
}
-static void decode_sb(AVCodecContext *avctx, int row, int col, VP9Filter *lflvl,
+static void decode_sb(VP9TileData *td, int row, int col, VP9Filter *lflvl,
ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
{
- VP9Context *s = avctx->priv_data;
+ const VP9Context *s = td->s;
int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
- (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
+ (((td->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? ff_vp9_default_kf_partition_probs[bl][c] :
s->prob.p.partition[bl][c];
enum BlockPartition bp;
@@ -944,75 +1056,75 @@ static void decode_sb(AVCodecContext *avctx, int row, int col, VP9Filter *lflvl,
int bytesperpixel = s->bytesperpixel;
if (bl == BL_8X8) {
- bp = vp8_rac_get_tree(&s->c, ff_vp9_partition_tree, p);
- ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+ bp = vp8_rac_get_tree(td->c, ff_vp9_partition_tree, p);
+ ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
} else if (col + hbs < s->cols) { // FIXME why not <=?
if (row + hbs < s->rows) { // FIXME why not <=?
- bp = vp8_rac_get_tree(&s->c, ff_vp9_partition_tree, p);
+ bp = vp8_rac_get_tree(td->c, ff_vp9_partition_tree, p);
switch (bp) {
case PARTITION_NONE:
- ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+ ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
break;
case PARTITION_H:
- ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+ ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
yoff += hbs * 8 * y_stride;
uvoff += hbs * 8 * uv_stride >> s->ss_v;
- ff_vp9_decode_block(avctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
+ ff_vp9_decode_block(td, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
break;
case PARTITION_V:
- ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+ ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
yoff += hbs * 8 * bytesperpixel;
uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
- ff_vp9_decode_block(avctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
+ ff_vp9_decode_block(td, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
break;
case PARTITION_SPLIT:
- decode_sb(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
- decode_sb(avctx, row, col + hbs, lflvl,
+ decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
+ decode_sb(td, row, col + hbs, lflvl,
yoff + 8 * hbs * bytesperpixel,
uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
yoff += hbs * 8 * y_stride;
uvoff += hbs * 8 * uv_stride >> s->ss_v;
- decode_sb(avctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
- decode_sb(avctx, row + hbs, col + hbs, lflvl,
+ decode_sb(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+ decode_sb(td, row + hbs, col + hbs, lflvl,
yoff + 8 * hbs * bytesperpixel,
uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
break;
default:
av_assert0(0);
}
- } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
+ } else if (vp56_rac_get_prob_branchy(td->c, p[1])) {
bp = PARTITION_SPLIT;
- decode_sb(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
- decode_sb(avctx, row, col + hbs, lflvl,
+ decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
+ decode_sb(td, row, col + hbs, lflvl,
yoff + 8 * hbs * bytesperpixel,
uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
} else {
bp = PARTITION_H;
- ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+ ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
}
} else if (row + hbs < s->rows) { // FIXME why not <=?
- if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
+ if (vp56_rac_get_prob_branchy(td->c, p[2])) {
bp = PARTITION_SPLIT;
- decode_sb(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+ decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
yoff += hbs * 8 * y_stride;
uvoff += hbs * 8 * uv_stride >> s->ss_v;
- decode_sb(avctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+ decode_sb(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
} else {
bp = PARTITION_V;
- ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+ ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
}
} else {
bp = PARTITION_SPLIT;
- decode_sb(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+ decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
}
- s->counts.partition[bl][c][bp]++;
+ td->counts.partition[bl][c][bp]++;
}
-static void decode_sb_mem(AVCodecContext *avctx, int row, int col, VP9Filter *lflvl,
+static void decode_sb_mem(VP9TileData *td, int row, int col, VP9Filter *lflvl,
ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
{
- VP9Context *s = avctx->priv_data;
- VP9Block *b = s->b;
+ const VP9Context *s = td->s;
+ VP9Block *b = td->b;
ptrdiff_t hbs = 4 >> bl;
AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
@@ -1020,39 +1132,39 @@ static void decode_sb_mem(AVCodecContext *avctx, int row, int col, VP9Filter *lf
if (bl == BL_8X8) {
av_assert2(b->bl == BL_8X8);
- ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
- } else if (s->b->bl == bl) {
- ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
+ ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
+ } else if (td->b->bl == bl) {
+ ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
if (b->bp == PARTITION_H && row + hbs < s->rows) {
yoff += hbs * 8 * y_stride;
uvoff += hbs * 8 * uv_stride >> s->ss_v;
- ff_vp9_decode_block(avctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
+ ff_vp9_decode_block(td, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
} else if (b->bp == PARTITION_V && col + hbs < s->cols) {
yoff += hbs * 8 * bytesperpixel;
uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
- ff_vp9_decode_block(avctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
+ ff_vp9_decode_block(td, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
}
} else {
- decode_sb_mem(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+ decode_sb_mem(td, row, col, lflvl, yoff, uvoff, bl + 1);
if (col + hbs < s->cols) { // FIXME why not <=?
if (row + hbs < s->rows) {
- decode_sb_mem(avctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
+ decode_sb_mem(td, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
yoff += hbs * 8 * y_stride;
uvoff += hbs * 8 * uv_stride >> s->ss_v;
- decode_sb_mem(avctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
- decode_sb_mem(avctx, row + hbs, col + hbs, lflvl,
+ decode_sb_mem(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+ decode_sb_mem(td, row + hbs, col + hbs, lflvl,
yoff + 8 * hbs * bytesperpixel,
uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
} else {
yoff += hbs * 8 * bytesperpixel;
uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
- decode_sb_mem(avctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
+ decode_sb_mem(td, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
}
} else if (row + hbs < s->rows) {
yoff += hbs * 8 * y_stride;
uvoff += hbs * 8 * uv_stride >> s->ss_v;
- decode_sb_mem(avctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+ decode_sb_mem(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
}
}
}
@@ -1067,9 +1179,13 @@ static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
static void free_buffers(VP9Context *s)
{
+ int i;
+
av_freep(&s->intra_pred_data[0]);
- av_freep(&s->b_base);
- av_freep(&s->block_base);
+ for (i = 0; i < s->active_tile_cols; i++) {
+ av_freep(&s->td[i].b_base);
+ av_freep(&s->td[i].block_base);
+ }
}
static av_cold int vp9_decode_free(AVCodecContext *avctx)
@@ -1090,10 +1206,249 @@ static av_cold int vp9_decode_free(AVCodecContext *avctx)
ff_thread_release_buffer(avctx, &s->next_refs[i]);
av_frame_free(&s->next_refs[i].f);
}
+
free_buffers(s);
- av_freep(&s->c_b);
- s->c_b_size = 0;
+ vp9_free_entries(avctx);
+ av_freep(&s->td);
+ return 0;
+}
+
+static int decode_tiles(AVCodecContext *avctx,
+ const uint8_t *data, int size)
+{
+ VP9Context *s = avctx->priv_data;
+ VP9TileData *td = &s->td[0];
+ int row, col, tile_row, tile_col, ret;
+ int bytesperpixel;
+ int tile_row_start, tile_row_end, tile_col_start, tile_col_end;
+ AVFrame *f;
+ ptrdiff_t yoff, uvoff, ls_y, ls_uv;
+
+ f = s->s.frames[CUR_FRAME].tf.f;
+ ls_y = f->linesize[0];
+ ls_uv =f->linesize[1];
+ bytesperpixel = s->bytesperpixel;
+
+ yoff = uvoff = 0;
+ for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
+ set_tile_offset(&tile_row_start, &tile_row_end,
+ tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
+
+ for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
+ int64_t tile_size;
+ if (tile_col == s->s.h.tiling.tile_cols - 1 &&
+ tile_row == s->s.h.tiling.tile_rows - 1) {
+ tile_size = size;
+ } else {
+ tile_size = AV_RB32(data);
+ data += 4;
+ size -= 4;
+ }
+ if (tile_size > size) {
+ ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+ return AVERROR_INVALIDDATA;
+ }
+ ret = ff_vp56_init_range_decoder(&td->c_b[tile_col], data, tile_size);
+ if (ret < 0)
+ return ret;
+ if (vp56_rac_get_prob_branchy(&td->c_b[tile_col], 128)) { // marker bit
+ ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+ return AVERROR_INVALIDDATA;
+ }
+ data += tile_size;
+ size -= tile_size;
+ }
+
+ for (row = tile_row_start; row < tile_row_end;
+ row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
+ VP9Filter *lflvl_ptr = s->lflvl;
+ ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
+
+ for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
+ set_tile_offset(&tile_col_start, &tile_col_end,
+ tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
+ td->tile_col_start = tile_col_start;
+ if (s->pass != 2) {
+ memset(td->left_partition_ctx, 0, 8);
+ memset(td->left_skip_ctx, 0, 8);
+ if (s->s.h.keyframe || s->s.h.intraonly) {
+ memset(td->left_mode_ctx, DC_PRED, 16);
+ } else {
+ memset(td->left_mode_ctx, NEARESTMV, 8);
+ }
+ memset(td->left_y_nnz_ctx, 0, 16);
+ memset(td->left_uv_nnz_ctx, 0, 32);
+ memset(td->left_segpred_ctx, 0, 8);
+
+ td->c = &td->c_b[tile_col];
+ }
+
+ for (col = tile_col_start;
+ col < tile_col_end;
+ col += 8, yoff2 += 64 * bytesperpixel,
+ uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+ // FIXME integrate with lf code (i.e. zero after each
+ // use, similar to invtxfm coefficients, or similar)
+ if (s->pass != 1) {
+ memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
+ }
+
+ if (s->pass == 2) {
+ decode_sb_mem(td, row, col, lflvl_ptr,
+ yoff2, uvoff2, BL_64X64);
+ } else {
+ decode_sb(td, row, col, lflvl_ptr,
+ yoff2, uvoff2, BL_64X64);
+ }
+ }
+ }
+
+ if (s->pass == 1)
+ continue;
+
+ // backup pre-loopfilter reconstruction data for intra
+ // prediction of next row of sb64s
+ if (row + 8 < s->rows) {
+ memcpy(s->intra_pred_data[0],
+ f->data[0] + yoff + 63 * ls_y,
+ 8 * s->cols * bytesperpixel);
+ memcpy(s->intra_pred_data[1],
+ f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+ 8 * s->cols * bytesperpixel >> s->ss_h);
+ memcpy(s->intra_pred_data[2],
+ f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+ 8 * s->cols * bytesperpixel >> s->ss_h);
+ }
+
+ // loopfilter one row
+ if (s->s.h.filter.level) {
+ yoff2 = yoff;
+ uvoff2 = uvoff;
+ lflvl_ptr = s->lflvl;
+ for (col = 0; col < s->cols;
+ col += 8, yoff2 += 64 * bytesperpixel,
+ uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+ ff_vp9_loopfilter_sb(avctx, lflvl_ptr, row, col,
+ yoff2, uvoff2);
+ }
+ }
+
+ // FIXME maybe we can make this more finegrained by running the
+ // loopfilter per-block instead of after each sbrow
+ // In fact that would also make intra pred left preparation easier?
+ ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
+ }
+ }
+ return 0;
+}
+
+
+static av_always_inline
+int decode_tiles_mt(AVCodecContext *avctx, void *tdata, int jobnr,
+ int threadnr)
+{
+ VP9Context *s = avctx->priv_data;
+ VP9TileData *td = &s->td[jobnr];
+ ptrdiff_t uvoff, yoff, ls_y, ls_uv;
+ int bytesperpixel = s->bytesperpixel, row, col, tile_row;
+ unsigned tile_cols_len;
+ int tile_row_start, tile_row_end, tile_col_start, tile_col_end;
+ VP9Filter *lflvl_ptr_base;
+ AVFrame *f;
+
+ f = s->s.frames[CUR_FRAME].tf.f;
+ ls_y = f->linesize[0];
+ ls_uv =f->linesize[1];
+
+ set_tile_offset(&tile_col_start, &tile_col_end,
+ jobnr, s->s.h.tiling.log2_tile_cols, s->sb_cols);
+ td->tile_col_start = tile_col_start;
+ uvoff = (64 * bytesperpixel >> s->ss_h)*(tile_col_start >> 3);
+ yoff = (64 * bytesperpixel)*(tile_col_start >> 3);
+ lflvl_ptr_base = s->lflvl+(tile_col_start >> 3);
+
+ for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
+ set_tile_offset(&tile_row_start, &tile_row_end,
+ tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
+
+ td->c = &td->c_b[tile_row];
+ for (row = tile_row_start; row < tile_row_end;
+ row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
+ ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
+ VP9Filter *lflvl_ptr = lflvl_ptr_base+s->sb_cols*(row >> 3);
+
+ memset(td->left_partition_ctx, 0, 8);
+ memset(td->left_skip_ctx, 0, 8);
+ if (s->s.h.keyframe || s->s.h.intraonly) {
+ memset(td->left_mode_ctx, DC_PRED, 16);
+ } else {
+ memset(td->left_mode_ctx, NEARESTMV, 8);
+ }
+ memset(td->left_y_nnz_ctx, 0, 16);
+ memset(td->left_uv_nnz_ctx, 0, 32);
+ memset(td->left_segpred_ctx, 0, 8);
+
+ for (col = tile_col_start;
+ col < tile_col_end;
+ col += 8, yoff2 += 64 * bytesperpixel,
+ uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+ // FIXME integrate with lf code (i.e. zero after each
+ // use, similar to invtxfm coefficients, or similar)
+ memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
+ decode_sb(td, row, col, lflvl_ptr,
+ yoff2, uvoff2, BL_64X64);
+ }
+
+ // backup pre-loopfilter reconstruction data for intra
+ // prediction of next row of sb64s
+ tile_cols_len = tile_col_end - tile_col_start;
+ if (row + 8 < s->rows) {
+ memcpy(s->intra_pred_data[0] + (tile_col_start * 8 * bytesperpixel),
+ f->data[0] + yoff + 63 * ls_y,
+ 8 * tile_cols_len * bytesperpixel);
+ memcpy(s->intra_pred_data[1] + (tile_col_start * 8 * bytesperpixel >> s->ss_h),
+ f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+ 8 * tile_cols_len * bytesperpixel >> s->ss_h);
+ memcpy(s->intra_pred_data[2] + (tile_col_start * 8 * bytesperpixel >> s->ss_h),
+ f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+ 8 * tile_cols_len * bytesperpixel >> s->ss_h);
+ }
+
+ vp9_report_tile_progress(s, row >> 3, 1);
+ }
+ }
+ return 0;
+}
+
+static av_always_inline
+int loopfilter_proc(AVCodecContext *avctx)
+{
+ VP9Context *s = avctx->priv_data;
+ ptrdiff_t uvoff, yoff, ls_y, ls_uv;
+ VP9Filter *lflvl_ptr;
+ int bytesperpixel = s->bytesperpixel, col, i;
+ AVFrame *f;
+
+ f = s->s.frames[CUR_FRAME].tf.f;
+ ls_y = f->linesize[0];
+ ls_uv =f->linesize[1];
+
+ for (i = 0; i < s->sb_rows; i++) {
+ vp9_await_tile_progress(s, i, s->s.h.tiling.tile_cols);
+
+ if (s->s.h.filter.level) {
+ yoff = (ls_y * 64)*i;
+ uvoff = (ls_uv * 64 >> s->ss_v)*i;
+ lflvl_ptr = s->lflvl+s->sb_cols*i;
+ for (col = 0; col < s->cols;
+ col += 8, yoff += 64 * bytesperpixel,
+ uvoff += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+ ff_vp9_loopfilter_sb(avctx, lflvl_ptr, i << 3, col,
+ yoff, uvoff);
+ }
+ }
+ }
return 0;
}
@@ -1104,12 +1459,10 @@ static int vp9_decode_frame(AVCodecContext *avctx, void *frame,
const uint8_t *data = pkt->data;
int size = pkt->size;
VP9Context *s = avctx->priv_data;
- int ret, tile_row, tile_col, i, ref, row, col;
+ int ret, i, j, ref;
int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
(!s->s.h.segmentation.enabled || !s->s.h.segmentation.update_map);
- ptrdiff_t yoff, uvoff, ls_y, ls_uv;
AVFrame *f;
- int bytesperpixel;
if ((ret = decode_frame_header(avctx, data, size, &ref)) < 0) {
return ret;
@@ -1159,8 +1512,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
f = s->s.frames[CUR_FRAME].tf.f;
f->key_frame = s->s.h.keyframe;
f->pict_type = (s->s.h.keyframe || s->s.h.intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
- ls_y = f->linesize[0];
- ls_uv =f->linesize[1];
if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
(s->s.frames[REF_FRAME_MVPAIR].tf.f->width != s->s.frames[CUR_FRAME].tf.f->width ||
@@ -1195,7 +1546,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
}
// main tile decode loop
- bytesperpixel = s->bytesperpixel;
memset(s->above_partition_ctx, 0, s->cols);
memset(s->above_skip_ctx, 0, s->cols);
if (s->s.h.keyframe || s->s.h.intraonly) {
@@ -1233,20 +1583,28 @@ FF_ENABLE_DEPRECATION_WARNINGS
ff_thread_finish_setup(avctx);
}
+ if (avctx->active_thread_type & FF_THREAD_SLICE) {
+ for (i = 0; i < s->sb_rows; i++)
+ atomic_store(&s->entries[i], 0);
+ }
+
do {
- yoff = uvoff = 0;
- s->b = s->b_base;
- s->block = s->block_base;
- s->uvblock[0] = s->uvblock_base[0];
- s->uvblock[1] = s->uvblock_base[1];
- s->eob = s->eob_base;
- s->uveob[0] = s->uveob_base[0];
- s->uveob[1] = s->uveob_base[1];
-
- for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
- set_tile_offset(&s->tile_row_start, &s->tile_row_end,
- tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
- if (s->pass != 2) {
+ for (i = 0; i < s->active_tile_cols; i++) {
+ s->td[i].b = s->td[i].b_base;
+ s->td[i].block = s->td[i].block_base;
+ s->td[i].uvblock[0] = s->td[i].uvblock_base[0];
+ s->td[i].uvblock[1] = s->td[i].uvblock_base[1];
+ s->td[i].eob = s->td[i].eob_base;
+ s->td[i].uveob[0] = s->td[i].uveob_base[0];
+ s->td[i].uveob[1] = s->td[i].uveob_base[1];
+ }
+
+ if (avctx->active_thread_type == FF_THREAD_SLICE) {
+ int tile_row, tile_col;
+
+ assert(!pass);
+
+ for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
int64_t tile_size;
@@ -1258,105 +1616,31 @@ FF_ENABLE_DEPRECATION_WARNINGS
data += 4;
size -= 4;
}
- if (tile_size > size) {
- ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+ if (tile_size > size)
return AVERROR_INVALIDDATA;
- }
- ret = ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
+ ret = ff_vp56_init_range_decoder(&s->td[tile_col].c_b[tile_row], data, tile_size);
if (ret < 0)
return ret;
- if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
- ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+ if (vp56_rac_get_prob_branchy(&s->td[tile_col].c_b[tile_row], 128)) // marker bit
return AVERROR_INVALIDDATA;
- }
data += tile_size;
size -= tile_size;
}
}
- for (row = s->tile_row_start; row < s->tile_row_end;
- row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
- VP9Filter *lflvl_ptr = s->lflvl;
- ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
-
- for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
- set_tile_offset(&s->tile_col_start, &s->tile_col_end,
- tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
-
- if (s->pass != 2) {
- memset(s->left_partition_ctx, 0, 8);
- memset(s->left_skip_ctx, 0, 8);
- if (s->s.h.keyframe || s->s.h.intraonly) {
- memset(s->left_mode_ctx, DC_PRED, 16);
- } else {
- memset(s->left_mode_ctx, NEARESTMV, 8);
- }
- memset(s->left_y_nnz_ctx, 0, 16);
- memset(s->left_uv_nnz_ctx, 0, 32);
- memset(s->left_segpred_ctx, 0, 8);
-
- memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
- }
-
- for (col = s->tile_col_start;
- col < s->tile_col_end;
- col += 8, yoff2 += 64 * bytesperpixel,
- uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
- // FIXME integrate with lf code (i.e. zero after each
- // use, similar to invtxfm coefficients, or similar)
- if (s->pass != 1) {
- memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
- }
-
- if (s->pass == 2) {
- decode_sb_mem(avctx, row, col, lflvl_ptr,
- yoff2, uvoff2, BL_64X64);
- } else {
- decode_sb(avctx, row, col, lflvl_ptr,
- yoff2, uvoff2, BL_64X64);
- }
- }
- if (s->pass != 2)
- memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
- }
-
- if (s->pass == 1)
- continue;
-
- // backup pre-loopfilter reconstruction data for intra
- // prediction of next row of sb64s
- if (row + 8 < s->rows) {
- memcpy(s->intra_pred_data[0],
- f->data[0] + yoff + 63 * ls_y,
- 8 * s->cols * bytesperpixel);
- memcpy(s->intra_pred_data[1],
- f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
- 8 * s->cols * bytesperpixel >> s->ss_h);
- memcpy(s->intra_pred_data[2],
- f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
- 8 * s->cols * bytesperpixel >> s->ss_h);
- }
-
- // loopfilter one row
- if (s->s.h.filter.level) {
- yoff2 = yoff;
- uvoff2 = uvoff;
- lflvl_ptr = s->lflvl;
- for (col = 0; col < s->cols;
- col += 8, yoff2 += 64 * bytesperpixel,
- uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
- ff_vp9_loopfilter_sb(avctx, lflvl_ptr, row, col,
- yoff2, uvoff2);
- }
- }
-
- // FIXME maybe we can make this more finegrained by running the
- // loopfilter per-block instead of after each sbrow
- // In fact that would also make intra pred left preparation easier?
- ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
- }
+ ff_slice_thread_execute_with_mainfunc(avctx, decode_tiles_mt, loopfilter_proc, s->td, NULL, s->s.h.tiling.tile_cols);
+ } else {
+ ret = decode_tiles(avctx, data, size);
+ if (ret < 0)
+ return ret;
}
+ // Sum all counts fields into td[0].counts for tile threading
+ if (avctx->active_thread_type == FF_THREAD_SLICE)
+ for (i = 1; i < s->s.h.tiling.tile_cols; i++)
+ for (j = 0; j < sizeof(s->td[i].counts) / sizeof(unsigned); j++)
+ ((unsigned *)&s->td[0].counts)[j] += ((unsigned *)&s->td[i].counts)[j];
+
if (s->pass < 2 && s->s.h.refreshctx && !s->s.h.parallelmode) {
ff_vp9_adapt_probs(s);
ff_thread_finish_setup(avctx);
@@ -1492,7 +1776,8 @@ AVCodec ff_vp9_decoder = {
.init = vp9_decode_init,
.close = vp9_decode_free,
.decode = vp9_decode_frame,
- .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+ .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_SLICE_THREADS,
+ .caps_internal = FF_CODEC_CAP_SLICE_THREAD_HAS_MF,
.flush = vp9_decode_flush,
.init_thread_copy = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
.update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),