From 951455c1c18d54177f281dba174078e54a835361 Mon Sep 17 00:00:00 2001
From: Daniel Kang <daniel.d.kang@gmail.com>
Date: Thu, 12 Jul 2012 13:07:06 -0700
Subject: vp8: implement sliced threading

Testing gives 25-30% gain on HD clips with two threads and
up to 50% gain with eight threads.

Sliced threading uses more memory than single or frame threading.

Frame threading and single threading keep the previous memory
layout.

Signed-off-by: Luca Barbato <lu_zero@gentoo.org>
---
 libavcodec/vp8.h | 63 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 24 deletions(-)

(limited to 'libavcodec/vp8.h')

diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h
index 2f2cb80a0a..1355da4d68 100644
--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@@ -4,6 +4,7 @@
  * Copyright (C) 2010 David Conrad
  * Copyright (C) 2010 Ronald S. Bultje
  * Copyright (C) 2010 Jason Garrett-Glaser
+ * Copyright (C) 2012 Daniel Kang
  *
  * This file is part of Libav.
  *
@@ -88,10 +89,40 @@ typedef struct {
 } VP8Macroblock;
 
 typedef struct {
+    pthread_mutex_t lock;
+    pthread_cond_t  cond;
+    int thread_nr;
+    int thread_mb_pos; // (mb_y << 16) | (mb_x & 0xFFFF)
+    int wait_mb_pos; // What the current thread is waiting on.
+    uint8_t *edge_emu_buffer;
+    /**
+     * For coeff decode, we need to know whether the above block had non-zero
+     * coefficients. This means for each macroblock, we need data for 4 luma
+     * blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9
+     * per macroblock. We keep the last row in top_nnz.
+     */
+    DECLARE_ALIGNED(8, uint8_t, left_nnz)[9];
+    /**
+     * This is the index plus one of the last non-zero coeff
+     * for each of the blocks in the current macroblock.
+     * So, 0 -> no coeffs
+     *     1 -> dc-only (special transform)
+     *     2+-> full transform
+     */
+    DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
+    DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
+    DECLARE_ALIGNED(16, DCTELEM, block_dc)[16];
+    VP8FilterStrength *filter_strength;
+} VP8ThreadData;
+
+#define MAX_THREADS 8
+typedef struct {
+    VP8ThreadData *thread_data;
     AVCodecContext *avctx;
     AVFrame *framep[4];
     AVFrame *next_framep[4];
-    uint8_t *edge_emu_buffer;
+    AVFrame *curframe;
+    AVFrame *prev_frame;
 
     uint16_t mb_width;   /* number of horizontal MB */
     uint16_t mb_height;  /* number of vertical MB */
@@ -128,7 +159,6 @@ typedef struct {
     } filter;
 
     VP8Macroblock *macroblocks;
-    VP8FilterStrength *filter_strength;
 
     uint8_t *intra4x4_pred_mode_top;
     uint8_t intra4x4_pred_mode_left[4];
@@ -169,32 +199,10 @@ typedef struct {
         int8_t ref[4];
     } lf_delta;
 
-    /**
-     * Cache of the top row needed for intra prediction
-     * 16 for luma, 8 for each chroma plane
-     */
     uint8_t (*top_border)[16+8+8];
-
-    /**
-     * For coeff decode, we need to know whether the above block had non-zero
-     * coefficients. This means for each macroblock, we need data for 4 luma
-     * blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9
-     * per macroblock. We keep the last row in top_nnz.
-     */
     uint8_t (*top_nnz)[9];
-    DECLARE_ALIGNED(8, uint8_t, left_nnz)[9];
 
-    /**
-     * This is the index plus one of the last non-zero coeff
-     * for each of the blocks in the current macroblock.
-     * So, 0 -> no coeffs
-     *     1 -> dc-only (special transform)
-     *     2+-> full transform
-     */
-    DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
     VP56RangeCoder c;   ///< header context, includes mb modes and motion vectors
-    DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
-    DECLARE_ALIGNED(16, DCTELEM, block_dc)[16];
 
     /**
      * These are all of the updatable probabilities for binary decisions.
@@ -247,6 +255,13 @@ typedef struct {
     uint8_t *segmentation_maps[5];
     int num_maps_to_be_freed;
     int maps_are_invalid;
+    int num_jobs;
+    /**
+     * This describes the macroblock memory layout.
+     * 0 -> Only width+height*2+1 macroblocks allocated (frame/single thread).
+     * 1 -> Macroblocks for entire frame alloced (sliced thread).
+     */
+    int mb_layout;
 } VP8Context;
 
 #endif /* AVCODEC_VP8_H */
-- 
cgit v1.2.3