From 3eeb7557637e8e48fbc64e844a94775edb496906 Mon Sep 17 00:00:00 2001
From: Christophe GISQUET <christophe.gisquet@gmail.com>
Date: Mon, 2 Jan 2012 20:53:54 +0100
Subject: rv34: Inter/intra MB code split

Split inter/intra macroblock handling code. This will allow further
optimizations such as performing inverse transform and block reconstruction
in a single pass as well as specialize code.

Signed-off-by: Janne Grunau <janne-libav@jannau.net>
---
 libavcodec/rv34.c | 244 ++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 191 insertions(+), 53 deletions(-)

(limited to 'libavcodec')

diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c
index 48b5193f38..48f34b93e0 100644
--- a/libavcodec/rv34.c
+++ b/libavcodec/rv34.c
@@ -351,44 +351,70 @@ static inline RV34VLC* choose_vlc_set(int quant, int mod, int type)
 }
 
 /**
- * Decode macroblock header and return CBP in case of success, -1 otherwise.
+ * Decode intra macroblock header and return CBP in case of success, -1 otherwise.
  */
-static int rv34_decode_mb_header(RV34DecContext *r, int8_t *intra_types)
+static int rv34_decode_intra_mb_header(RV34DecContext *r, int8_t *intra_types)
 {
     MpegEncContext *s = &r->s;
     GetBitContext *gb = &s->gb;
     int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
-    int i, t;
+    int t;
 
-    if(!r->si.type){
-        r->is16 = get_bits1(gb);
-        if(!r->is16 && !r->rv30){
+    r->is16 = get_bits1(gb);
+    if(r->is16){
+        s->current_picture_ptr->f.mb_type[mb_pos] = MB_TYPE_INTRA16x16;
+        r->block_type = RV34_MB_TYPE_INTRA16x16;
+        t = get_bits(gb, 2);
+        fill_rectangle(intra_types, 4, 4, r->intra_types_stride, t, sizeof(intra_types[0]));
+        r->luma_vlc   = 2;
+    }else{
+        if(!r->rv30){
             if(!get_bits1(gb))
                 av_log(s->avctx, AV_LOG_ERROR, "Need DQUANT\n");
         }
-        s->current_picture_ptr->f.mb_type[mb_pos] = r->is16 ? MB_TYPE_INTRA16x16 : MB_TYPE_INTRA;
-        r->block_type = r->is16 ? RV34_MB_TYPE_INTRA16x16 : RV34_MB_TYPE_INTRA;
-    }else{
-        r->block_type = r->decode_mb_info(r);
-        if(r->block_type == -1)
+        s->current_picture_ptr->f.mb_type[mb_pos] = MB_TYPE_INTRA;
+        r->block_type = RV34_MB_TYPE_INTRA;
+        if(r->decode_intra_types(r, gb, intra_types) < 0)
             return -1;
-        s->current_picture_ptr->f.mb_type[mb_pos] = rv34_mb_type_to_lavc[r->block_type];
-        r->mb_type[mb_pos] = r->block_type;
-        if(r->block_type == RV34_MB_SKIP){
-            if(s->pict_type == AV_PICTURE_TYPE_P)
-                r->mb_type[mb_pos] = RV34_MB_P_16x16;
-            if(s->pict_type == AV_PICTURE_TYPE_B)
-                r->mb_type[mb_pos] = RV34_MB_B_DIRECT;
-        }
-        r->is16 = !!IS_INTRA16x16(s->current_picture_ptr->f.mb_type[mb_pos]);
-        rv34_decode_mv(r, r->block_type);
-        if(r->block_type == RV34_MB_SKIP){
-            fill_rectangle(intra_types, 4, 4, r->intra_types_stride, 0, sizeof(intra_types[0]));
-            return 0;
-        }
-        r->chroma_vlc = 1;
-        r->luma_vlc   = 0;
+        r->luma_vlc   = 1;
     }
+
+    r->chroma_vlc = 0;
+    r->cur_vlcs   = choose_vlc_set(r->si.quant, r->si.vlc_set, 0);
+
+    return rv34_decode_cbp(gb, r->cur_vlcs, r->is16);
+}
+
+/**
+ * Decode inter macroblock header and return CBP in case of success, -1 otherwise.
+ */
+static int rv34_decode_inter_mb_header(RV34DecContext *r, int8_t *intra_types)
+{
+    MpegEncContext *s = &r->s;
+    GetBitContext *gb = &s->gb;
+    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
+    int i, t;
+
+    r->block_type = r->decode_mb_info(r);
+    if(r->block_type == -1)
+        return -1;
+    s->current_picture_ptr->f.mb_type[mb_pos] = rv34_mb_type_to_lavc[r->block_type];
+    r->mb_type[mb_pos] = r->block_type;
+    if(r->block_type == RV34_MB_SKIP){
+        if(s->pict_type == AV_PICTURE_TYPE_P)
+            r->mb_type[mb_pos] = RV34_MB_P_16x16;
+        if(s->pict_type == AV_PICTURE_TYPE_B)
+            r->mb_type[mb_pos] = RV34_MB_B_DIRECT;
+    }
+    r->is16 = !!IS_INTRA16x16(s->current_picture_ptr->f.mb_type[mb_pos]);
+    rv34_decode_mv(r, r->block_type);
+    if(r->block_type == RV34_MB_SKIP){
+        fill_rectangle(intra_types, 4, 4, r->intra_types_stride, 0, sizeof(intra_types[0]));
+        return 0;
+    }
+    r->chroma_vlc = 1;
+    r->luma_vlc   = 0;
+
     if(IS_INTRA(s->current_picture_ptr->f.mb_type[mb_pos])){
         if(r->is16){
             t = get_bits(gb, 2);
@@ -1123,7 +1149,7 @@ static int rv34_set_deblock_coef(RV34DecContext *r)
     return hmvmask | vmvmask;
 }
 
-static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types)
+static int rv34_decode_inter_macroblock(RV34DecContext *r, int8_t *intra_types)
 {
     MpegEncContext *s = &r->s;
     GetBitContext *gb = &s->gb;
@@ -1131,7 +1157,6 @@ static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types)
     int q_dc, q_ac, has_ac;
     int i, blknum, blkoff;
     LOCAL_ALIGNED_16(DCTELEM, block16, [64]);
-    int luma_dc_quant;
     int dist;
     int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
 
@@ -1151,20 +1176,19 @@ static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types)
         r->avail_cache[1] = s->current_picture_ptr->f.mb_type[mb_pos - s->mb_stride - 1];
 
     s->qscale = r->si.quant;
-    cbp = cbp2 = rv34_decode_mb_header(r, intra_types);
+    cbp = cbp2 = rv34_decode_inter_mb_header(r, intra_types);
     r->cbp_luma  [mb_pos] = cbp;
     r->cbp_chroma[mb_pos] = cbp >> 16;
-    if(s->pict_type == AV_PICTURE_TYPE_I)
-        r->deblock_coefs[mb_pos] = 0xFFFF;
-    else
-        r->deblock_coefs[mb_pos] = rv34_set_deblock_coef(r) | r->cbp_luma[mb_pos];
+    r->deblock_coefs[mb_pos] = rv34_set_deblock_coef(r) | r->cbp_luma[mb_pos];
     s->current_picture_ptr->f.qscale_table[mb_pos] = s->qscale;
 
     if(cbp == -1)
         return -1;
 
-    luma_dc_quant = r->block_type == RV34_MB_P_MIX16x16 ? r->luma_dc_quant_p[s->qscale] : r->luma_dc_quant_i[s->qscale];
     if(r->is16){
+        int luma_dc_quant = r->block_type == RV34_MB_P_MIX16x16
+                          ? r->luma_dc_quant_p[s->qscale]
+                          : r->luma_dc_quant_i[s->qscale];
         q_dc = rv34_qscale_tab[luma_dc_quant];
         q_ac = rv34_qscale_tab[s->qscale];
         s->dsp.clear_block(block16);
@@ -1172,25 +1196,37 @@ static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types)
             r->rdsp.rv34_inv_transform_tab[1](block16);
         else
             r->rdsp.rv34_inv_transform_dc_tab[1](block16);
-    }
 
-    q_ac = rv34_qscale_tab[s->qscale];
-    for(i = 0; i < 16; i++, cbp >>= 1){
-        DCTELEM *ptr;
-        if(!r->is16 && !(cbp & 1)) continue;
-        blknum = ((i & 2) >> 1) + ((i & 8) >> 2);
-        blkoff = ((i & 1) << 2) + ((i & 4) << 3);
-        ptr    = s->block[blknum] + blkoff;
-        if(cbp & 1)
-            has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
-        else
-            has_ac = 0;
-        if(r->is16) //FIXME: optimize
+        q_ac = rv34_qscale_tab[s->qscale];
+        for(i = 0; i < 16; i++, cbp >>= 1){
+            DCTELEM *ptr;
+            blknum = ((i & 2) >> 1) + ((i & 8) >> 2);
+            blkoff = ((i & 1) << 2) + ((i & 4) << 3);
+            ptr    = s->block[blknum] + blkoff;
+            if(cbp & 1)
+                has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
+            else
+                has_ac = 0;
             ptr[0] = block16[(i & 3) | ((i & 0xC) << 1)];
-        if(has_ac)
-            r->rdsp.rv34_inv_transform_tab[0](ptr);
-        else
-            r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
+            if(has_ac)
+                r->rdsp.rv34_inv_transform_tab[0](ptr);
+            else
+                r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
+        }
+    }else{
+        q_ac = rv34_qscale_tab[s->qscale];
+        for(i = 0; i < 16; i++, cbp >>= 1){
+            DCTELEM *ptr;
+            if(!(cbp & 1)) continue;
+            blknum = ((i & 2) >> 1) + ((i & 8) >> 2);
+            blkoff = ((i & 1) << 2) + ((i & 4) << 3);
+            ptr    = s->block[blknum] + blkoff;
+            has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
+            if(has_ac)
+                r->rdsp.rv34_inv_transform_tab[0](ptr);
+            else
+                r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
+        }
     }
     if(r->block_type == RV34_MB_P_MIX16x16)
         r->cur_vlcs = choose_vlc_set(r->si.quant, r->si.vlc_set, 1);
@@ -1215,6 +1251,104 @@ static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types)
     return 0;
 }
 
+static int rv34_decode_intra_macroblock(RV34DecContext *r, int8_t *intra_types)
+{
+    MpegEncContext *s = &r->s;
+    GetBitContext *gb = &s->gb;
+    int cbp, cbp2;
+    int q_dc, q_ac, has_ac;
+    int i, blknum, blkoff;
+    LOCAL_ALIGNED_16(DCTELEM, block16, [64]);
+    int dist;
+    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
+
+    // Calculate which neighbours are available. Maybe it's worth optimizing too.
+    memset(r->avail_cache, 0, sizeof(r->avail_cache));
+    fill_rectangle(r->avail_cache + 6, 2, 2, 4, 1, 4);
+    dist = (s->mb_x - s->resync_mb_x) + (s->mb_y - s->resync_mb_y) * s->mb_width;
+    if(s->mb_x && dist)
+        r->avail_cache[5] =
+        r->avail_cache[9] = s->current_picture_ptr->f.mb_type[mb_pos - 1];
+    if(dist >= s->mb_width)
+        r->avail_cache[2] =
+        r->avail_cache[3] = s->current_picture_ptr->f.mb_type[mb_pos - s->mb_stride];
+    if(((s->mb_x+1) < s->mb_width) && dist >= s->mb_width - 1)
+        r->avail_cache[4] = s->current_picture_ptr->f.mb_type[mb_pos - s->mb_stride + 1];
+    if(s->mb_x && dist > s->mb_width)
+        r->avail_cache[1] = s->current_picture_ptr->f.mb_type[mb_pos - s->mb_stride - 1];
+
+    s->qscale = r->si.quant;
+    cbp = cbp2 = rv34_decode_intra_mb_header(r, intra_types);
+    r->cbp_luma  [mb_pos] = cbp;
+    r->cbp_chroma[mb_pos] = cbp >> 16;
+    r->deblock_coefs[mb_pos] = 0xFFFF;
+     s->current_picture_ptr->f.qscale_table[mb_pos] = s->qscale;
+
+    if(cbp == -1)
+        return -1;
+
+    if(r->is16){
+        int luma_dc_quant = r->block_type == RV34_MB_P_MIX16x16
+                          ? r->luma_dc_quant_p[s->qscale]
+                          : r->luma_dc_quant_i[s->qscale];
+        q_dc = rv34_qscale_tab[luma_dc_quant];
+        q_ac = rv34_qscale_tab[s->qscale];
+        s->dsp.clear_block(block16);
+        if (rv34_decode_block(block16, gb, r->cur_vlcs, 3, 0, q_dc, q_dc, q_ac))
+            r->rdsp.rv34_inv_transform_tab[1](block16);
+        else
+            r->rdsp.rv34_inv_transform_dc_tab[1](block16);
+
+        q_ac = rv34_qscale_tab[s->qscale];
+        for(i = 0; i < 16; i++, cbp >>= 1){
+            DCTELEM *ptr;
+            blknum = ((i & 2) >> 1) + ((i & 8) >> 2);
+            blkoff = ((i & 1) << 2) + ((i & 4) << 3);
+            ptr    = s->block[blknum] + blkoff;
+            if(cbp & 1)
+                has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
+            else
+                has_ac = 0;
+            ptr[0] = block16[(i & 3) | ((i & 0xC) << 1)];
+            if(has_ac)
+                r->rdsp.rv34_inv_transform_tab[0](ptr);
+            else
+                r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
+        }
+    }else{
+        q_ac = rv34_qscale_tab[s->qscale];
+        for(i = 0; i < 16; i++, cbp >>= 1){
+            DCTELEM *ptr;
+            if(!(cbp & 1)) continue;
+            blknum = ((i & 2) >> 1) + ((i & 8) >> 2);
+            blkoff = ((i & 1) << 2) + ((i & 4) << 3);
+            ptr    = s->block[blknum] + blkoff;
+            has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
+            if(has_ac)
+                r->rdsp.rv34_inv_transform_tab[0](ptr);
+            else
+                r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
+        }
+    }
+
+    q_dc = rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]];
+    q_ac = rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]];
+    for(; i < 24; i++, cbp >>= 1){
+        DCTELEM *ptr;
+        if(!(cbp & 1)) continue;
+        blknum = ((i & 4) >> 2) + 4;
+        blkoff = ((i & 1) << 2) + ((i & 2) << 4);
+        ptr    = s->block[blknum] + blkoff;
+        if (rv34_decode_block(ptr, gb, r->cur_vlcs, r->chroma_vlc, 1, q_dc, q_ac, q_ac))
+            r->rdsp.rv34_inv_transform_tab[0](ptr);
+        else
+            r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
+    }
+    rv34_output_macroblock(r, intra_types, cbp2, r->is16);
+
+    return 0;
+}
+
 static int check_slice_end(RV34DecContext *r, MpegEncContext *s)
 {
     int bits;
@@ -1324,7 +1458,11 @@ static int rv34_decode_slice(RV34DecContext *r, int end, const uint8_t* buf, int
         ff_update_block_index(s);
         s->dsp.clear_blocks(s->block[0]);
 
-        if(rv34_decode_macroblock(r, r->intra_types + s->mb_x * 4 + 4) < 0){
+        if(r->si.type)
+            res = rv34_decode_inter_macroblock(r, r->intra_types + s->mb_x * 4 + 4);
+        else
+            res = rv34_decode_intra_macroblock(r, r->intra_types + s->mb_x * 4 + 4);
+        if(res < 0){
             ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, ER_MB_ERROR);
             return -1;
         }
-- 
cgit v1.2.3


From d78062386e425deafe9a08d109cff70b7a2de22c Mon Sep 17 00:00:00 2001
From: Christophe GISQUET <christophe.gisquet@gmail.com>
Date: Tue, 3 Jan 2012 00:22:11 +0100
Subject: rv34: Intra 16x16 handling

Extract processing of intra 16x16 blocks from intra macroblock
processing.
Also implement a function performing inverse transform and block
reconstruction for DC-only blocks in 1 pass instead of 2.
---
 libavcodec/rv34.c             | 281 +++++++++++++++++++++++-------------------
 libavcodec/rv34dsp.c          |  47 ++++++-
 libavcodec/rv34dsp.h          |   7 ++
 libavcodec/x86/rv34dsp.asm    |  83 +++++++++++--
 libavcodec/x86/rv34dsp_init.c |  14 ++-
 5 files changed, 292 insertions(+), 140 deletions(-)

(limited to 'libavcodec')

diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c
index 48f34b93e0..478c93ada0 100644
--- a/libavcodec/rv34.c
+++ b/libavcodec/rv34.c
@@ -982,15 +982,6 @@ static void rv34_pred_4x4_block(RV34DecContext *r, uint8_t *dst, int stride, int
     r->h.pred4x4[itype](dst, prev, stride);
 }
 
-/** add_pixels_clamped for 4x4 block */
-static void rv34_add_4x4_block(uint8_t *dst, int stride, DCTELEM block[64], int off)
-{
-    int x, y;
-    for(y = 0; y < 4; y++)
-        for(x = 0; x < 4; x++)
-            dst[x + y*stride] = av_clip_uint8(dst[x + y*stride] + block[off + x+y*8]);
-}
-
 static inline int adjust_pred16(int itype, int up, int left)
 {
     if(!up && !left)
@@ -1007,15 +998,20 @@ static inline int adjust_pred16(int itype, int up, int left)
     return itype;
 }
 
-static void rv34_output_macroblock(RV34DecContext *r, int8_t *intra_types, int cbp, int is16)
+static void rv34_output_i16x16(RV34DecContext *r, int8_t *intra_types, int cbp)
 {
-    MpegEncContext *s = &r->s;
-    DSPContext *dsp = &s->dsp;
-    int i, j;
-    uint8_t *Y, *U, *V;
-    int itype;
-    int avail[6*8] = {0};
-    int idx;
+    LOCAL_ALIGNED_16(DCTELEM, block16, [64]);
+    MpegEncContext *s    = &r->s;
+    DSPContext     *dsp  = &s->dsp;
+    GetBitContext  *gb   = &s->gb;
+    int             q_dc = rv34_qscale_tab[ r->luma_dc_quant_i[s->qscale] ],
+                    q_ac = rv34_qscale_tab[s->qscale];
+    uint8_t        *dst  = s->dest[0];
+    DCTELEM        *ptr  = s->block[0];
+    int       avail[6*8] = {0};
+    int i, j, itype, has_ac;
+
+    memset(block16, 0, 64 * sizeof(*block16));
 
     // Set neighbour information.
     if(r->avail_cache[1])
@@ -1031,52 +1027,142 @@ static void rv34_output_macroblock(RV34DecContext *r, int8_t *intra_types, int c
     if(r->avail_cache[9])
         avail[24] = avail[32] = 1;
 
-    Y = s->dest[0];
-    U = s->dest[1];
-    V = s->dest[2];
-    if(!is16){
-        for(j = 0; j < 4; j++){
-            idx = 9 + j*8;
-            for(i = 0; i < 4; i++, cbp >>= 1, Y += 4, idx++){
-                rv34_pred_4x4_block(r, Y, s->linesize, ittrans[intra_types[i]], avail[idx-8], avail[idx-1], avail[idx+7], avail[idx-7]);
-                avail[idx] = 1;
-                if(cbp & 1)
-                    rv34_add_4x4_block(Y, s->linesize, s->block[(i>>1)+(j&2)], (i&1)*4+(j&1)*32);
+    has_ac = rv34_decode_block(block16, gb, r->cur_vlcs, 3, 0, q_dc, q_dc, q_ac);
+    if(has_ac)
+        r->rdsp.rv34_inv_transform_tab[1](block16);
+    else
+        r->rdsp.rv34_inv_transform_dc_tab[1](block16);
+
+    itype = ittrans16[intra_types[0]];
+    itype = adjust_pred16(itype, r->avail_cache[6-4], r->avail_cache[6-1]);
+    r->h.pred16x16[itype](dst, s->linesize);
+
+    dsp->clear_block(ptr);
+    for(j = 0; j < 4; j++){
+        for(i = 0; i < 4; i++, cbp >>= 1){
+            int dc = block16[i + j*8];
+
+            if(cbp & 1){
+                has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
+            }else
+                has_ac = 0;
+
+            if(has_ac){
+                ptr[0] = dc;
+                r->rdsp.rv34_idct_add(dst+4*i, s->linesize, ptr);
+                dsp->clear_block(ptr);
+            }else
+                r->rdsp.rv34_idct_dc_add(dst+4*i, s->linesize, dc);
+        }
+
+        dst += 4*s->linesize;
+    }
+
+    itype = ittrans16[intra_types[0]];
+    if(itype == PLANE_PRED8x8) itype = DC_PRED8x8;
+    itype = adjust_pred16(itype, r->avail_cache[6-4], r->avail_cache[6-1]);
+
+    q_dc = rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]];
+    q_ac = rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]];
+
+    for(j = 1; j < 3; j++){
+        dst = s->dest[j];
+        r->h.pred8x8[itype](dst, s->uvlinesize);
+        for(i = 0; i < 4; i++, cbp >>= 1){
+            uint8_t *pdst;
+            if(!(cbp & 1)) continue;
+            pdst   = dst + (i&1)*4 + (i&2)*2*s->uvlinesize;
+
+            has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->chroma_vlc, 1, q_dc, q_ac, q_ac);
+            if(has_ac){
+                r->rdsp.rv34_idct_add(pdst, s->uvlinesize, ptr);
+                dsp->clear_block(ptr);
+            }else{
+                r->rdsp.rv34_idct_dc_add(pdst, s->uvlinesize, ptr[0]);
+                ptr[0] = 0;
             }
-            Y += s->linesize * 4 - 4*4;
-            intra_types += r->intra_types_stride;
         }
-        intra_types -= r->intra_types_stride * 4;
+    }
+}
+
+static void rv34_output_intra(RV34DecContext *r, int8_t *intra_types, int cbp)
+{
+    MpegEncContext *s   = &r->s;
+    DSPContext     *dsp = &s->dsp;
+    GetBitContext  *gb  = &s->gb;
+    DCTELEM        *ptr = s->block[0];
+    uint8_t        *dst = s->dest[0];
+    int      avail[6*8] = {0};
+    int i, j, k;
+    int idx, has_ac;
+    int q_ac, q_dc;
+
+    // Set neighbour information.
+    if(r->avail_cache[1])
+        avail[0] = 1;
+    if(r->avail_cache[2])
+        avail[1] = avail[2] = 1;
+    if(r->avail_cache[3])
+        avail[3] = avail[4] = 1;
+    if(r->avail_cache[4])
+        avail[5] = 1;
+    if(r->avail_cache[5])
+        avail[8] = avail[16] = 1;
+    if(r->avail_cache[9])
+        avail[24] = avail[32] = 1;
+
+    q_ac = rv34_qscale_tab[s->qscale];
+    for(j = 0; j < 4; j++){
+        idx = 9 + j*8;
+        for(i = 0; i < 4; i++, cbp >>= 1, dst += 4, idx++){
+            rv34_pred_4x4_block(r, dst, s->linesize, ittrans[intra_types[i]], avail[idx-8], avail[idx-1], avail[idx+7], avail[idx-7]);
+            avail[idx] = 1;
+            if(!(cbp & 1)) continue;
+
+            has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
+            if(has_ac){
+                r->rdsp.rv34_idct_add(dst, s->linesize, ptr);
+                dsp->clear_block(ptr);
+            }else{
+                r->rdsp.rv34_idct_dc_add(dst, s->linesize, ptr[0]);
+                ptr[0] = 0;
+            }
+        }
+        dst += s->linesize * 4 - 4*4;
+        intra_types += r->intra_types_stride;
+    }
+
+    intra_types -= r->intra_types_stride * 4;
+
+    q_dc = rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]];
+    q_ac = rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]];
+
+    for(k = 0; k < 2; k++){
+        dst = s->dest[1+k];
         fill_rectangle(r->avail_cache + 6, 2, 2, 4, 0, 4);
+
         for(j = 0; j < 2; j++){
-            idx = 6 + j*4;
-            for(i = 0; i < 2; i++, cbp >>= 1, idx++){
-                rv34_pred_4x4_block(r, U + i*4 + j*4*s->uvlinesize, s->uvlinesize, ittrans[intra_types[i*2+j*2*r->intra_types_stride]], r->avail_cache[idx-4], r->avail_cache[idx-1], !i && !j, r->avail_cache[idx-3]);
-                rv34_pred_4x4_block(r, V + i*4 + j*4*s->uvlinesize, s->uvlinesize, ittrans[intra_types[i*2+j*2*r->intra_types_stride]], r->avail_cache[idx-4], r->avail_cache[idx-1], !i && !j, r->avail_cache[idx-3]);
-                r->avail_cache[idx] = 1;
-                if(cbp & 0x01)
-                    rv34_add_4x4_block(U + i*4 + j*4*s->uvlinesize, s->uvlinesize, s->block[4], i*4+j*32);
-                if(cbp & 0x10)
-                    rv34_add_4x4_block(V + i*4 + j*4*s->uvlinesize, s->uvlinesize, s->block[5], i*4+j*32);
+            int* acache = r->avail_cache + 6 + j*4;
+            for(i = 0; i < 2; i++, cbp >>= 1, acache++){
+                int itype = ittrans[intra_types[i*2+j*2*r->intra_types_stride]];
+                rv34_pred_4x4_block(r, dst+4*i, s->uvlinesize, itype, acache[-4], acache[-1], !i && !j, acache[-3]);
+                acache[0] = 1;
+
+                if(!(cbp&1)) continue;
+
+                has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->chroma_vlc, 1, q_dc, q_ac, q_ac);
+                if(has_ac){
+                    r->rdsp.rv34_idct_add(dst + 4*i, s->uvlinesize, ptr);
+                    dsp->clear_block(ptr);
+                }
+                else {
+                    r->rdsp.rv34_idct_dc_add(dst + 4*i, s->uvlinesize, ptr[0]);
+                    ptr[0] = 0;
+                }
             }
+
+            dst += 4*s->uvlinesize;
         }
-    }else{
-        itype = ittrans16[intra_types[0]];
-        itype = adjust_pred16(itype, r->avail_cache[6-4], r->avail_cache[6-1]);
-        r->h.pred16x16[itype](Y, s->linesize);
-        dsp->add_pixels_clamped(s->block[0], Y,     s->linesize);
-        dsp->add_pixels_clamped(s->block[1], Y + 8, s->linesize);
-        Y += s->linesize * 8;
-        dsp->add_pixels_clamped(s->block[2], Y,     s->linesize);
-        dsp->add_pixels_clamped(s->block[3], Y + 8, s->linesize);
-
-        itype = ittrans16[intra_types[0]];
-        if(itype == PLANE_PRED8x8) itype = DC_PRED8x8;
-        itype = adjust_pred16(itype, r->avail_cache[6-4], r->avail_cache[6-1]);
-        r->h.pred8x8[itype](U, s->uvlinesize);
-        dsp->add_pixels_clamped(s->block[4], U, s->uvlinesize);
-        r->h.pred8x8[itype](V, s->uvlinesize);
-        dsp->add_pixels_clamped(s->block[5], V, s->uvlinesize);
     }
 }
 
@@ -1185,6 +1271,12 @@ static int rv34_decode_inter_macroblock(RV34DecContext *r, int8_t *intra_types)
     if(cbp == -1)
         return -1;
 
+    if (IS_INTRA(s->current_picture_ptr->f.mb_type[mb_pos])){
+        if(r->is16) rv34_output_i16x16(r, intra_types, cbp);
+        else        rv34_output_intra(r, intra_types, cbp);
+        return 0;
+    }
+
     if(r->is16){
         int luma_dc_quant = r->block_type == RV34_MB_P_MIX16x16
                           ? r->luma_dc_quant_p[s->qscale]
@@ -1243,10 +1335,7 @@ static int rv34_decode_inter_macroblock(RV34DecContext *r, int8_t *intra_types)
         else
             r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
     }
-    if (IS_INTRA(s->current_picture_ptr->f.mb_type[mb_pos]))
-        rv34_output_macroblock(r, intra_types, cbp2, r->is16);
-    else
-        rv34_apply_differences(r, cbp2);
+    rv34_apply_differences(r, cbp2);
 
     return 0;
 }
@@ -1254,12 +1343,7 @@ static int rv34_decode_inter_macroblock(RV34DecContext *r, int8_t *intra_types)
 static int rv34_decode_intra_macroblock(RV34DecContext *r, int8_t *intra_types)
 {
     MpegEncContext *s = &r->s;
-    GetBitContext *gb = &s->gb;
-    int cbp, cbp2;
-    int q_dc, q_ac, has_ac;
-    int i, blknum, blkoff;
-    LOCAL_ALIGNED_16(DCTELEM, block16, [64]);
-    int dist;
+    int cbp, dist;
     int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
 
     // Calculate which neighbours are available. Maybe it's worth optimizing too.
@@ -1278,74 +1362,21 @@ static int rv34_decode_intra_macroblock(RV34DecContext *r, int8_t *intra_types)
         r->avail_cache[1] = s->current_picture_ptr->f.mb_type[mb_pos - s->mb_stride - 1];
 
     s->qscale = r->si.quant;
-    cbp = cbp2 = rv34_decode_intra_mb_header(r, intra_types);
+    cbp = rv34_decode_intra_mb_header(r, intra_types);
     r->cbp_luma  [mb_pos] = cbp;
     r->cbp_chroma[mb_pos] = cbp >> 16;
     r->deblock_coefs[mb_pos] = 0xFFFF;
-     s->current_picture_ptr->f.qscale_table[mb_pos] = s->qscale;
+    s->current_picture_ptr->f.qscale_table[mb_pos] = s->qscale;
 
     if(cbp == -1)
         return -1;
 
     if(r->is16){
-        int luma_dc_quant = r->block_type == RV34_MB_P_MIX16x16
-                          ? r->luma_dc_quant_p[s->qscale]
-                          : r->luma_dc_quant_i[s->qscale];
-        q_dc = rv34_qscale_tab[luma_dc_quant];
-        q_ac = rv34_qscale_tab[s->qscale];
-        s->dsp.clear_block(block16);
-        if (rv34_decode_block(block16, gb, r->cur_vlcs, 3, 0, q_dc, q_dc, q_ac))
-            r->rdsp.rv34_inv_transform_tab[1](block16);
-        else
-            r->rdsp.rv34_inv_transform_dc_tab[1](block16);
-
-        q_ac = rv34_qscale_tab[s->qscale];
-        for(i = 0; i < 16; i++, cbp >>= 1){
-            DCTELEM *ptr;
-            blknum = ((i & 2) >> 1) + ((i & 8) >> 2);
-            blkoff = ((i & 1) << 2) + ((i & 4) << 3);
-            ptr    = s->block[blknum] + blkoff;
-            if(cbp & 1)
-                has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
-            else
-                has_ac = 0;
-            ptr[0] = block16[(i & 3) | ((i & 0xC) << 1)];
-            if(has_ac)
-                r->rdsp.rv34_inv_transform_tab[0](ptr);
-            else
-                r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
-        }
-    }else{
-        q_ac = rv34_qscale_tab[s->qscale];
-        for(i = 0; i < 16; i++, cbp >>= 1){
-            DCTELEM *ptr;
-            if(!(cbp & 1)) continue;
-            blknum = ((i & 2) >> 1) + ((i & 8) >> 2);
-            blkoff = ((i & 1) << 2) + ((i & 4) << 3);
-            ptr    = s->block[blknum] + blkoff;
-            has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
-            if(has_ac)
-                r->rdsp.rv34_inv_transform_tab[0](ptr);
-            else
-                r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
-        }
-    }
-
-    q_dc = rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]];
-    q_ac = rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]];
-    for(; i < 24; i++, cbp >>= 1){
-        DCTELEM *ptr;
-        if(!(cbp & 1)) continue;
-        blknum = ((i & 4) >> 2) + 4;
-        blkoff = ((i & 1) << 2) + ((i & 2) << 4);
-        ptr    = s->block[blknum] + blkoff;
-        if (rv34_decode_block(ptr, gb, r->cur_vlcs, r->chroma_vlc, 1, q_dc, q_ac, q_ac))
-            r->rdsp.rv34_inv_transform_tab[0](ptr);
-        else
-            r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
+        rv34_output_i16x16(r, intra_types, cbp);
+        return 0;
     }
-    rv34_output_macroblock(r, intra_types, cbp2, r->is16);
 
+    rv34_output_intra(r, intra_types, cbp);
     return 0;
 }
 
diff --git a/libavcodec/rv34dsp.c b/libavcodec/rv34dsp.c
index 1767be4173..91c455a024 100644
--- a/libavcodec/rv34dsp.c
+++ b/libavcodec/rv34dsp.c
@@ -32,7 +32,7 @@
  * @{
  */
 
-static av_always_inline void rv34_row_transform(int temp[16], DCTELEM *block)
+static av_always_inline void rv34_row_transform(int temp[16], const DCTELEM *block)
 {
     int i;
 
@@ -72,6 +72,32 @@ static void rv34_inv_transform_c(DCTELEM *block){
     }
 }
 
+/**
+ * Real Video 3.0/4.0 inverse transform + sample reconstruction
+ * Code is almost the same as in SVQ3, only scaling is different.
+ */
+static void rv34_idct_add_c(uint8_t *dst, int stride, const DCTELEM *block){
+    int      temp[16];
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    int      i;
+
+    rv34_row_transform(temp, block);
+
+    for(i = 0; i < 4; i++){
+        const int z0 = 13*(temp[4*0+i] +    temp[4*2+i]) + 0x200;
+        const int z1 = 13*(temp[4*0+i] -    temp[4*2+i]) + 0x200;
+        const int z2 =  7* temp[4*1+i] - 17*temp[4*3+i];
+        const int z3 = 17* temp[4*1+i] +  7*temp[4*3+i];
+
+        dst[0] = cm[ dst[0] + ( (z0 + z3) >> 10 ) ];
+        dst[1] = cm[ dst[1] + ( (z1 + z2) >> 10 ) ];
+        dst[2] = cm[ dst[2] + ( (z1 - z2) >> 10 ) ];
+        dst[3] = cm[ dst[3] + ( (z0 - z3) >> 10 ) ];
+
+        dst  += stride;
+    }
+}
+
 /**
  * RealVideo 3.0/4.0 inverse transform for DC block
  *
@@ -97,6 +123,22 @@ static void rv34_inv_transform_noround_c(DCTELEM *block){
     }
 }
 
+static void rv34_idct_dc_add_c(uint8_t *dst, int stride, int dc)
+{
+    const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    int i, j;
+
+    cm += (13*13*dc + 0x200) >> 10;
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+            dst[j] = cm[ dst[j] ];
+
+        dst += stride;
+    }
+}
+
 static void rv34_inv_transform_dc_c(DCTELEM *block)
 {
     DCTELEM dc = (13 * 13 * block[0] + 0x200) >> 10;
@@ -126,6 +168,9 @@ av_cold void ff_rv34dsp_init(RV34DSPContext *c, DSPContext* dsp) {
     c->rv34_inv_transform_dc_tab[0]  = rv34_inv_transform_dc_c;
     c->rv34_inv_transform_dc_tab[1]  = rv34_inv_transform_dc_noround_c;
 
+    c->rv34_idct_add    = rv34_idct_add_c;
+    c->rv34_idct_dc_add = rv34_idct_dc_add_c;
+
     if (HAVE_NEON)
         ff_rv34dsp_init_neon(c, dsp);
     if (HAVE_MMX)
diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h
index 6f53a09928..2e9e58e64a 100644
--- a/libavcodec/rv34dsp.h
+++ b/libavcodec/rv34dsp.h
@@ -36,6 +36,11 @@ typedef void (*rv40_weight_func)(uint8_t *dst/*align width (8 or 16)*/,
 
 typedef void (*rv34_inv_transform_func)(DCTELEM *block);
 
+typedef void (*rv34_idct_add_func)(uint8_t *dst, int stride,
+                                   const DCTELEM *block);
+typedef void (*rv34_idct_dc_add_func)(uint8_t *dst, int stride,
+                                      int   dc);
+
 typedef void (*rv40_weak_loop_filter_func)(uint8_t *src, int stride,
                                            int filter_p1, int filter_q1,
                                            int alpha, int beta,
@@ -57,6 +62,8 @@ typedef struct RV34DSPContext {
     rv40_weight_func rv40_weight_pixels_tab[2];
     rv34_inv_transform_func rv34_inv_transform_tab[2];
     void (*rv34_inv_transform_dc_tab[2])(DCTELEM *block);
+    rv34_idct_add_func rv34_idct_add;
+    rv34_idct_dc_add_func rv34_idct_dc_add;
     rv40_weak_loop_filter_func rv40_weak_loop_filter[2];
     rv40_strong_loop_filter_func rv40_strong_loop_filter[2];
     rv40_loop_filter_strength_func rv40_loop_filter_strength[2];
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index 58f1af0495..c8eeebbfeb 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -35,21 +35,84 @@ SECTION .text
     sar    %1, 10
 %endmacro
 
-%macro rv34_idct_dequant4x4_dc 1
-cglobal rv34_idct_dequant4x4_%1_mmx2, 1, 2, 0
+%macro rv34_idct 1
+cglobal rv34_idct_%1_mmx2, 1, 2, 0
     movsx   r1, word [r0]
     IDCT_DC r1
-    movd    mm0, r1
-    pshufw  mm0, mm0, 0
-    movq    [r0+ 0], mm0
-    movq    [r0+16], mm0
-    movq    [r0+32], mm0
-    movq    [r0+48], mm0
+    movd    m0, r1
+    pshufw  m0, m0, 0
+    movq    [r0+ 0], m0
+    movq    [r0+16], m0
+    movq    [r0+32], m0
+    movq    [r0+48], m0
     REP_RET
 %endmacro
 
 INIT_MMX
 %define IDCT_DC IDCT_DC_ROUND
-rv34_idct_dequant4x4_dc dc
+rv34_idct dc
 %define IDCT_DC IDCT_DC_NOROUND
-rv34_idct_dequant4x4_dc dc_noround
+rv34_idct dc_noround
+
+; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
+cglobal rv34_idct_dc_add_mmx, 3, 3
+    ; calculate DC
+    IDCT_DC_ROUND r2
+    pxor       m1, m1
+    movd       m0, r2
+    psubw      m1, m0
+    packuswb   m0, m0
+    packuswb   m1, m1
+    punpcklbw  m0, m0
+    punpcklbw  m1, m1
+    punpcklwd  m0, m0
+    punpcklwd  m1, m1
+
+    ; add DC
+    lea        r2, [r0+r1*2]
+    movh       m2, [r0]
+    movh       m3, [r0+r1]
+    movh       m4, [r2]
+    movh       m5, [r2+r1]
+    paddusb    m2, m0
+    paddusb    m3, m0
+    paddusb    m4, m0
+    paddusb    m5, m0
+    psubusb    m2, m1
+    psubusb    m3, m1
+    psubusb    m4, m1
+    psubusb    m5, m1
+    movh       [r0], m2
+    movh       [r0+r1], m3
+    movh       [r2], m4
+    movh       [r2+r1], m5
+    RET
+
+; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
+INIT_XMM
+cglobal rv34_idct_dc_add_sse4, 3, 3, 6
+    ; load data
+    IDCT_DC_ROUND r2
+    pxor       m1, m1
+
+    ; calculate DC
+    movd       m0, r2
+    lea        r2, [r0+r1*2]
+    movd       m2, [r0]
+    movd       m3, [r0+r1]
+    pshuflw    m0, m0, 0
+    movd       m4, [r2]
+    movd       m5, [r2+r1]
+    punpcklqdq m0, m0
+    punpckldq  m2, m3
+    punpckldq  m4, m5
+    punpcklbw  m2, m1
+    punpcklbw  m4, m1
+    paddw      m2, m0
+    paddw      m4, m0
+    packuswb   m2, m4
+    movd      [r0], m2
+    pextrd [r0+r1], m2, 1
+    pextrd    [r2], m2, 2
+    pextrd [r2+r1], m2, 3
+    RET
diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c
index 4317e9b23b..c10ae4ee96 100644
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@@ -24,17 +24,23 @@
 #include "libavcodec/dsputil.h"
 #include "libavcodec/rv34dsp.h"
 
-void ff_rv34_idct_dequant4x4_dc_mmx2(DCTELEM *block);
-void ff_rv34_idct_dequant4x4_dc_noround_mmx2(DCTELEM *block);
+void ff_rv34_idct_dc_mmx2(DCTELEM *block);
+void ff_rv34_idct_dc_noround_mmx2(DCTELEM *block);
+void ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
+void ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
 
 av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c, DSPContext *dsp)
 {
 #if HAVE_YASM
     int mm_flags = av_get_cpu_flags();
 
+    if (mm_flags & AV_CPU_FLAG_MMX)
+        c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
     if (mm_flags & AV_CPU_FLAG_MMX2) {
-        c->rv34_inv_transform_dc_tab[0] = ff_rv34_idct_dequant4x4_dc_mmx2;
-        c->rv34_inv_transform_dc_tab[1] = ff_rv34_idct_dequant4x4_dc_noround_mmx2;
+        c->rv34_inv_transform_dc_tab[0] = ff_rv34_idct_dc_mmx2;
+        c->rv34_inv_transform_dc_tab[1] = ff_rv34_idct_dc_noround_mmx2;
     }
+    if (mm_flags & AV_CPU_FLAG_SSE4)
+        c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
 #endif
 }
-- 
cgit v1.2.3


From 5ee5fa021f32e0506bed6ebd183c807d5162bc72 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Thu, 15 Dec 2011 17:56:06 -0500
Subject: avcodec: add a public function, avcodec_fill_audio_frame().

This is a convenience function for the user to fill audio AVFrame information.
---
 libavcodec/avcodec.h | 20 +++++++++++
 libavcodec/utils.c   | 94 ++++++++++++++++++++++++++++++----------------------
 2 files changed, 75 insertions(+), 39 deletions(-)

(limited to 'libavcodec')

diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 49d2a0f918..a5071aa438 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -4238,6 +4238,26 @@ void avsubtitle_free(AVSubtitle *sub);
 int avcodec_encode_audio(AVCodecContext *avctx, uint8_t *buf, int buf_size,
                          const short *samples);
 
+/**
+ * Fill audio frame data and linesize.
+ * AVFrame extended_data channel pointers are allocated if necessary for
+ * planar audio.
+ *
+ * @param frame       the AVFrame
+ *                    frame->nb_samples must be set prior to calling the
+ *                    function. This function fills in frame->data,
+ *                    frame->extended_data, frame->linesize[0].
+ * @param nb_channels channel count
+ * @param sample_fmt  sample format
+ * @param buf         buffer to use for frame data
+ * @param buf_size    size of buffer
+ * @param align       plane size sample alignment
+ * @return            0 on success, negative error code on failure
+ */
+int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+                             enum AVSampleFormat sample_fmt, const uint8_t *buf,
+                             int buf_size, int align);
+
 /**
  * Encode a video frame from pict into buf.
  * The input picture should be
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index 8473aacb4f..c3faa76169 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -239,11 +239,47 @@ void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height){
     *width=FFALIGN(*width, align);
 }
 
+int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+                             enum AVSampleFormat sample_fmt, const uint8_t *buf,
+                             int buf_size, int align)
+{
+    int ch, planar, needed_size, ret = 0;
+
+    needed_size = av_samples_get_buffer_size(NULL, nb_channels,
+                                             frame->nb_samples, sample_fmt,
+                                             align);
+    if (buf_size < needed_size)
+        return AVERROR(EINVAL);
+
+    planar = av_sample_fmt_is_planar(sample_fmt);
+    if (planar && nb_channels > AV_NUM_DATA_POINTERS) {
+        if (!(frame->extended_data = av_mallocz(nb_channels *
+                                                sizeof(*frame->extended_data))))
+            return AVERROR(ENOMEM);
+    } else {
+        frame->extended_data = frame->data;
+    }
+
+    if ((ret = av_samples_fill_arrays(frame->extended_data, &frame->linesize[0],
+                                      buf, nb_channels, frame->nb_samples,
+                                      sample_fmt, align)) < 0) {
+        if (frame->extended_data != frame->data)
+            av_free(frame->extended_data);
+        return ret;
+    }
+    if (frame->extended_data != frame->data) {
+        for (ch = 0; ch < AV_NUM_DATA_POINTERS; ch++)
+            frame->data[ch] = frame->extended_data[ch];
+    }
+
+    return ret;
+}
+
 static int audio_get_buffer(AVCodecContext *avctx, AVFrame *frame)
 {
     AVCodecInternal *avci = avctx->internal;
     InternalBuffer *buf;
-    int buf_size, ret, i, needs_extended_data;
+    int buf_size, ret;
 
     buf_size = av_samples_get_buffer_size(NULL, avctx->channels,
                                           frame->nb_samples, avctx->sample_fmt,
@@ -251,9 +287,6 @@ static int audio_get_buffer(AVCodecContext *avctx, AVFrame *frame)
     if (buf_size < 0)
         return AVERROR(EINVAL);
 
-    needs_extended_data = av_sample_fmt_is_planar(avctx->sample_fmt) &&
-                          avctx->channels > AV_NUM_DATA_POINTERS;
-
     /* allocate InternalBuffer if needed */
     if (!avci->buffer) {
         avci->buffer = av_mallocz(sizeof(InternalBuffer));
@@ -285,48 +318,31 @@ static int audio_get_buffer(AVCodecContext *avctx, AVFrame *frame)
     /* if there is no previous buffer or the previous buffer cannot be used
        as-is, allocate a new buffer and/or rearrange the channel pointers */
     if (!buf->extended_data) {
-        /* if the channel pointers will fit, just set extended_data to data,
-           otherwise allocate the extended_data channel pointers */
-        if (needs_extended_data) {
-            buf->extended_data = av_mallocz(avctx->channels *
-                                            sizeof(*buf->extended_data));
-            if (!buf->extended_data)
+        if (!buf->data[0]) {
+            if (!(buf->data[0] = av_mallocz(buf_size)))
                 return AVERROR(ENOMEM);
-        } else {
-            buf->extended_data = buf->data;
+            buf->audio_data_size = buf_size;
         }
-
-        /* if there is a previous buffer and it is large enough, reuse it and
-           just fill-in new channel pointers and linesize, otherwise allocate
-           a new buffer */
-        if (buf->extended_data[0]) {
-            ret = av_samples_fill_arrays(buf->extended_data, &buf->linesize[0],
-                                         buf->extended_data[0], avctx->channels,
-                                         frame->nb_samples, avctx->sample_fmt,
-                                         32);
-        } else {
-            ret = av_samples_alloc(buf->extended_data, &buf->linesize[0],
-                                   avctx->channels, frame->nb_samples,
-                                   avctx->sample_fmt, 32);
-        }
-        if (ret)
+        if ((ret = avcodec_fill_audio_frame(frame, avctx->channels,
+                                            avctx->sample_fmt, buf->data[0],
+                                            buf->audio_data_size, 32)))
             return ret;
 
-        /* if data was not used for extended_data, we need to copy as many of
-           the extended_data channel pointers as will fit */
-        if (needs_extended_data) {
-            for (i = 0; i < AV_NUM_DATA_POINTERS; i++)
-                buf->data[i] = buf->extended_data[i];
-        }
-        buf->audio_data_size = buf_size;
-        buf->nb_channels     = avctx->channels;
+        if (frame->extended_data == frame->data)
+            buf->extended_data = buf->data;
+        else
+            buf->extended_data = frame->extended_data;
+        memcpy(buf->data, frame->data, sizeof(frame->data));
+        buf->linesize[0] = frame->linesize[0];
+        buf->nb_channels = avctx->channels;
+    } else {
+        /* copy InternalBuffer info to the AVFrame */
+        frame->extended_data = buf->extended_data;
+        frame->linesize[0]   = buf->linesize[0];
+        memcpy(frame->data, buf->data, sizeof(frame->data));
     }
 
-    /* copy InternalBuffer info to the AVFrame */
     frame->type          = FF_BUFFER_TYPE_INTERNAL;
-    frame->extended_data = buf->extended_data;
-    frame->linesize[0]   = buf->linesize[0];
-    memcpy(frame->data, buf->data, sizeof(frame->data));
 
     if (avctx->pkt) frame->pkt_pts = avctx->pkt->pts;
     else            frame->pkt_pts = AV_NOPTS_VALUE;
-- 
cgit v1.2.3


From b2c75b6e6320b1a399d76913f9d98c56f386f98b Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 18 Dec 2011 13:20:15 -0500
Subject: avcodec: Add avcodec_encode_audio2() as replacement for
 avcodec_encode_audio()

This allows audio encoders to optionally take an AVFrame as input and write
encoded output to an AVPacket.

This also adds AVCodec.encode2() which will also be usable by video and
subtitle encoders once support is implemented in the public functions.
---
 libavcodec/avcodec.h  |  72 ++++++++++++++-
 libavcodec/internal.h |  25 +++++
 libavcodec/pcm.c      |   1 +
 libavcodec/utils.c    | 246 ++++++++++++++++++++++++++++++++++++++++++++++----
 libavcodec/version.h  |   3 +
 5 files changed, 328 insertions(+), 19 deletions(-)

(limited to 'libavcodec')

diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index a5071aa438..be1b2021bd 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -741,6 +741,11 @@ typedef struct RcOverride{
  * Encoders:
  * The encoder needs to be fed with NULL data at the end of encoding until the
  * encoder no longer returns data.
+ *
+ * NOTE: For encoders implementing the AVCodec.encode2() function, setting this
+ *       flag also means that the encoder must set the pts and duration for
+ *       each output packet. If this flag is not set, the pts and duration will
+ *       be determined by libavcodec from the input frame.
  */
 #define CODEC_CAP_DELAY           0x0020
 /**
@@ -793,6 +798,10 @@ typedef struct RcOverride{
  * Codec supports avctx->thread_count == 0 (auto).
  */
 #define CODEC_CAP_AUTO_THREADS     0x8000
+/**
+ * Audio encoder supports receiving a different number of samples in each call.
+ */
+#define CODEC_CAP_VARIABLE_FRAME_SIZE 0x10000
 
 //The following defines may change, don't expect compatibility if you use them.
 #define MB_TYPE_INTRA4x4   0x0001
@@ -3246,6 +3255,19 @@ typedef struct AVCodec {
      * Initialize codec static data, called from avcodec_register().
      */
     void (*init_static_data)(struct AVCodec *codec);
+
+    /**
+     * Encode data to an AVPacket.
+     *
+     * @param      avctx          codec context
+     * @param      avpkt          output AVPacket (may contain a user-provided buffer)
+     * @param[in]  frame          AVFrame containing the raw data to be encoded
+     * @param[out] got_packet_ptr encoder sets to 0 or 1 to indicate that a
+     *                            non-empty packet was returned in avpkt.
+     * @return 0 on success, negative error code on failure
+     */
+    int (*encode2)(AVCodecContext *avctx, AVPacket *avpkt, const AVFrame *frame,
+                   int *got_packet_ptr);
 } AVCodec;
 
 /**
@@ -4213,9 +4235,12 @@ int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
  */
 void avsubtitle_free(AVSubtitle *sub);
 
+#if FF_API_OLD_ENCODE_AUDIO
 /**
  * Encode an audio frame from samples into buf.
  *
+ * @deprecated Use avcodec_encode_audio2 instead.
+ *
  * @note The output buffer should be at least FF_MIN_BUFFER_SIZE bytes large.
  * However, for codecs with avctx->frame_size equal to 0 (e.g. PCM) the user
  * will know how much space is needed because it depends on the value passed
@@ -4235,8 +4260,51 @@ void avsubtitle_free(AVSubtitle *sub);
  * @return On error a negative value is returned, on success zero or the number
  * of bytes used to encode the data read from the input buffer.
  */
-int avcodec_encode_audio(AVCodecContext *avctx, uint8_t *buf, int buf_size,
-                         const short *samples);
+int attribute_deprecated avcodec_encode_audio(AVCodecContext *avctx,
+                                              uint8_t *buf, int buf_size,
+                                              const short *samples);
+#endif
+
+/**
+ * Encode a frame of audio.
+ *
+ * Takes input samples from frame and writes the next output packet, if
+ * available, to avpkt. The output packet does not necessarily contain data for
+ * the most recent frame, as encoders can delay, split, and combine input frames
+ * internally as needed.
+ *
+ * @param avctx     codec context
+ * @param avpkt     output AVPacket.
+ *                  The user can supply an output buffer by setting
+ *                  avpkt->data and avpkt->size prior to calling the
+ *                  function, but if the size of the user-provided data is not
+ *                  large enough, encoding will fail. All other AVPacket fields
+ *                  will be reset by the encoder using av_init_packet(). If
+ *                  avpkt->data is NULL, the encoder will allocate it.
+ *                  The encoder will set avpkt->size to the size of the
+ *                  output packet.
+ * @param[in] frame AVFrame containing the raw audio data to be encoded.
+ *                  May be NULL when flushing an encoder that has the
+ *                  CODEC_CAP_DELAY capability set.
+ *                  There are 2 codec capabilities that affect the allowed
+ *                  values of frame->nb_samples.
+ *                  If CODEC_CAP_SMALL_LAST_FRAME is set, then only the final
+ *                  frame may be smaller than avctx->frame_size, and all other
+ *                  frames must be equal to avctx->frame_size.
+ *                  If CODEC_CAP_VARIABLE_FRAME_SIZE is set, then each frame
+ *                  can have any number of samples.
+ *                  If neither is set, frame->nb_samples must be equal to
+ *                  avctx->frame_size for all frames.
+ * @param[out] got_packet_ptr This field is set to 1 by libavcodec if the
+ *                            output packet is non-empty, and to 0 if it is
+ *                            empty. If the function returns an error, the
+ *                            packet can be assumed to be invalid, and the
+ *                            value of got_packet_ptr is undefined and should
+ *                            not be used.
+ * @return          0 on success, negative error code on failure
+ */
+int avcodec_encode_audio2(AVCodecContext *avctx, AVPacket *avpkt,
+                          const AVFrame *frame, int *got_packet_ptr);
 
 /**
  * Fill audio frame data and linesize.
diff --git a/libavcodec/internal.h b/libavcodec/internal.h
index 1c2d0daaef..441430e41c 100644
--- a/libavcodec/internal.h
+++ b/libavcodec/internal.h
@@ -61,6 +61,14 @@ typedef struct AVCodecInternal {
      * should be freed from the original context only.
      */
     int is_copy;
+
+#if FF_API_OLD_DECODE_AUDIO
+    /**
+     * Internal sample count used by avcodec_encode_audio() to fabricate pts.
+     * Can be removed along with avcodec_encode_audio().
+     */
+    int sample_count;
+#endif
 } AVCodecInternal;
 
 struct AVCodecDefault {
@@ -101,4 +109,21 @@ int avpriv_unlock_avformat(void);
  */
 #define FF_MAX_EXTRADATA_SIZE ((1 << 28) - FF_INPUT_BUFFER_PADDING_SIZE)
 
+/**
+ * Check AVPacket size and/or allocate data.
+ *
+ * Encoders supporting AVCodec.encode2() can use this as a convenience to
+ * ensure the output packet data is large enough, whether provided by the user
+ * or allocated in this function.
+ *
+ * @param avpkt   the AVPacket
+ *                If avpkt->data is already set, avpkt->size is checked
+ *                to ensure it is large enough.
+ *                If avpkt->data is NULL, a new buffer is allocated.
+ *                All other AVPacket fields will be reset with av_init_packet().
+ * @param size    the minimum required packet size
+ * @return        0 on success, negative error code on failure
+ */
+int ff_alloc_packet(AVPacket *avpkt, int size);
+
 #endif /* AVCODEC_INTERNAL_H */
diff --git a/libavcodec/pcm.c b/libavcodec/pcm.c
index 32231125d2..190f027adc 100644
--- a/libavcodec/pcm.c
+++ b/libavcodec/pcm.c
@@ -474,6 +474,7 @@ AVCodec ff_ ## name_ ## _encoder = {            \
     .init        = pcm_encode_init,             \
     .encode      = pcm_encode_frame,            \
     .close       = pcm_encode_close,            \
+    .capabilities = CODEC_CAP_VARIABLE_FRAME_SIZE, \
     .sample_fmts = (const enum AVSampleFormat[]){sample_fmt_,AV_SAMPLE_FMT_NONE}, \
     .long_name = NULL_IF_CONFIG_SMALL(long_name_), \
 }
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index c3faa76169..ff3f065064 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -25,6 +25,7 @@
  * utils.
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
 #include "libavutil/crc.h"
 #include "libavutil/mathematics.h"
@@ -101,6 +102,16 @@ void avcodec_init(void)
     dsputil_static_init();
 }
 
+static av_always_inline int codec_is_encoder(AVCodec *codec)
+{
+    return codec && (codec->encode || codec->encode2);
+}
+
+static av_always_inline int codec_is_decoder(AVCodec *codec)
+{
+    return codec && codec->decode;
+}
+
 void avcodec_register(AVCodec *codec)
 {
     AVCodec **p;
@@ -690,7 +701,7 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, AVCodec *codec, AVD
 
     /* if the decoder init function was already called previously,
        free the already allocated subtitle_header before overwriting it */
-    if (codec->decode)
+    if (codec_is_decoder(codec))
         av_freep(&avctx->subtitle_header);
 
 #define SANE_NB_CHANNELS 128U
@@ -738,7 +749,7 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, AVCodec *codec, AVD
         ret = AVERROR(EINVAL);
         goto free_and_end;
     }
-    if (avctx->codec->encode) {
+    if (codec_is_encoder(avctx->codec)) {
         int i;
         if (avctx->codec->sample_fmts) {
             for (i = 0; avctx->codec->sample_fmts[i] != AV_SAMPLE_FMT_NONE; i++)
@@ -812,20 +823,221 @@ free_and_end:
     goto end;
 }
 
-int attribute_align_arg avcodec_encode_audio(AVCodecContext *avctx, uint8_t *buf, int buf_size,
-                         const short *samples)
+int ff_alloc_packet(AVPacket *avpkt, int size)
 {
-    if(buf_size < FF_MIN_BUFFER_SIZE && 0){
-        av_log(avctx, AV_LOG_ERROR, "buffer smaller than minimum size\n");
-        return -1;
+    if (size > INT_MAX - FF_INPUT_BUFFER_PADDING_SIZE)
+        return AVERROR(EINVAL);
+
+    if (avpkt->data) {
+        uint8_t *pkt_data;
+        int pkt_size;
+
+        if (avpkt->size < size)
+            return AVERROR(EINVAL);
+
+        pkt_data = avpkt->data;
+        pkt_size = avpkt->size;
+        av_init_packet(avpkt);
+        avpkt->data = pkt_data;
+        avpkt->size = pkt_size;
+        return 0;
+    } else {
+        return av_new_packet(avpkt, size);
     }
-    if((avctx->codec->capabilities & CODEC_CAP_DELAY) || samples){
-        int ret = avctx->codec->encode(avctx, buf, buf_size, samples);
-        avctx->frame_number++;
-        return ret;
-    }else
+}
+
+int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
+                                              AVPacket *avpkt,
+                                              const AVFrame *frame,
+                                              int *got_packet_ptr)
+{
+    int ret;
+    int user_packet = !!avpkt->data;
+    int nb_samples;
+
+    if (!(avctx->codec->capabilities & CODEC_CAP_DELAY) && !frame) {
+        av_init_packet(avpkt);
+        avpkt->size = 0;
         return 0;
+    }
+
+    /* check for valid frame size */
+    if (frame) {
+        nb_samples = frame->nb_samples;
+        if (avctx->codec->capabilities & CODEC_CAP_SMALL_LAST_FRAME) {
+            if (nb_samples > avctx->frame_size)
+                return AVERROR(EINVAL);
+        } else if (!(avctx->codec->capabilities & CODEC_CAP_VARIABLE_FRAME_SIZE)) {
+            if (nb_samples != avctx->frame_size)
+                return AVERROR(EINVAL);
+        }
+    } else {
+        nb_samples = avctx->frame_size;
+    }
+
+    if (avctx->codec->encode2) {
+        *got_packet_ptr = 0;
+        ret = avctx->codec->encode2(avctx, avpkt, frame, got_packet_ptr);
+        if (!ret && *got_packet_ptr &&
+            !(avctx->codec->capabilities & CODEC_CAP_DELAY)) {
+            avpkt->pts = frame->pts;
+            avpkt->duration = av_rescale_q(frame->nb_samples,
+                                           (AVRational){ 1, avctx->sample_rate },
+                                           avctx->time_base);
+        }
+    } else {
+        /* for compatibility with encoders not supporting encode2(), we need to
+           allocate a packet buffer if the user has not provided one or check
+           the size otherwise */
+        int fs_tmp   = 0;
+        int buf_size = avpkt->size;
+        if (!user_packet) {
+            if (avctx->codec->capabilities & CODEC_CAP_VARIABLE_FRAME_SIZE) {
+                av_assert0(av_get_bits_per_sample(avctx->codec_id) != 0);
+                buf_size = nb_samples * avctx->channels *
+                           av_get_bits_per_sample(avctx->codec_id) / 8;
+            } else {
+                /* this is a guess as to the required size.
+                   if an encoder needs more than this, it should probably
+                   implement encode2() */
+                buf_size = 2 * avctx->frame_size * avctx->channels *
+                           av_get_bytes_per_sample(avctx->sample_fmt);
+                buf_size += FF_MIN_BUFFER_SIZE;
+            }
+        }
+        if ((ret = ff_alloc_packet(avpkt, buf_size)))
+            return ret;
+
+        /* Encoders using AVCodec.encode() that support
+           CODEC_CAP_SMALL_LAST_FRAME require avctx->frame_size to be set to
+           the smaller size when encoding the last frame.
+           This code can be removed once all encoders supporting
+           CODEC_CAP_SMALL_LAST_FRAME use encode2() */
+        if ((avctx->codec->capabilities & CODEC_CAP_SMALL_LAST_FRAME) &&
+            nb_samples < avctx->frame_size) {
+            fs_tmp = avctx->frame_size;
+            avctx->frame_size = nb_samples;
+        }
+
+        /* encode the frame */
+        ret = avctx->codec->encode(avctx, avpkt->data, avpkt->size,
+                                   frame ? frame->data[0] : NULL);
+        if (ret >= 0) {
+            if (!ret) {
+                /* no output. if the packet data was allocated by libavcodec,
+                   free it */
+                if (!user_packet)
+                    av_freep(&avpkt->data);
+            } else {
+                if (avctx->coded_frame)
+                    avpkt->pts = avctx->coded_frame->pts;
+                /* Set duration for final small packet. This can be removed
+                   once all encoders supporting CODEC_CAP_SMALL_LAST_FRAME use
+                   encode2() */
+                if (fs_tmp) {
+                    avpkt->duration = av_rescale_q(avctx->frame_size,
+                                                   (AVRational){ 1, avctx->sample_rate },
+                                                   avctx->time_base);
+                }
+            }
+            avpkt->size = ret;
+            *got_packet_ptr = (ret > 0);
+            ret = 0;
+        }
+
+        if (fs_tmp)
+            avctx->frame_size = fs_tmp;
+    }
+    if (!ret)
+        avctx->frame_number++;
+
+    /* NOTE: if we add any audio encoders which output non-keyframe packets,
+             this needs to be moved to the encoders, but for now we can do it
+             here to simplify things */
+    avpkt->flags |= AV_PKT_FLAG_KEY;
+
+    return ret;
+}
+
+#if FF_API_OLD_DECODE_AUDIO
+int attribute_align_arg avcodec_encode_audio(AVCodecContext *avctx,
+                                             uint8_t *buf, int buf_size,
+                                             const short *samples)
+{
+    AVPacket pkt;
+    AVFrame frame0;
+    AVFrame *frame;
+    int ret, samples_size, got_packet;
+
+    av_init_packet(&pkt);
+    pkt.data = buf;
+    pkt.size = buf_size;
+
+    if (samples) {
+        frame = &frame0;
+        avcodec_get_frame_defaults(frame);
+
+        if (avctx->frame_size) {
+            frame->nb_samples = avctx->frame_size;
+        } else {
+            /* if frame_size is not set, the number of samples must be
+               calculated from the buffer size */
+            int64_t nb_samples;
+            if (!av_get_bits_per_sample(avctx->codec_id)) {
+                av_log(avctx, AV_LOG_ERROR, "avcodec_encode_audio() does not "
+                       "support this codec\n");
+                return AVERROR(EINVAL);
+            }
+            nb_samples = (int64_t)buf_size * 8 /
+                         (av_get_bits_per_sample(avctx->codec_id) *
+                         avctx->channels);
+            if (nb_samples >= INT_MAX)
+                return AVERROR(EINVAL);
+            frame->nb_samples = nb_samples;
+        }
+
+        /* it is assumed that the samples buffer is large enough based on the
+           relevant parameters */
+        samples_size = av_samples_get_buffer_size(NULL, avctx->channels,
+                                                  frame->nb_samples,
+                                                  avctx->sample_fmt, 1);
+        if ((ret = avcodec_fill_audio_frame(frame, avctx->channels,
+                                            avctx->sample_fmt,
+                                            samples, samples_size, 1)))
+            return ret;
+
+        /* fabricate frame pts from sample count.
+           this is needed because the avcodec_encode_audio() API does not have
+           a way for the user to provide pts */
+        frame->pts = av_rescale_q(avctx->internal->sample_count,
+                                  (AVRational){ 1, avctx->sample_rate },
+                                  avctx->time_base);
+        avctx->internal->sample_count += frame->nb_samples;
+    } else {
+        frame = NULL;
+    }
+
+    got_packet = 0;
+    ret = avcodec_encode_audio2(avctx, &pkt, frame, &got_packet);
+    if (!ret && got_packet && avctx->coded_frame) {
+        avctx->coded_frame->pts       = pkt.pts;
+        avctx->coded_frame->key_frame = !!(pkt.flags & AV_PKT_FLAG_KEY);
+    }
+    /* free any side data since we cannot return it */
+    if (pkt.side_data_elems > 0) {
+        int i;
+        for (i = 0; i < pkt.side_data_elems; i++)
+            av_free(pkt.side_data[i].data);
+        av_freep(&pkt.side_data);
+        pkt.side_data_elems = 0;
+    }
+
+    if (frame && frame->extended_data != frame->data)
+        av_free(frame->extended_data);
+
+    return ret ? ret : pkt.size;
 }
+#endif
 
 int attribute_align_arg avcodec_encode_video(AVCodecContext *avctx, uint8_t *buf, int buf_size,
                          const AVFrame *pict)
@@ -1077,7 +1289,7 @@ av_cold int avcodec_close(AVCodecContext *avctx)
         av_opt_free(avctx->priv_data);
     av_opt_free(avctx);
     av_freep(&avctx->priv_data);
-    if(avctx->codec && avctx->codec->encode)
+    if (codec_is_encoder(avctx->codec))
         av_freep(&avctx->extradata);
     avctx->codec = NULL;
     avctx->active_thread_type = 0;
@@ -1095,7 +1307,7 @@ AVCodec *avcodec_find_encoder(enum CodecID id)
     AVCodec *p, *experimental=NULL;
     p = first_avcodec;
     while (p) {
-        if (p->encode != NULL && p->id == id) {
+        if (codec_is_encoder(p) && p->id == id) {
             if (p->capabilities & CODEC_CAP_EXPERIMENTAL && !experimental) {
                 experimental = p;
             } else
@@ -1113,7 +1325,7 @@ AVCodec *avcodec_find_encoder_by_name(const char *name)
         return NULL;
     p = first_avcodec;
     while (p) {
-        if (p->encode != NULL && strcmp(name,p->name) == 0)
+        if (codec_is_encoder(p) && strcmp(name,p->name) == 0)
             return p;
         p = p->next;
     }
@@ -1125,7 +1337,7 @@ AVCodec *avcodec_find_decoder(enum CodecID id)
     AVCodec *p;
     p = first_avcodec;
     while (p) {
-        if (p->decode != NULL && p->id == id)
+        if (codec_is_decoder(p) && p->id == id)
             return p;
         p = p->next;
     }
@@ -1139,7 +1351,7 @@ AVCodec *avcodec_find_decoder_by_name(const char *name)
         return NULL;
     p = first_avcodec;
     while (p) {
-        if (p->decode != NULL && strcmp(name,p->name) == 0)
+        if (codec_is_decoder(p) && strcmp(name,p->name) == 0)
             return p;
         p = p->next;
     }
diff --git a/libavcodec/version.h b/libavcodec/version.h
index 87838c08d4..0b7547f2bb 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -119,5 +119,8 @@
 #ifndef FF_API_AVFRAME_AGE
 #define FF_API_AVFRAME_AGE (LIBAVCODEC_VERSION_MAJOR < 54)
 #endif
+#ifndef FF_API_OLD_ENCODE_AUDIO
+#define FF_API_OLD_ENCODE_AUDIO (LIBAVCODEC_VERSION_MAJOR < 54)
+#endif
 
 #endif /* AVCODEC_VERSION_H */
-- 
cgit v1.2.3


From a6ccae3f4c17831d1e158c6cfeecbcf92f750b03 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 18 Dec 2011 19:47:38 -0500
Subject: avcodec: bump minor version and add APIChanges for the new audio
 encoding API

---
 doc/APIchanges       | 8 ++++++++
 libavcodec/version.h | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'libavcodec')

diff --git a/doc/APIchanges b/doc/APIchanges
index 751566afe2..2d7832430f 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -13,6 +13,14 @@ libavutil:   2011-04-18
 
 API changes, most recent first:
 
+2012-xx-xx - lavc 53.34.0
+  New audio encoding API:
+  xxxxxxx Add CODEC_CAP_VARIABLE_FRAME_SIZE capability for use by audio
+          encoders.
+  xxxxxxx Add avcodec_fill_audio_frame() as a convenience function.
+  xxxxxxx Add avcodec_encode_audio2() and deprecate avcodec_encode_audio().
+          Add AVCodec.encode2().
+
 2012-01-xx - xxxxxxx - lavfi 2.15.0
   Add a new installed header -- libavfilter/version.h -- with version macros.
 
diff --git a/libavcodec/version.h b/libavcodec/version.h
index 0b7547f2bb..c7b4c15b7a 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -21,7 +21,7 @@
 #define AVCODEC_VERSION_H
 
 #define LIBAVCODEC_VERSION_MAJOR 53
-#define LIBAVCODEC_VERSION_MINOR 33
+#define LIBAVCODEC_VERSION_MINOR 34
 #define LIBAVCODEC_VERSION_MICRO  0
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
-- 
cgit v1.2.3


From 05f95443cac79eba959d7a1a919d77e8f401ea6f Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 8 Jan 2012 17:37:16 -0500
Subject: pcmenc: use AVCodec.encode2()

---
 libavcodec/pcm.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'libavcodec')

diff --git a/libavcodec/pcm.c b/libavcodec/pcm.c
index 190f027adc..1adaf70318 100644
--- a/libavcodec/pcm.c
+++ b/libavcodec/pcm.c
@@ -27,6 +27,7 @@
 #include "avcodec.h"
 #include "libavutil/common.h" /* for av_reverse */
 #include "bytestream.h"
+#include "internal.h"
 #include "pcm_tablegen.h"
 
 #define MAX_CHANNELS 64
@@ -77,10 +78,10 @@ static av_cold int pcm_encode_close(AVCodecContext *avctx)
         bytestream_put_##endian(&dst, v); \
     }
 
-static int pcm_encode_frame(AVCodecContext *avctx,
-                            unsigned char *frame, int buf_size, void *data)
+static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                            const AVFrame *frame, int *got_packet_ptr)
 {
-    int n, sample_size, v;
+    int n, sample_size, v, ret;
     const short *samples;
     unsigned char *dst;
     const uint8_t *srcu8;
@@ -91,9 +92,14 @@ static int pcm_encode_frame(AVCodecContext *avctx,
     const uint32_t *samples_uint32_t;
 
     sample_size = av_get_bits_per_sample(avctx->codec->id)/8;
-    n = buf_size / sample_size;
-    samples = data;
-    dst = frame;
+    n           = frame->nb_samples * avctx->channels;
+    samples     = (const short *)frame->data[0];
+
+    if ((ret = ff_alloc_packet(avpkt, n * sample_size))) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+        return ret;
+    }
+    dst = avpkt->data;
 
     switch(avctx->codec->id) {
     case CODEC_ID_PCM_U32LE:
@@ -130,7 +136,7 @@ static int pcm_encode_frame(AVCodecContext *avctx,
         ENCODE(uint16_t, be16, samples, dst, n, 0, 0x8000)
         break;
     case CODEC_ID_PCM_S8:
-        srcu8= data;
+        srcu8 = frame->data[0];
         for(;n>0;n--) {
             v = *srcu8++;
             *dst++ = v - 128;
@@ -186,9 +192,10 @@ static int pcm_encode_frame(AVCodecContext *avctx,
     default:
         return -1;
     }
-    //avctx->frame_size = (dst - frame) / (sample_size * avctx->channels);
 
-    return dst - frame;
+    avpkt->size = frame->nb_samples * avctx->channels * sample_size;
+    *got_packet_ptr = 1;
+    return 0;
 }
 
 typedef struct PCMDecode {
@@ -472,7 +479,7 @@ AVCodec ff_ ## name_ ## _encoder = {            \
     .type        = AVMEDIA_TYPE_AUDIO,          \
     .id          = id_,                         \
     .init        = pcm_encode_init,             \
-    .encode      = pcm_encode_frame,            \
+    .encode2     = pcm_encode_frame,            \
     .close       = pcm_encode_close,            \
     .capabilities = CODEC_CAP_VARIABLE_FRAME_SIZE, \
     .sample_fmts = (const enum AVSampleFormat[]){sample_fmt_,AV_SAMPLE_FMT_NONE}, \
-- 
cgit v1.2.3


From d859191322d8596b6d4219269db96456267f0d04 Mon Sep 17 00:00:00 2001
From: Laurentiu Ion <ionlaurentiucristian@gmail.com>
Date: Mon, 16 Jan 2012 04:47:07 +0200
Subject: pictordec: Use bytestream2 functions

Signed-off-by: Justin Ruggles <justin.ruggles@gmail.com>
---
 libavcodec/pictordec.c | 99 ++++++++++++++++++++++++++++----------------------
 1 file changed, 56 insertions(+), 43 deletions(-)

(limited to 'libavcodec')

diff --git a/libavcodec/pictordec.c b/libavcodec/pictordec.c
index 732583ec46..e0bc899946 100644
--- a/libavcodec/pictordec.c
+++ b/libavcodec/pictordec.c
@@ -33,6 +33,7 @@ typedef struct PicContext {
     AVFrame frame;
     int width, height;
     int nb_planes;
+    GetByteContext g;
 } PicContext;
 
 static void picmemset_8bpp(PicContext *s, int value, int run, int *x, int *y)
@@ -55,7 +56,8 @@ static void picmemset_8bpp(PicContext *s, int value, int run, int *x, int *y)
     }
 }
 
-static void picmemset(PicContext *s, int value, int run, int *x, int *y, int *plane, int bits_per_plane)
+static void picmemset(PicContext *s, int value, int run,
+                      int *x, int *y, int *plane, int bits_per_plane)
 {
     uint8_t *d;
     int shift = *plane * bits_per_plane;
@@ -99,34 +101,35 @@ static int decode_frame(AVCodecContext *avctx,
                         AVPacket *avpkt)
 {
     PicContext *s = avctx->priv_data;
-    int buf_size = avpkt->size;
-    const uint8_t *buf = avpkt->data;
-    const uint8_t *buf_end = avpkt->data + buf_size;
     uint32_t *palette;
-    int bits_per_plane, bpp, etype, esize, npal;
-    int i, x, y, plane;
+    int bits_per_plane, bpp, etype, esize, npal, pos_after_pal;
+    int i, x, y, plane, tmp;
 
-    if (buf_size < 11)
+    bytestream2_init(&s->g, avpkt->data, avpkt->size);
+
+    if (bytestream2_get_bytes_left(&s->g) < 11)
         return AVERROR_INVALIDDATA;
 
-    if (bytestream_get_le16(&buf) != 0x1234)
+    if (bytestream2_get_le16u(&s->g) != 0x1234)
         return AVERROR_INVALIDDATA;
-    s->width  = bytestream_get_le16(&buf);
-    s->height = bytestream_get_le16(&buf);
-    buf += 4;
-    bits_per_plane    = *buf & 0xF;
-    s->nb_planes      = (*buf++ >> 4) + 1;
-    bpp               = s->nb_planes ? bits_per_plane*s->nb_planes : bits_per_plane;
+
+    s->width       = bytestream2_get_le16u(&s->g);
+    s->height      = bytestream2_get_le16u(&s->g);
+    bytestream2_skip(&s->g, 4);
+    tmp            = bytestream2_get_byteu(&s->g);
+    bits_per_plane = tmp & 0xF;
+    s->nb_planes   = (tmp >> 4) + 1;
+    bpp            = bits_per_plane * s->nb_planes;
     if (bits_per_plane > 8 || bpp < 1 || bpp > 32) {
         av_log_ask_for_sample(s, "unsupported bit depth\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if (*buf == 0xFF) {
-        buf += 2;
-        etype  = bytestream_get_le16(&buf);
-        esize  = bytestream_get_le16(&buf);
-        if (buf_end - buf < esize)
+    if (bytestream2_peek_byte(&s->g) == 0xFF) {
+        bytestream2_skip(&s->g, 2);
+        etype = bytestream2_get_le16(&s->g);
+        esize = bytestream2_get_le16(&s->g);
+        if (bytestream2_get_bytes_left(&s->g) < esize)
             return AVERROR_INVALIDDATA;
     } else {
         etype = -1;
@@ -151,24 +154,29 @@ static int decode_frame(AVCodecContext *avctx,
     s->frame.pict_type           = AV_PICTURE_TYPE_I;
     s->frame.palette_has_changed = 1;
 
+    pos_after_pal = bytestream2_tell(&s->g) + esize;
     palette = (uint32_t*)s->frame.data[1];
-    if (etype == 1 && esize > 1 && *buf < 6) {
-        int idx = *buf;
+    if (etype == 1 && esize > 1 && bytestream2_peek_byte(&s->g) < 6) {
+        int idx = bytestream2_get_byte(&s->g);
         npal = 4;
         for (i = 0; i < npal; i++)
             palette[i] = ff_cga_palette[ cga_mode45_index[idx][i] ];
     } else if (etype == 2) {
         npal = FFMIN(esize, 16);
-        for (i = 0; i < npal; i++)
-            palette[i] = ff_cga_palette[ FFMIN(buf[i], 16)];
+        for (i = 0; i < npal; i++) {
+            int pal_idx = bytestream2_get_byte(&s->g);
+            palette[i]  = ff_cga_palette[FFMIN(pal_idx, 16)];
+        }
     } else if (etype == 3) {
         npal = FFMIN(esize, 16);
-        for (i = 0; i < npal; i++)
-            palette[i] = ff_ega_palette[ FFMIN(buf[i], 63)];
+        for (i = 0; i < npal; i++) {
+            int pal_idx = bytestream2_get_byte(&s->g);
+            palette[i]  = ff_ega_palette[FFMIN(pal_idx, 63)];
+        }
     } else if (etype == 4 || etype == 5) {
         npal = FFMIN(esize / 3, 256);
         for (i = 0; i < npal; i++)
-            palette[i] = AV_RB24(buf + i*3) << 2;
+            palette[i] = bytestream2_get_be24(&s->g) << 2;
     } else {
         if (bpp == 1) {
             npal = 2;
@@ -185,29 +193,34 @@ static int decode_frame(AVCodecContext *avctx,
     }
     // fill remaining palette entries
     memset(palette + npal, 0, AVPALETTE_SIZE - npal * 4);
-    buf += esize;
-
+    // skip remaining palette bytes
+    bytestream2_seek(&s->g, pos_after_pal, SEEK_SET);
 
     x = 0;
     y = s->height - 1;
     plane = 0;
-    if (bytestream_get_le16(&buf)) {
-        while (buf_end - buf >= 6) {
-            const uint8_t *buf_pend = buf + FFMIN(AV_RL16(buf), buf_end - buf);
-            //ignore uncompressed block size reported at buf[2]
-            int marker = buf[4];
-            buf += 5;
-
-            while (plane < s->nb_planes && buf_pend - buf >= 1) {
+    if (bytestream2_get_le16(&s->g)) {
+        while (bytestream2_get_bytes_left(&s->g) >= 6) {
+            int stop_size, marker, t1, t2;
+
+            t1        = bytestream2_get_bytes_left(&s->g);
+            t2        = bytestream2_get_le16(&s->g);
+            stop_size = t1 - FFMIN(t1, t2);
+            // ignore uncompressed block size
+            bytestream2_skip(&s->g, 2);
+            marker    = bytestream2_get_byte(&s->g);
+
+            while (plane < s->nb_planes &&
+                   bytestream2_get_bytes_left(&s->g) > stop_size) {
                 int run = 1;
-                int val = *buf++;
+                int val = bytestream2_get_byte(&s->g);
                 if (val == marker) {
-                    run = *buf++;
+                    run = bytestream2_get_byte(&s->g);
                     if (run == 0)
-                        run = bytestream_get_le16(&buf);
-                    val = *buf++;
+                        run = bytestream2_get_le16(&s->g);
+                    val = bytestream2_get_byte(&s->g);
                 }
-                if (buf > buf_end)
+                if (!bytestream2_get_bytes_left(&s->g))
                     break;
 
                 if (bits_per_plane == 8) {
@@ -221,12 +234,12 @@ static int decode_frame(AVCodecContext *avctx,
         }
     } else {
         av_log_ask_for_sample(s, "uncompressed image\n");
-        return buf_size;
+        return avpkt->size;
     }
 
     *data_size = sizeof(AVFrame);
     *(AVFrame*)data = s->frame;
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int decode_end(AVCodecContext *avctx)
-- 
cgit v1.2.3


From 9ba9c3402499d90e54f8aa111b62c278206d11af Mon Sep 17 00:00:00 2001
From: Christophe GISQUET <christophe.gisquet@gmail.com>
Date: Tue, 3 Jan 2012 20:38:29 +0100
Subject: rv34: 1-pass inter MB reconstruction

Implement 1-pass inverse transform and reconstruction for inter blocks.
---
 libavcodec/arm/rv34dsp_init_neon.c |   8 +-
 libavcodec/arm/rv34dsp_neon.S      |  59 ++--------
 libavcodec/rv34.c                  | 227 ++++++++++++++++---------------------
 libavcodec/rv34dsp.c               |  62 +++-------
 libavcodec/rv34dsp.h               |   7 +-
 libavcodec/x86/rv34dsp.asm         |   4 +-
 libavcodec/x86/rv34dsp_init.c      |   3 +-
 7 files changed, 130 insertions(+), 240 deletions(-)

(limited to 'libavcodec')

diff --git a/libavcodec/arm/rv34dsp_init_neon.c b/libavcodec/arm/rv34dsp_init_neon.c
index 16bda46658..3984d43c39 100644
--- a/libavcodec/arm/rv34dsp_init_neon.c
+++ b/libavcodec/arm/rv34dsp_init_neon.c
@@ -23,16 +23,12 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/rv34dsp.h"
 
-void ff_rv34_inv_transform_neon(DCTELEM *block);
 void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
 
-void ff_rv34_inv_transform_dc_neon(DCTELEM *block);
 void ff_rv34_inv_transform_noround_dc_neon(DCTELEM *block);
 
 void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
 {
-    c->rv34_inv_transform_tab[0]    = ff_rv34_inv_transform_neon;
-    c->rv34_inv_transform_tab[1]    = ff_rv34_inv_transform_noround_neon;
-    c->rv34_inv_transform_dc_tab[0] = ff_rv34_inv_transform_dc_neon;
-    c->rv34_inv_transform_dc_tab[1] = ff_rv34_inv_transform_noround_dc_neon;
+    c->rv34_inv_transform    = ff_rv34_inv_transform_noround_neon;
+    c->rv34_inv_transform_dc = ff_rv34_inv_transform_noround_dc_neon;
 }
diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
index 1e8d4b49a1..a156412d01 100644
--- a/libavcodec/arm/rv34dsp_neon.S
+++ b/libavcodec/arm/rv34dsp_neon.S
@@ -21,11 +21,7 @@
 #include "asm.S"
 
 .macro rv34_inv_transform
-        mov             r1,  #16
-        vld1.16         {d28}, [r0,:64], r1     @ block[i+8*0]
-        vld1.16         {d29}, [r0,:64], r1     @ block[i+8*1]
-        vld1.16         {d30}, [r0,:64], r1     @ block[i+8*2]
-        vld1.16         {d31}, [r0,:64], r1     @ block[i+8*3]
+        vld1.16         {q14-q15}, [r0,:128]
         vmov.s16        d0,  #13
         vshll.s16       q12, d29, #3
         vshll.s16       q13, d29, #4
@@ -35,12 +31,12 @@
         vmlal.s16       q10, d30, d0
         vmull.s16       q11, d28, d0
         vmlsl.s16       q11, d30, d0
-        vsubw.s16       q12, q12, d29   @ z2 = block[i+8*1]*7
-        vaddw.s16       q13, q13, d29   @ z3 = block[i+8*1]*17
+        vsubw.s16       q12, q12, d29   @ z2 = block[i+4*1]*7
+        vaddw.s16       q13, q13, d29   @ z3 = block[i+4*1]*17
         vsubw.s16       q9,  q9,  d31
         vaddw.s16       q1,  q1,  d31
-        vadd.s32        q13, q13, q9    @ z3 = 17*block[i+8*1] +  7*block[i+8*3]
-        vsub.s32        q12, q12, q1    @ z2 = 7*block[i+8*1]  - 17*block[i+8*3]
+        vadd.s32        q13, q13, q9    @ z3 = 17*block[i+4*1] +  7*block[i+4*3]
+        vsub.s32        q12, q12, q1    @ z2 = 7*block[i+4*1]  - 17*block[i+4*3]
         vadd.s32        q1,  q10, q13   @ z0 + z3
         vadd.s32        q2,  q11, q12   @ z1 + z2
         vsub.s32        q8,  q10, q13   @ z0 - z3
@@ -70,24 +66,8 @@
         vsub.s32        q15, q14, q9    @ z0 - z3
 .endm
 
-/* void ff_rv34_inv_transform_neon(DCTELEM *block); */
-function ff_rv34_inv_transform_neon, export=1
-        mov             r2,  r0
-        rv34_inv_transform
-        vrshrn.s32      d1,  q2,  #10   @ (z1 + z2) >> 10
-        vrshrn.s32      d0,  q1,  #10   @ (z0 + z3) >> 10
-        vrshrn.s32      d2,  q3,  #10   @ (z1 - z2) >> 10
-        vrshrn.s32      d3,  q15, #10   @ (z0 - z3) >> 10
-        vst4.16         {d0[0], d1[0], d2[0], d3[0]}, [r2,:64], r1
-        vst4.16         {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r1
-        vst4.16         {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r1
-        vst4.16         {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1
-        bx              lr
-endfunc
-
 /* void rv34_inv_transform_noround_neon(DCTELEM *block); */
 function ff_rv34_inv_transform_noround_neon, export=1
-        mov             r2,  r0
         rv34_inv_transform
         vshl.s32        q11, q2,  #1
         vshl.s32        q10, q1,  #1
@@ -101,38 +81,23 @@ function ff_rv34_inv_transform_noround_neon, export=1
         vshrn.s32       d1,  q11, #11   @ (z1 + z2)*3 >> 11
         vshrn.s32       d2,  q12, #11   @ (z1 - z2)*3 >> 11
         vshrn.s32       d3,  q13, #11   @ (z0 - z3)*3 >> 11
-        vst4.16         {d0[0], d1[0], d2[0], d3[0]}, [r2,:64], r1
-        vst4.16         {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r1
-        vst4.16         {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r1
-        vst4.16         {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1
+        vst4.16         {d0[0], d1[0], d2[0], d3[0]}, [r0,:64]!
+        vst4.16         {d0[1], d1[1], d2[1], d3[1]}, [r0,:64]!
+        vst4.16         {d0[2], d1[2], d2[2], d3[2]}, [r0,:64]!
+        vst4.16         {d0[3], d1[3], d2[3], d3[3]}, [r0,:64]!
         bx              lr
 endfunc
 
-/* void rv34_inv_transform_dc_c(DCTELEM *block) */
-function ff_rv34_inv_transform_dc_neon, export=1
-        vld1.16         {d28[]}, [r0,:16]       @ block[0]
-        vmov.i16        d4,  #169
-        mov             r1,  #16
-        vmull.s16       q3,  d28, d4
-        vrshrn.s32      d0,  q3,  #10
-        vst1.16         {d0}, [r0,:64], r1
-        vst1.16         {d0}, [r0,:64], r1
-        vst1.16         {d0}, [r0,:64], r1
-        vst1.16         {d0}, [r0,:64], r1
-        bx              lr
-endfunc
 
 /* void rv34_inv_transform_dc_noround_c(DCTELEM *block) */
 function ff_rv34_inv_transform_noround_dc_neon, export=1
         vld1.16         {d28[]}, [r0,:16]       @ block[0]
         vmov.i16        d4,  #251
         vorr.s16        d4,  #256               @ 13^2 * 3
-        mov             r1,  #16
         vmull.s16       q3,  d28, d4
         vshrn.s32       d0,  q3,  #11
-        vst1.64         {d0}, [r0,:64], r1
-        vst1.64         {d0}, [r0,:64], r1
-        vst1.64         {d0}, [r0,:64], r1
-        vst1.64         {d0}, [r0,:64], r1
+        vmov.i16        d1,  d0
+        vst1.64         {q0}, [r0,:128]!
+        vst1.64         {q0}, [r0,:128]!
         bx              lr
 endfunc
diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c
index 478c93ada0..e6af0793d3 100644
--- a/libavcodec/rv34.c
+++ b/libavcodec/rv34.c
@@ -240,15 +240,15 @@ static inline void decode_subblock(DCTELEM *dst, int code, const int is_block2,
 {
     int flags = modulo_three_table[code];
 
-    decode_coeff(    dst+0, (flags >> 6)    , 3, gb, vlc, q);
+    decode_coeff(    dst+0*4+0, (flags >> 6)    , 3, gb, vlc, q);
     if(is_block2){
-        decode_coeff(dst+8, (flags >> 4) & 3, 2, gb, vlc, q);
-        decode_coeff(dst+1, (flags >> 2) & 3, 2, gb, vlc, q);
+        decode_coeff(dst+1*4+0, (flags >> 4) & 3, 2, gb, vlc, q);
+        decode_coeff(dst+0*4+1, (flags >> 2) & 3, 2, gb, vlc, q);
     }else{
-        decode_coeff(dst+1, (flags >> 4) & 3, 2, gb, vlc, q);
-        decode_coeff(dst+8, (flags >> 2) & 3, 2, gb, vlc, q);
+        decode_coeff(dst+0*4+1, (flags >> 4) & 3, 2, gb, vlc, q);
+        decode_coeff(dst+1*4+0, (flags >> 2) & 3, 2, gb, vlc, q);
     }
-    decode_coeff(    dst+9, (flags >> 0) & 3, 2, gb, vlc, q);
+    decode_coeff(    dst+1*4+1, (flags >> 0) & 3, 2, gb, vlc, q);
 }
 
 /**
@@ -265,15 +265,15 @@ static inline void decode_subblock3(DCTELEM *dst, int code, const int is_block2,
 {
     int flags = modulo_three_table[code];
 
-    decode_coeff(    dst+0, (flags >> 6)    , 3, gb, vlc, q_dc);
+    decode_coeff(    dst+0*4+0, (flags >> 6)    , 3, gb, vlc, q_dc);
     if(is_block2){
-        decode_coeff(dst+8, (flags >> 4) & 3, 2, gb, vlc, q_ac1);
-        decode_coeff(dst+1, (flags >> 2) & 3, 2, gb, vlc, q_ac1);
+        decode_coeff(dst+1*4+0, (flags >> 4) & 3, 2, gb, vlc, q_ac1);
+        decode_coeff(dst+0*4+1, (flags >> 2) & 3, 2, gb, vlc, q_ac1);
     }else{
-        decode_coeff(dst+1, (flags >> 4) & 3, 2, gb, vlc, q_ac1);
-        decode_coeff(dst+8, (flags >> 2) & 3, 2, gb, vlc, q_ac1);
+        decode_coeff(dst+0*4+1, (flags >> 4) & 3, 2, gb, vlc, q_ac1);
+        decode_coeff(dst+1*4+0, (flags >> 2) & 3, 2, gb, vlc, q_ac1);
     }
-    decode_coeff(    dst+9, (flags >> 0) & 3, 2, gb, vlc, q_ac2);
+    decode_coeff(    dst+1*4+1, (flags >> 0) & 3, 2, gb, vlc, q_ac2);
 }
 
 /**
@@ -308,15 +308,15 @@ static inline int rv34_decode_block(DCTELEM *dst, GetBitContext *gb, RV34VLC *rv
 
     if(pattern & 4){
         code = get_vlc2(gb, rvlc->second_pattern[sc].table, 9, 2);
-        decode_subblock(dst + 2, code, 0, gb, &rvlc->coefficient, q_ac2);
+        decode_subblock(dst + 4*0+2, code, 0, gb, &rvlc->coefficient, q_ac2);
     }
     if(pattern & 2){ // Looks like coefficients 1 and 2 are swapped for this block
         code = get_vlc2(gb, rvlc->second_pattern[sc].table, 9, 2);
-        decode_subblock(dst + 8*2, code, 1, gb, &rvlc->coefficient, q_ac2);
+        decode_subblock(dst + 4*2+0, code, 1, gb, &rvlc->coefficient, q_ac2);
     }
     if(pattern & 1){
         code = get_vlc2(gb, rvlc->third_pattern[sc].table, 9, 2);
-        decode_subblock(dst + 8*2+2, code, 0, gb, &rvlc->coefficient, q_ac2);
+        decode_subblock(dst + 4*2+2, code, 0, gb, &rvlc->coefficient, q_ac2);
     }
     return has_ac || pattern;
 }
@@ -998,11 +998,26 @@ static inline int adjust_pred16(int itype, int up, int left)
     return itype;
 }
 
+static inline void rv34_process_block(RV34DecContext *r,
+                                      uint8_t *pdst, int stride,
+                                      int fc, int sc, int q_dc, int q_ac)
+{
+    MpegEncContext *s = &r->s;
+    DCTELEM *ptr = s->block[0];
+    int has_ac = rv34_decode_block(ptr, &s->gb, r->cur_vlcs,
+                                   fc, sc, q_dc, q_ac, q_ac);
+    if(has_ac){
+        r->rdsp.rv34_idct_add(pdst, stride, ptr);
+    }else{
+        r->rdsp.rv34_idct_dc_add(pdst, stride, ptr[0]);
+        ptr[0] = 0;
+    }
+}
+
 static void rv34_output_i16x16(RV34DecContext *r, int8_t *intra_types, int cbp)
 {
-    LOCAL_ALIGNED_16(DCTELEM, block16, [64]);
+    LOCAL_ALIGNED_16(DCTELEM, block16, [16]);
     MpegEncContext *s    = &r->s;
-    DSPContext     *dsp  = &s->dsp;
     GetBitContext  *gb   = &s->gb;
     int             q_dc = rv34_qscale_tab[ r->luma_dc_quant_i[s->qscale] ],
                     q_ac = rv34_qscale_tab[s->qscale];
@@ -1011,7 +1026,7 @@ static void rv34_output_i16x16(RV34DecContext *r, int8_t *intra_types, int cbp)
     int       avail[6*8] = {0};
     int i, j, itype, has_ac;
 
-    memset(block16, 0, 64 * sizeof(*block16));
+    memset(block16, 0, 16 * sizeof(*block16));
 
     // Set neighbour information.
     if(r->avail_cache[1])
@@ -1029,18 +1044,17 @@ static void rv34_output_i16x16(RV34DecContext *r, int8_t *intra_types, int cbp)
 
     has_ac = rv34_decode_block(block16, gb, r->cur_vlcs, 3, 0, q_dc, q_dc, q_ac);
     if(has_ac)
-        r->rdsp.rv34_inv_transform_tab[1](block16);
+        r->rdsp.rv34_inv_transform(block16);
     else
-        r->rdsp.rv34_inv_transform_dc_tab[1](block16);
+        r->rdsp.rv34_inv_transform_dc(block16);
 
     itype = ittrans16[intra_types[0]];
     itype = adjust_pred16(itype, r->avail_cache[6-4], r->avail_cache[6-1]);
     r->h.pred16x16[itype](dst, s->linesize);
 
-    dsp->clear_block(ptr);
     for(j = 0; j < 4; j++){
         for(i = 0; i < 4; i++, cbp >>= 1){
-            int dc = block16[i + j*8];
+            int dc = block16[i + j*4];
 
             if(cbp & 1){
                 has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
@@ -1050,7 +1064,6 @@ static void rv34_output_i16x16(RV34DecContext *r, int8_t *intra_types, int cbp)
             if(has_ac){
                 ptr[0] = dc;
                 r->rdsp.rv34_idct_add(dst+4*i, s->linesize, ptr);
-                dsp->clear_block(ptr);
             }else
                 r->rdsp.rv34_idct_dc_add(dst+4*i, s->linesize, dc);
         }
@@ -1073,14 +1086,8 @@ static void rv34_output_i16x16(RV34DecContext *r, int8_t *intra_types, int cbp)
             if(!(cbp & 1)) continue;
             pdst   = dst + (i&1)*4 + (i&2)*2*s->uvlinesize;
 
-            has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->chroma_vlc, 1, q_dc, q_ac, q_ac);
-            if(has_ac){
-                r->rdsp.rv34_idct_add(pdst, s->uvlinesize, ptr);
-                dsp->clear_block(ptr);
-            }else{
-                r->rdsp.rv34_idct_dc_add(pdst, s->uvlinesize, ptr[0]);
-                ptr[0] = 0;
-            }
+            rv34_process_block(r, pdst, s->uvlinesize,
+                               r->chroma_vlc, 1, q_dc, q_ac);
         }
     }
 }
@@ -1088,14 +1095,10 @@ static void rv34_output_i16x16(RV34DecContext *r, int8_t *intra_types, int cbp)
 static void rv34_output_intra(RV34DecContext *r, int8_t *intra_types, int cbp)
 {
     MpegEncContext *s   = &r->s;
-    DSPContext     *dsp = &s->dsp;
-    GetBitContext  *gb  = &s->gb;
-    DCTELEM        *ptr = s->block[0];
     uint8_t        *dst = s->dest[0];
     int      avail[6*8] = {0};
     int i, j, k;
-    int idx, has_ac;
-    int q_ac, q_dc;
+    int idx, q_ac, q_dc;
 
     // Set neighbour information.
     if(r->avail_cache[1])
@@ -1119,14 +1122,8 @@ static void rv34_output_intra(RV34DecContext *r, int8_t *intra_types, int cbp)
             avail[idx] = 1;
             if(!(cbp & 1)) continue;
 
-            has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
-            if(has_ac){
-                r->rdsp.rv34_idct_add(dst, s->linesize, ptr);
-                dsp->clear_block(ptr);
-            }else{
-                r->rdsp.rv34_idct_dc_add(dst, s->linesize, ptr[0]);
-                ptr[0] = 0;
-            }
+            rv34_process_block(r, dst, s->linesize,
+                               r->luma_vlc, 0, q_ac, q_ac);
         }
         dst += s->linesize * 4 - 4*4;
         intra_types += r->intra_types_stride;
@@ -1150,15 +1147,8 @@ static void rv34_output_intra(RV34DecContext *r, int8_t *intra_types, int cbp)
 
                 if(!(cbp&1)) continue;
 
-                has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->chroma_vlc, 1, q_dc, q_ac, q_ac);
-                if(has_ac){
-                    r->rdsp.rv34_idct_add(dst + 4*i, s->uvlinesize, ptr);
-                    dsp->clear_block(ptr);
-                }
-                else {
-                    r->rdsp.rv34_idct_dc_add(dst + 4*i, s->uvlinesize, ptr[0]);
-                    ptr[0] = 0;
-                }
+                rv34_process_block(r, dst + 4*i, s->uvlinesize,
+                                   r->chroma_vlc, 1, q_dc, q_ac);
             }
 
             dst += 4*s->uvlinesize;
@@ -1166,33 +1156,6 @@ static void rv34_output_intra(RV34DecContext *r, int8_t *intra_types, int cbp)
     }
 }
 
-/**
- * mask for retrieving all bits in coded block pattern
- * corresponding to one 8x8 block
- */
-#define LUMA_CBP_BLOCK_MASK 0x33
-
-#define U_CBP_MASK 0x0F0000
-#define V_CBP_MASK 0xF00000
-
-/** @} */ // recons group
-
-
-static void rv34_apply_differences(RV34DecContext *r, int cbp)
-{
-    static const int shifts[4] = { 0, 2, 8, 10 };
-    MpegEncContext *s = &r->s;
-    int i;
-
-    for(i = 0; i < 4; i++)
-        if((cbp & (LUMA_CBP_BLOCK_MASK << shifts[i])) || r->block_type == RV34_MB_P_MIX16x16)
-            s->dsp.add_pixels_clamped(s->block[i], s->dest[0] + (i & 1)*8 + (i&2)*4*s->linesize, s->linesize);
-    if(cbp & U_CBP_MASK)
-        s->dsp.add_pixels_clamped(s->block[4], s->dest[1], s->uvlinesize);
-    if(cbp & V_CBP_MASK)
-        s->dsp.add_pixels_clamped(s->block[5], s->dest[2], s->uvlinesize);
-}
-
 static int is_mv_diff_gt_3(int16_t (*motion_val)[2], int step)
 {
     int d;
@@ -1237,14 +1200,15 @@ static int rv34_set_deblock_coef(RV34DecContext *r)
 
 static int rv34_decode_inter_macroblock(RV34DecContext *r, int8_t *intra_types)
 {
-    MpegEncContext *s = &r->s;
-    GetBitContext *gb = &s->gb;
+    MpegEncContext *s   = &r->s;
+    GetBitContext  *gb  = &s->gb;
+    uint8_t        *dst = s->dest[0];
+    DCTELEM        *ptr = s->block[0];
+    int          mb_pos = s->mb_x + s->mb_y * s->mb_stride;
     int cbp, cbp2;
     int q_dc, q_ac, has_ac;
-    int i, blknum, blkoff;
-    LOCAL_ALIGNED_16(DCTELEM, block16, [64]);
+    int i, j;
     int dist;
-    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
 
     // Calculate which neighbours are available. Maybe it's worth optimizing too.
     memset(r->avail_cache, 0, sizeof(r->avail_cache));
@@ -1278,64 +1242,66 @@ static int rv34_decode_inter_macroblock(RV34DecContext *r, int8_t *intra_types)
     }
 
     if(r->is16){
-        int luma_dc_quant = r->block_type == RV34_MB_P_MIX16x16
-                          ? r->luma_dc_quant_p[s->qscale]
-                          : r->luma_dc_quant_i[s->qscale];
-        q_dc = rv34_qscale_tab[luma_dc_quant];
+        // Only for RV34_MB_P_MIX16x16
+        LOCAL_ALIGNED_16(DCTELEM, block16, [16]);
+        memset(block16, 0, 16 * sizeof(*block16));
+        q_dc = rv34_qscale_tab[ r->luma_dc_quant_p[s->qscale] ];
         q_ac = rv34_qscale_tab[s->qscale];
-        s->dsp.clear_block(block16);
         if (rv34_decode_block(block16, gb, r->cur_vlcs, 3, 0, q_dc, q_dc, q_ac))
-            r->rdsp.rv34_inv_transform_tab[1](block16);
+            r->rdsp.rv34_inv_transform(block16);
         else
-            r->rdsp.rv34_inv_transform_dc_tab[1](block16);
+            r->rdsp.rv34_inv_transform_dc(block16);
 
         q_ac = rv34_qscale_tab[s->qscale];
-        for(i = 0; i < 16; i++, cbp >>= 1){
-            DCTELEM *ptr;
-            blknum = ((i & 2) >> 1) + ((i & 8) >> 2);
-            blkoff = ((i & 1) << 2) + ((i & 4) << 3);
-            ptr    = s->block[blknum] + blkoff;
-            if(cbp & 1)
-                has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
-            else
-                has_ac = 0;
-            ptr[0] = block16[(i & 3) | ((i & 0xC) << 1)];
-            if(has_ac)
-                r->rdsp.rv34_inv_transform_tab[0](ptr);
-            else
-                r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
+
+        for(j = 0; j < 4; j++){
+            for(i = 0; i < 4; i++, cbp >>= 1){
+                int      dc   = block16[i + j*4];
+
+                if(cbp & 1){
+                    has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
+                }else
+                    has_ac = 0;
+
+                if(has_ac){
+                    ptr[0] = dc;
+                    r->rdsp.rv34_idct_add(dst+4*i, s->linesize, ptr);
+                }else
+                    r->rdsp.rv34_idct_dc_add(dst+4*i, s->linesize, dc);
+            }
+
+            dst += 4*s->linesize;
         }
+
+        r->cur_vlcs = choose_vlc_set(r->si.quant, r->si.vlc_set, 1);
     }else{
         q_ac = rv34_qscale_tab[s->qscale];
-        for(i = 0; i < 16; i++, cbp >>= 1){
-            DCTELEM *ptr;
-            if(!(cbp & 1)) continue;
-            blknum = ((i & 2) >> 1) + ((i & 8) >> 2);
-            blkoff = ((i & 1) << 2) + ((i & 4) << 3);
-            ptr    = s->block[blknum] + blkoff;
-            has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
-            if(has_ac)
-                r->rdsp.rv34_inv_transform_tab[0](ptr);
-            else
-                r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
+
+        for(j = 0; j < 4; j++){
+            for(i = 0; i < 4; i++, cbp >>= 1){
+                if(!(cbp & 1)) continue;
+
+                rv34_process_block(r, dst + 4*i, s->linesize,
+                                   r->luma_vlc, 0, q_ac, q_ac);
+            }
+            dst += 4*s->linesize;
         }
     }
-    if(r->block_type == RV34_MB_P_MIX16x16)
-        r->cur_vlcs = choose_vlc_set(r->si.quant, r->si.vlc_set, 1);
+
     q_dc = rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]];
     q_ac = rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]];
-    for(; i < 24; i++, cbp >>= 1){
-        DCTELEM *ptr;
-        if(!(cbp & 1)) continue;
-        blknum = ((i & 4) >> 2) + 4;
-        blkoff = ((i & 1) << 2) + ((i & 2) << 4);
-        ptr    = s->block[blknum] + blkoff;
-        if (rv34_decode_block(ptr, gb, r->cur_vlcs, r->chroma_vlc, 1, q_dc, q_ac, q_ac))
-            r->rdsp.rv34_inv_transform_tab[0](ptr);
-        else
-            r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
+
+    for(j = 1; j < 3; j++){
+        dst = s->dest[j];
+        for(i = 0; i < 4; i++, cbp >>= 1){
+            uint8_t *pdst;
+            if(!(cbp & 1)) continue;
+            pdst = dst + (i&1)*4 + (i&2)*2*s->uvlinesize;
+
+            rv34_process_block(r, pdst, s->uvlinesize,
+                               r->chroma_vlc, 1, q_dc, q_ac);
+        }
     }
-    rv34_apply_differences(r, cbp2);
 
     return 0;
 }
@@ -1487,7 +1453,6 @@ static int rv34_decode_slice(RV34DecContext *r, int end, const uint8_t* buf, int
     ff_init_block_index(s);
     while(!check_slice_end(r, s)) {
         ff_update_block_index(s);
-        s->dsp.clear_blocks(s->block[0]);
 
         if(r->si.type)
             res = rv34_decode_inter_macroblock(r, r->intra_types + s->mb_x * 4 + 4);
diff --git a/libavcodec/rv34dsp.c b/libavcodec/rv34dsp.c
index 91c455a024..e2251773af 100644
--- a/libavcodec/rv34dsp.c
+++ b/libavcodec/rv34dsp.c
@@ -32,15 +32,15 @@
  * @{
  */
 
-static av_always_inline void rv34_row_transform(int temp[16], const DCTELEM *block)
+static av_always_inline void rv34_row_transform(int temp[16], DCTELEM *block)
 {
     int i;
 
     for(i = 0; i < 4; i++){
-        const int z0 = 13*(block[i+8*0] +    block[i+8*2]);
-        const int z1 = 13*(block[i+8*0] -    block[i+8*2]);
-        const int z2 =  7* block[i+8*1] - 17*block[i+8*3];
-        const int z3 = 17* block[i+8*1] +  7*block[i+8*3];
+        const int z0 = 13*(block[i+4*0] +    block[i+4*2]);
+        const int z1 = 13*(block[i+4*0] -    block[i+4*2]);
+        const int z2 =  7* block[i+4*1] - 17*block[i+4*3];
+        const int z3 = 17* block[i+4*1] +  7*block[i+4*3];
 
         temp[4*i+0] = z0 + z3;
         temp[4*i+1] = z1 + z2;
@@ -49,39 +49,17 @@ static av_always_inline void rv34_row_transform(int temp[16], const DCTELEM *blo
     }
 }
 
-/**
- * Real Video 3.0/4.0 inverse transform
- * Code is almost the same as in SVQ3, only scaling is different.
- */
-static void rv34_inv_transform_c(DCTELEM *block){
-    int temp[16];
-    int i;
-
-    rv34_row_transform(temp, block);
-
-    for(i = 0; i < 4; i++){
-        const int z0 = 13*(temp[4*0+i] +    temp[4*2+i]) + 0x200;
-        const int z1 = 13*(temp[4*0+i] -    temp[4*2+i]) + 0x200;
-        const int z2 =  7* temp[4*1+i] - 17*temp[4*3+i];
-        const int z3 = 17* temp[4*1+i] +  7*temp[4*3+i];
-
-        block[i*8+0] = (z0 + z3) >> 10;
-        block[i*8+1] = (z1 + z2) >> 10;
-        block[i*8+2] = (z1 - z2) >> 10;
-        block[i*8+3] = (z0 - z3) >> 10;
-    }
-}
-
 /**
  * Real Video 3.0/4.0 inverse transform + sample reconstruction
  * Code is almost the same as in SVQ3, only scaling is different.
  */
-static void rv34_idct_add_c(uint8_t *dst, int stride, const DCTELEM *block){
+static void rv34_idct_add_c(uint8_t *dst, int stride, DCTELEM *block){
     int      temp[16];
     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
     int      i;
 
     rv34_row_transform(temp, block);
+    memset(block, 0, 16*sizeof(DCTELEM));
 
     for(i = 0; i < 4; i++){
         const int z0 = 13*(temp[4*0+i] +    temp[4*2+i]) + 0x200;
@@ -116,10 +94,10 @@ static void rv34_inv_transform_noround_c(DCTELEM *block){
         const int z2 =  7* temp[4*1+i] - 17*temp[4*3+i];
         const int z3 = 17* temp[4*1+i] +  7*temp[4*3+i];
 
-        block[i*8+0] = ((z0 + z3) * 3) >> 11;
-        block[i*8+1] = ((z1 + z2) * 3) >> 11;
-        block[i*8+2] = ((z1 - z2) * 3) >> 11;
-        block[i*8+3] = ((z0 - z3) * 3) >> 11;
+        block[i*4+0] = ((z0 + z3) * 3) >> 11;
+        block[i*4+1] = ((z1 + z2) * 3) >> 11;
+        block[i*4+2] = ((z1 - z2) * 3) >> 11;
+        block[i*4+3] = ((z0 - z3) * 3) >> 11;
     }
 }
 
@@ -139,22 +117,12 @@ static void rv34_idct_dc_add_c(uint8_t *dst, int stride, int dc)
     }
 }
 
-static void rv34_inv_transform_dc_c(DCTELEM *block)
-{
-    DCTELEM dc = (13 * 13 * block[0] + 0x200) >> 10;
-    int i, j;
-
-    for (i = 0; i < 4; i++, block += 8)
-        for (j = 0; j < 4; j++)
-            block[j] = dc;
-}
-
 static void rv34_inv_transform_dc_noround_c(DCTELEM *block)
 {
     DCTELEM dc = (13 * 13 * 3 * block[0]) >> 11;
     int i, j;
 
-    for (i = 0; i < 4; i++, block += 8)
+    for (i = 0; i < 4; i++, block += 4)
         for (j = 0; j < 4; j++)
             block[j] = dc;
 }
@@ -163,10 +131,8 @@ static void rv34_inv_transform_dc_noround_c(DCTELEM *block)
 
 
 av_cold void ff_rv34dsp_init(RV34DSPContext *c, DSPContext* dsp) {
-    c->rv34_inv_transform_tab[0] = rv34_inv_transform_c;
-    c->rv34_inv_transform_tab[1] = rv34_inv_transform_noround_c;
-    c->rv34_inv_transform_dc_tab[0]  = rv34_inv_transform_dc_c;
-    c->rv34_inv_transform_dc_tab[1]  = rv34_inv_transform_dc_noround_c;
+    c->rv34_inv_transform    = rv34_inv_transform_noround_c;
+    c->rv34_inv_transform_dc = rv34_inv_transform_dc_noround_c;
 
     c->rv34_idct_add    = rv34_idct_add_c;
     c->rv34_idct_dc_add = rv34_idct_dc_add_c;
diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h
index 2e9e58e64a..fe8fcaa8dd 100644
--- a/libavcodec/rv34dsp.h
+++ b/libavcodec/rv34dsp.h
@@ -36,8 +36,7 @@ typedef void (*rv40_weight_func)(uint8_t *dst/*align width (8 or 16)*/,
 
 typedef void (*rv34_inv_transform_func)(DCTELEM *block);
 
-typedef void (*rv34_idct_add_func)(uint8_t *dst, int stride,
-                                   const DCTELEM *block);
+typedef void (*rv34_idct_add_func)(uint8_t *dst, int stride, DCTELEM *block);
 typedef void (*rv34_idct_dc_add_func)(uint8_t *dst, int stride,
                                       int   dc);
 
@@ -60,8 +59,8 @@ typedef struct RV34DSPContext {
     h264_chroma_mc_func put_chroma_pixels_tab[3];
     h264_chroma_mc_func avg_chroma_pixels_tab[3];
     rv40_weight_func rv40_weight_pixels_tab[2];
-    rv34_inv_transform_func rv34_inv_transform_tab[2];
-    void (*rv34_inv_transform_dc_tab[2])(DCTELEM *block);
+    rv34_inv_transform_func rv34_inv_transform;
+    rv34_inv_transform_func rv34_inv_transform_dc;
     rv34_idct_add_func rv34_idct_add;
     rv34_idct_dc_add_func rv34_idct_dc_add;
     rv40_weak_loop_filter_func rv40_weak_loop_filter[2];
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index c8eeebbfeb..2d2f6e19e6 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -42,9 +42,9 @@ cglobal rv34_idct_%1_mmx2, 1, 2, 0
     movd    m0, r1
     pshufw  m0, m0, 0
     movq    [r0+ 0], m0
+    movq    [r0+ 8], m0
     movq    [r0+16], m0
-    movq    [r0+32], m0
-    movq    [r0+48], m0
+    movq    [r0+24], m0
     REP_RET
 %endmacro
 
diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c
index c10ae4ee96..f3d2e172e7 100644
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@@ -37,8 +37,7 @@ av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c, DSPContext *dsp)
     if (mm_flags & AV_CPU_FLAG_MMX)
         c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
     if (mm_flags & AV_CPU_FLAG_MMX2) {
-        c->rv34_inv_transform_dc_tab[0] = ff_rv34_idct_dc_mmx2;
-        c->rv34_inv_transform_dc_tab[1] = ff_rv34_idct_dc_noround_mmx2;
+        c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmx2;
     }
     if (mm_flags & AV_CPU_FLAG_SSE4)
         c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
-- 
cgit v1.2.3


From 9e12002f114d7e0b0ef69519518cdc0391e5e198 Mon Sep 17 00:00:00 2001
From: Janne Grunau <janne-libav@jannau.net>
Date: Sun, 15 Jan 2012 19:16:02 +0100
Subject: rv34: add NEON rv34_idct_add

Overall almost 4% faster, idct_add down from 350 to 85 cycles, idct_dc_add
down from 83 to 30 cycles.

squash: rv34 idct rearrange partial register loads
---
 libavcodec/arm/rv34dsp_init_neon.c |  6 ++++
 libavcodec/arm/rv34dsp_neon.S      | 59 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 62 insertions(+), 3 deletions(-)

(limited to 'libavcodec')

diff --git a/libavcodec/arm/rv34dsp_init_neon.c b/libavcodec/arm/rv34dsp_init_neon.c
index 3984d43c39..744818cee3 100644
--- a/libavcodec/arm/rv34dsp_init_neon.c
+++ b/libavcodec/arm/rv34dsp_init_neon.c
@@ -27,8 +27,14 @@ void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
 
 void ff_rv34_inv_transform_noround_dc_neon(DCTELEM *block);
 
+void ff_rv34_idct_add_neon(uint8_t *dst, int stride, DCTELEM *block);
+void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc);
+
 void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
 {
     c->rv34_inv_transform    = ff_rv34_inv_transform_noround_neon;
     c->rv34_inv_transform_dc = ff_rv34_inv_transform_noround_dc_neon;
+
+    c->rv34_idct_add    = ff_rv34_idct_add_neon;
+    c->rv34_idct_dc_add = ff_rv34_idct_dc_add_neon;
 }
diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
index a156412d01..15a015deef 100644
--- a/libavcodec/arm/rv34dsp_neon.S
+++ b/libavcodec/arm/rv34dsp_neon.S
@@ -19,9 +19,10 @@
  */
 
 #include "asm.S"
+#include "neon.S"
 
-.macro rv34_inv_transform
-        vld1.16         {q14-q15}, [r0,:128]
+.macro rv34_inv_transform    r0
+        vld1.16         {q14-q15}, [\r0,:128]
         vmov.s16        d0,  #13
         vshll.s16       q12, d29, #3
         vshll.s16       q13, d29, #4
@@ -66,9 +67,39 @@
         vsub.s32        q15, q14, q9    @ z0 - z3
 .endm
 
+/* void rv34_idct_add_c(uint8_t *dst, int stride, DCTELEM *block) */
+function ff_rv34_idct_add_neon, export=1
+        mov             r3,  r0
+        rv34_inv_transform   r2
+        vmov.i16        q12, #0
+        vrshrn.s32      d16, q1,  #10   @ (z0 + z3) >> 10
+        vrshrn.s32      d17, q2,  #10   @ (z1 + z2) >> 10
+        vrshrn.s32      d18, q3,  #10   @ (z1 - z2) >> 10
+        vrshrn.s32      d19, q15, #10   @ (z0 - z3) >> 10
+        vld1.32         {d28[]},  [r0,:32], r1
+        vld1.32         {d29[]},  [r0,:32], r1
+        vtrn.32         q8,  q9
+        vld1.32         {d28[1]}, [r0,:32], r1
+        vld1.32         {d29[1]}, [r0,:32], r1
+        vst1.16         {q12}, [r2,:128]!       @ memset(block,    0, 16)
+        vst1.16         {q12}, [r2,:128]        @ memset(block+16, 0, 16)
+        vtrn.16         d16, d17
+        vtrn.32         d28, d29
+        vtrn.16         d18, d19
+        vaddw.u8        q0,   q8,  d28
+        vaddw.u8        q1,   q9,  d29
+        vqmovun.s16     d28,  q0
+        vqmovun.s16     d29,  q1
+        vst1.32         {d28[0]}, [r3,:32], r1
+        vst1.32         {d28[1]}, [r3,:32], r1
+        vst1.32         {d29[0]}, [r3,:32], r1
+        vst1.32         {d29[1]}, [r3,:32], r1
+        bx              lr
+endfunc
+
 /* void rv34_inv_transform_noround_neon(DCTELEM *block); */
 function ff_rv34_inv_transform_noround_neon, export=1
-        rv34_inv_transform
+        rv34_inv_transform   r0
         vshl.s32        q11, q2,  #1
         vshl.s32        q10, q1,  #1
         vshl.s32        q12, q3,  #1
@@ -88,6 +119,28 @@ function ff_rv34_inv_transform_noround_neon, export=1
         bx              lr
 endfunc
 
+/* void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc) */
+function ff_rv34_idct_dc_add_neon, export=1
+        mov             r3,  r0
+        vld1.32         {d28[]},  [r0,:32], r1
+        vld1.32         {d29[]},  [r0,:32], r1
+        vdup.16         d0,  r2
+        vmov.s16        d1,  #169
+        vld1.32         {d28[1]}, [r0,:32], r1
+        vmull.s16       q1,  d0,  d1    @ dc * 13 * 13
+        vld1.32         {d29[1]}, [r0,:32], r1
+        vrshrn.s32      d0,  q1,  #10   @ (dc * 13 * 13 + 0x200) >> 10
+        vmov            d1,  d0
+        vaddw.u8        q2,  q0,  d28
+        vaddw.u8        q3,  q0,  d29
+        vqmovun.s16     d28, q2
+        vqmovun.s16     d29, q3
+        vst1.32         {d28[0]}, [r3,:32], r1
+        vst1.32         {d29[0]}, [r3,:32], r1
+        vst1.32         {d28[1]}, [r3,:32], r1
+        vst1.32         {d29[1]}, [r3,:32], r1
+        bx              lr
+endfunc
 
 /* void rv34_inv_transform_dc_noround_c(DCTELEM *block) */
 function ff_rv34_inv_transform_noround_dc_neon, export=1
-- 
cgit v1.2.3