Merge remote-tracking branch 'qatar/master'

* qatar/master: rv34: add NEON rv34_idct_add rv34: 1-pass inter MB reconstruction add SMJPEG muxer avformat: split out common SMJPEG code pictordec: Use bytestream2 functions avconv: use avcodec_encode_audio2() pcmenc: use AVCodec.encode2() avcodec: bump minor version and add APIChanges for the new audio encoding API avcodec: Add avcodec_encode_audio2() as replacement for avcodec_encode_audio() avcodec: add a public function, avcodec_fill_audio_frame(). rv34: Intra 16x16 handling rv34: Inter/intra MB code split Conflicts: Changelog libavcodec/avcodec.h libavcodec/pictordec.c libavcodec/utils.c libavcodec/version.h libavcodec/x86/rv34dsp.asm libavformat/version.h Merged-by: Michael Niedermayer <michaelni@gmx.at>
author: Michael Niedermayer <michaelni@gmx.at> 2012-01-17 01:40:45 +0100
committer: Michael Niedermayer <michaelni@gmx.at> 2012-01-17 02:37:30 +0100
commit: 67f5650a78de2567c58dbd7545434cc6d3ef9b7e (patch)
tree: 34b08ed769cd7a1f071bf9ff4eca1348481c0bf1 /libavcodec
parent: 905c4dc2b0d564e1b9b6bc6eeca0b8915b81cd8c (diff)
parent: 9e12002f114d7e0b0ef69519518cdc0391e5e198 (diff)
13 files changed, 987 insertions, 380 deletions
diff --git a/libavcodec/arm/rv34dsp_init_neon.c b/libavcodec/arm/rv34dsp_init_neon.c
index 16bda46658..744818cee3 100644
--- a/libavcodec/arm/rv34dsp_init_neon.c
+++ b/libavcodec/arm/rv34dsp_init_neon.c
@@ -23,16 +23,18 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/rv34dsp.h"
 
-void ff_rv34_inv_transform_neon(DCTELEM *block);
 void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
 
-void ff_rv34_inv_transform_dc_neon(DCTELEM *block);
 void ff_rv34_inv_transform_noround_dc_neon(DCTELEM *block);
 
+void ff_rv34_idct_add_neon(uint8_t *dst, int stride, DCTELEM *block);
+void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc);
+
 void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
 {
-    c->rv34_inv_transform_tab[0]    = ff_rv34_inv_transform_neon;
-    c->rv34_inv_transform_tab[1]    = ff_rv34_inv_transform_noround_neon;
-    c->rv34_inv_transform_dc_tab[0] = ff_rv34_inv_transform_dc_neon;
-    c->rv34_inv_transform_dc_tab[1] = ff_rv34_inv_transform_noround_dc_neon;
+    c->rv34_inv_transform    = ff_rv34_inv_transform_noround_neon;
+    c->rv34_inv_transform_dc = ff_rv34_inv_transform_noround_dc_neon;
+
+    c->rv34_idct_add    = ff_rv34_idct_add_neon;
+    c->rv34_idct_dc_add = ff_rv34_idct_dc_add_neon;
 }
diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
index 1e8d4b49a1..15a015deef 100644
--- a/libavcodec/arm/rv34dsp_neon.S
+++ b/libavcodec/arm/rv34dsp_neon.S
@@ -19,13 +19,10 @@
  */
 
 #include "asm.S"
+#include "neon.S"
 
-.macro rv34_inv_transform
-        mov             r1,  #16
-        vld1.16         {d28}, [r0,:64], r1     @ block[i+8*0]
-        vld1.16         {d29}, [r0,:64], r1     @ block[i+8*1]
-        vld1.16         {d30}, [r0,:64], r1     @ block[i+8*2]
-        vld1.16         {d31}, [r0,:64], r1     @ block[i+8*3]
+.macro rv34_inv_transform    r0
+        vld1.16         {q14-q15}, [\r0,:128]
         vmov.s16        d0,  #13
         vshll.s16       q12, d29, #3
         vshll.s16       q13, d29, #4
@@ -35,12 +32,12 @@
         vmlal.s16       q10, d30, d0
         vmull.s16       q11, d28, d0
         vmlsl.s16       q11, d30, d0
-        vsubw.s16       q12, q12, d29   @ z2 = block[i+8*1]*7
-        vaddw.s16       q13, q13, d29   @ z3 = block[i+8*1]*17
+        vsubw.s16       q12, q12, d29   @ z2 = block[i+4*1]*7
+        vaddw.s16       q13, q13, d29   @ z3 = block[i+4*1]*17
         vsubw.s16       q9,  q9,  d31
         vaddw.s16       q1,  q1,  d31
-        vadd.s32        q13, q13, q9    @ z3 = 17*block[i+8*1] +  7*block[i+8*3]
-        vsub.s32        q12, q12, q1    @ z2 = 7*block[i+8*1]  - 17*block[i+8*3]
+        vadd.s32        q13, q13, q9    @ z3 = 17*block[i+4*1] +  7*block[i+4*3]
+        vsub.s32        q12, q12, q1    @ z2 = 7*block[i+4*1]  - 17*block[i+4*3]
         vadd.s32        q1,  q10, q13   @ z0 + z3
         vadd.s32        q2,  q11, q12   @ z1 + z2
         vsub.s32        q8,  q10, q13   @ z0 - z3
@@ -70,25 +67,39 @@
         vsub.s32        q15, q14, q9    @ z0 - z3
 .endm
 
-/* void ff_rv34_inv_transform_neon(DCTELEM *block); */
-function ff_rv34_inv_transform_neon, export=1
-        mov             r2,  r0
-        rv34_inv_transform
-        vrshrn.s32      d1,  q2,  #10   @ (z1 + z2) >> 10
-        vrshrn.s32      d0,  q1,  #10   @ (z0 + z3) >> 10
-        vrshrn.s32      d2,  q3,  #10   @ (z1 - z2) >> 10
-        vrshrn.s32      d3,  q15, #10   @ (z0 - z3) >> 10
-        vst4.16         {d0[0], d1[0], d2[0], d3[0]}, [r2,:64], r1
-        vst4.16         {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r1
-        vst4.16         {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r1
-        vst4.16         {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1
+/* void rv34_idct_add_c(uint8_t *dst, int stride, DCTELEM *block) */
+function ff_rv34_idct_add_neon, export=1
+        mov             r3,  r0
+        rv34_inv_transform   r2
+        vmov.i16        q12, #0
+        vrshrn.s32      d16, q1,  #10   @ (z0 + z3) >> 10
+        vrshrn.s32      d17, q2,  #10   @ (z1 + z2) >> 10
+        vrshrn.s32      d18, q3,  #10   @ (z1 - z2) >> 10
+        vrshrn.s32      d19, q15, #10   @ (z0 - z3) >> 10
+        vld1.32         {d28[]},  [r0,:32], r1
+        vld1.32         {d29[]},  [r0,:32], r1
+        vtrn.32         q8,  q9
+        vld1.32         {d28[1]}, [r0,:32], r1
+        vld1.32         {d29[1]}, [r0,:32], r1
+        vst1.16         {q12}, [r2,:128]!       @ memset(block,    0, 16)
+        vst1.16         {q12}, [r2,:128]        @ memset(block+16, 0, 16)
+        vtrn.16         d16, d17
+        vtrn.32         d28, d29
+        vtrn.16         d18, d19
+        vaddw.u8        q0,   q8,  d28
+        vaddw.u8        q1,   q9,  d29
+        vqmovun.s16     d28,  q0
+        vqmovun.s16     d29,  q1
+        vst1.32         {d28[0]}, [r3,:32], r1
+        vst1.32         {d28[1]}, [r3,:32], r1
+        vst1.32         {d29[0]}, [r3,:32], r1
+        vst1.32         {d29[1]}, [r3,:32], r1
         bx              lr
 endfunc
 
 /* void rv34_inv_transform_noround_neon(DCTELEM *block); */
 function ff_rv34_inv_transform_noround_neon, export=1
-        mov             r2,  r0
-        rv34_inv_transform
+        rv34_inv_transform   r0
         vshl.s32        q11, q2,  #1
         vshl.s32        q10, q1,  #1
         vshl.s32        q12, q3,  #1
@@ -101,24 +112,33 @@ function ff_rv34_inv_transform_noround_neon, export=1
         vshrn.s32       d1,  q11, #11   @ (z1 + z2)*3 >> 11
         vshrn.s32       d2,  q12, #11   @ (z1 - z2)*3 >> 11
         vshrn.s32       d3,  q13, #11   @ (z0 - z3)*3 >> 11
-        vst4.16         {d0[0], d1[0], d2[0], d3[0]}, [r2,:64], r1
-        vst4.16         {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r1
-        vst4.16         {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r1
-        vst4.16         {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1
+        vst4.16         {d0[0], d1[0], d2[0], d3[0]}, [r0,:64]!
+        vst4.16         {d0[1], d1[1], d2[1], d3[1]}, [r0,:64]!
+        vst4.16         {d0[2], d1[2], d2[2], d3[2]}, [r0,:64]!
+        vst4.16         {d0[3], d1[3], d2[3], d3[3]}, [r0,:64]!
         bx              lr
 endfunc
 
-/* void rv34_inv_transform_dc_c(DCTELEM *block) */
-function ff_rv34_inv_transform_dc_neon, export=1
-        vld1.16         {d28[]}, [r0,:16]       @ block[0]
-        vmov.i16        d4,  #169
-        mov             r1,  #16
-        vmull.s16       q3,  d28, d4
-        vrshrn.s32      d0,  q3,  #10
-        vst1.16         {d0}, [r0,:64], r1
-        vst1.16         {d0}, [r0,:64], r1
-        vst1.16         {d0}, [r0,:64], r1
-        vst1.16         {d0}, [r0,:64], r1
+/* void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc) */
+function ff_rv34_idct_dc_add_neon, export=1
+        mov             r3,  r0
+        vld1.32         {d28[]},  [r0,:32], r1
+        vld1.32         {d29[]},  [r0,:32], r1
+        vdup.16         d0,  r2
+        vmov.s16        d1,  #169
+        vld1.32         {d28[1]}, [r0,:32], r1
+        vmull.s16       q1,  d0,  d1    @ dc * 13 * 13
+        vld1.32         {d29[1]}, [r0,:32], r1
+        vrshrn.s32      d0,  q1,  #10   @ (dc * 13 * 13 + 0x200) >> 10
+        vmov            d1,  d0
+        vaddw.u8        q2,  q0,  d28
+        vaddw.u8        q3,  q0,  d29
+        vqmovun.s16     d28, q2
+        vqmovun.s16     d29, q3
+        vst1.32         {d28[0]}, [r3,:32], r1
+        vst1.32         {d29[0]}, [r3,:32], r1
+        vst1.32         {d28[1]}, [r3,:32], r1
+        vst1.32         {d29[1]}, [r3,:32], r1
         bx              lr
 endfunc
 
@@ -127,12 +147,10 @@ function ff_rv34_inv_transform_noround_dc_neon, export=1
         vld1.16         {d28[]}, [r0,:16]       @ block[0]
         vmov.i16        d4,  #251
         vorr.s16        d4,  #256               @ 13^2 * 3
-        mov             r1,  #16
         vmull.s16       q3,  d28, d4
         vshrn.s32       d0,  q3,  #11
-        vst1.64         {d0}, [r0,:64], r1
-        vst1.64         {d0}, [r0,:64], r1
-        vst1.64         {d0}, [r0,:64], r1
-        vst1.64         {d0}, [r0,:64], r1
+        vmov.i16        d1,  d0
+        vst1.64         {q0}, [r0,:128]!
+        vst1.64         {q0}, [r0,:128]!
         bx              lr
 endfunc
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 4e55e0e12b..e690c81e12 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -761,6 +761,11 @@ typedef struct RcOverride{
  * Encoders:
  * The encoder needs to be fed with NULL data at the end of encoding until the
  * encoder no longer returns data.
+ *
+ * NOTE: For encoders implementing the AVCodec.encode2() function, setting this
+ *       flag also means that the encoder must set the pts and duration for
+ *       each output packet. If this flag is not set, the pts and duration will
+ *       be determined by libavcodec from the input frame.
  */
 #define CODEC_CAP_DELAY           0x0020
 /**
@@ -816,6 +821,10 @@ typedef struct RcOverride{
  */
 #define CODEC_CAP_AUTO_THREADS     0x8000
 /**
+ * Audio encoder supports receiving a different number of samples in each call.
+ */
+#define CODEC_CAP_VARIABLE_FRAME_SIZE 0x10000
+/**
  * Codec is lossless.
  */
 #define CODEC_CAP_LOSSLESS         0x80000000
@@ -3314,6 +3323,19 @@ typedef struct AVCodec {
      * Initialize codec static data, called from avcodec_register().
      */
     void (*init_static_data)(struct AVCodec *codec);
+
+    /**
+     * Encode data to an AVPacket.
+     *
+     * @param      avctx          codec context
+     * @param      avpkt          output AVPacket (may contain a user-provided buffer)
+     * @param[in]  frame          AVFrame containing the raw data to be encoded
+     * @param[out] got_packet_ptr encoder sets to 0 or 1 to indicate that a
+     *                            non-empty packet was returned in avpkt.
+     * @return 0 on success, negative error code on failure
+     */
+    int (*encode2)(AVCodecContext *avctx, AVPacket *avpkt, const AVFrame *frame,
+                   int *got_packet_ptr);
 } AVCodec;
 
 /**
@@ -4331,9 +4353,12 @@ int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
  */
 void avsubtitle_free(AVSubtitle *sub);
 
+#if FF_API_OLD_ENCODE_AUDIO
 /**
  * Encode an audio frame from samples into buf.
  *
+ * @deprecated Use avcodec_encode_audio2 instead.
+ *
  * @note The output buffer should be at least FF_MIN_BUFFER_SIZE bytes large.
  * However, for codecs with avctx->frame_size equal to 0 (e.g. PCM) the user
  * will know how much space is needed because it depends on the value passed
@@ -4353,8 +4378,71 @@ void avsubtitle_free(AVSubtitle *sub);
  * @return On error a negative value is returned, on success zero or the number
  * of bytes used to encode the data read from the input buffer.
  */
-int avcodec_encode_audio(AVCodecContext *avctx, uint8_t *buf, int buf_size,
-                         const short *samples);
+int attribute_deprecated avcodec_encode_audio(AVCodecContext *avctx,
+                                              uint8_t *buf, int buf_size,
+                                              const short *samples);
+#endif
+
+/**
+ * Encode a frame of audio.
+ *
+ * Takes input samples from frame and writes the next output packet, if
+ * available, to avpkt. The output packet does not necessarily contain data for
+ * the most recent frame, as encoders can delay, split, and combine input frames
+ * internally as needed.
+ *
+ * @param avctx     codec context
+ * @param avpkt     output AVPacket.
+ *                  The user can supply an output buffer by setting
+ *                  avpkt->data and avpkt->size prior to calling the
+ *                  function, but if the size of the user-provided data is not
+ *                  large enough, encoding will fail. All other AVPacket fields
+ *                  will be reset by the encoder using av_init_packet(). If
+ *                  avpkt->data is NULL, the encoder will allocate it.
+ *                  The encoder will set avpkt->size to the size of the
+ *                  output packet.
+ * @param[in] frame AVFrame containing the raw audio data to be encoded.
+ *                  May be NULL when flushing an encoder that has the
+ *                  CODEC_CAP_DELAY capability set.
+ *                  There are 2 codec capabilities that affect the allowed
+ *                  values of frame->nb_samples.
+ *                  If CODEC_CAP_SMALL_LAST_FRAME is set, then only the final
+ *                  frame may be smaller than avctx->frame_size, and all other
+ *                  frames must be equal to avctx->frame_size.
+ *                  If CODEC_CAP_VARIABLE_FRAME_SIZE is set, then each frame
+ *                  can have any number of samples.
+ *                  If neither is set, frame->nb_samples must be equal to
+ *                  avctx->frame_size for all frames.
+ * @param[out] got_packet_ptr This field is set to 1 by libavcodec if the
+ *                            output packet is non-empty, and to 0 if it is
+ *                            empty. If the function returns an error, the
+ *                            packet can be assumed to be invalid, and the
+ *                            value of got_packet_ptr is undefined and should
+ *                            not be used.
+ * @return          0 on success, negative error code on failure
+ */
+int avcodec_encode_audio2(AVCodecContext *avctx, AVPacket *avpkt,
+                          const AVFrame *frame, int *got_packet_ptr);
+
+/**
+ * Fill audio frame data and linesize.
+ * AVFrame extended_data channel pointers are allocated if necessary for
+ * planar audio.
+ *
+ * @param frame       the AVFrame
+ *                    frame->nb_samples must be set prior to calling the
+ *                    function. This function fills in frame->data,
+ *                    frame->extended_data, frame->linesize[0].
+ * @param nb_channels channel count
+ * @param sample_fmt  sample format
+ * @param buf         buffer to use for frame data
+ * @param buf_size    size of buffer
+ * @param align       plane size sample alignment
+ * @return            0 on success, negative error code on failure
+ */
+int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+                             enum AVSampleFormat sample_fmt, const uint8_t *buf,
+                             int buf_size, int align);
 
 /**
  * Encode a video frame from pict into buf.
diff --git a/libavcodec/internal.h b/libavcodec/internal.h
index e6270f81bf..72a89441c2 100644
--- a/libavcodec/internal.h
+++ b/libavcodec/internal.h
@@ -61,6 +61,14 @@ typedef struct AVCodecInternal {
      * should be freed from the original context only.
      */
     int is_copy;
+
+#if FF_API_OLD_DECODE_AUDIO
+    /**
+     * Internal sample count used by avcodec_encode_audio() to fabricate pts.
+     * Can be removed along with avcodec_encode_audio().
+     */
+    int sample_count;
+#endif
 } AVCodecInternal;
 
 struct AVCodecDefault {
@@ -111,4 +119,21 @@ int avpriv_unlock_avformat(void);
  */
 #define FF_MAX_EXTRADATA_SIZE ((1 << 28) - FF_INPUT_BUFFER_PADDING_SIZE)
 
+/**
+ * Check AVPacket size and/or allocate data.
+ *
+ * Encoders supporting AVCodec.encode2() can use this as a convenience to
+ * ensure the output packet data is large enough, whether provided by the user
+ * or allocated in this function.
+ *
+ * @param avpkt   the AVPacket
+ *                If avpkt->data is already set, avpkt->size is checked
+ *                to ensure it is large enough.
+ *                If avpkt->data is NULL, a new buffer is allocated.
+ *                All other AVPacket fields will be reset with av_init_packet().
+ * @param size    the minimum required packet size
+ * @return        0 on success, negative error code on failure
+ */
+int ff_alloc_packet(AVPacket *avpkt, int size);
+
 #endif /* AVCODEC_INTERNAL_H */
diff --git a/libavcodec/pcm.c b/libavcodec/pcm.c
index 3609c3b0d9..650003793c 100644
--- a/libavcodec/pcm.c
+++ b/libavcodec/pcm.c
@@ -27,6 +27,7 @@
 #include "avcodec.h"
 #include "libavutil/common.h" /* for av_reverse */
 #include "bytestream.h"
+#include "internal.h"
 #include "pcm_tablegen.h"
 
 #define MAX_CHANNELS 64
@@ -77,10 +78,10 @@ static av_cold int pcm_encode_close(AVCodecContext *avctx)
         bytestream_put_##endian(&dst, v); \
     }
 
-static int pcm_encode_frame(AVCodecContext *avctx,
-                            unsigned char *frame, int buf_size, void *data)
+static int pcm_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
+                            const AVFrame *frame, int *got_packet_ptr)
 {
-    int n, sample_size, v;
+    int n, sample_size, v, ret;
     const short *samples;
     unsigned char *dst;
     const uint8_t *srcu8;
@@ -91,9 +92,14 @@ static int pcm_encode_frame(AVCodecContext *avctx,
     const uint32_t *samples_uint32_t;
 
     sample_size = av_get_bits_per_sample(avctx->codec->id)/8;
-    n = buf_size / sample_size;
-    samples = data;
-    dst = frame;
+    n           = frame->nb_samples * avctx->channels;
+    samples     = (const short *)frame->data[0];
+
+    if ((ret = ff_alloc_packet(avpkt, n * sample_size))) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting output packet\n");
+        return ret;
+    }
+    dst = avpkt->data;
 
     switch(avctx->codec->id) {
     case CODEC_ID_PCM_U32LE:
@@ -130,7 +136,7 @@ static int pcm_encode_frame(AVCodecContext *avctx,
         ENCODE(uint16_t, be16, samples, dst, n, 0, 0x8000)
         break;
     case CODEC_ID_PCM_S8:
-        srcu8= data;
+        srcu8 = frame->data[0];
         for(;n>0;n--) {
             v = *srcu8++;
             *dst++ = v - 128;
@@ -186,9 +192,10 @@ static int pcm_encode_frame(AVCodecContext *avctx,
     default:
         return -1;
     }
-    //avctx->frame_size = (dst - frame) / (sample_size * avctx->channels);
 
-    return dst - frame;
+    avpkt->size = frame->nb_samples * avctx->channels * sample_size;
+    *got_packet_ptr = 1;
+    return 0;
 }
 
 typedef struct PCMDecode {
@@ -474,8 +481,9 @@ AVCodec ff_ ## name_ ## _encoder = {            \
     .type        = AVMEDIA_TYPE_AUDIO,          \
     .id          = id_,                         \
     .init        = pcm_encode_init,             \
-    .encode      = pcm_encode_frame,            \
+    .encode2     = pcm_encode_frame,            \
     .close       = pcm_encode_close,            \
+    .capabilities = CODEC_CAP_VARIABLE_FRAME_SIZE, \
     .sample_fmts = (const enum AVSampleFormat[]){sample_fmt_,AV_SAMPLE_FMT_NONE}, \
     .long_name = NULL_IF_CONFIG_SMALL(long_name_), \
 }
diff --git a/libavcodec/pictordec.c b/libavcodec/pictordec.c
index b3b5f7ef4f..d788e6474c 100644
--- a/libavcodec/pictordec.c
+++ b/libavcodec/pictordec.c
@@ -33,6 +33,7 @@ typedef struct PicContext {
     AVFrame frame;
     int width, height;
     int nb_planes;
+    GetByteContext g;
 } PicContext;
 
 static void picmemset_8bpp(PicContext *s, int value, int run, int *x, int *y)
@@ -55,7 +56,8 @@ static void picmemset_8bpp(PicContext *s, int value, int run, int *x, int *y)
     }
 }
 
-static void picmemset(PicContext *s, int value, int run, int *x, int *y, int *plane, int bits_per_plane)
+static void picmemset(PicContext *s, int value, int run,
+                      int *x, int *y, int *plane, int bits_per_plane)
 {
     uint8_t *d;
     int shift = *plane * bits_per_plane;
@@ -107,34 +109,35 @@ static int decode_frame(AVCodecContext *avctx,
                         AVPacket *avpkt)
 {
     PicContext *s = avctx->priv_data;
-    int buf_size = avpkt->size;
-    const uint8_t *buf = avpkt->data;
-    const uint8_t *buf_end = avpkt->data + buf_size;
     uint32_t *palette;
-    int bits_per_plane, bpp, etype, esize, npal;
-    int i, x, y, plane;
+    int bits_per_plane, bpp, etype, esize, npal, pos_after_pal;
+    int i, x, y, plane, tmp;
 
-    if (buf_size < 11)
+    bytestream2_init(&s->g, avpkt->data, avpkt->size);
+
+    if (bytestream2_get_bytes_left(&s->g) < 11)
         return AVERROR_INVALIDDATA;
 
-    if (bytestream_get_le16(&buf) != 0x1234)
+    if (bytestream2_get_le16u(&s->g) != 0x1234)
         return AVERROR_INVALIDDATA;
-    s->width  = bytestream_get_le16(&buf);
-    s->height = bytestream_get_le16(&buf);
-    buf += 4;
-    bits_per_plane    = *buf & 0xF;
-    s->nb_planes      = (*buf++ >> 4) + 1;
-    bpp               = s->nb_planes ? bits_per_plane*s->nb_planes : bits_per_plane;
+
+    s->width       = bytestream2_get_le16u(&s->g);
+    s->height      = bytestream2_get_le16u(&s->g);
+    bytestream2_skip(&s->g, 4);
+    tmp            = bytestream2_get_byteu(&s->g);
+    bits_per_plane = tmp & 0xF;
+    s->nb_planes   = (tmp >> 4) + 1;
+    bpp            = bits_per_plane * s->nb_planes;
     if (bits_per_plane > 8 || bpp < 1 || bpp > 32) {
         av_log_ask_for_sample(avctx, "unsupported bit depth\n");
         return AVERROR_INVALIDDATA;
     }
 
-    if (*buf == 0xFF || bpp == 8) {
-        buf += 2;
-        etype  = bytestream_get_le16(&buf);
-        esize  = bytestream_get_le16(&buf);
-        if (buf_end - buf < esize)
+    if (bytestream2_peek_byte(&s->g) == 0xFF || bpp == 8) {
+        bytestream2_skip(&s->g, 2);
+        etype = bytestream2_get_le16(&s->g);
+        esize = bytestream2_get_le16(&s->g);
+        if (bytestream2_get_bytes_left(&s->g) < esize)
             return AVERROR_INVALIDDATA;
     } else {
         etype = -1;
@@ -159,25 +162,30 @@ static int decode_frame(AVCodecContext *avctx,
     s->frame.pict_type           = AV_PICTURE_TYPE_I;
     s->frame.palette_has_changed = 1;
 
+    pos_after_pal = bytestream2_tell(&s->g) + esize;
     palette = (uint32_t*)s->frame.data[1];
-    if (etype == 1 && esize > 1 && *buf < 6) {
-        int idx = *buf;
+    if (etype == 1 && esize > 1 && bytestream2_peek_byte(&s->g) < 6) {
+        int idx = bytestream2_get_byte(&s->g);
         npal = 4;
         for (i = 0; i < npal; i++)
             palette[i] = ff_cga_palette[ cga_mode45_index[idx][i] ];
     } else if (etype == 2) {
         npal = FFMIN(esize, 16);
-        for (i = 0; i < npal; i++)
-            palette[i] = ff_cga_palette[ FFMIN(buf[i], 16)];
+        for (i = 0; i < npal; i++) {
+            int pal_idx = bytestream2_get_byte(&s->g);
+            palette[i]  = ff_cga_palette[FFMIN(pal_idx, 16)];
+        }
     } else if (etype == 3) {
         npal = FFMIN(esize, 16);
-        for (i = 0; i < npal; i++)
-            palette[i] = ff_ega_palette[ FFMIN(buf[i], 63)];
+        for (i = 0; i < npal; i++) {
+            int pal_idx = bytestream2_get_byte(&s->g);
+            palette[i]  = ff_ega_palette[FFMIN(pal_idx, 63)];
+        }
     } else if (etype == 4 || etype == 5) {
         npal = FFMIN(esize / 3, 256);
         for (i = 0; i < npal; i++) {
-            palette[i] = AV_RB24(buf + i*3) << 2;
-            palette[i] |= 0xFF << 24 | palette[i] >> 6 & 0x30303;
+            palette[i] = bytestream2_get_be24(&s->g) << 2;
+            palette[i] |= 0xFFU << 24 | palette[i] >> 6 & 0x30303;
         }
     } else {
         if (bpp == 1) {
@@ -195,29 +203,34 @@ static int decode_frame(AVCodecContext *avctx,
     }
     // fill remaining palette entries
     memset(palette + npal, 0, AVPALETTE_SIZE - npal * 4);
-    buf += esize;
-
+    // skip remaining palette bytes
+    bytestream2_seek(&s->g, pos_after_pal, SEEK_SET);
 
     y = s->height - 1;
-    if (bytestream_get_le16(&buf)) {
+    if (bytestream2_get_le16(&s->g)) {
         x = 0;
         plane = 0;
-        while (y >= 0 && buf_end - buf >= 6) {
-            const uint8_t *buf_pend = buf + FFMIN(AV_RL16(buf), buf_end - buf);
-            //ignore uncompressed block size reported at buf[2]
-            int marker = buf[4];
-            buf += 5;
+        while (y >= 0 && bytestream2_get_bytes_left(&s->g) >= 6) {
+            int stop_size, marker, t1, t2;
+
+            t1        = bytestream2_get_bytes_left(&s->g);
+            t2        = bytestream2_get_le16(&s->g);
+            stop_size = t1 - FFMIN(t1, t2);
+            // ignore uncompressed block size
+            bytestream2_skip(&s->g, 2);
+            marker    = bytestream2_get_byte(&s->g);
 
-            while (plane < s->nb_planes && y >= 0 && buf_pend - buf >= 1) {
+            while (plane < s->nb_planes && y >= 0 &&
+                   bytestream2_get_bytes_left(&s->g) > stop_size) {
                 int run = 1;
-                int val = *buf++;
+                int val = bytestream2_get_byte(&s->g);
                 if (val == marker) {
-                    run = *buf++;
+                    run = bytestream2_get_byte(&s->g);
                     if (run == 0)
-                        run = bytestream_get_le16(&buf);
-                    val = *buf++;
+                        run = bytestream2_get_le16(&s->g);
+                    val = bytestream2_get_byte(&s->g);
                 }
-                if (buf > buf_end)
+                if (!bytestream2_get_bytes_left(&s->g))
                     break;
 
                 if (bits_per_plane == 8) {
@@ -228,16 +241,16 @@ static int decode_frame(AVCodecContext *avctx,
             }
         }
     } else {
-        while (y >= 0 && buf < buf_end) {
-            memcpy(s->frame.data[0] + y * s->frame.linesize[0], buf, FFMIN(avctx->width, buf_end - buf));
-            buf += avctx->width;
+        while (y >= 0 && bytestream2_get_bytes_left(&s->g) > 0) {
+            memcpy(s->frame.data[0] + y * s->frame.linesize[0], s->g.buffer, FFMIN(avctx->width, bytestream2_get_bytes_left(&s->g)));
+            bytestream2_skip(&s->g, avctx->width);
             y--;
         }
     }
 
     *data_size = sizeof(AVFrame);
     *(AVFrame*)data = s->frame;
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int decode_end(AVCodecContext *avctx)
diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c
index 2be9b3cd38..e09d5dcf14 100644
--- a/libavcodec/rv34.c
+++ b/libavcodec/rv34.c
@@ -240,15 +240,15 @@ static inline void decode_subblock(DCTELEM *dst, int code, const int is_block2,
 {
     int flags = modulo_three_table[code];
 
-    decode_coeff(    dst+0, (flags >> 6)    , 3, gb, vlc, q);
+    decode_coeff(    dst+0*4+0, (flags >> 6)    , 3, gb, vlc, q);
     if(is_block2){
-        decode_coeff(dst+8, (flags >> 4) & 3, 2, gb, vlc, q);
-        decode_coeff(dst+1, (flags >> 2) & 3, 2, gb, vlc, q);
+        decode_coeff(dst+1*4+0, (flags >> 4) & 3, 2, gb, vlc, q);
+        decode_coeff(dst+0*4+1, (flags >> 2) & 3, 2, gb, vlc, q);
     }else{
-        decode_coeff(dst+1, (flags >> 4) & 3, 2, gb, vlc, q);
-        decode_coeff(dst+8, (flags >> 2) & 3, 2, gb, vlc, q);
+        decode_coeff(dst+0*4+1, (flags >> 4) & 3, 2, gb, vlc, q);
+        decode_coeff(dst+1*4+0, (flags >> 2) & 3, 2, gb, vlc, q);
     }
-    decode_coeff(    dst+9, (flags >> 0) & 3, 2, gb, vlc, q);
+    decode_coeff(    dst+1*4+1, (flags >> 0) & 3, 2, gb, vlc, q);
 }
 
 /**
@@ -265,15 +265,15 @@ static inline void decode_subblock3(DCTELEM *dst, int code, const int is_block2,
 {
     int flags = modulo_three_table[code];
 
-    decode_coeff(    dst+0, (flags >> 6)    , 3, gb, vlc, q_dc);
+    decode_coeff(    dst+0*4+0, (flags >> 6)    , 3, gb, vlc, q_dc);
     if(is_block2){
-        decode_coeff(dst+8, (flags >> 4) & 3, 2, gb, vlc, q_ac1);
-        decode_coeff(dst+1, (flags >> 2) & 3, 2, gb, vlc, q_ac1);
+        decode_coeff(dst+1*4+0, (flags >> 4) & 3, 2, gb, vlc, q_ac1);
+        decode_coeff(dst+0*4+1, (flags >> 2) & 3, 2, gb, vlc, q_ac1);
     }else{
-        decode_coeff(dst+1, (flags >> 4) & 3, 2, gb, vlc, q_ac1);
-        decode_coeff(dst+8, (flags >> 2) & 3, 2, gb, vlc, q_ac1);
+        decode_coeff(dst+0*4+1, (flags >> 4) & 3, 2, gb, vlc, q_ac1);
+        decode_coeff(dst+1*4+0, (flags >> 2) & 3, 2, gb, vlc, q_ac1);
     }
-    decode_coeff(    dst+9, (flags >> 0) & 3, 2, gb, vlc, q_ac2);
+    decode_coeff(    dst+1*4+1, (flags >> 0) & 3, 2, gb, vlc, q_ac2);
 }
 
 /**
@@ -308,15 +308,15 @@ static inline int rv34_decode_block(DCTELEM *dst, GetBitContext *gb, RV34VLC *rv
 
     if(pattern & 4){
         code = get_vlc2(gb, rvlc->second_pattern[sc].table, 9, 2);
-        decode_subblock(dst + 2, code, 0, gb, &rvlc->coefficient, q_ac2);
+        decode_subblock(dst + 4*0+2, code, 0, gb, &rvlc->coefficient, q_ac2);
     }
     if(pattern & 2){ // Looks like coefficients 1 and 2 are swapped for this block
         code = get_vlc2(gb, rvlc->second_pattern[sc].table, 9, 2);
-        decode_subblock(dst + 8*2, code, 1, gb, &rvlc->coefficient, q_ac2);
+        decode_subblock(dst + 4*2+0, code, 1, gb, &rvlc->coefficient, q_ac2);
     }
     if(pattern & 1){
         code = get_vlc2(gb, rvlc->third_pattern[sc].table, 9, 2);
-        decode_subblock(dst + 8*2+2, code, 0, gb, &rvlc->coefficient, q_ac2);
+        decode_subblock(dst + 4*2+2, code, 0, gb, &rvlc->coefficient, q_ac2);
     }
     return has_ac || pattern;
 }
@@ -351,44 +351,70 @@ static inline RV34VLC* choose_vlc_set(int quant, int mod, int type)
 }
 
 /**
- * Decode macroblock header and return CBP in case of success, -1 otherwise.
+ * Decode intra macroblock header and return CBP in case of success, -1 otherwise.
  */
-static int rv34_decode_mb_header(RV34DecContext *r, int8_t *intra_types)
+static int rv34_decode_intra_mb_header(RV34DecContext *r, int8_t *intra_types)
 {
     MpegEncContext *s = &r->s;
     GetBitContext *gb = &s->gb;
     int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
-    int i, t;
+    int t;
 
-    if(!r->si.type){
-        r->is16 = get_bits1(gb);
-        if(!r->is16 && !r->rv30){
+    r->is16 = get_bits1(gb);
+    if(r->is16){
+        s->current_picture_ptr->f.mb_type[mb_pos] = MB_TYPE_INTRA16x16;
+        r->block_type = RV34_MB_TYPE_INTRA16x16;
+        t = get_bits(gb, 2);
+        fill_rectangle(intra_types, 4, 4, r->intra_types_stride, t, sizeof(intra_types[0]));
+        r->luma_vlc   = 2;
+    }else{
+        if(!r->rv30){
             if(!get_bits1(gb))
                 av_log(s->avctx, AV_LOG_ERROR, "Need DQUANT\n");
         }
-        s->current_picture_ptr->f.mb_type[mb_pos] = r->is16 ? MB_TYPE_INTRA16x16 : MB_TYPE_INTRA;
-        r->block_type = r->is16 ? RV34_MB_TYPE_INTRA16x16 : RV34_MB_TYPE_INTRA;
-    }else{
-        r->block_type = r->decode_mb_info(r);
-        if(r->block_type == -1)
+        s->current_picture_ptr->f.mb_type[mb_pos] = MB_TYPE_INTRA;
+        r->block_type = RV34_MB_TYPE_INTRA;
+        if(r->decode_intra_types(r, gb, intra_types) < 0)
             return -1;
-        s->current_picture_ptr->f.mb_type[mb_pos] = rv34_mb_type_to_lavc[r->block_type];
-        r->mb_type[mb_pos] = r->block_type;
-        if(r->block_type == RV34_MB_SKIP){
-            if(s->pict_type == AV_PICTURE_TYPE_P)
-                r->mb_type[mb_pos] = RV34_MB_P_16x16;
-            if(s->pict_type == AV_PICTURE_TYPE_B)
-                r->mb_type[mb_pos] = RV34_MB_B_DIRECT;
-        }
-        r->is16 = !!IS_INTRA16x16(s->current_picture_ptr->f.mb_type[mb_pos]);
-        rv34_decode_mv(r, r->block_type);
-        if(r->block_type == RV34_MB_SKIP){
-            fill_rectangle(intra_types, 4, 4, r->intra_types_stride, 0, sizeof(intra_types[0]));
-            return 0;
-        }
-        r->chroma_vlc = 1;
-        r->luma_vlc   = 0;
+        r->luma_vlc   = 1;
+    }
+
+    r->chroma_vlc = 0;
+    r->cur_vlcs   = choose_vlc_set(r->si.quant, r->si.vlc_set, 0);
+
+    return rv34_decode_cbp(gb, r->cur_vlcs, r->is16);
+}
+
+/**
+ * Decode inter macroblock header and return CBP in case of success, -1 otherwise.
+ */
+static int rv34_decode_inter_mb_header(RV34DecContext *r, int8_t *intra_types)
+{
+    MpegEncContext *s = &r->s;
+    GetBitContext *gb = &s->gb;
+    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
+    int i, t;
+
+    r->block_type = r->decode_mb_info(r);
+    if(r->block_type == -1)
+        return -1;
+    s->current_picture_ptr->f.mb_type[mb_pos] = rv34_mb_type_to_lavc[r->block_type];
+    r->mb_type[mb_pos] = r->block_type;
+    if(r->block_type == RV34_MB_SKIP){
+        if(s->pict_type == AV_PICTURE_TYPE_P)
+            r->mb_type[mb_pos] = RV34_MB_P_16x16;
+        if(s->pict_type == AV_PICTURE_TYPE_B)
+            r->mb_type[mb_pos] = RV34_MB_B_DIRECT;
     }
+    r->is16 = !!IS_INTRA16x16(s->current_picture_ptr->f.mb_type[mb_pos]);
+    rv34_decode_mv(r, r->block_type);
+    if(r->block_type == RV34_MB_SKIP){
+        fill_rectangle(intra_types, 4, 4, r->intra_types_stride, 0, sizeof(intra_types[0]));
+        return 0;
+    }
+    r->chroma_vlc = 1;
+    r->luma_vlc   = 0;
+
     if(IS_INTRA(s->current_picture_ptr->f.mb_type[mb_pos])){
         if(r->is16){
             t = get_bits(gb, 2);
@@ -956,15 +982,6 @@ static void rv34_pred_4x4_block(RV34DecContext *r, uint8_t *dst, int stride, int
     r->h.pred4x4[itype](dst, prev, stride);
 }
 
-/** add_pixels_clamped for 4x4 block */
-static void rv34_add_4x4_block(uint8_t *dst, int stride, DCTELEM block[64], int off)
-{
-    int x, y;
-    for(y = 0; y < 4; y++)
-        for(x = 0; x < 4; x++)
-            dst[x + y*stride] = av_clip_uint8(dst[x + y*stride] + block[off + x+y*8]);
-}
-
 static inline int adjust_pred16(int itype, int up, int left)
 {
     if(!up && !left)
@@ -981,15 +998,35 @@ static inline int adjust_pred16(int itype, int up, int left)
     return itype;
 }
 
-static void rv34_output_macroblock(RV34DecContext *r, int8_t *intra_types, int cbp, int is16)
+static inline void rv34_process_block(RV34DecContext *r,
+                                      uint8_t *pdst, int stride,
+                                      int fc, int sc, int q_dc, int q_ac)
 {
     MpegEncContext *s = &r->s;
-    DSPContext *dsp = &s->dsp;
-    int i, j;
-    uint8_t *Y, *U, *V;
-    int itype;
-    int avail[6*8] = {0};
-    int idx;
+    DCTELEM *ptr = s->block[0];
+    int has_ac = rv34_decode_block(ptr, &s->gb, r->cur_vlcs,
+                                   fc, sc, q_dc, q_ac, q_ac);
+    if(has_ac){
+        r->rdsp.rv34_idct_add(pdst, stride, ptr);
+    }else{
+        r->rdsp.rv34_idct_dc_add(pdst, stride, ptr[0]);
+        ptr[0] = 0;
+    }
+}
+
+static void rv34_output_i16x16(RV34DecContext *r, int8_t *intra_types, int cbp)
+{
+    LOCAL_ALIGNED_16(DCTELEM, block16, [16]);
+    MpegEncContext *s    = &r->s;
+    GetBitContext  *gb   = &s->gb;
+    int             q_dc = rv34_qscale_tab[ r->luma_dc_quant_i[s->qscale] ],
+                    q_ac = rv34_qscale_tab[s->qscale];
+    uint8_t        *dst  = s->dest[0];
+    DCTELEM        *ptr  = s->block[0];
+    int       avail[6*8] = {0};
+    int i, j, itype, has_ac;
+
+    memset(block16, 0, 16 * sizeof(*block16));
 
     // Set neighbour information.
     if(r->avail_cache[1])
@@ -1005,80 +1042,118 @@ static void rv34_output_macroblock(RV34DecContext *r, int8_t *intra_types, int c
     if(r->avail_cache[9])
         avail[24] = avail[32] = 1;
 
-    Y = s->dest[0];
-    U = s->dest[1];
-    V = s->dest[2];
-    if(!is16){
-        for(j = 0; j < 4; j++){
-            idx = 9 + j*8;
-            for(i = 0; i < 4; i++, cbp >>= 1, Y += 4, idx++){
-                rv34_pred_4x4_block(r, Y, s->linesize, ittrans[intra_types[i]], avail[idx-8], avail[idx-1], avail[idx+7], avail[idx-7]);
-                avail[idx] = 1;
-                if(cbp & 1)
-                    rv34_add_4x4_block(Y, s->linesize, s->block[(i>>1)+(j&2)], (i&1)*4+(j&1)*32);
-            }
-            Y += s->linesize * 4 - 4*4;
-            intra_types += r->intra_types_stride;
+    has_ac = rv34_decode_block(block16, gb, r->cur_vlcs, 3, 0, q_dc, q_dc, q_ac);
+    if(has_ac)
+        r->rdsp.rv34_inv_transform(block16);
+    else
+        r->rdsp.rv34_inv_transform_dc(block16);
+
+    itype = ittrans16[intra_types[0]];
+    itype = adjust_pred16(itype, r->avail_cache[6-4], r->avail_cache[6-1]);
+    r->h.pred16x16[itype](dst, s->linesize);
+
+    for(j = 0; j < 4; j++){
+        for(i = 0; i < 4; i++, cbp >>= 1){
+            int dc = block16[i + j*4];
+
+            if(cbp & 1){
+                has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
+            }else
+                has_ac = 0;
+
+            if(has_ac){
+                ptr[0] = dc;
+                r->rdsp.rv34_idct_add(dst+4*i, s->linesize, ptr);
+            }else
+                r->rdsp.rv34_idct_dc_add(dst+4*i, s->linesize, dc);
         }
-        intra_types -= r->intra_types_stride * 4;
-        fill_rectangle(r->avail_cache + 6, 2, 2, 4, 0, 4);
-        for(j = 0; j < 2; j++){
-            idx = 6 + j*4;
-            for(i = 0; i < 2; i++, cbp >>= 1, idx++){
-                rv34_pred_4x4_block(r, U + i*4 + j*4*s->uvlinesize, s->uvlinesize, ittrans[intra_types[i*2+j*2*r->intra_types_stride]], r->avail_cache[idx-4], r->avail_cache[idx-1], !i && !j, r->avail_cache[idx-3]);
-                rv34_pred_4x4_block(r, V + i*4 + j*4*s->uvlinesize, s->uvlinesize, ittrans[intra_types[i*2+j*2*r->intra_types_stride]], r->avail_cache[idx-4], r->avail_cache[idx-1], !i && !j, r->avail_cache[idx-3]);
-                r->avail_cache[idx] = 1;
-                if(cbp & 0x01)
-                    rv34_add_4x4_block(U + i*4 + j*4*s->uvlinesize, s->uvlinesize, s->block[4], i*4+j*32);
-                if(cbp & 0x10)
-                    rv34_add_4x4_block(V + i*4 + j*4*s->uvlinesize, s->uvlinesize, s->block[5], i*4+j*32);
-            }
+
+        dst += 4*s->linesize;
+    }
+
+    itype = ittrans16[intra_types[0]];
+    if(itype == PLANE_PRED8x8) itype = DC_PRED8x8;
+    itype = adjust_pred16(itype, r->avail_cache[6-4], r->avail_cache[6-1]);
+
+    q_dc = rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]];
+    q_ac = rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]];
+
+    for(j = 1; j < 3; j++){
+        dst = s->dest[j];
+        r->h.pred8x8[itype](dst, s->uvlinesize);
+        for(i = 0; i < 4; i++, cbp >>= 1){
+            uint8_t *pdst;
+            if(!(cbp & 1)) continue;
+            pdst   = dst + (i&1)*4 + (i&2)*2*s->uvlinesize;
+
+            rv34_process_block(r, pdst, s->uvlinesize,
+                               r->chroma_vlc, 1, q_dc, q_ac);
         }
-    }else{
-        itype = ittrans16[intra_types[0]];
-        itype = adjust_pred16(itype, r->avail_cache[6-4], r->avail_cache[6-1]);
-        r->h.pred16x16[itype](Y, s->linesize);
-        dsp->add_pixels_clamped(s->block[0], Y,     s->linesize);
-        dsp->add_pixels_clamped(s->block[1], Y + 8, s->linesize);
-        Y += s->linesize * 8;
-        dsp->add_pixels_clamped(s->block[2], Y,     s->linesize);
-        dsp->add_pixels_clamped(s->block[3], Y + 8, s->linesize);
-
-        itype = ittrans16[intra_types[0]];
-        if(itype == PLANE_PRED8x8) itype = DC_PRED8x8;
-        itype = adjust_pred16(itype, r->avail_cache[6-4], r->avail_cache[6-1]);
-        r->h.pred8x8[itype](U, s->uvlinesize);
-        dsp->add_pixels_clamped(s->block[4], U, s->uvlinesize);
-        r->h.pred8x8[itype](V, s->uvlinesize);
-        dsp->add_pixels_clamped(s->block[5], V, s->uvlinesize);
     }
 }
 
-/**
- * mask for retrieving all bits in coded block pattern
- * corresponding to one 8x8 block
- */
-#define LUMA_CBP_BLOCK_MASK 0x33
+static void rv34_output_intra(RV34DecContext *r, int8_t *intra_types, int cbp)
+{
+    MpegEncContext *s   = &r->s;
+    uint8_t        *dst = s->dest[0];
+    int      avail[6*8] = {0};
+    int i, j, k;
+    int idx, q_ac, q_dc;
 
-#define U_CBP_MASK 0x0F0000
-#define V_CBP_MASK 0xF00000
+    // Set neighbour information.
+    if(r->avail_cache[1])
+        avail[0] = 1;
+    if(r->avail_cache[2])
+        avail[1] = avail[2] = 1;
+    if(r->avail_cache[3])
+        avail[3] = avail[4] = 1;
+    if(r->avail_cache[4])
+        avail[5] = 1;
+    if(r->avail_cache[5])
+        avail[8] = avail[16] = 1;
+    if(r->avail_cache[9])
+        avail[24] = avail[32] = 1;
 
-/** @} */ // recons group
+    q_ac = rv34_qscale_tab[s->qscale];
+    for(j = 0; j < 4; j++){
+        idx = 9 + j*8;
+        for(i = 0; i < 4; i++, cbp >>= 1, dst += 4, idx++){
+            rv34_pred_4x4_block(r, dst, s->linesize, ittrans[intra_types[i]], avail[idx-8], avail[idx-1], avail[idx+7], avail[idx-7]);
+            avail[idx] = 1;
+            if(!(cbp & 1)) continue;
+
+            rv34_process_block(r, dst, s->linesize,
+                               r->luma_vlc, 0, q_ac, q_ac);
+        }
+        dst += s->linesize * 4 - 4*4;
+        intra_types += r->intra_types_stride;
+    }
 
+    intra_types -= r->intra_types_stride * 4;
 
-static void rv34_apply_differences(RV34DecContext *r, int cbp)
-{
-    static const int shifts[4] = { 0, 2, 8, 10 };
-    MpegEncContext *s = &r->s;
-    int i;
+    q_dc = rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]];
+    q_ac = rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]];
+
+    for(k = 0; k < 2; k++){
+        dst = s->dest[1+k];
+        fill_rectangle(r->avail_cache + 6, 2, 2, 4, 0, 4);
+
+        for(j = 0; j < 2; j++){
+            int* acache = r->avail_cache + 6 + j*4;
+            for(i = 0; i < 2; i++, cbp >>= 1, acache++){
+                int itype = ittrans[intra_types[i*2+j*2*r->intra_types_stride]];
+                rv34_pred_4x4_block(r, dst+4*i, s->uvlinesize, itype, acache[-4], acache[-1], !i && !j, acache[-3]);
+                acache[0] = 1;
 
-    for(i = 0; i < 4; i++)
-        if((cbp & (LUMA_CBP_BLOCK_MASK << shifts[i])) || r->block_type == RV34_MB_P_MIX16x16)
-            s->dsp.add_pixels_clamped(s->block[i], s->dest[0] + (i & 1)*8 + (i&2)*4*s->linesize, s->linesize);
-    if(cbp & U_CBP_MASK)
-        s->dsp.add_pixels_clamped(s->block[4], s->dest[1], s->uvlinesize);
-    if(cbp & V_CBP_MASK)
-        s->dsp.add_pixels_clamped(s->block[5], s->dest[2], s->uvlinesize);
+                if(!(cbp&1)) continue;
+
+                rv34_process_block(r, dst + 4*i, s->uvlinesize,
+                                   r->chroma_vlc, 1, q_dc, q_ac);
+            }
+
+            dst += 4*s->uvlinesize;
+        }
+    }
 }
 
 static int is_mv_diff_gt_3(int16_t (*motion_val)[2], int step)
@@ -1123,17 +1198,17 @@ static int rv34_set_deblock_coef(RV34DecContext *r)
     return hmvmask | vmvmask;
 }
 
-static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types)
+static int rv34_decode_inter_macroblock(RV34DecContext *r, int8_t *intra_types)
 {
-    MpegEncContext *s = &r->s;
-    GetBitContext *gb = &s->gb;
+    MpegEncContext *s   = &r->s;
+    GetBitContext  *gb  = &s->gb;
+    uint8_t        *dst = s->dest[0];
+    DCTELEM        *ptr = s->block[0];
+    int          mb_pos = s->mb_x + s->mb_y * s->mb_stride;
     int cbp, cbp2;
     int q_dc, q_ac, has_ac;
-    int i, blknum, blkoff;
-    LOCAL_ALIGNED_16(DCTELEM, block16, [64]);
-    int luma_dc_quant;
+    int i, j;
     int dist;
-    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
 
     // Calculate which neighbours are available. Maybe it's worth optimizing too.
     memset(r->avail_cache, 0, sizeof(r->avail_cache));
@@ -1151,70 +1226,126 @@ static int rv34_decode_macroblock(RV34DecContext *r, int8_t *intra_types)
         r->avail_cache[1] = s->current_picture_ptr->f.mb_type[mb_pos - s->mb_stride - 1];
 
     s->qscale = r->si.quant;
-    cbp = cbp2 = rv34_decode_mb_header(r, intra_types);
+    cbp = cbp2 = rv34_decode_inter_mb_header(r, intra_types);
     r->cbp_luma  [mb_pos] = cbp;
     r->cbp_chroma[mb_pos] = cbp >> 16;
-    if(s->pict_type == AV_PICTURE_TYPE_I)
-        r->deblock_coefs[mb_pos] = 0xFFFF;
-    else
-        r->deblock_coefs[mb_pos] = rv34_set_deblock_coef(r) | r->cbp_luma[mb_pos];
+    r->deblock_coefs[mb_pos] = rv34_set_deblock_coef(r) | r->cbp_luma[mb_pos];
     s->current_picture_ptr->f.qscale_table[mb_pos] = s->qscale;
 
     if(cbp == -1)
         return -1;
 
-    luma_dc_quant = r->block_type == RV34_MB_P_MIX16x16 ? r->luma_dc_quant_p[s->qscale] : r->luma_dc_quant_i[s->qscale];
+    if (IS_INTRA(s->current_picture_ptr->f.mb_type[mb_pos])){
+        if(r->is16) rv34_output_i16x16(r, intra_types, cbp);
+        else        rv34_output_intra(r, intra_types, cbp);
+        return 0;
+    }
+
     if(r->is16){
-        q_dc = rv34_qscale_tab[luma_dc_quant];
+        // Only for RV34_MB_P_MIX16x16
+        LOCAL_ALIGNED_16(DCTELEM, block16, [16]);
+        memset(block16, 0, 16 * sizeof(*block16));
+        q_dc = rv34_qscale_tab[ r->luma_dc_quant_p[s->qscale] ];
         q_ac = rv34_qscale_tab[s->qscale];
-        s->dsp.clear_block(block16);
         if (rv34_decode_block(block16, gb, r->cur_vlcs, 3, 0, q_dc, q_dc, q_ac))
-            r->rdsp.rv34_inv_transform_tab[1](block16);
+            r->rdsp.rv34_inv_transform(block16);
         else
-            r->rdsp.rv34_inv_transform_dc_tab[1](block16);
-    }
+            r->rdsp.rv34_inv_transform_dc(block16);
+
+        q_ac = rv34_qscale_tab[s->qscale];
+
+        for(j = 0; j < 4; j++){
+            for(i = 0; i < 4; i++, cbp >>= 1){
+                int      dc   = block16[i + j*4];
+
+                if(cbp & 1){
+                    has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
+                }else
+                    has_ac = 0;
+
+                if(has_ac){
+                    ptr[0] = dc;
+                    r->rdsp.rv34_idct_add(dst+4*i, s->linesize, ptr);
+                }else
+                    r->rdsp.rv34_idct_dc_add(dst+4*i, s->linesize, dc);
+            }
+
+            dst += 4*s->linesize;
+        }
 
-    q_ac = rv34_qscale_tab[s->qscale];
-    for(i = 0; i < 16; i++, cbp >>= 1){
-        DCTELEM *ptr;
-        if(!r->is16 && !(cbp & 1)) continue;
-        blknum = ((i & 2) >> 1) + ((i & 8) >> 2);
-        blkoff = ((i & 1) << 2) + ((i & 4) << 3);
-        ptr    = s->block[blknum] + blkoff;
-        if(cbp & 1)
-            has_ac = rv34_decode_block(ptr, gb, r->cur_vlcs, r->luma_vlc, 0, q_ac, q_ac, q_ac);
-        else
-            has_ac = 0;
-        if(r->is16) //FIXME: optimize
-            ptr[0] = block16[(i & 3) | ((i & 0xC) << 1)];
-        if(has_ac)
-            r->rdsp.rv34_inv_transform_tab[0](ptr);
-        else
-            r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
-    }
-    if(r->block_type == RV34_MB_P_MIX16x16)
         r->cur_vlcs = choose_vlc_set(r->si.quant, r->si.vlc_set, 1);
+    }else{
+        q_ac = rv34_qscale_tab[s->qscale];
+
+        for(j = 0; j < 4; j++){
+            for(i = 0; i < 4; i++, cbp >>= 1){
+                if(!(cbp & 1)) continue;
+
+                rv34_process_block(r, dst + 4*i, s->linesize,
+                                   r->luma_vlc, 0, q_ac, q_ac);
+            }
+            dst += 4*s->linesize;
+        }
+    }
+
     q_dc = rv34_qscale_tab[rv34_chroma_quant[1][s->qscale]];
     q_ac = rv34_qscale_tab[rv34_chroma_quant[0][s->qscale]];
-    for(; i < 24; i++, cbp >>= 1){
-        DCTELEM *ptr;
-        if(!(cbp & 1)) continue;
-        blknum = ((i & 4) >> 2) + 4;
-        blkoff = ((i & 1) << 2) + ((i & 2) << 4);
-        ptr    = s->block[blknum] + blkoff;
-        if (rv34_decode_block(ptr, gb, r->cur_vlcs, r->chroma_vlc, 1, q_dc, q_ac, q_ac))
-            r->rdsp.rv34_inv_transform_tab[0](ptr);
-        else
-            r->rdsp.rv34_inv_transform_dc_tab[0](ptr);
+
+    for(j = 1; j < 3; j++){
+        dst = s->dest[j];
+        for(i = 0; i < 4; i++, cbp >>= 1){
+            uint8_t *pdst;
+            if(!(cbp & 1)) continue;
+            pdst = dst + (i&1)*4 + (i&2)*2*s->uvlinesize;
+
+            rv34_process_block(r, pdst, s->uvlinesize,
+                               r->chroma_vlc, 1, q_dc, q_ac);
+        }
     }
-    if (IS_INTRA(s->current_picture_ptr->f.mb_type[mb_pos]))
-        rv34_output_macroblock(r, intra_types, cbp2, r->is16);
-    else
-        rv34_apply_differences(r, cbp2);
 
     return 0;
 }
 
+static int rv34_decode_intra_macroblock(RV34DecContext *r, int8_t *intra_types)
+{
+    MpegEncContext *s = &r->s;
+    int cbp, dist;
+    int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
+
+    // Calculate which neighbours are available. Maybe it's worth optimizing too.
+    memset(r->avail_cache, 0, sizeof(r->avail_cache));
+    fill_rectangle(r->avail_cache + 6, 2, 2, 4, 1, 4);
+    dist = (s->mb_x - s->resync_mb_x) + (s->mb_y - s->resync_mb_y) * s->mb_width;
+    if(s->mb_x && dist)
+        r->avail_cache[5] =
+        r->avail_cache[9] = s->current_picture_ptr->f.mb_type[mb_pos - 1];
+    if(dist >= s->mb_width)
+        r->avail_cache[2] =
+        r->avail_cache[3] = s->current_picture_ptr->f.mb_type[mb_pos - s->mb_stride];
+    if(((s->mb_x+1) < s->mb_width) && dist >= s->mb_width - 1)
+        r->avail_cache[4] = s->current_picture_ptr->f.mb_type[mb_pos - s->mb_stride + 1];
+    if(s->mb_x && dist > s->mb_width)
+        r->avail_cache[1] = s->current_picture_ptr->f.mb_type[mb_pos - s->mb_stride - 1];
+
+    s->qscale = r->si.quant;
+    cbp = rv34_decode_intra_mb_header(r, intra_types);
+    r->cbp_luma  [mb_pos] = cbp;
+    r->cbp_chroma[mb_pos] = cbp >> 16;
+    r->deblock_coefs[mb_pos] = 0xFFFF;
+    s->current_picture_ptr->f.qscale_table[mb_pos] = s->qscale;
+
+    if(cbp == -1)
+        return -1;
+
+    if(r->is16){
+        rv34_output_i16x16(r, intra_types, cbp);
+        return 0;
+    }
+
+    rv34_output_intra(r, intra_types, cbp);
+    return 0;
+}
+
 static int check_slice_end(RV34DecContext *r, MpegEncContext *s)
 {
     int bits;
@@ -1326,9 +1457,12 @@ static int rv34_decode_slice(RV34DecContext *r, int end, const uint8_t* buf, int
     ff_init_block_index(s);
     while(!check_slice_end(r, s)) {
         ff_update_block_index(s);
-        s->dsp.clear_blocks(s->block[0]);
 
-        if(rv34_decode_macroblock(r, r->intra_types + s->mb_x * 4 + 4) < 0){
+        if(r->si.type)
+            res = rv34_decode_inter_macroblock(r, r->intra_types + s->mb_x * 4 + 4);
+        else
+            res = rv34_decode_intra_macroblock(r, r->intra_types + s->mb_x * 4 + 4);
+        if(res < 0){
             ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, ER_MB_ERROR);
             return -1;
         }
diff --git a/libavcodec/rv34dsp.c b/libavcodec/rv34dsp.c
index 1767be4173..e2251773af 100644
--- a/libavcodec/rv34dsp.c
+++ b/libavcodec/rv34dsp.c
@@ -37,10 +37,10 @@ static av_always_inline void rv34_row_transform(int temp[16], DCTELEM *block)
     int i;
 
     for(i = 0; i < 4; i++){
-        const int z0 = 13*(block[i+8*0] +    block[i+8*2]);
-        const int z1 = 13*(block[i+8*0] -    block[i+8*2]);
-        const int z2 =  7* block[i+8*1] - 17*block[i+8*3];
-        const int z3 = 17* block[i+8*1] +  7*block[i+8*3];
+        const int z0 = 13*(block[i+4*0] +    block[i+4*2]);
+        const int z1 = 13*(block[i+4*0] -    block[i+4*2]);
+        const int z2 =  7* block[i+4*1] - 17*block[i+4*3];
+        const int z3 = 17* block[i+4*1] +  7*block[i+4*3];
 
         temp[4*i+0] = z0 + z3;
         temp[4*i+1] = z1 + z2;
@@ -50,14 +50,16 @@ static av_always_inline void rv34_row_transform(int temp[16], DCTELEM *block)
 }
 
 /**
- * Real Video 3.0/4.0 inverse transform
+ * Real Video 3.0/4.0 inverse transform + sample reconstruction
  * Code is almost the same as in SVQ3, only scaling is different.
  */
-static void rv34_inv_transform_c(DCTELEM *block){
-    int temp[16];
-    int i;
+static void rv34_idct_add_c(uint8_t *dst, int stride, DCTELEM *block){
+    int      temp[16];
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    int      i;
 
     rv34_row_transform(temp, block);
+    memset(block, 0, 16*sizeof(DCTELEM));
 
     for(i = 0; i < 4; i++){
         const int z0 = 13*(temp[4*0+i] +    temp[4*2+i]) + 0x200;
@@ -65,10 +67,12 @@ static void rv34_inv_transform_c(DCTELEM *block){
         const int z2 =  7* temp[4*1+i] - 17*temp[4*3+i];
         const int z3 = 17* temp[4*1+i] +  7*temp[4*3+i];
 
-        block[i*8+0] = (z0 + z3) >> 10;
-        block[i*8+1] = (z1 + z2) >> 10;
-        block[i*8+2] = (z1 - z2) >> 10;
-        block[i*8+3] = (z0 - z3) >> 10;
+        dst[0] = cm[ dst[0] + ( (z0 + z3) >> 10 ) ];
+        dst[1] = cm[ dst[1] + ( (z1 + z2) >> 10 ) ];
+        dst[2] = cm[ dst[2] + ( (z1 - z2) >> 10 ) ];
+        dst[3] = cm[ dst[3] + ( (z0 - z3) >> 10 ) ];
+
+        dst  += stride;
     }
 }
 
@@ -90,21 +94,27 @@ static void rv34_inv_transform_noround_c(DCTELEM *block){
         const int z2 =  7* temp[4*1+i] - 17*temp[4*3+i];
         const int z3 = 17* temp[4*1+i] +  7*temp[4*3+i];
 
-        block[i*8+0] = ((z0 + z3) * 3) >> 11;
-        block[i*8+1] = ((z1 + z2) * 3) >> 11;
-        block[i*8+2] = ((z1 - z2) * 3) >> 11;
-        block[i*8+3] = ((z0 - z3) * 3) >> 11;
+        block[i*4+0] = ((z0 + z3) * 3) >> 11;
+        block[i*4+1] = ((z1 + z2) * 3) >> 11;
+        block[i*4+2] = ((z1 - z2) * 3) >> 11;
+        block[i*4+3] = ((z0 - z3) * 3) >> 11;
     }
 }
 
-static void rv34_inv_transform_dc_c(DCTELEM *block)
+static void rv34_idct_dc_add_c(uint8_t *dst, int stride, int dc)
 {
-    DCTELEM dc = (13 * 13 * block[0] + 0x200) >> 10;
+    const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
     int i, j;
 
-    for (i = 0; i < 4; i++, block += 8)
+    cm += (13*13*dc + 0x200) >> 10;
+
+    for (i = 0; i < 4; i++)
+    {
         for (j = 0; j < 4; j++)
-            block[j] = dc;
+            dst[j] = cm[ dst[j] ];
+
+        dst += stride;
+    }
 }
 
 static void rv34_inv_transform_dc_noround_c(DCTELEM *block)
@@ -112,7 +122,7 @@ static void rv34_inv_transform_dc_noround_c(DCTELEM *block)
     DCTELEM dc = (13 * 13 * 3 * block[0]) >> 11;
     int i, j;
 
-    for (i = 0; i < 4; i++, block += 8)
+    for (i = 0; i < 4; i++, block += 4)
         for (j = 0; j < 4; j++)
             block[j] = dc;
 }
@@ -121,10 +131,11 @@ static void rv34_inv_transform_dc_noround_c(DCTELEM *block)
 
 
 av_cold void ff_rv34dsp_init(RV34DSPContext *c, DSPContext* dsp) {
-    c->rv34_inv_transform_tab[0] = rv34_inv_transform_c;
-    c->rv34_inv_transform_tab[1] = rv34_inv_transform_noround_c;
-    c->rv34_inv_transform_dc_tab[0]  = rv34_inv_transform_dc_c;
-    c->rv34_inv_transform_dc_tab[1]  = rv34_inv_transform_dc_noround_c;
+    c->rv34_inv_transform    = rv34_inv_transform_noround_c;
+    c->rv34_inv_transform_dc = rv34_inv_transform_dc_noround_c;
+
+    c->rv34_idct_add    = rv34_idct_add_c;
+    c->rv34_idct_dc_add = rv34_idct_dc_add_c;
 
     if (HAVE_NEON)
         ff_rv34dsp_init_neon(c, dsp);
diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h
index 6f53a09928..fe8fcaa8dd 100644
--- a/libavcodec/rv34dsp.h
+++ b/libavcodec/rv34dsp.h
@@ -36,6 +36,10 @@ typedef void (*rv40_weight_func)(uint8_t *dst/*align width (8 or 16)*/,
 
 typedef void (*rv34_inv_transform_func)(DCTELEM *block);
 
+typedef void (*rv34_idct_add_func)(uint8_t *dst, int stride, DCTELEM *block);
+typedef void (*rv34_idct_dc_add_func)(uint8_t *dst, int stride,
+                                      int   dc);
+
 typedef void (*rv40_weak_loop_filter_func)(uint8_t *src, int stride,
                                            int filter_p1, int filter_q1,
                                            int alpha, int beta,
@@ -55,8 +59,10 @@ typedef struct RV34DSPContext {
     h264_chroma_mc_func put_chroma_pixels_tab[3];
     h264_chroma_mc_func avg_chroma_pixels_tab[3];
     rv40_weight_func rv40_weight_pixels_tab[2];
-    rv34_inv_transform_func rv34_inv_transform_tab[2];
-    void (*rv34_inv_transform_dc_tab[2])(DCTELEM *block);
+    rv34_inv_transform_func rv34_inv_transform;
+    rv34_inv_transform_func rv34_inv_transform_dc;
+    rv34_idct_add_func rv34_idct_add;
+    rv34_idct_dc_add_func rv34_idct_dc_add;
     rv40_weak_loop_filter_func rv40_weak_loop_filter[2];
     rv40_strong_loop_filter_func rv40_strong_loop_filter[2];
     rv40_loop_filter_strength_func rv40_loop_filter_strength[2];
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index 462288446b..7ea9c54f31 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -25,6 +25,7 @@
  * utils.
  */
 
+#include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
 #include "libavutil/crc.h"
 #include "libavutil/mathematics.h"
@@ -102,6 +103,16 @@ void avcodec_init(void)
     dsputil_static_init();
 }
 
+static av_always_inline int codec_is_encoder(AVCodec *codec)
+{
+    return codec && (codec->encode || codec->encode2);
+}
+
+static av_always_inline int codec_is_decoder(AVCodec *codec)
+{
+    return codec && codec->decode;
+}
+
 void avcodec_register(AVCodec *codec)
 {
     AVCodec **p;
@@ -260,11 +271,47 @@ void ff_init_buffer_info(AVCodecContext *s, AVFrame *pic)
     pic->format              = s->pix_fmt;
 }
 
+int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+                             enum AVSampleFormat sample_fmt, const uint8_t *buf,
+                             int buf_size, int align)
+{
+    int ch, planar, needed_size, ret = 0;
+
+    needed_size = av_samples_get_buffer_size(NULL, nb_channels,
+                                             frame->nb_samples, sample_fmt,
+                                             align);
+    if (buf_size < needed_size)
+        return AVERROR(EINVAL);
+
+    planar = av_sample_fmt_is_planar(sample_fmt);
+    if (planar && nb_channels > AV_NUM_DATA_POINTERS) {
+        if (!(frame->extended_data = av_mallocz(nb_channels *
+                                                sizeof(*frame->extended_data))))
+            return AVERROR(ENOMEM);
+    } else {
+        frame->extended_data = frame->data;
+    }
+
+    if ((ret = av_samples_fill_arrays(frame->extended_data, &frame->linesize[0],
+                                      buf, nb_channels, frame->nb_samples,
+                                      sample_fmt, align)) < 0) {
+        if (frame->extended_data != frame->data)
+            av_free(frame->extended_data);
+        return ret;
+    }
+    if (frame->extended_data != frame->data) {
+        for (ch = 0; ch < AV_NUM_DATA_POINTERS; ch++)
+            frame->data[ch] = frame->extended_data[ch];
+    }
+
+    return ret;
+}
+
 static int audio_get_buffer(AVCodecContext *avctx, AVFrame *frame)
 {
     AVCodecInternal *avci = avctx->internal;
     InternalBuffer *buf;
-    int buf_size, ret, i, needs_extended_data;
+    int buf_size, ret;
 
     buf_size = av_samples_get_buffer_size(NULL, avctx->channels,
                                           frame->nb_samples, avctx->sample_fmt,
@@ -272,9 +319,6 @@ static int audio_get_buffer(AVCodecContext *avctx, AVFrame *frame)
     if (buf_size < 0)
         return AVERROR(EINVAL);
 
-    needs_extended_data = av_sample_fmt_is_planar(avctx->sample_fmt) &&
-                          avctx->channels > AV_NUM_DATA_POINTERS;
-
     /* allocate InternalBuffer if needed */
     if (!avci->buffer) {
         avci->buffer = av_mallocz(sizeof(InternalBuffer));
@@ -306,48 +350,31 @@ static int audio_get_buffer(AVCodecContext *avctx, AVFrame *frame)
     /* if there is no previous buffer or the previous buffer cannot be used
        as-is, allocate a new buffer and/or rearrange the channel pointers */
     if (!buf->extended_data) {
-        /* if the channel pointers will fit, just set extended_data to data,
-           otherwise allocate the extended_data channel pointers */
-        if (needs_extended_data) {
-            buf->extended_data = av_mallocz(avctx->channels *
-                                            sizeof(*buf->extended_data));
-            if (!buf->extended_data)
+        if (!buf->data[0]) {
+            if (!(buf->data[0] = av_mallocz(buf_size)))
                 return AVERROR(ENOMEM);
-        } else {
-            buf->extended_data = buf->data;
-        }
-
-        /* if there is a previous buffer and it is large enough, reuse it and
-           just fill-in new channel pointers and linesize, otherwise allocate
-           a new buffer */
-        if (buf->extended_data[0]) {
-            ret = av_samples_fill_arrays(buf->extended_data, &buf->linesize[0],
-                                         buf->extended_data[0], avctx->channels,
-                                         frame->nb_samples, avctx->sample_fmt,
-                                         32);
-        } else {
-            ret = av_samples_alloc(buf->extended_data, &buf->linesize[0],
-                                   avctx->channels, frame->nb_samples,
-                                   avctx->sample_fmt, 32);
+            buf->audio_data_size = buf_size;
         }
-        if (ret)
+        if ((ret = avcodec_fill_audio_frame(frame, avctx->channels,
+                                            avctx->sample_fmt, buf->data[0],
+                                            buf->audio_data_size, 32)))
             return ret;
 
-        /* if data was not used for extended_data, we need to copy as many of
-           the extended_data channel pointers as will fit */
-        if (needs_extended_data) {
-            for (i = 0; i < AV_NUM_DATA_POINTERS; i++)
-                buf->data[i] = buf->extended_data[i];
-        }
-        buf->audio_data_size = buf_size;
-        buf->nb_channels     = avctx->channels;
+        if (frame->extended_data == frame->data)
+            buf->extended_data = buf->data;
+        else
+            buf->extended_data = frame->extended_data;
+        memcpy(buf->data, frame->data, sizeof(frame->data));
+        buf->linesize[0] = frame->linesize[0];
+        buf->nb_channels = avctx->channels;
+    } else {
+        /* copy InternalBuffer info to the AVFrame */
+        frame->extended_data = buf->extended_data;
+        frame->linesize[0]   = buf->linesize[0];
+        memcpy(frame->data, buf->data, sizeof(frame->data));
     }
 
-    /* copy InternalBuffer info to the AVFrame */
     frame->type          = FF_BUFFER_TYPE_INTERNAL;
-    frame->extended_data = buf->extended_data;
-    frame->linesize[0]   = buf->linesize[0];
-    memcpy(frame->data, buf->data, sizeof(frame->data));
 
     if (avctx->pkt) {
         frame->pkt_pts = avctx->pkt->pts;
@@ -732,7 +759,7 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, AVCodec *codec, AVD
 
     /* if the decoder init function was already called previously,
        free the already allocated subtitle_header before overwriting it */
-    if (codec->decode)
+    if (codec_is_decoder(codec))
         av_freep(&avctx->subtitle_header);
 
 #define SANE_NB_CHANNELS 128U
@@ -789,7 +816,7 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, AVCodec *codec, AVD
         ret = AVERROR(EINVAL);
         goto free_and_end;
     }
-    if (avctx->codec->encode) {
+    if (codec_is_encoder(avctx->codec)) {
         int i;
         if (avctx->codec->sample_fmts) {
             for (i = 0; avctx->codec->sample_fmts[i] != AV_SAMPLE_FMT_NONE; i++)
@@ -870,21 +897,225 @@ free_and_end:
     goto end;
 }
 
-int attribute_align_arg avcodec_encode_audio(AVCodecContext *avctx, uint8_t *buf, int buf_size,
-                         const short *samples)
+int ff_alloc_packet(AVPacket *avpkt, int size)
 {
-    if(buf_size < FF_MIN_BUFFER_SIZE && 0){
-        av_log(avctx, AV_LOG_ERROR, "buffer smaller than minimum size\n");
-        return -1;
+    if (size > INT_MAX - FF_INPUT_BUFFER_PADDING_SIZE)
+        return AVERROR(EINVAL);
+
+    if (avpkt->data) {
+        uint8_t *pkt_data;
+        int pkt_size;
+
+        if (avpkt->size < size)
+            return AVERROR(EINVAL);
+
+        pkt_data = avpkt->data;
+        pkt_size = avpkt->size;
+        av_init_packet(avpkt);
+        avpkt->data = pkt_data;
+        avpkt->size = pkt_size;
+        return 0;
+    } else {
+        return av_new_packet(avpkt, size);
     }
-    if((avctx->codec->capabilities & CODEC_CAP_DELAY) || samples){
-        int ret = avctx->codec->encode(avctx, buf, buf_size, samples);
-        avctx->frame_number++;
-        return ret;
-    }else
+}
+
+int attribute_align_arg avcodec_encode_audio2(AVCodecContext *avctx,
+                                              AVPacket *avpkt,
+                                              const AVFrame *frame,
+                                              int *got_packet_ptr)
+{
+    int ret;
+    int user_packet = !!avpkt->data;
+    int nb_samples;
+
+    if (!(avctx->codec->capabilities & CODEC_CAP_DELAY) && !frame) {
+        av_init_packet(avpkt);
+        avpkt->size = 0;
         return 0;
+    }
+
+    /* check for valid frame size */
+    if (frame) {
+        nb_samples = frame->nb_samples;
+        if (avctx->codec->capabilities & CODEC_CAP_SMALL_LAST_FRAME) {
+            if (nb_samples > avctx->frame_size)
+                return AVERROR(EINVAL);
+        } else if (!(avctx->codec->capabilities & CODEC_CAP_VARIABLE_FRAME_SIZE)) {
+            if (nb_samples != avctx->frame_size)
+                return AVERROR(EINVAL);
+        }
+    } else {
+        nb_samples = avctx->frame_size;
+    }
+
+    if (avctx->codec->encode2) {
+        *got_packet_ptr = 0;
+        ret = avctx->codec->encode2(avctx, avpkt, frame, got_packet_ptr);
+        if (!ret && *got_packet_ptr &&
+            !(avctx->codec->capabilities & CODEC_CAP_DELAY)) {
+            avpkt->pts = frame->pts;
+            avpkt->duration = av_rescale_q(frame->nb_samples,
+                                           (AVRational){ 1, avctx->sample_rate },
+                                           avctx->time_base);
+        }
+    } else {
+        /* for compatibility with encoders not supporting encode2(), we need to
+           allocate a packet buffer if the user has not provided one or check
+           the size otherwise */
+        int fs_tmp   = 0;
+        int buf_size = avpkt->size;
+        if (!user_packet) {
+            if (avctx->codec->capabilities & CODEC_CAP_VARIABLE_FRAME_SIZE) {
+                av_assert0(av_get_bits_per_sample(avctx->codec_id) != 0);
+                buf_size = nb_samples * avctx->channels *
+                           av_get_bits_per_sample(avctx->codec_id) / 8;
+            } else {
+                /* this is a guess as to the required size.
+                   if an encoder needs more than this, it should probably
+                   implement encode2() */
+                buf_size = 2 * avctx->frame_size * avctx->channels *
+                           av_get_bytes_per_sample(avctx->sample_fmt);
+                buf_size += FF_MIN_BUFFER_SIZE;
+            }
+        }
+        if ((ret = ff_alloc_packet(avpkt, buf_size)))
+            return ret;
+
+        /* Encoders using AVCodec.encode() that support
+           CODEC_CAP_SMALL_LAST_FRAME require avctx->frame_size to be set to
+           the smaller size when encoding the last frame.
+           This code can be removed once all encoders supporting
+           CODEC_CAP_SMALL_LAST_FRAME use encode2() */
+        if ((avctx->codec->capabilities & CODEC_CAP_SMALL_LAST_FRAME) &&
+            nb_samples < avctx->frame_size) {
+            fs_tmp = avctx->frame_size;
+            avctx->frame_size = nb_samples;
+        }
+
+        /* encode the frame */
+        ret = avctx->codec->encode(avctx, avpkt->data, avpkt->size,
+                                   frame ? frame->data[0] : NULL);
+        if (ret >= 0) {
+            if (!ret) {
+                /* no output. if the packet data was allocated by libavcodec,
+                   free it */
+                if (!user_packet)
+                    av_freep(&avpkt->data);
+            } else {
+                if (avctx->coded_frame)
+                    avpkt->pts = avctx->coded_frame->pts;
+                /* Set duration for final small packet. This can be removed
+                   once all encoders supporting CODEC_CAP_SMALL_LAST_FRAME use
+                   encode2() */
+                if (fs_tmp) {
+                    avpkt->duration = av_rescale_q(avctx->frame_size,
+                                                   (AVRational){ 1, avctx->sample_rate },
+                                                   avctx->time_base);
+                }
+            }
+            avpkt->size = ret;
+            *got_packet_ptr = (ret > 0);
+            ret = 0;
+        }
+
+        if (fs_tmp)
+            avctx->frame_size = fs_tmp;
+    }
+    if (!ret)
+        avctx->frame_number++;
+
+    /* NOTE: if we add any audio encoders which output non-keyframe packets,
+             this needs to be moved to the encoders, but for now we can do it
+             here to simplify things */
+    avpkt->flags |= AV_PKT_FLAG_KEY;
+
+    return ret;
 }
 
+#if FF_API_OLD_DECODE_AUDIO
+int attribute_align_arg avcodec_encode_audio(AVCodecContext *avctx,
+                                             uint8_t *buf, int buf_size,
+                                             const short *samples)
+{
+    AVPacket pkt;
+    AVFrame frame0;
+    AVFrame *frame;
+    int ret, samples_size, got_packet;
+
+    av_init_packet(&pkt);
+    pkt.data = buf;
+    pkt.size = buf_size;
+
+    if (samples) {
+        frame = &frame0;
+        avcodec_get_frame_defaults(frame);
+
+        if (avctx->frame_size) {
+            frame->nb_samples = avctx->frame_size;
+        } else {
+            /* if frame_size is not set, the number of samples must be
+               calculated from the buffer size */
+            int64_t nb_samples;
+            if (!av_get_bits_per_sample(avctx->codec_id)) {
+                av_log(avctx, AV_LOG_ERROR, "avcodec_encode_audio() does not "
+                       "support this codec\n");
+                return AVERROR(EINVAL);
+            }
+            nb_samples = (int64_t)buf_size * 8 /
+                         (av_get_bits_per_sample(avctx->codec_id) *
+                         avctx->channels);
+            if (nb_samples >= INT_MAX)
+                return AVERROR(EINVAL);
+            frame->nb_samples = nb_samples;
+        }
+
+        /* it is assumed that the samples buffer is large enough based on the
+           relevant parameters */
+        samples_size = av_samples_get_buffer_size(NULL, avctx->channels,
+                                                  frame->nb_samples,
+                                                  avctx->sample_fmt, 1);
+        if ((ret = avcodec_fill_audio_frame(frame, avctx->channels,
+                                            avctx->sample_fmt,
+                                            samples, samples_size, 1)))
+            return ret;
+
+        /* fabricate frame pts from sample count.
+           this is needed because the avcodec_encode_audio() API does not have
+           a way for the user to provide pts */
+        if(avctx->sample_rate && avctx->time_base.num)
+            frame->pts = av_rescale_q(avctx->internal->sample_count,
+                                  (AVRational){ 1, avctx->sample_rate },
+                                  avctx->time_base);
+        else
+            frame->pts = AV_NOPTS_VALUE;
+        avctx->internal->sample_count += frame->nb_samples;
+    } else {
+        frame = NULL;
+    }
+
+    got_packet = 0;
+    ret = avcodec_encode_audio2(avctx, &pkt, frame, &got_packet);
+    if (!ret && got_packet && avctx->coded_frame) {
+        avctx->coded_frame->pts       = pkt.pts;
+        avctx->coded_frame->key_frame = !!(pkt.flags & AV_PKT_FLAG_KEY);
+    }
+    /* free any side data since we cannot return it */
+    if (pkt.side_data_elems > 0) {
+        int i;
+        for (i = 0; i < pkt.side_data_elems; i++)
+            av_free(pkt.side_data[i].data);
+        av_freep(&pkt.side_data);
+        pkt.side_data_elems = 0;
+    }
+
+    if (frame && frame->extended_data != frame->data)
+        av_free(frame->extended_data);
+
+    return ret ? ret : pkt.size;
+}
+#endif
+
 int attribute_align_arg avcodec_encode_video(AVCodecContext *avctx, uint8_t *buf, int buf_size,
                          const AVFrame *pict)
 {
@@ -1187,7 +1418,7 @@ av_cold int avcodec_close(AVCodecContext *avctx)
         av_opt_free(avctx->priv_data);
     av_opt_free(avctx);
     av_freep(&avctx->priv_data);
-    if(avctx->codec && avctx->codec->encode)
+    if (codec_is_encoder(avctx->codec))
         av_freep(&avctx->extradata);
     avctx->codec = NULL;
     avctx->active_thread_type = 0;
@@ -1216,7 +1447,7 @@ AVCodec *avcodec_find_encoder(enum CodecID id)
     p = first_avcodec;
     id= remap_deprecated_codec_id(id);
     while (p) {
-        if (p->encode != NULL && p->id == id) {
+        if (codec_is_encoder(p) && p->id == id) {
             if (p->capabilities & CODEC_CAP_EXPERIMENTAL && !experimental) {
                 experimental = p;
             } else
@@ -1234,7 +1465,7 @@ AVCodec *avcodec_find_encoder_by_name(const char *name)
         return NULL;
     p = first_avcodec;
     while (p) {
-        if (p->encode != NULL && strcmp(name,p->name) == 0)
+        if (codec_is_encoder(p) && strcmp(name,p->name) == 0)
             return p;
         p = p->next;
     }
@@ -1247,7 +1478,7 @@ AVCodec *avcodec_find_decoder(enum CodecID id)
     p = first_avcodec;
     id= remap_deprecated_codec_id(id);
     while (p) {
-        if (p->decode != NULL && p->id == id) {
+        if (codec_is_decoder(p) && p->id == id) {
             if (p->capabilities & CODEC_CAP_EXPERIMENTAL && !experimental) {
                 experimental = p;
             } else
@@ -1265,7 +1496,7 @@ AVCodec *avcodec_find_decoder_by_name(const char *name)
         return NULL;
     p = first_avcodec;
     while (p) {
-        if (p->decode != NULL && strcmp(name,p->name) == 0)
+        if (codec_is_decoder(p) && strcmp(name,p->name) == 0)
             return p;
         p = p->next;
     }
diff --git a/libavcodec/version.h b/libavcodec/version.h
index fd0c3cf1da..f8bb5c69ae 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -21,7 +21,7 @@
 #define AVCODEC_VERSION_H
 
 #define LIBAVCODEC_VERSION_MAJOR 53
-#define LIBAVCODEC_VERSION_MINOR 55
+#define LIBAVCODEC_VERSION_MINOR 56
 #define LIBAVCODEC_VERSION_MICRO 105
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
@@ -123,5 +123,8 @@
 #ifndef FF_API_AVFRAME_AGE
 #define FF_API_AVFRAME_AGE (LIBAVCODEC_VERSION_MAJOR < 54)
 #endif
+#ifndef FF_API_OLD_ENCODE_AUDIO
+#define FF_API_OLD_ENCODE_AUDIO (LIBAVCODEC_VERSION_MAJOR < 54)
+#endif
 
 #endif /* AVCODEC_VERSION_H */
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index a70ad07e87..75bf1ae08a 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -35,21 +35,84 @@ SECTION .text
     sar    %1, 10
 %endmacro
 
-%macro rv34_idct_dequant4x4_dc 1
-cglobal rv34_idct_dequant4x4_%1_mmx2, 1, 2, 0
+%macro rv34_idct 1
+cglobal rv34_idct_%1_mmx2, 1, 2, 0
     movsx   r1, word [r0]
     IDCT_DC r1
-    movd    mm0, r1d
-    pshufw  mm0, mm0, 0
-    movq    [r0+ 0], mm0
-    movq    [r0+16], mm0
-    movq    [r0+32], mm0
-    movq    [r0+48], mm0
+    movd    m0, r1d
+    pshufw  m0, m0, 0
+    movq    [r0+ 0], m0
+    movq    [r0+ 8], m0
+    movq    [r0+16], m0
+    movq    [r0+24], m0
     REP_RET
 %endmacro
 
 INIT_MMX
 %define IDCT_DC IDCT_DC_ROUND
-rv34_idct_dequant4x4_dc dc
+rv34_idct dc
 %define IDCT_DC IDCT_DC_NOROUND
-rv34_idct_dequant4x4_dc dc_noround
+rv34_idct dc_noround
+
+; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
+cglobal rv34_idct_dc_add_mmx, 3, 3
+    ; calculate DC
+    IDCT_DC_ROUND r2
+    pxor       m1, m1
+    movd       m0, r2
+    psubw      m1, m0
+    packuswb   m0, m0
+    packuswb   m1, m1
+    punpcklbw  m0, m0
+    punpcklbw  m1, m1
+    punpcklwd  m0, m0
+    punpcklwd  m1, m1
+
+    ; add DC
+    lea        r2, [r0+r1*2]
+    movh       m2, [r0]
+    movh       m3, [r0+r1]
+    movh       m4, [r2]
+    movh       m5, [r2+r1]
+    paddusb    m2, m0
+    paddusb    m3, m0
+    paddusb    m4, m0
+    paddusb    m5, m0
+    psubusb    m2, m1
+    psubusb    m3, m1
+    psubusb    m4, m1
+    psubusb    m5, m1
+    movh       [r0], m2
+    movh       [r0+r1], m3
+    movh       [r2], m4
+    movh       [r2+r1], m5
+    RET
+
+; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
+INIT_XMM
+cglobal rv34_idct_dc_add_sse4, 3, 3, 6
+    ; load data
+    IDCT_DC_ROUND r2
+    pxor       m1, m1
+
+    ; calculate DC
+    movd       m0, r2
+    lea        r2, [r0+r1*2]
+    movd       m2, [r0]
+    movd       m3, [r0+r1]
+    pshuflw    m0, m0, 0
+    movd       m4, [r2]
+    movd       m5, [r2+r1]
+    punpcklqdq m0, m0
+    punpckldq  m2, m3
+    punpckldq  m4, m5
+    punpcklbw  m2, m1
+    punpcklbw  m4, m1
+    paddw      m2, m0
+    paddw      m4, m0
+    packuswb   m2, m4
+    movd      [r0], m2
+    pextrd [r0+r1], m2, 1
+    pextrd    [r2], m2, 2
+    pextrd [r2+r1], m2, 3
+    RET
diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c
index 4317e9b23b..f3d2e172e7 100644
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@@ -24,17 +24,22 @@
 #include "libavcodec/dsputil.h"
 #include "libavcodec/rv34dsp.h"
 
-void ff_rv34_idct_dequant4x4_dc_mmx2(DCTELEM *block);
-void ff_rv34_idct_dequant4x4_dc_noround_mmx2(DCTELEM *block);
+void ff_rv34_idct_dc_mmx2(DCTELEM *block);
+void ff_rv34_idct_dc_noround_mmx2(DCTELEM *block);
+void ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
+void ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
 
 av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c, DSPContext *dsp)
 {
 #if HAVE_YASM
     int mm_flags = av_get_cpu_flags();
 
+    if (mm_flags & AV_CPU_FLAG_MMX)
+        c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
     if (mm_flags & AV_CPU_FLAG_MMX2) {
-        c->rv34_inv_transform_dc_tab[0] = ff_rv34_idct_dequant4x4_dc_mmx2;
-        c->rv34_inv_transform_dc_tab[1] = ff_rv34_idct_dequant4x4_dc_noround_mmx2;
+        c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmx2;
     }
+    if (mm_flags & AV_CPU_FLAG_SSE4)
+        c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
 #endif
 }
author	Michael Niedermayer <michaelni@gmx.at>	2012-01-17 01:40:45 +0100
committer	Michael Niedermayer <michaelni@gmx.at>	2012-01-17 02:37:30 +0100
commit	67f5650a78de2567c58dbd7545434cc6d3ef9b7e (patch)
tree	34b08ed769cd7a1f071bf9ff4eca1348481c0bf1 /libavcodec
parent	905c4dc2b0d564e1b9b6bc6eeca0b8915b81cd8c (diff)
parent	9e12002f114d7e0b0ef69519518cdc0391e5e198 (diff)