47 files changed, 1119 insertions, 998 deletions
diff --git a/Changelog b/Changelog
index 1b59a1459e..6d47eaf01b 100644
--- a/Changelog
+++ b/Changelog
@@ -67,6 +67,7 @@ easier to use. The changes are:
 - aevalsrc audio source added
 - Ut Video decoder
 - Speex encoding via libspeex
+- 4:2:2 H.264 decoding support
 
 
 version 0.8:
diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index e51026800d..cc4c688c8b 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
 void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
                                        int beta, int8_t *tc0);
 
-void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
-                                      int weight, int offset);
-void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
-                                     int weight, int offset);
-void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
-                                     int weight, int offset);
-void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
-void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
-                                    int weight, int offset);
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
+                                   int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
+                                  int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
+                                  int log2_den, int weight, int offset);
 
-void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
-                                        int log2_den, int weightd, int weights,
-                                        int offset);
-void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                       int log2_den, int weightd, int weights,
-                                       int offset);
-void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
-                                       int log2_den, int weightd, int weights,
-                                       int offset);
-void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
-void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
-                                      int log2_den, int weightd, int weights,
-                                      int offset);
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
+                                     int height, int log2_den, int weightd,
+                                     int weights, int offset);
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
+                                    int height, int log2_den, int weightd,
+                                    int weights, int offset);
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
+                                    int height, int log2_den, int weightd,
+                                    int weights, int offset);
 
 void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
@@ -101,23 +76,14 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i
     c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
     c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
     }
-    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
-    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
-    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
-    c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
-    c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
-    c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
-    c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
-    c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
 
-    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
-    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
-    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
-    c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
-    c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
-    c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
-    c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
-    c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
+    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
+    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
+    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
+
+    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
+    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
+    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
 
     c->h264_idct_add        = ff_h264_idct_add_neon;
     c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index 338de6f643..6426f46637 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -1592,7 +1592,7 @@ endfunc
         vdup.8          d1,  r5
         vmov            q2,  q8
         vmov            q3,  q8
-1:      subs            ip,  ip,  #2
+1:      subs            r3,  r3,  #2
         vld1.8          {d20-d21},[r0,:128], r2
         \macd           q2,  d0,  d20
         pld             [r0]
@@ -1632,7 +1632,7 @@ endfunc
         vdup.8          d1,  r5
         vmov            q1,  q8
         vmov            q10, q8
-1:      subs            ip,  ip,  #2
+1:      subs            r3,  r3,  #2
         vld1.8          {d4},[r0,:64], r2
         \macd           q1,  d0,  d4
         pld             [r0]
@@ -1662,7 +1662,7 @@ endfunc
         vdup.8          d1,  r5
         vmov            q1,  q8
         vmov            q10, q8
-1:      subs            ip,  ip,  #4
+1:      subs            r3,  r3,  #4
         vld1.32         {d4[0]},[r0,:32], r2
         vld1.32         {d4[1]},[r0,:32], r2
         \macd           q1,  d0,  d4
@@ -1700,16 +1700,17 @@ endfunc
         .endm
 
         .macro  biweight_func w
-function biweight_h264_pixels_\w\()_neon
+function ff_biweight_h264_pixels_\w\()_neon, export=1
         push            {r4-r6, lr}
-        add             r4,  sp,  #16
+        ldr             r12, [sp, #16]
+        add             r4,  sp,  #20
         ldm             r4,  {r4-r6}
         lsr             lr,  r4,  #31
         add             r6,  r6,  #1
         eors            lr,  lr,  r5,  lsr #30
         orr             r6,  r6,  #1
-        vdup.16         q9,  r3
-        lsl             r6,  r6,  r3
+        vdup.16         q9,  r12
+        lsl             r6,  r6,  r12
         vmvn            q9,  q9
         vdup.16         q8,  r6
         mov             r6,  r0
@@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon
 endfunc
         .endm
 
-        .macro  biweight_entry w, h, b=1
-function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
-        mov             ip,  #\h
-.if \b
-        b               biweight_h264_pixels_\w\()_neon
-.endif
-endfunc
-        .endm
-
-        biweight_entry  16, 8
-        biweight_entry  16, 16, b=0
         biweight_func   16
-
-        biweight_entry  8,  16
-        biweight_entry  8,  4
-        biweight_entry  8,  8,  b=0
         biweight_func   8
-
-        biweight_entry  4,  8
-        biweight_entry  4,  2
-        biweight_entry  4,  4,  b=0
         biweight_func   4
 
 @ Weighted prediction
 
         .macro  weight_16 add
-        vdup.8          d0,  r3
-1:      subs            ip,  ip,  #2
+        vdup.8          d0,  r12
+1:      subs            r2,  r2,  #2
         vld1.8          {d20-d21},[r0,:128], r1
         vmull.u8        q2,  d0,  d20
         pld             [r0]
@@ -1785,8 +1767,8 @@ endfunc
         .endm
 
         .macro  weight_8 add
-        vdup.8          d0,  r3
-1:      subs            ip,  ip,  #2
+        vdup.8          d0,  r12
+1:      subs            r2,  r2,  #2
         vld1.8          {d4},[r0,:64], r1
         vmull.u8        q1,  d0,  d4
         pld             [r0]
@@ -1806,10 +1788,10 @@ endfunc
         .endm
 
         .macro  weight_4 add
-        vdup.8          d0,  r3
+        vdup.8          d0,  r12
         vmov            q1,  q8
         vmov            q10, q8
-1:      subs            ip,  ip,  #4
+1:      subs            r2,  r2,  #4
         vld1.32         {d4[0]},[r0,:32], r1
         vld1.32         {d4[1]},[r0,:32], r1
         vmull.u8        q1,  d0,  d4
@@ -1842,50 +1824,32 @@ endfunc
         .endm
 
         .macro  weight_func w
-function weight_h264_pixels_\w\()_neon
+function ff_weight_h264_pixels_\w\()_neon, export=1
         push            {r4, lr}
-        ldr             r4,  [sp, #8]
-        cmp             r2,  #1
-        lsl             r4,  r4,  r2
+        ldr             r12, [sp, #8]
+        ldr             r4,  [sp, #12]
+        cmp             r3,  #1
+        lsl             r4,  r4,  r3
         vdup.16         q8,  r4
         mov             r4,  r0
         ble             20f
-        rsb             lr,  r2,  #1
+        rsb             lr,  r3,  #1
         vdup.16         q9,  lr
-        cmp             r3,  #0
+        cmp             r12, #0
         blt             10f
         weight_\w       vhadd.s16
-10:     rsb             r3,  r3,  #0
+10:     rsb             r12, r12, #0
         weight_\w       vhsub.s16
-20:     rsb             lr,  r2,  #0
+20:     rsb             lr,  r3,  #0
         vdup.16         q9,  lr
-        cmp             r3,  #0
+        cmp             r12, #0
         blt             10f
         weight_\w       vadd.s16
-10:     rsb             r3,  r3,  #0
+10:     rsb             r12, r12, #0
         weight_\w       vsub.s16
 endfunc
         .endm
 
-        .macro  weight_entry w, h, b=1
-function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
-        mov             ip,  #\h
-.if \b
-        b               weight_h264_pixels_\w\()_neon
-.endif
-endfunc
-        .endm
-
-        weight_entry    16, 8
-        weight_entry    16, 16, b=0
         weight_func     16
-
-        weight_entry    8,  16
-        weight_entry    8,  4
-        weight_entry    8,  8,  b=0
         weight_func     8
-
-        weight_entry    4,  8
-        weight_entry    4,  2
-        weight_entry    4,  4,  b=0
         weight_func     4
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
index 825422bed6..c0584753cd 100644
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -70,7 +70,15 @@ typedef struct FmtConvertContext {
                                       long len, int channels);
 
     /**
-     * Convert an array of interleaved float to multiple arrays of float.
+     * Convert multiple arrays of float to an array of interleaved float.
+     *
+     * @param dst destination array of interleaved float.
+     *            constraints: 16-byte aligned
+     * @param src source array of float arrays, one for each channel.
+     *            constraints: 16-byte aligned
+     * @param len number of elements to convert.
+     *            constraints: multiple of 8
+     * @param channels number of channels
      */
     void (*float_interleave)(float *dst, const float **src, unsigned int len,
                              int channels);
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index addebce07d..4906f92ea8 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -460,11 +460,14 @@ static void chroma_dc_dct_c(DCTELEM *block){
 }
 #endif
 
-static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
-                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
-                           int src_x_offset, int src_y_offset,
-                           qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
-                           int pixel_shift, int chroma444){
+static av_always_inline void
+mc_dir_part(H264Context *h, Picture *pic, int n, int square,
+            int height, int delta, int list,
+            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+            int src_x_offset, int src_y_offset,
+            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
+            int pixel_shift, int chroma_idc)
+{
     MpegEncContext * const s = &h->s;
     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
@@ -479,6 +482,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
     const int full_my= my>>2;
     const int pic_width  = 16*s->mb_width;
     const int pic_height = 16*s->mb_height >> MB_FIELD;
+    int ysh;
 
     if(mx&7) extra_width -= 3;
     if(my&7) extra_height -= 3;
@@ -487,7 +491,8 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
        || full_my < 0-extra_height
        || full_mx + 16/*FIXME*/ > pic_width + extra_width
        || full_my + 16/*FIXME*/ > pic_height + extra_height){
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
+        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_y - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize,
+                                16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
             src_y= s->edge_emu_buffer + (2 << pixel_shift) + 2*h->mb_linesize;
         emu=1;
     }
@@ -499,7 +504,7 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
 
     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
 
-    if(chroma444){
+    if(chroma_idc == 3 /* yuv444 */){
         src_cb = pic->f.data[1] + offset;
         if(emu){
             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb - (2 << pixel_shift) - 2*h->mb_linesize, h->mb_linesize,
@@ -524,42 +529,55 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
         return;
     }
 
-    if(MB_FIELD){
+    ysh = 3 - (chroma_idc == 2 /* yuv422 */);
+    if(chroma_idc == 1 /* yuv420 */ && MB_FIELD){
         // chroma offset when predicting from a field of opposite parity
         my += 2 * ((s->mb_y & 1) - (pic->f.reference - 1));
         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
     }
-    src_cb = pic->f.data[1] + ((mx >> 3) << pixel_shift) + (my >> 3) * h->mb_uvlinesize;
-    src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + (my >> 3) * h->mb_uvlinesize;
+
+    src_cb = pic->f.data[1] + ((mx >> 3) << pixel_shift) + (my >> ysh) * h->mb_uvlinesize;
+    src_cr = pic->f.data[2] + ((mx >> 3) << pixel_shift) + (my >> ysh) * h->mb_uvlinesize;
 
     if(emu){
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
+        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize,
+                                9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
+                                pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
             src_cb= s->edge_emu_buffer;
     }
-    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
+    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
+              mx&7, (my << (chroma_idc == 2 /* yuv422 */)) &7);
 
     if(emu){
-        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
+        s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize,
+                                9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
+                                pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
             src_cr= s->edge_emu_buffer;
     }
-    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
+    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
+              mx&7, (my << (chroma_idc == 2 /* yuv422 */)) &7);
 }
 
-static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
-                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
-                           int x_offset, int y_offset,
-                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
-                           qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
-                           int list0, int list1, int pixel_shift, int chroma444){
+static av_always_inline void
+mc_part_std(H264Context *h, int n, int square, int height, int delta,
+            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+            int x_offset, int y_offset,
+            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
+            int list0, int list1, int pixel_shift, int chroma_idc)
+{
     MpegEncContext * const s = &h->s;
     qpel_mc_func *qpix_op=  qpix_put;
     h264_chroma_mc_func chroma_op= chroma_put;
 
     dest_y  += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    if(chroma444){
+    if (chroma_idc == 3 /* yuv444 */) {
         dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
         dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    }else{
+    } else if (chroma_idc == 2 /* yuv422 */) {
+        dest_cb += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
+        dest_cr += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
+    } else /* yuv420 */ {
         dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
         dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
     }
@@ -568,9 +586,9 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
 
     if(list0){
         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
+        mc_dir_part(h, ref, n, square, height, delta, 0,
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                           qpix_op, chroma_op, pixel_shift, chroma444);
+                           qpix_op, chroma_op, pixel_shift, chroma_idc);
 
         qpix_op=  qpix_avg;
         chroma_op= chroma_avg;
@@ -578,28 +596,36 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
 
     if(list1){
         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
+        mc_dir_part(h, ref, n, square, height, delta, 1,
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                           qpix_op, chroma_op, pixel_shift, chroma444);
+                           qpix_op, chroma_op, pixel_shift, chroma_idc);
     }
 }
 
-static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
-                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
-                           int x_offset, int y_offset,
-                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
-                           h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
-                           h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
-                           int list0, int list1, int pixel_shift, int chroma444){
+static av_always_inline void
+mc_part_weighted(H264Context *h, int n, int square, int height, int delta,
+                 uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+                 int x_offset, int y_offset,
+                 qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+                 h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
+                 h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
+                 int list0, int list1, int pixel_shift, int chroma_idc){
     MpegEncContext * const s = &h->s;
+    int chroma_height;
 
     dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    if(chroma444){
+    if (chroma_idc == 3 /* yuv444 */) {
+        chroma_height = height;
         chroma_weight_avg = luma_weight_avg;
         chroma_weight_op = luma_weight_op;
         dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
         dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
-    }else{
+    } else if (chroma_idc == 2 /* yuv422 */) {
+        chroma_height = height;
+        dest_cb += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
+        dest_cr += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
+    } else /* yuv420 */ {
+        chroma_height = height >> 1;
         dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
         dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
     }
@@ -615,27 +641,32 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
         int refn0 = h->ref_cache[0][ scan8[n] ];
         int refn1 = h->ref_cache[1][ scan8[n] ];
 
-        mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
+        mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0,
                     dest_y, dest_cb, dest_cr,
-                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
-        mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
+                    x_offset, y_offset, qpix_put, chroma_put,
+                    pixel_shift, chroma_idc);
+        mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1,
                     tmp_y, tmp_cb, tmp_cr,
-                    x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
+                    x_offset, y_offset, qpix_put, chroma_put,
+                    pixel_shift, chroma_idc);
 
         if(h->use_weight == 2){
             int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
             int weight1 = 64 - weight0;
-            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
-            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
+            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize,
+                              height,        5, weight0, weight1, 0);
+            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize,
+                              chroma_height, 5, weight0, weight1, 0);
+            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize,
+                              chroma_height, 5, weight0, weight1, 0);
         }else{
-            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
+            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                             h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
                             h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]);
-            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
                             h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
-            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                             h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
                             h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
         }
@@ -643,42 +674,46 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
         int list = list1 ? 1 : 0;
         int refn = h->ref_cache[list][ scan8[n] ];
         Picture *ref= &h->ref_list[list][refn];
-        mc_dir_part(h, ref, n, square, chroma_height, delta, list,
+        mc_dir_part(h, ref, n, square, height, delta, list,
                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                    qpix_put, chroma_put, pixel_shift, chroma444);
+                    qpix_put, chroma_put, pixel_shift, chroma_idc);
 
-        luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
+        luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                        h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
         if(h->use_weight_chroma){
-            chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                              h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
-            chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
+            chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                              h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
         }
     }
 }
 
-static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
-                           uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
-                           int x_offset, int y_offset,
-                           qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
-                           qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
-                           h264_weight_func *weight_op, h264_biweight_func *weight_avg,
-                           int list0, int list1, int pixel_shift, int chroma444){
+static av_always_inline void
+mc_part(H264Context *h, int n, int square, int height, int delta,
+        uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+        int x_offset, int y_offset,
+        qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
+        qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
+        h264_weight_func *weight_op, h264_biweight_func *weight_avg,
+        int list0, int list1, int pixel_shift, int chroma_idc)
+{
     if((h->use_weight==2 && list0 && list1
         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
        || h->use_weight==1)
-        mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+        mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                          x_offset, y_offset, qpix_put, chroma_put,
-                         weight_op[0], weight_op[3], weight_avg[0],
-                         weight_avg[3], list0, list1, pixel_shift, chroma444);
+                         weight_op[0], weight_op[1], weight_avg[0],
+                         weight_avg[1], list0, list1, pixel_shift, chroma_idc);
     else
-        mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
+        mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
-                    chroma_avg, list0, list1, pixel_shift, chroma444);
+                    chroma_avg, list0, list1, pixel_shift, chroma_idc);
 }
 
-static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma444){
+static av_always_inline void
+prefetch_motion(H264Context *h, int list, int pixel_shift, int chroma_idc)
+{
     /* fetch pixels for estimated mv 4 macroblocks ahead
      * optimized for 64byte cache lines */
     MpegEncContext * const s = &h->s;
@@ -689,7 +724,7 @@ static inline void prefetch_motion(H264Context *h, int list, int pixel_shift, in
         uint8_t **src = h->ref_list[list][refn].f.data;
         int off= (mx << pixel_shift) + (my + (s->mb_x&3)*4)*h->mb_linesize + (64 << pixel_shift);
         s->dsp.prefetch(src[0]+off, s->linesize, 4);
-        if(chroma444){
+        if (chroma_idc == 3 /* yuv444 */) {
             s->dsp.prefetch(src[1]+off, s->linesize, 4);
             s->dsp.prefetch(src[2]+off, s->linesize, 4);
         }else{
@@ -703,7 +738,8 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
                       h264_weight_func *weight_op, h264_biweight_func *weight_avg,
-                      int pixel_shift, int chroma444){
+                      int pixel_shift, int chroma_idc)
+{
     MpegEncContext * const s = &h->s;
     const int mb_xy= h->mb_xy;
     const int mb_type = s->current_picture.f.mb_type[mb_xy];
@@ -712,36 +748,36 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
 
     if(HAVE_THREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME))
         await_references(h);
-    prefetch_motion(h, 0, pixel_shift, chroma444);
+    prefetch_motion(h, 0, pixel_shift, chroma_idc);
 
     if(IS_16X16(mb_type)){
-        mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
                 weight_op, weight_avg,
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma444);
+                pixel_shift, chroma_idc);
     }else if(IS_16X8(mb_type)){
-        mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-                &weight_op[1], &weight_avg[1],
+                weight_op, weight_avg,
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma444);
-        mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
+                pixel_shift, chroma_idc);
+        mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-                &weight_op[1], &weight_avg[1],
+                weight_op, weight_avg,
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
-                pixel_shift, chroma444);
+                pixel_shift, chroma_idc);
     }else if(IS_8X16(mb_type)){
-        mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
+        mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                &weight_op[2], &weight_avg[2],
+                &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma444);
-        mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
+                pixel_shift, chroma_idc);
+        mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                &weight_op[2], &weight_avg[2],
+                &weight_op[1], &weight_avg[1],
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
-                pixel_shift, chroma444);
+                pixel_shift, chroma_idc);
     }else{
         int i;
 
@@ -754,50 +790,72 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
             int y_offset= (i&2)<<1;
 
             if(IS_SUB_8X8(sub_mb_type)){
-                mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                    &weight_op[3], &weight_avg[3],
+                    &weight_op[1], &weight_avg[1],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift, chroma_idc);
             }else if(IS_SUB_8X4(sub_mb_type)){
-                mc_part(h, n  , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n  , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-                    &weight_op[4], &weight_avg[4],
+                    &weight_op[1], &weight_avg[1],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
-                mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
+                    pixel_shift, chroma_idc);
+                mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-                    &weight_op[4], &weight_avg[4],
+                    &weight_op[1], &weight_avg[1],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift, chroma_idc);
             }else if(IS_SUB_4X8(sub_mb_type)){
-                mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                mc_part(h, n  , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                    &weight_op[5], &weight_avg[5],
+                    &weight_op[2], &weight_avg[2],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
-                mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
+                    pixel_shift, chroma_idc);
+                mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                    &weight_op[5], &weight_avg[5],
+                    &weight_op[2], &weight_avg[2],
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                    pixel_shift, chroma444);
+                    pixel_shift, chroma_idc);
             }else{
                 int j;
                 assert(IS_SUB_4X4(sub_mb_type));
                 for(j=0; j<4; j++){
                     int sub_x_offset= x_offset + 2*(j&1);
                     int sub_y_offset= y_offset +   (j&2);
-                    mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
+                    mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                        &weight_op[6], &weight_avg[6],
+                        &weight_op[2], &weight_avg[2],
                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift, chroma444);
+                        pixel_shift, chroma_idc);
                 }
             }
         }
     }
 
-    prefetch_motion(h, 1, pixel_shift, chroma444);
+    prefetch_motion(h, 1, pixel_shift, chroma_idc);
+}
+
+static av_always_inline void
+hl_motion_420(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+              qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
+              qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
+              h264_weight_func *weight_op, h264_biweight_func *weight_avg,
+              int pixel_shift)
+{
+    hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put,
+              qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 1);
+}
+
+static av_always_inline void
+hl_motion_422(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
+              qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
+              qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
+              h264_weight_func *weight_op, h264_biweight_func *weight_avg,
+              int pixel_shift)
+{
+    hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put,
+              qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 2);
 }
 
 static void free_tables(H264Context *h, int free_rbsp){
@@ -1468,7 +1526,10 @@ static void decode_postinit(H264Context *h, int setup_finished){
         ff_thread_finish_setup(s->avctx);
 }
 
-static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
+static av_always_inline void backup_mb_border(H264Context *h, uint8_t *src_y,
+                                              uint8_t *src_cb, uint8_t *src_cr,
+                                              int linesize, int uvlinesize, int simple)
+{
     MpegEncContext * const s = &h->s;
     uint8_t *top_border;
     int top_idx = 1;
@@ -1813,7 +1874,8 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type,
     }
 }
 
-static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift){
+static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, int pixel_shift)
+{
     MpegEncContext * const s = &h->s;
     const int mb_x= s->mb_x;
     const int mb_y= s->mb_y;
@@ -1827,7 +1889,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
     /* is_h264 should always be true if SVQ3 is disabled. */
     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
-    const int block_h = 16>>s->chroma_y_shift;
+    const int block_h = 16 >> s->chroma_y_shift;
+    const int chroma422 = CHROMA422;
 
     dest_y  = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
     dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift)*8 + mb_y * s->uvlinesize * block_h;
@@ -1844,8 +1907,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
         block_offset = &h->block_offset[48];
         if(mb_y&1){ //FIXME move out of this function?
             dest_y -= s->linesize*15;
-            dest_cb-= s->uvlinesize*(block_h-1);
-            dest_cr-= s->uvlinesize*(block_h-1);
+            dest_cb-= s->uvlinesize * (block_h - 1);
+            dest_cr-= s->uvlinesize * (block_h - 1);
         }
         if(FRAME_MBAFF) {
             int list;
@@ -1884,7 +1947,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
             }
             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
                 if (!h->sps.chroma_format_idc) {
-                    for (i = 0; i < 8; i++) {
+                    for (i = 0; i < block_h; i++) {
                         uint16_t *tmp_cb = (uint16_t*)(dest_cb + i*uvlinesize);
                         uint16_t *tmp_cr = (uint16_t*)(dest_cr + i*uvlinesize);
                         for (j = 0; j < 8; j++) {
@@ -1911,13 +1974,13 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
                 if (!h->sps.chroma_format_idc) {
                     for (i=0; i<8; i++) {
-                        memset(dest_cb+ i*uvlinesize, 1 << (bit_depth - 1), 8);
-                        memset(dest_cr+ i*uvlinesize, 1 << (bit_depth - 1), 8);
+                        memset(dest_cb + i*uvlinesize, 1 << (bit_depth - 1), 8);
+                        memset(dest_cr + i*uvlinesize, 1 << (bit_depth - 1), 8);
                     }
                 } else {
                     for (i=0; i<block_h; i++) {
-                        memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
-                        memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
+                        memcpy(dest_cb + i*uvlinesize, h->mb + 128 + i*4,  8);
+                        memcpy(dest_cr + i*uvlinesize, h->mb + 160 + i*4,  8);
                     }
                 }
             }
@@ -1937,11 +2000,21 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
             if(h->deblocking_filter)
                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, 0, simple, pixel_shift);
         }else if(is_h264){
-            hl_motion(h, dest_y, dest_cb, dest_cr,
-                      s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
-                      s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
-                      h->h264dsp.weight_h264_pixels_tab,
-                      h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 0);
+            if (chroma422) {
+                hl_motion_422(h, dest_y, dest_cb, dest_cr,
+                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
+                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
+                              h->h264dsp.weight_h264_pixels_tab,
+                              h->h264dsp.biweight_h264_pixels_tab,
+                              pixel_shift);
+            } else {
+                hl_motion_420(h, dest_y, dest_cb, dest_cr,
+                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
+                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
+                              h->h264dsp.weight_h264_pixels_tab,
+                              h->h264dsp.biweight_h264_pixels_tab,
+                              pixel_shift);
+            }
         }
 
         hl_decode_mb_idct_luma(h, mb_type, is_h264, simple, transform_bypass, pixel_shift, block_offset, linesize, dest_y, 0);
@@ -1959,14 +2032,20 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple, i
                             if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
                                 idct_add   (dest[j-1] + block_offset[i], h->mb + (i*16 << pixel_shift), uvlinesize);
                         }
+                        if (chroma422) {
+                            for(i=j*16+4; i<j*16+8; i++){
+                                if(h->non_zero_count_cache[ scan8[i] ] || dctcoef_get(h->mb, pixel_shift, i*16))
+                                    idct_add   (dest[j-1] + block_offset[i+4], h->mb + (i*16 << pixel_shift), uvlinesize);
+                            }
+                        }
                     }
                 }
             }else{
                 if(is_h264){
                     int qp[2];
-                    if (CHROMA422) {
-                        qp[0] = h->chroma_qp[0]+3;
-                        qp[1] = h->chroma_qp[1]+3;
+                    if (chroma422) {
+                        qp[0] = h->chroma_qp[0] + 3;
+                        qp[1] = h->chroma_qp[1] + 3;
                     } else {
                         qp[0] = h->chroma_qp[0];
                         qp[1] = h->chroma_qp[1];
@@ -2086,7 +2165,7 @@ static av_always_inline void hl_decode_mb_444_internal(H264Context *h, int simpl
                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
                       h->h264dsp.weight_h264_pixels_tab,
-                      h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 1);
+                      h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 3);
         }
 
         for (p = 0; p < plane_count; p++)
@@ -2690,6 +2769,8 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
             case 9 :
                 if (CHROMA444)
                     s->avctx->pix_fmt = PIX_FMT_YUV444P9;
+                else if (CHROMA422)
+                    s->avctx->pix_fmt = PIX_FMT_YUV422P9;
                 else
                     s->avctx->pix_fmt = PIX_FMT_YUV420P9;
                 break;
@@ -2708,7 +2789,7 @@ static int decode_slice_header(H264Context *h, H264Context *h0){
                        s->avctx->pix_fmt = PIX_FMT_GBR24P;
                        av_log(h->s.avctx, AV_LOG_DEBUG, "Detected GBR colorspace.\n");
                     }
-                }else if (CHROMA422) {
+                } else if (CHROMA422) {
                     s->avctx->pix_fmt = s->avctx->color_range == AVCOL_RANGE_JPEG ? PIX_FMT_YUVJ422P : PIX_FMT_YUV422P;
                 }else{
                     s->avctx->pix_fmt = s->avctx->get_format(s->avctx,
@@ -3384,7 +3465,7 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
     const int end_mb_y= s->mb_y + FRAME_MBAFF;
     const int old_slice_type= h->slice_type;
     const int pixel_shift = h->pixel_shift;
-    const int block_h = 16>>s->chroma_y_shift;
+    const int block_h = 16 >> s->chroma_y_shift;
 
     if(h->deblocking_filter) {
         for(mb_x= start_x; mb_x<end_x; mb_x++){
@@ -3401,8 +3482,8 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
                 s->mb_x= mb_x;
                 s->mb_y= mb_y;
                 dest_y  = s->current_picture.f.data[0] + ((mb_x << pixel_shift) + mb_y * s->linesize  ) * 16;
-                dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift)*(8<<CHROMA444) + mb_y * s->uvlinesize * block_h;
-                dest_cr = s->current_picture.f.data[2] + (mb_x << pixel_shift)*(8<<CHROMA444) + mb_y * s->uvlinesize * block_h;
+                dest_cb = s->current_picture.f.data[1] + (mb_x << pixel_shift) * (8 << CHROMA444) + mb_y * s->uvlinesize * block_h;
+                dest_cr = s->current_picture.f.data[2] + (mb_x << pixel_shift) * (8 << CHROMA444) + mb_y * s->uvlinesize * block_h;
                     //FIXME simplify above
 
                 if (MB_FIELD) {
@@ -3410,8 +3491,8 @@ static void loop_filter(H264Context *h, int start_x, int end_x){
                     uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
                     if(mb_y&1){ //FIXME move out of this function?
                         dest_y -= s->linesize*15;
-                        dest_cb-= s->uvlinesize*(block_h-1);
-                        dest_cr-= s->uvlinesize*(block_h-1);
+                        dest_cb-= s->uvlinesize * (block_h - 1);
+                        dest_cr-= s->uvlinesize * (block_h - 1);
                     }
                 } else {
                     linesize   = h->mb_linesize   = s->linesize;
diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index f4cae4621f..31c2658a6b 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -1565,7 +1565,12 @@ DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = {
     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
 };
 
-static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
+static av_always_inline void
+decode_cabac_residual_internal(H264Context *h, DCTELEM *block,
+                               int cat, int n, const uint8_t *scantable,
+                               const uint32_t *qmul, int max_coeff,
+                               int is_dc, int chroma422)
+{
     static const int significant_coeff_flag_offset[2][14] = {
       { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718 },
       { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733 }
@@ -1593,7 +1598,10 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
      * map node ctx => cabac ctx for level=1 */
     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
     /* map node ctx => cabac ctx for level>1 */
-    static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
+    static const uint8_t coeff_abs_levelgt1_ctx[2][8] = {
+        { 5, 5, 5, 5, 6, 7, 8, 9 },
+        { 5, 5, 5, 5, 6, 7, 8, 8 }, // 422/dc case
+    };
     static const uint8_t coeff_abs_level_transition[2][8] = {
     /* update node ctx after decoding a level=1 */
         { 1, 2, 3, 3, 4, 5, 6, 7 },
@@ -1652,7 +1660,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index,
                                                  last_coeff_ctx_base, sig_off);
     } else {
-        if (is_dc && max_coeff == 8) { // dc 422
+        if (is_dc && chroma422) { // dc 422
             DECODE_SIGNIFICANCE(7, sig_coeff_offset_dc[last], sig_coeff_offset_dc[last]);
         } else {
             coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index,
@@ -1661,7 +1669,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
 #else
         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
     } else {
-        if (is_dc && max_coeff == 8) { // dc 422
+        if (is_dc && chroma422) { // dc 422
             DECODE_SIGNIFICANCE(7, sig_coeff_offset_dc[last], sig_coeff_offset_dc[last]);
         } else {
             DECODE_SIGNIFICANCE(max_coeff - 1, last, last);
@@ -1701,9 +1709,7 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
             } \
         } else { \
             int coeff_abs = 2; \
-            if (is_dc && max_coeff == 8) \
-                node_ctx = FFMIN(node_ctx, 6); \
-            ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base; \
+            ctx = coeff_abs_levelgt1_ctx[is_dc && chroma422][node_ctx] + abs_level_m1_ctx_base; \
             node_ctx = coeff_abs_level_transition[1][node_ctx]; \
 \
             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) { \
@@ -1745,11 +1751,18 @@ static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCT
 }
 
 static void decode_cabac_residual_dc_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) {
-    decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1);
+    decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 0);
+}
+
+static void decode_cabac_residual_dc_internal_422(H264Context *h, DCTELEM *block,
+                                                  int cat, int n, const uint8_t *scantable,
+                                                  int max_coeff)
+{
+    decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 1);
 }
 
 static void decode_cabac_residual_nondc_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
-    decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
+    decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0, 0);
 }
 
 /* cat: 0-> DC 16x16  n = 0
@@ -1773,6 +1786,19 @@ static av_always_inline void decode_cabac_residual_dc( H264Context *h, DCTELEM *
     decode_cabac_residual_dc_internal( h, block, cat, n, scantable, max_coeff );
 }
 
+static av_always_inline void
+decode_cabac_residual_dc_422(H264Context *h, DCTELEM *block,
+                             int cat, int n, const uint8_t *scantable,
+                             int max_coeff)
+{
+    /* read coded block flag */
+    if (get_cabac(&h->cabac, &h->cabac_state[get_cabac_cbf_ctx(h, cat, n, max_coeff, 1)]) == 0) {
+        h->non_zero_count_cache[scan8[n]] = 0;
+        return;
+    }
+    decode_cabac_residual_dc_internal_422(h, block, cat, n, scantable, max_coeff);
+}
+
 static av_always_inline void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
     /* read coded block flag */
     if( (cat != 5 || CHROMA444) && get_cabac( &h->cabac, &h->cabac_state[get_cabac_cbf_ctx( h, cat, n, max_coeff, 0 ) ] ) == 0 ) {
@@ -2325,17 +2351,14 @@ decode_intra_mb:
         if(CHROMA444){
             decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 1);
             decode_cabac_luma_residual(h, scan, scan8x8, pixel_shift, mb_type, cbp, 2);
-        } else {
-            const int num_c8x8 = h->sps.chroma_format_idc;
-
+        } else if (CHROMA422) {
             if( cbp&0x30 ){
                 int c;
                 for( c = 0; c < 2; c++ ) {
                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
-                    decode_cabac_residual_dc(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3,
-                                             CHROMA_DC_BLOCK_INDEX+c,
-                                             CHROMA422 ? chroma422_dc_scan : chroma_dc_scan,
-                                             4*num_c8x8);
+                    decode_cabac_residual_dc_422(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3,
+                                                 CHROMA_DC_BLOCK_INDEX + c,
+                                                 chroma422_dc_scan, 8);
                 }
             }
 
@@ -2344,7 +2367,7 @@ decode_intra_mb:
                 for( c = 0; c < 2; c++ ) {
                     DCTELEM *mb = h->mb + (16*(16 + 16*c) << pixel_shift);
                     qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
-                    for (i8x8 = 0; i8x8 < num_c8x8; i8x8++) {
+                    for (i8x8 = 0; i8x8 < 2; i8x8++) {
                         for (i = 0; i < 4; i++) {
                             const int index = 16 + 16 * c + 8*i8x8 + i;
                             //av_log(s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16);
@@ -2357,6 +2380,29 @@ decode_intra_mb:
                 fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
                 fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
             }
+        } else /* yuv420 */ {
+            if( cbp&0x30 ){
+                int c;
+                for( c = 0; c < 2; c++ ) {
+                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
+                    decode_cabac_residual_dc(h, h->mb + ((256 + 16*16*c) << pixel_shift), 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4);
+                }
+            }
+
+            if( cbp&0x20 ) {
+                int c, i;
+                for( c = 0; c < 2; c++ ) {
+                    qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
+                    for( i = 0; i < 4; i++ ) {
+                        const int index = 16 + 16 * c + i;
+                        //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
+                        decode_cabac_residual_nondc(h, h->mb + (16*index << pixel_shift), 4, index, scan + 1, qmul, 15);
+                    }
+                }
+            } else {
+                fill_rectangle(&h->non_zero_count_cache[scan8[16]], 4, 4, 8, 0, 1);
+                fill_rectangle(&h->non_zero_count_cache[scan8[32]], 4, 4, 8, 0, 1);
+            }
         }
     } else {
         fill_rectangle(&h->non_zero_count_cache[scan8[ 0]], 4, 4, 8, 0, 1);
diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c
index 93697a83c1..a7de122c53 100644
--- a/libavcodec/h264_ps.c
+++ b/libavcodec/h264_ps.c
@@ -415,7 +415,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){
 #endif
     sps->crop= get_bits1(&s->gb);
     if(sps->crop){
-        int crop_vertical_limit = sps->chroma_format_idc & 2 ? 16 : 8;
+        int crop_vertical_limit   = sps->chroma_format_idc  & 2 ? 16 : 8;
         int crop_horizontal_limit = sps->chroma_format_idc == 3 ? 16 : 8;
         sps->crop_left  = get_ue_golomb(&s->gb);
         sps->crop_right = get_ue_golomb(&s->gb);
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index cf0067b8f0..bd35aa3065 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
     else\
         c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\
 \
-    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\
-    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\
-    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\
-    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\
-    c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\
-    c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\
-    c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\
-    c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\
-    c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\
-    c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\
-    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\
-    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\
-    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\
-    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\
-    c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\
-    c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\
-    c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\
-    c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\
-    c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\
-    c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\
+    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\
+    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\
+    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\
+    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\
+    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\
+    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\
+    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\
+    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\
 \
     c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\
     c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index c79ab0a625..490a936310 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -31,16 +31,18 @@
 #include "dsputil.h"
 
 //typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
-typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
-typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
+typedef void (*h264_weight_func)(uint8_t *block, int stride, int height,
+                                 int log2_denom, int weight, int offset);
+typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height,
+                                   int log2_denom, int weightd, int weights, int offset);
 
 /**
  * Context for storing H.264 DSP functions
  */
 typedef struct H264DSPContext{
     /* weighted MC */
-    h264_weight_func weight_h264_pixels_tab[10];
-    h264_biweight_func biweight_h264_pixels_tab[10];
+    h264_weight_func weight_h264_pixels_tab[4];
+    h264_biweight_func biweight_h264_pixels_tab[4];
 
     /* loop filter */
     void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
diff --git a/libavcodec/h264dsp_template.c b/libavcodec/h264dsp_template.c
index 3023541634..4d5faf01c0 100644
--- a/libavcodec/h264dsp_template.c
+++ b/libavcodec/h264dsp_template.c
@@ -29,14 +29,16 @@
 
 #define op_scale1(x)  block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
 #define op_scale2(x)  dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
-#define H264_WEIGHT(W,H) \
-static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *p_block, int stride, int log2_denom, int weight, int offset){ \
+#define H264_WEIGHT(W) \
+static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \
+                                           int log2_denom, int weight, int offset) \
+{ \
     int y; \
-    pixel *block = (pixel*)p_block; \
+    pixel *block = (pixel*)_block; \
     stride >>= sizeof(pixel)-1; \
     offset <<= (log2_denom + (BIT_DEPTH-8)); \
     if(log2_denom) offset += 1<<(log2_denom-1); \
-    for(y=0; y<H; y++, block += stride){ \
+    for (y = 0; y < height; y++, block += stride) { \
         op_scale1(0); \
         op_scale1(1); \
         if(W==2) continue; \
@@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *p_block, int strid
         op_scale1(15); \
     } \
 } \
-static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \
+                                             int log2_denom, int weightd, int weights, int offset) \
+{ \
     int y; \
     pixel *dst = (pixel*)_dst; \
     pixel *src = (pixel*)_src; \
     stride >>= sizeof(pixel)-1; \
     offset <<= (BIT_DEPTH-8); \
     offset = ((offset + 1) | 1) << log2_denom; \
-    for(y=0; y<H; y++, dst += stride, src += stride){ \
+    for (y = 0; y < height; y++, dst += stride, src += stride) { \
         op_scale2(0); \
         op_scale2(1); \
         if(W==2) continue; \
@@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_
     } \
 }
 
-H264_WEIGHT(16,16)
-H264_WEIGHT(16,8)
-H264_WEIGHT(8,16)
-H264_WEIGHT(8,8)
-H264_WEIGHT(8,4)
-H264_WEIGHT(4,8)
-H264_WEIGHT(4,4)
-H264_WEIGHT(4,2)
-H264_WEIGHT(2,4)
-H264_WEIGHT(2,2)
+H264_WEIGHT(16)
+H264_WEIGHT(8)
+H264_WEIGHT(4)
+H264_WEIGHT(2)
 
 #undef op_scale1
 #undef op_scale2
diff --git a/libavcodec/h264idct_template.c b/libavcodec/h264idct_template.c
index 64bc70dd47..c59976a1d9 100644
--- a/libavcodec/h264idct_template.c
+++ b/libavcodec/h264idct_template.c
@@ -228,16 +228,6 @@ void FUNCC(ff_h264_idct_add8)(uint8_t **dest, const int *block_offset, DCTELEM *
 void FUNCC(ff_h264_idct_add8_422)(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[15*8]){
     int i, j;
 
-#if 0
-    av_log(NULL, AV_LOG_INFO, "idct\n");
-    int32_t *b = block;
-    for (int i = 0; i < 256; i++) {
-        av_log(NULL, AV_LOG_INFO, "%5d ", b[i+256]);
-        if (!((i+1) % 16))
-            av_log(NULL, AV_LOG_INFO, "\n");
-    }
-#endif
-
     for(j=1; j<3; j++){
         for(i=j*16; i<j*16+4; i++){
             if(nnzc[ scan8[i] ])
@@ -296,13 +286,13 @@ void FUNCC(ff_h264_luma_dc_dequant_idct)(DCTELEM *p_output, DCTELEM *p_input, in
 #undef stride
 }
 
-void FUNCC(ff_h264_chroma422_dc_dequant_idct)(DCTELEM *p_block, int qmul){
+void FUNCC(ff_h264_chroma422_dc_dequant_idct)(DCTELEM *_block, int qmul){
     const int stride= 16*2;
     const int xStride= 16;
     int i;
     int temp[8];
     static const uint8_t x_offset[2]={0, 16};
-    dctcoef *block = (dctcoef*)p_block;
+    dctcoef *block = (dctcoef*)_block;
 
     for(i=0; i<4; i++){
         temp[2*i+0] = block[stride*i + xStride*0] + block[stride*i + xStride*1];
@@ -321,22 +311,13 @@ void FUNCC(ff_h264_chroma422_dc_dequant_idct)(DCTELEM *p_block, int qmul){
         block[stride*2+offset]= ((z1 - z2)*qmul + 128) >> 8;
         block[stride*3+offset]= ((z0 - z3)*qmul + 128) >> 8;
     }
-
-#if 0
-    av_log(NULL, AV_LOG_INFO, "after chroma dc\n");
-    for (int i = 0; i < 256; i++) {
-        av_log(NULL, AV_LOG_INFO, "%5d ", block[i]);
-        if (!((i+1) % 16))
-            av_log(NULL, AV_LOG_INFO, "\n");
-    }
-#endif
 }
 
-void FUNCC(ff_h264_chroma_dc_dequant_idct)(DCTELEM *p_block, int qmul){
+void FUNCC(ff_h264_chroma_dc_dequant_idct)(DCTELEM *_block, int qmul){
     const int stride= 16*2;
     const int xStride= 16;
     int a,b,c,d,e;
-    dctcoef *block = (dctcoef*)p_block;
+    dctcoef *block = (dctcoef*)_block;
 
     a= block[stride*0 + xStride*0];
     b= block[stride*0 + xStride*1];
diff --git a/libavcodec/h264pred.c b/libavcodec/h264pred.c
index dce29f9f0b..a174b4ca3c 100644
--- a/libavcodec/h264pred.c
+++ b/libavcodec/h264pred.c
@@ -462,10 +462,10 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, co
             h->pred8x8[DC_PRED8x8     ]= FUNCC(pred8x16_dc                    , depth);\
             h->pred8x8[LEFT_DC_PRED8x8]= FUNCC(pred8x16_left_dc               , depth);\
             h->pred8x8[TOP_DC_PRED8x8 ]= FUNCC(pred8x16_top_dc                , depth);\
-            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l0t, depth);\
-            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0lt, depth);\
-            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_l00, depth);\
-            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x8_mad_cow_dc_0l0, depth);\
+            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_l0t, depth);\
+            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_0lt, depth);\
+            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_l00, depth);\
+            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= FUNC(pred8x16_mad_cow_dc_0l0, depth);\
         }\
     }else{\
         h->pred8x8[DC_PRED8x8     ]= FUNCD(pred8x8_dc_rv40);\
@@ -510,8 +510,13 @@ void ff_h264_pred_init(H264PredContext *h, int codec_id, const int bit_depth, co
     h->pred4x4_add  [ HOR_PRED   ]= FUNCC(pred4x4_horizontal_add          , depth);\
     h->pred8x8l_add [VERT_PRED   ]= FUNCC(pred8x8l_vertical_add           , depth);\
     h->pred8x8l_add [ HOR_PRED   ]= FUNCC(pred8x8l_horizontal_add         , depth);\
+    if (chroma_format_idc == 1) {\
     h->pred8x8_add  [VERT_PRED8x8]= FUNCC(pred8x8_vertical_add            , depth);\
     h->pred8x8_add  [ HOR_PRED8x8]= FUNCC(pred8x8_horizontal_add          , depth);\
+    } else {\
+        h->pred8x8_add  [VERT_PRED8x8]= FUNCC(pred8x16_vertical_add            , depth);\
+        h->pred8x8_add  [ HOR_PRED8x8]= FUNCC(pred8x16_horizontal_add          , depth);\
+    }\
     h->pred16x16_add[VERT_PRED8x8]= FUNCC(pred16x16_vertical_add          , depth);\
     h->pred16x16_add[ HOR_PRED8x8]= FUNCC(pred16x16_horizontal_add        , depth);\
 
diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c
index 8021895378..074fad50ca 100644
--- a/libavcodec/h264pred_template.c
+++ b/libavcodec/h264pred_template.c
@@ -663,23 +663,45 @@ static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, int stride){
     FUNCC(pred4x4_dc)(src, NULL, stride);
 }
 
+static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, int stride){
+    FUNCC(pred8x16_top_dc)(src, stride);
+    FUNCC(pred4x4_dc)(src, NULL, stride);
+}
+
 static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, int stride){
     FUNCC(pred8x8_dc)(src, stride);
     FUNCC(pred4x4_top_dc)(src, NULL, stride);
 }
 
+static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, int stride){
+    FUNCC(pred8x16_dc)(src, stride);
+    FUNCC(pred4x4_top_dc)(src, NULL, stride);
+}
+
 static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, int stride){
     FUNCC(pred8x8_left_dc)(src, stride);
     FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
     FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
 }
 
+static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, int stride){
+    FUNCC(pred8x16_left_dc)(src, stride);
+    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
+    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
+}
+
 static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, int stride){
     FUNCC(pred8x8_left_dc)(src, stride);
     FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
     FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
 }
 
+static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, int stride){
+    FUNCC(pred8x16_left_dc)(src, stride);
+    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
+    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
+}
+
 static void FUNCC(pred8x8_plane)(uint8_t *_src, int _stride){
   int j, k;
   int a;
@@ -1126,8 +1148,24 @@ static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, c
         FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
 }
 
+static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<4; i++)
+        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
+    for(i=4; i<8; i++)
+        FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
+}
+
 static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
     int i;
     for(i=0; i<4; i++)
         FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
 }
+
+static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
+    int i;
+    for(i=0; i<4; i++)
+        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
+    for(i=4; i<8; i++)
+        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
+}
diff --git a/libavcodec/libspeexdec.c b/libavcodec/libspeexdec.c
index 7ee53b04e5..91f190525d 100644
--- a/libavcodec/libspeexdec.c
+++ b/libavcodec/libspeexdec.c
@@ -18,11 +18,11 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "avcodec.h"
 #include <speex/speex.h>
 #include <speex/speex_header.h>
 #include <speex/speex_stereo.h>
 #include <speex/speex_callbacks.h>
+#include "avcodec.h"
 
 typedef struct {
     SpeexBits bits;
@@ -60,14 +60,14 @@ static av_cold int libspeex_decode_init(AVCodecContext *avctx)
         mode = speex_lib_get_mode(s->header->mode);
         if (!mode) {
             av_log(avctx, AV_LOG_ERROR, "Unknown Speex mode %d", s->header->mode);
-            return -1;
+            return AVERROR_INVALIDDATA;
         }
     } else
         av_log(avctx, AV_LOG_INFO, "Missing Speex header, assuming defaults.\n");
 
     if (avctx->channels > 2) {
         av_log(avctx, AV_LOG_ERROR, "Only stereo and mono are supported.\n");
-        return -1;
+        return AVERROR(EINVAL);
     }
 
     speex_bits_init(&s->bits);
@@ -99,32 +99,42 @@ static int libspeex_decode_frame(AVCodecContext *avctx,
     uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     LibSpeexContext *s = avctx->priv_data;
-    int16_t *output = data, *end;
-    int i, num_samples;
-
-    num_samples = s->frame_size * avctx->channels;
-    end = output + *data_size / sizeof(*output);
-
-    speex_bits_read_from(&s->bits, buf, buf_size);
-
-    for (i = 0; speex_bits_remaining(&s->bits) && output + num_samples < end; i++) {
-        int ret = speex_decode_int(s->dec_state, &s->bits, output);
-        if (ret <= -2) {
-            av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n");
-            return -1;
-        } else if (ret == -1)
-            // end of stream
-            break;
+    int16_t *output = data;
+    int out_size, ret, consumed = 0;
+
+    /* check output buffer size */
+    out_size = s->frame_size * avctx->channels *
+               av_get_bytes_per_sample(avctx->sample_fmt);
+    if (*data_size < out_size) {
+        av_log(avctx, AV_LOG_ERROR, "Output buffer is too small\n");
+        return AVERROR(EINVAL);
+    }
 
-        if (avctx->channels == 2)
-            speex_decode_stereo_int(output, s->frame_size, &s->stereo);
+    /* if there is not enough data left for the smallest possible frame,
+       reset the libspeex buffer using the current packet, otherwise ignore
+       the current packet and keep decoding frames from the libspeex buffer. */
+    if (speex_bits_remaining(&s->bits) < 43) {
+        /* check for flush packet */
+        if (!buf || !buf_size) {
+            *data_size = 0;
+            return buf_size;
+        }
+        /* set new buffer */
+        speex_bits_read_from(&s->bits, buf, buf_size);
+        consumed = buf_size;
+    }
 
-        output += num_samples;
+    /* decode a single frame */
+    ret = speex_decode_int(s->dec_state, &s->bits, output);
+    if (ret <= -2) {
+        av_log(avctx, AV_LOG_ERROR, "Error decoding Speex frame.\n");
+        return AVERROR_INVALIDDATA;
     }
+    if (avctx->channels == 2)
+        speex_decode_stereo_int(output, s->frame_size, &s->stereo);
 
-    avctx->frame_size = s->frame_size * i;
-    *data_size = avctx->channels * avctx->frame_size * sizeof(*output);
-    return buf_size;
+    *data_size = out_size;
+    return consumed;
 }
 
 static av_cold int libspeex_decode_close(AVCodecContext *avctx)
@@ -138,6 +148,12 @@ static av_cold int libspeex_decode_close(AVCodecContext *avctx)
     return 0;
 }
 
+static av_cold void libspeex_decode_flush(AVCodecContext *avctx)
+{
+    LibSpeexContext *s = avctx->priv_data;
+    speex_bits_reset(&s->bits);
+}
+
 AVCodec ff_libspeex_decoder = {
     .name           = "libspeex",
     .type           = AVMEDIA_TYPE_AUDIO,
@@ -146,5 +162,7 @@ AVCodec ff_libspeex_decoder = {
     .init           = libspeex_decode_init,
     .close          = libspeex_decode_close,
     .decode         = libspeex_decode_frame,
+    .flush          = libspeex_decode_flush,
+    .capabilities   = CODEC_CAP_SUBFRAMES | CODEC_CAP_DELAY,
     .long_name = NULL_IF_CONFIG_SMALL("libspeex Speex"),
 };
diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c
index b5ba285bc9..f5f169a8e3 100644
--- a/libavcodec/mpegaudiodec.c
+++ b/libavcodec/mpegaudiodec.c
@@ -1893,24 +1893,50 @@ typedef struct MP3On4DecodeContext {
     int syncword; ///< syncword patch
     const uint8_t *coff; ///< channels offsets in output buffer
     MPADecodeContext *mp3decctx[5]; ///< MPADecodeContext for every decoder instance
+    OUT_INT *decoded_buf;           ///< output buffer for decoded samples
 } MP3On4DecodeContext;
 
 #include "mpeg4audio.h"
 
 /* Next 3 arrays are indexed by channel config number (passed via codecdata) */
 static const uint8_t mp3Frames[8] = {0,1,1,2,3,3,4,5};   /* number of mp3 decoder instances */
-/* offsets into output buffer, assume output order is FL FR BL BR C LFE */
+/* offsets into output buffer, assume output order is FL FR C LFE BL BR SL SR */
 static const uint8_t chan_offset[8][5] = {
     {0},
     {0},            // C
     {0},            // FLR
     {2,0},          // C FLR
     {2,0,3},        // C FLR BS
-    {4,0,2},        // C FLR BLRS
-    {4,0,2,5},      // C FLR BLRS LFE
-    {4,0,2,6,5},    // C FLR BLRS BLR LFE
+    {2,0,3},        // C FLR BLRS
+    {2,0,4,3},      // C FLR BLRS LFE
+    {2,0,6,4,3},    // C FLR BLRS BLR LFE
 };
 
+/* mp3on4 channel layouts */
+static const int16_t chan_layout[8] = {
+    0,
+    AV_CH_LAYOUT_MONO,
+    AV_CH_LAYOUT_STEREO,
+    AV_CH_LAYOUT_SURROUND,
+    AV_CH_LAYOUT_4POINT0,
+    AV_CH_LAYOUT_5POINT0,
+    AV_CH_LAYOUT_5POINT1,
+    AV_CH_LAYOUT_7POINT1
+};
+
+static av_cold int decode_close_mp3on4(AVCodecContext * avctx)
+{
+    MP3On4DecodeContext *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < s->frames; i++)
+        av_free(s->mp3decctx[i]);
+
+    av_freep(&s->decoded_buf);
+
+    return 0;
+}
+
 
 static int decode_init_mp3on4(AVCodecContext * avctx)
 {
@@ -1931,6 +1957,7 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
     s->frames = mp3Frames[cfg.chan_config];
     s->coff = chan_offset[cfg.chan_config];
     avctx->channels = ff_mpeg4audio_channels[cfg.chan_config];
+    avctx->channel_layout = chan_layout[cfg.chan_config];
 
     if (cfg.sample_rate < 16000)
         s->syncword = 0xffe00000;
@@ -1944,6 +1971,8 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
      */
     // Allocate zeroed memory for the first decoder context
     s->mp3decctx[0] = av_mallocz(sizeof(MPADecodeContext));
+    if (!s->mp3decctx[0])
+        goto alloc_fail;
     // Put decoder context in place to make init_decode() happy
     avctx->priv_data = s->mp3decctx[0];
     decode_init(avctx);
@@ -1956,23 +1985,38 @@ static int decode_init_mp3on4(AVCodecContext * avctx)
      */
     for (i = 1; i < s->frames; i++) {
         s->mp3decctx[i] = av_mallocz(sizeof(MPADecodeContext));
+        if (!s->mp3decctx[i])
+            goto alloc_fail;
         s->mp3decctx[i]->adu_mode = 1;
         s->mp3decctx[i]->avctx = avctx;
+        s->mp3decctx[i]->mpadsp = s->mp3decctx[0]->mpadsp;
+    }
+
+    /* Allocate buffer for multi-channel output if needed */
+    if (s->frames > 1) {
+        s->decoded_buf = av_malloc(MPA_FRAME_SIZE * MPA_MAX_CHANNELS *
+                                   sizeof(*s->decoded_buf));
+        if (!s->decoded_buf)
+            goto alloc_fail;
     }
 
     return 0;
+alloc_fail:
+    decode_close_mp3on4(avctx);
+    return AVERROR(ENOMEM);
 }
 
 
-static av_cold int decode_close_mp3on4(AVCodecContext * avctx)
+static void flush_mp3on4(AVCodecContext *avctx)
 {
-    MP3On4DecodeContext *s = avctx->priv_data;
     int i;
+    MP3On4DecodeContext *s = avctx->priv_data;
 
-    for (i = 0; i < s->frames; i++)
-        av_free(s->mp3decctx[i]);
-
-    return 0;
+    for (i = 0; i < s->frames; i++) {
+        MPADecodeContext *m = s->mp3decctx[i];
+        memset(m->synth_buf, 0, sizeof(m->synth_buf));
+        m->last_buf_size = 0;
+    }
 }
 
 
@@ -1987,12 +2031,13 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
     int fsize, len = buf_size, out_size = 0;
     uint32_t header;
     OUT_INT *out_samples = data;
-    OUT_INT decoded_buf[MPA_FRAME_SIZE * MPA_MAX_CHANNELS];
     OUT_INT *outptr, *bp;
-    int fr, j, n;
+    int fr, j, n, ch;
 
-    if(*data_size < MPA_FRAME_SIZE * MPA_MAX_CHANNELS * s->frames * sizeof(OUT_INT))
-        return -1;
+    if (*data_size < MPA_FRAME_SIZE * avctx->channels * sizeof(OUT_INT)) {
+        av_log(avctx, AV_LOG_ERROR, "output buffer is too small\n");
+        return AVERROR(EINVAL);
+    }
 
     *data_size = 0;
     // Discard too short frames
@@ -2000,10 +2045,11 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
         return -1;
 
     // If only one decoder interleave is not needed
-    outptr = s->frames == 1 ? out_samples : decoded_buf;
+    outptr = s->frames == 1 ? out_samples : s->decoded_buf;
 
     avctx->bit_rate = 0;
 
+    ch = 0;
     for (fr = 0; fr < s->frames; fr++) {
         fsize = AV_RB16(buf) >> 4;
         fsize = FFMIN3(fsize, len, MPA_MAX_CODED_FRAME_SIZE);
@@ -2016,6 +2062,14 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
             break;
 
         avpriv_mpegaudio_decode_header((MPADecodeHeader *)m, header);
+
+        if (ch + m->nb_channels > avctx->channels) {
+            av_log(avctx, AV_LOG_ERROR, "frame channel count exceeds codec "
+                                        "channel count\n");
+            return AVERROR_INVALIDDATA;
+        }
+        ch += m->nb_channels;
+
         out_size += mp_decode_frame(m, outptr, buf, fsize);
         buf += fsize;
         len -= fsize;
@@ -2026,13 +2080,13 @@ static int decode_frame_mp3on4(AVCodecContext * avctx,
             bp = out_samples + s->coff[fr];
             if(m->nb_channels == 1) {
                 for(j = 0; j < n; j++) {
-                    *bp = decoded_buf[j];
+                    *bp = s->decoded_buf[j];
                     bp += avctx->channels;
                 }
             } else {
                 for(j = 0; j < n; j++) {
-                    bp[0] = decoded_buf[j++];
-                    bp[1] = decoded_buf[j];
+                    bp[0] = s->decoded_buf[j++];
+                    bp[1] = s->decoded_buf[j];
                     bp += avctx->channels;
                 }
             }
@@ -2110,7 +2164,7 @@ AVCodec ff_mp3on4_decoder = {
     .init           = decode_init_mp3on4,
     .close          = decode_close_mp3on4,
     .decode         = decode_frame_mp3on4,
-    .flush          = flush,
+    .flush          = flush_mp3on4,
     .long_name      = NULL_IF_CONFIG_SMALL("MP3onMP4"),
 };
 #endif
diff --git a/libavcodec/mpegaudiodec_float.c b/libavcodec/mpegaudiodec_float.c
index 2fde46a5cd..312b84278f 100644
--- a/libavcodec/mpegaudiodec_float.c
+++ b/libavcodec/mpegaudiodec_float.c
@@ -83,7 +83,7 @@ AVCodec ff_mp3on4float_decoder = {
     .init           = decode_init_mp3on4,
     .close          = decode_close_mp3on4,
     .decode         = decode_frame_mp3on4,
-    .flush          = flush,
+    .flush          = flush_mp3on4,
     .long_name      = NULL_IF_CONFIG_SMALL("MP3onMP4"),
 };
 #endif
diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c
index 00fb0a73d9..bf4fd0f016 100644
--- a/libavcodec/ppc/h264_altivec.c
+++ b/libavcodec/ppc/h264_altivec.c
@@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
 }
 
 static av_always_inline
-void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
+void weight_h264_W_altivec(uint8_t *block, int stride, int height,
+                           int log2_denom, int weight, int offset, int w)
 {
     int y, aligned;
     vec_u8 vblock;
@@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
     voffset = vec_splat(vtemp, 5);
     aligned = !((unsigned long)block & 0xf);
 
-    for (y=0; y<h; y++) {
+    for (y = 0; y < height; y++) {
         vblock = vec_ld(0, block);
 
         v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
@@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
 }
 
 static av_always_inline
-void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
-                               int weightd, int weights, int offset, int w, int h)
+void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
+                             int log2_denom, int weightd, int weights, int offset, int w)
 {
     int y, dst_aligned, src_aligned;
     vec_u8 vsrc, vdst;
@@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
     dst_aligned = !((unsigned long)dst & 0xf);
     src_aligned = !((unsigned long)src & 0xf);
 
-    for (y=0; y<h; y++) {
+    for (y = 0; y < height; y++) {
         vdst = vec_ld(0, dst);
         vsrc = vec_ld(0, src);
 
@@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
     }
 }
 
-#define H264_WEIGHT(W,H) \
-static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
-    weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
+#define H264_WEIGHT(W) \
+static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
+                                                   int log2_denom, int weight, int offset){ \
+    weight_h264_W_altivec(block, stride, height, log2_denom, weight, offset, W); \
 }\
-static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
-    biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
+static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
+                                                     int log2_denom, int weightd, int weights, int offset){ \
+    biweight_h264_W_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
 }
 
-H264_WEIGHT(16,16)
-H264_WEIGHT(16, 8)
-H264_WEIGHT( 8,16)
-H264_WEIGHT( 8, 8)
-H264_WEIGHT( 8, 4)
+H264_WEIGHT(16)
+H264_WEIGHT( 8)
 
 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
@@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom
         c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
         c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
 
-        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
-        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
-        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
-        c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
-        c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
-        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
-        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
-        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
-        c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
-        c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
+        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec;
+        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec;
+        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec;
+        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec;
     }
     }
 }
diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index 205e3600c5..405fc31318 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -158,6 +158,8 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height, int l
     case PIX_FMT_YUV420P9BE:
     case PIX_FMT_YUV420P10LE:
     case PIX_FMT_YUV420P10BE:
+    case PIX_FMT_YUV422P9LE:
+    case PIX_FMT_YUV422P9BE:
     case PIX_FMT_YUV422P10LE:
     case PIX_FMT_YUV422P10BE:
     case PIX_FMT_YUV444P9LE:
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 2643b34d67..8c0380315e 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -41,24 +41,57 @@ static void free_buffers(VP8Context *s)
     av_freep(&s->top_nnz);
     av_freep(&s->edge_emu_buffer);
     av_freep(&s->top_border);
-    av_freep(&s->segmentation_map);
 
     s->macroblocks = NULL;
 }
 
-static void vp8_decode_flush(AVCodecContext *avctx)
+static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
+{
+    int ret;
+    if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
+        return ret;
+    if (!s->maps_are_invalid && s->num_maps_to_be_freed) {
+        f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
+    } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
+        ff_thread_release_buffer(s->avctx, f);
+        return AVERROR(ENOMEM);
+    }
+    return 0;
+}
+
+static void vp8_release_frame(VP8Context *s, AVFrame *f, int is_close)
+{
+    if (!is_close) {
+        if (f->ref_index[0]) {
+            assert(s->num_maps_to_be_freed < FF_ARRAY_ELEMS(s->segmentation_maps));
+            s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
+            f->ref_index[0] = NULL;
+        }
+    } else {
+        av_freep(&f->ref_index[0]);
+    }
+    ff_thread_release_buffer(s->avctx, f);
+}
+
+static void vp8_decode_flush_impl(AVCodecContext *avctx, int force, int is_close)
 {
     VP8Context *s = avctx->priv_data;
     int i;
 
-    if (!avctx->is_copy) {
+    if (!avctx->is_copy || force) {
         for (i = 0; i < 5; i++)
             if (s->frames[i].data[0])
-                ff_thread_release_buffer(avctx, &s->frames[i]);
+                vp8_release_frame(s, &s->frames[i], is_close);
     }
     memset(s->framep, 0, sizeof(s->framep));
 
     free_buffers(s);
+    s->maps_are_invalid = 1;
+}
+
+static void vp8_decode_flush(AVCodecContext *avctx)
+{
+    vp8_decode_flush_impl(avctx, 0, 0);
 }
 
 static int update_dimensions(VP8Context *s, int width, int height)
@@ -68,7 +101,7 @@ static int update_dimensions(VP8Context *s, int width, int height)
         if (av_image_check_size(width, height, 0, s->avctx))
             return AVERROR_INVALIDDATA;
 
-        vp8_decode_flush(s->avctx);
+        vp8_decode_flush_impl(s->avctx, 1, 0);
 
         avcodec_set_dimensions(s->avctx, width, height);
     }
@@ -81,10 +114,9 @@ static int update_dimensions(VP8Context *s, int width, int height)
     s->intra4x4_pred_mode_top  = av_mallocz(s->mb_width*4);
     s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
     s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
-    s->segmentation_map        = av_mallocz(s->mb_width*s->mb_height);
 
     if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
-        !s->top_nnz || !s->top_border || !s->segmentation_map)
+        !s->top_nnz || !s->top_border)
         return AVERROR(ENOMEM);
 
     s->macroblocks        = s->macroblocks_base + 1;
@@ -1508,6 +1540,14 @@ static void filter_mb_row_simple(VP8Context *s, AVFrame *curframe, int mb_y)
     }
 }
 
+static void release_queued_segmaps(VP8Context *s, int is_close)
+{
+    int leave_behind = is_close ? 0 : !s->maps_are_invalid;
+    while (s->num_maps_to_be_freed > leave_behind)
+        av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
+    s->maps_are_invalid = 0;
+}
+
 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
                             AVPacket *avpkt)
 {
@@ -1516,6 +1556,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
     enum AVDiscard skip_thresh;
     AVFrame *av_uninit(curframe), *prev_frame = s->framep[VP56_FRAME_CURRENT];
 
+    release_queued_segmaps(s, 0);
+
     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
         return ret;
 
@@ -1538,7 +1580,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
-            ff_thread_release_buffer(avctx, &s->frames[i]);
+            vp8_release_frame(s, &s->frames[i], 0);
 
     // find a free buffer
     for (i = 0; i < 5; i++)
@@ -1559,8 +1601,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
     curframe->key_frame = s->keyframe;
     curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
     curframe->reference = referenced ? 3 : 0;
-    curframe->ref_index[0] = s->segmentation_map;
-    if ((ret = ff_thread_get_buffer(avctx, curframe))) {
+    if ((ret = vp8_alloc_frame(s, curframe))) {
         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
         return ret;
     }
@@ -1652,8 +1693,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
             s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
             s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
 
-            decode_mb_mode(s, mb, mb_x, mb_y, s->segmentation_map + mb_xy,
-                           prev_frame ? prev_frame->ref_index[0] + mb_xy : NULL);
+            decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
+                           prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL);
 
             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
 
@@ -1736,7 +1777,8 @@ static av_cold int vp8_decode_init(AVCodecContext *avctx)
 
 static av_cold int vp8_decode_free(AVCodecContext *avctx)
 {
-    vp8_decode_flush(avctx);
+    vp8_decode_flush_impl(avctx, 0, 1);
+    release_queued_segmaps(avctx->priv_data, 1);
     return 0;
 }
 
diff --git a/libavcodec/vp8.h b/libavcodec/vp8.h
index 468e28e8d5..36c21df217 100644
--- a/libavcodec/vp8.h
+++ b/libavcodec/vp8.h
@@ -130,7 +130,6 @@ typedef struct {
 
     uint8_t *intra4x4_pred_mode_top;
     uint8_t intra4x4_pred_mode_left[4];
-    uint8_t *segmentation_map;
 
     /**
      * Macroblocks can have one of 4 different quants in a frame when
@@ -237,6 +236,16 @@ typedef struct {
     H264PredContext hpc;
     vp8_mc_func put_pixels_tab[3][3][3];
     AVFrame frames[5];
+
+    /**
+     * A list of segmentation_map buffers that are to be free()'ed in
+     * the next decoding iteration. We can't free() them right away
+     * because the map may still be used by subsequent decoding threads.
+     * Unused if frame threading is off.
+     */
+    uint8_t *segmentation_maps[5];
+    int num_maps_to_be_freed;
+    int maps_are_invalid;
 } VP8Context;
 
 #endif /* AVCODEC_VP8_H */
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 70f1b86c44..6627d21bd8 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1055,14 +1055,6 @@ emu_edge mmx
 ;                           int32_t max, unsigned int len)
 ;-----------------------------------------------------------------------------
 
-%macro SPLATD_MMX 1
-    punpckldq  %1, %1
-%endmacro
-
-%macro SPLATD_SSE2 1
-    pshufd  %1, %1, 0
-%endmacro
-
 %macro VECTOR_CLIP_INT32 4
 cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
 %ifidn %1, sse2
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 2deb577ca6..37e7a094ce 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -24,6 +24,146 @@
 
 SECTION_TEXT
 
+;---------------------------------------------------------------------------------
+; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
+;---------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_SCALAR 2
+%ifdef ARCH_X86_64
+cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
+%else
+cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
+    movss   m0, mulm
+%endif
+    SPLATD  m0
+    shl     lenq, 2
+    add     srcq, lenq
+    add     dstq, lenq
+    neg     lenq
+.loop:
+%ifidn %1, sse2
+    cvtdq2ps  m1, [srcq+lenq   ]
+    cvtdq2ps  m2, [srcq+lenq+16]
+%else
+    cvtpi2ps  m1, [srcq+lenq   ]
+    cvtpi2ps  m3, [srcq+lenq+ 8]
+    cvtpi2ps  m2, [srcq+lenq+16]
+    cvtpi2ps  m4, [srcq+lenq+24]
+    movlhps   m1, m3
+    movlhps   m2, m4
+%endif
+    mulps     m1, m0
+    mulps     m2, m0
+    mova  [dstq+lenq   ], m1
+    mova  [dstq+lenq+16], m2
+    add     lenq, 32
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM
+%define SPLATD SPLATD_SSE
+%define movdqa movaps
+INT32_TO_FLOAT_FMUL_SCALAR sse, 5
+%undef movdqa
+%define SPLATD SPLATD_SSE2
+INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
+%undef SPLATD
+
+
+;------------------------------------------------------------------------------
+; void ff_float_to_int16(int16_t *dst, const float *src, long len);
+;------------------------------------------------------------------------------
+%macro FLOAT_TO_INT16 2
+cglobal float_to_int16_%1, 3,3,%2, dst, src, len
+    add       lenq, lenq
+    lea       srcq, [srcq+2*lenq]
+    add       dstq, lenq
+    neg       lenq
+.loop:
+%ifidn %1, sse2
+    cvtps2dq    m0, [srcq+2*lenq   ]
+    cvtps2dq    m1, [srcq+2*lenq+16]
+    packssdw    m0, m1
+    mova  [dstq+lenq], m0
+%else
+    cvtps2pi    m0, [srcq+2*lenq   ]
+    cvtps2pi    m1, [srcq+2*lenq+ 8]
+    cvtps2pi    m2, [srcq+2*lenq+16]
+    cvtps2pi    m3, [srcq+2*lenq+24]
+    packssdw    m0, m1
+    packssdw    m2, m3
+    mova  [dstq+lenq  ], m0
+    mova  [dstq+lenq+8], m2
+%endif
+    add       lenq, 16
+    js .loop
+%ifnidn %1, sse2
+    emms
+%endif
+    REP_RET
+%endmacro
+
+INIT_XMM
+FLOAT_TO_INT16 sse2, 2
+INIT_MMX
+FLOAT_TO_INT16 sse, 0
+%define cvtps2pi pf2id
+FLOAT_TO_INT16 3dnow, 0
+%undef cvtps2pi
+
+
+;-------------------------------------------------------------------------------
+; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
+;-------------------------------------------------------------------------------
+%macro FLOAT_TO_INT16_INTERLEAVE2 1
+cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
+    lea      lenq, [4*r2q]
+    mov     src1q, [src0q+gprsize]
+    mov     src0q, [src0q]
+    add      dstq, lenq
+    add     src0q, lenq
+    add     src1q, lenq
+    neg      lenq
+.loop:
+%ifidn %1, sse2
+    cvtps2dq   m0, [src0q+lenq]
+    cvtps2dq   m1, [src1q+lenq]
+    packssdw   m0, m1
+    movhlps    m1, m0
+    punpcklwd  m0, m1
+    mova  [dstq+lenq], m0
+%else
+    cvtps2pi   m0, [src0q+lenq  ]
+    cvtps2pi   m1, [src0q+lenq+8]
+    cvtps2pi   m2, [src1q+lenq  ]
+    cvtps2pi   m3, [src1q+lenq+8]
+    packssdw   m0, m1
+    packssdw   m2, m3
+    mova       m1, m0
+    punpcklwd  m0, m2
+    punpckhwd  m1, m2
+    mova  [dstq+lenq  ], m0
+    mova  [dstq+lenq+8], m1
+%endif
+    add      lenq, 16
+    js .loop
+%ifnidn %1, sse2
+    emms
+%endif
+    REP_RET
+%endmacro
+
+INIT_MMX
+%define cvtps2pi pf2id
+FLOAT_TO_INT16_INTERLEAVE2 3dnow
+%undef cvtps2pi
+%define movdqa movaps
+FLOAT_TO_INT16_INTERLEAVE2 sse
+%undef movdqa
+INIT_XMM
+FLOAT_TO_INT16_INTERLEAVE2 sse2
+
+
 %macro PSWAPD_SSE 2
     pshufw %1, %2, 0x4e
 %endmacro
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index ba2c2c9bd5..a3d8f89816 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -26,133 +26,32 @@
 #include "libavutil/x86_cpu.h"
 #include "libavcodec/fmtconvert.h"
 
-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
-{
-    x86_reg i = -4*len;
-    __asm__ volatile(
-        "movss  %3, %%xmm4 \n"
-        "shufps $0, %%xmm4, %%xmm4 \n"
-        "1: \n"
-        "cvtpi2ps   (%2,%0), %%xmm0 \n"
-        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
-        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
-        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
-        "movlhps  %%xmm1,    %%xmm0 \n"
-        "movlhps  %%xmm3,    %%xmm2 \n"
-        "mulps    %%xmm4,    %%xmm0 \n"
-        "mulps    %%xmm4,    %%xmm2 \n"
-        "movaps   %%xmm0,   (%1,%0) \n"
-        "movaps   %%xmm2, 16(%1,%0) \n"
-        "add $32, %0 \n"
-        "jl 1b \n"
-        :"+r"(i)
-        :"r"(dst+len), "r"(src+len), "m"(mul)
-    );
-}
-
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
-{
-    x86_reg i = -4*len;
-    __asm__ volatile(
-        "movss  %3, %%xmm4 \n"
-        "shufps $0, %%xmm4, %%xmm4 \n"
-        "1: \n"
-        "cvtdq2ps   (%2,%0), %%xmm0 \n"
-        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
-        "mulps    %%xmm4,    %%xmm0 \n"
-        "mulps    %%xmm4,    %%xmm1 \n"
-        "movaps   %%xmm0,   (%1,%0) \n"
-        "movaps   %%xmm1, 16(%1,%0) \n"
-        "add $32, %0 \n"
-        "jl 1b \n"
-        :"+r"(i)
-        :"r"(dst+len), "r"(src+len), "m"(mul)
-    );
-}
+#if HAVE_YASM
 
-static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
-    x86_reg reglen = len;
-    // not bit-exact: pf2id uses different rounding than C and SSE
-    __asm__ volatile(
-        "add        %0          , %0        \n\t"
-        "lea         (%2,%0,2)  , %2        \n\t"
-        "add        %0          , %1        \n\t"
-        "neg        %0                      \n\t"
-        "1:                                 \n\t"
-        "pf2id       (%2,%0,2)  , %%mm0     \n\t"
-        "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
-        "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
-        "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
-        "packssdw   %%mm1       , %%mm0     \n\t"
-        "packssdw   %%mm3       , %%mm2     \n\t"
-        "movq       %%mm0       ,  (%1,%0)  \n\t"
-        "movq       %%mm2       , 8(%1,%0)  \n\t"
-        "add        $16         , %0        \n\t"
-        " js 1b                             \n\t"
-        "femms                              \n\t"
-        :"+r"(reglen), "+r"(dst), "+r"(src)
-    );
-}
+void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len);
+void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len);
 
-static void float_to_int16_sse(int16_t *dst, const float *src, long len){
-    x86_reg reglen = len;
-    __asm__ volatile(
-        "add        %0          , %0        \n\t"
-        "lea         (%2,%0,2)  , %2        \n\t"
-        "add        %0          , %1        \n\t"
-        "neg        %0                      \n\t"
-        "1:                                 \n\t"
-        "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
-        "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
-        "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
-        "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
-        "packssdw   %%mm1       , %%mm0     \n\t"
-        "packssdw   %%mm3       , %%mm2     \n\t"
-        "movq       %%mm0       ,  (%1,%0)  \n\t"
-        "movq       %%mm2       , 8(%1,%0)  \n\t"
-        "add        $16         , %0        \n\t"
-        " js 1b                             \n\t"
-        "emms                               \n\t"
-        :"+r"(reglen), "+r"(dst), "+r"(src)
-    );
-}
+void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
+void ff_float_to_int16_sse  (int16_t *dst, const float *src, long len);
+void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
 
-static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
-    x86_reg reglen = len;
-    __asm__ volatile(
-        "add        %0          , %0        \n\t"
-        "lea         (%2,%0,2)  , %2        \n\t"
-        "add        %0          , %1        \n\t"
-        "neg        %0                      \n\t"
-        "1:                                 \n\t"
-        "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
-        "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
-        "packssdw   %%xmm1      , %%xmm0    \n\t"
-        "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
-        "add        $16         , %0        \n\t"
-        " js 1b                             \n\t"
-        :"+r"(reglen), "+r"(dst), "+r"(src)
-    );
-}
+void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
+void ff_float_to_int16_interleave2_sse  (int16_t *dst, const float **src, long len);
+void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
 
 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
 
-#if !HAVE_YASM
-#define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#endif
 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
 
-#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
+#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
     DECLARE_ALIGNED(16, int16_t, tmp)[len];\
     int i,j,c;\
     for(c=0; c<channels; c++){\
-        float_to_int16_##cpu(tmp, src[c], len);\
+        ff_float_to_int16_##cpu(tmp, src[c], len);\
         for(i=0, j=c; i<len; i++, j+=channels)\
             dst[j] = tmp[i];\
     }\
@@ -160,73 +59,18 @@ static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const
 \
 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
     if(channels==1)\
-        float_to_int16_##cpu(dst, src[0], len);\
+        ff_float_to_int16_##cpu(dst, src[0], len);\
     else if(channels==2){\
-        x86_reg reglen = len; \
-        const float *src0 = src[0];\
-        const float *src1 = src[1];\
-        __asm__ volatile(\
-            "shl $2, %0 \n"\
-            "add %0, %1 \n"\
-            "add %0, %2 \n"\
-            "add %0, %3 \n"\
-            "neg %0 \n"\
-            body\
-            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
-        );\
+        ff_float_to_int16_interleave2_##cpu(dst, src, len);\
     }else if(channels==6){\
         ff_float_to_int16_interleave6_##cpu(dst, src, len);\
     }else\
         float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
 }
 
-FLOAT_TO_INT16_INTERLEAVE(3dnow,
-    "1:                         \n"
-    "pf2id     (%2,%0), %%mm0   \n"
-    "pf2id    8(%2,%0), %%mm1   \n"
-    "pf2id     (%3,%0), %%mm2   \n"
-    "pf2id    8(%3,%0), %%mm3   \n"
-    "packssdw    %%mm1, %%mm0   \n"
-    "packssdw    %%mm3, %%mm2   \n"
-    "movq        %%mm0, %%mm1   \n"
-    "punpcklwd   %%mm2, %%mm0   \n"
-    "punpckhwd   %%mm2, %%mm1   \n"
-    "movq        %%mm0,  (%1,%0)\n"
-    "movq        %%mm1, 8(%1,%0)\n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-    "femms                      \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse,
-    "1:                         \n"
-    "cvtps2pi  (%2,%0), %%mm0   \n"
-    "cvtps2pi 8(%2,%0), %%mm1   \n"
-    "cvtps2pi  (%3,%0), %%mm2   \n"
-    "cvtps2pi 8(%3,%0), %%mm3   \n"
-    "packssdw    %%mm1, %%mm0   \n"
-    "packssdw    %%mm3, %%mm2   \n"
-    "movq        %%mm0, %%mm1   \n"
-    "punpcklwd   %%mm2, %%mm0   \n"
-    "punpckhwd   %%mm2, %%mm1   \n"
-    "movq        %%mm0,  (%1,%0)\n"
-    "movq        %%mm1, 8(%1,%0)\n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-    "emms                       \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse2,
-    "1:                         \n"
-    "cvtps2dq  (%2,%0), %%xmm0  \n"
-    "cvtps2dq  (%3,%0), %%xmm1  \n"
-    "packssdw   %%xmm1, %%xmm0  \n"
-    "movhlps    %%xmm0, %%xmm1  \n"
-    "punpcklwd  %%xmm1, %%xmm0  \n"
-    "movdqa     %%xmm0, (%1,%0) \n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-)
+FLOAT_TO_INT16_INTERLEAVE(3dnow)
+FLOAT_TO_INT16_INTERLEAVE(sse)
+FLOAT_TO_INT16_INTERLEAVE(sse2)
 
 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
     if(channels==6)
@@ -235,7 +79,6 @@ static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long
         float_to_int16_interleave_3dnow(dst, src, len, channels);
 }
 
-#if HAVE_YASM
 void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
 void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
 
@@ -269,34 +112,32 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
 {
     int mm_flags = av_get_cpu_flags();
 
-    if (mm_flags & AV_CPU_FLAG_MMX) {
 #if HAVE_YASM
+    if (mm_flags & AV_CPU_FLAG_MMX) {
         c->float_interleave = float_interleave_mmx;
-#endif
 
-        if(mm_flags & AV_CPU_FLAG_3DNOW){
+        if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) {
             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
-                c->float_to_int16 = float_to_int16_3dnow;
+                c->float_to_int16 = ff_float_to_int16_3dnow;
                 c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
             }
         }
-        if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
+        if (HAVE_AMD3DNOWEXT && mm_flags & AV_CPU_FLAG_3DNOWEXT) {
             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
             }
         }
-        if(mm_flags & AV_CPU_FLAG_SSE){
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
-            c->float_to_int16 = float_to_int16_sse;
+        if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) {
+            c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
+            c->float_to_int16 = ff_float_to_int16_sse;
             c->float_to_int16_interleave = float_to_int16_interleave_sse;
-#if HAVE_YASM
             c->float_interleave = float_interleave_sse;
-#endif
         }
-        if(mm_flags & AV_CPU_FLAG_SSE2){
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
-            c->float_to_int16 = float_to_int16_sse2;
+        if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) {
+            c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
+            c->float_to_int16 = ff_float_to_int16_sse2;
             c->float_to_int16_interleave = float_to_int16_interleave_sse2;
         }
     }
+#endif
 }
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index bb0af86097..cc96cb1f3b 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -28,21 +28,20 @@ SECTION .text
 ;-----------------------------------------------------------------------------
 ; biweight pred:
 ;
-; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
-;                               int log2_denom, int weightd, int weights,
-;                               int offset);
+; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
+;                            int height, int log2_denom, int weightd,
+;                            int weights, int offset);
 ; and
-; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
-;                             int log2_denom, int weight,
-;                             int offset);
+; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
+;                          int log2_denom, int weight, int offset);
 ;-----------------------------------------------------------------------------
 
 %macro WEIGHT_SETUP 0
-    add        r4, r4
-    inc        r4
-    movd       m3, r3d
-    movd       m5, r4d
-    movd       m6, r2d
+    add        r5, r5
+    inc        r5
+    movd       m3, r4d
+    movd       m5, r5d
+    movd       m6, r3d
     pslld      m5, m6
     psrld      m5, 1
 %if mmsize == 16
@@ -71,60 +70,41 @@ SECTION .text
     packuswb      m0, m1
 %endmacro
 
-%macro WEIGHT_FUNC_DBL_MM 1
-cglobal h264_weight_16x%1_mmx2, 5, 5, 0
+INIT_MMX
+cglobal h264_weight_16_mmx2, 6, 6, 0
     WEIGHT_SETUP
-    mov        r2, %1
-%if %1 == 16
 .nextrow
     WEIGHT_OP 0,  4
     mova     [r0  ], m0
     WEIGHT_OP 8, 12
     mova     [r0+8], m0
     add        r0, r1
-    dec        r2
+    dec        r2d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
-%endif
-%endmacro
 
-INIT_MMX
-WEIGHT_FUNC_DBL_MM 16
-WEIGHT_FUNC_DBL_MM  8
-
-%macro WEIGHT_FUNC_MM 4
-cglobal h264_weight_%1x%2_%4, 7, 7, %3
+%macro WEIGHT_FUNC_MM 3
+cglobal h264_weight_%1_%3, 6, 6, %2
     WEIGHT_SETUP
-    mov        r2, %2
-%if %2 == 16
 .nextrow
     WEIGHT_OP 0, mmsize/2
     mova     [r0], m0
     add        r0, r1
-    dec        r2
+    dec        r2d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
-%endif
 %endmacro
 
 INIT_MMX
-WEIGHT_FUNC_MM  8, 16,  0, mmx2
-WEIGHT_FUNC_MM  8,  8,  0, mmx2
-WEIGHT_FUNC_MM  8,  4,  0, mmx2
+WEIGHT_FUNC_MM  8, 0, mmx2
 INIT_XMM
-WEIGHT_FUNC_MM 16, 16,  8, sse2
-WEIGHT_FUNC_MM 16,  8,  8, sse2
+WEIGHT_FUNC_MM 16, 8, sse2
 
-%macro WEIGHT_FUNC_HALF_MM 5
-cglobal h264_weight_%1x%2_%5, 5, 5, %4
+%macro WEIGHT_FUNC_HALF_MM 3
+cglobal h264_weight_%1_%3, 6, 6, %2
     WEIGHT_SETUP
-    mov        r2, %2/2
+    sar       r2d, 1
     lea        r3, [r1*2]
-%if %2 == mmsize
 .nextrow
     WEIGHT_OP 0, r1
     movh     [r0], m0
@@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4
     movh     [r0+r1], m0
 %endif
     add        r0, r3
-    dec        r2
+    dec        r2d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
-%endif
 %endmacro
 
 INIT_MMX
-WEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
-WEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
-WEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
 INIT_XMM
-WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
-WEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
-WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
 
 %macro BIWEIGHT_SETUP 0
-    add        r6, 1
-    or         r6, 1
-    add        r3, 1
-    movd       m3, r4d
-    movd       m4, r5d
-    movd       m5, r6d
-    movd       m6, r3d
+%ifdef ARCH_X86_64
+%define off_regd r11d
+%else
+%define off_regd r3d
+%endif
+    mov  off_regd, r7m
+    add  off_regd, 1
+    or   off_regd, 1
+    add        r4, 1
+    movd       m3, r5d
+    movd       m4, r6d
+    movd       m5, off_regd
+    movd       m6, r4d
     pslld      m5, m6
     psrld      m5, 1
 %if mmsize == 16
@@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
     packuswb   m0, m1
 %endmacro
 
-%macro BIWEIGHT_FUNC_DBL_MM 1
-cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
+INIT_MMX
+cglobal h264_biweight_16_mmx2, 7, 7, 0
     BIWEIGHT_SETUP
-    mov        r3, %1
-%if %1 == 16
+    movifnidn r3d, r3m
 .nextrow
     BIWEIGHT_STEPA 0, 1, 0
     BIWEIGHT_STEPA 1, 2, 4
@@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
     mova     [r0+8], m0
     add        r0, r2
     add        r1, r2
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
-%endif
-%endmacro
 
-INIT_MMX
-BIWEIGHT_FUNC_DBL_MM 16
-BIWEIGHT_FUNC_DBL_MM  8
-
-%macro BIWEIGHT_FUNC_MM 4
-cglobal h264_biweight_%1x%2_%4, 7, 7, %3
+%macro BIWEIGHT_FUNC_MM 3
+cglobal h264_biweight_%1_%3, 7, 7, %2
     BIWEIGHT_SETUP
-    mov        r3, %2
-%if %2 == 16
+    movifnidn r3d, r3m
 .nextrow
     BIWEIGHT_STEPA 0, 1, 0
     BIWEIGHT_STEPA 1, 2, mmsize/2
@@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3
     mova       [r0], m0
     add        r0, r2
     add        r1, r2
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
-%endif
 %endmacro
 
 INIT_MMX
-BIWEIGHT_FUNC_MM  8, 16,  0, mmx2
-BIWEIGHT_FUNC_MM  8,  8,  0, mmx2
-BIWEIGHT_FUNC_MM  8,  4,  0, mmx2
+BIWEIGHT_FUNC_MM  8, 0, mmx2
 INIT_XMM
-BIWEIGHT_FUNC_MM 16, 16,  8, sse2
-BIWEIGHT_FUNC_MM 16,  8,  8, sse2
+BIWEIGHT_FUNC_MM 16, 8, sse2
 
-%macro BIWEIGHT_FUNC_HALF_MM 5
-cglobal h264_biweight_%1x%2_%5, 7, 7, %4
+%macro BIWEIGHT_FUNC_HALF_MM 3
+cglobal h264_biweight_%1_%3, 7, 7, %2
     BIWEIGHT_SETUP
-    mov        r3, %2/2
+    movifnidn r3d, r3m
+    sar        r3, 1
     lea        r4, [r2*2]
-%if %2 == mmsize
 .nextrow
     BIWEIGHT_STEPA 0, 1, 0
     BIWEIGHT_STEPA 1, 2, r2
@@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4
 %endif
     add        r0, r4
     add        r1, r4
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
-%endif
 %endmacro
 
 INIT_MMX
-BIWEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
-BIWEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
-BIWEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
+BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2
 INIT_XMM
-BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
-BIWEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
-BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
+BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
 
 %macro BIWEIGHT_SSSE3_SETUP 0
-    add        r6, 1
-    or         r6, 1
-    add        r3, 1
-    movd       m4, r4d
-    movd       m0, r5d
-    movd       m5, r6d
-    movd       m6, r3d
+%ifdef ARCH_X86_64
+%define off_regd r11d
+%else
+%define off_regd r3d
+%endif
+    mov  off_regd, r7m
+    add  off_regd, 1
+    or   off_regd, 1
+    add        r4, 1
+    movd       m4, r5d
+    movd       m0, r6d
+    movd       m5, off_regd
+    movd       m6, r4d
     pslld      m5, m6
     psrld      m5, 1
     punpcklbw  m4, m0
@@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
     packuswb   m0, m2
 %endmacro
 
-%macro BIWEIGHT_SSSE3_16 1
-cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
+INIT_XMM
+cglobal h264_biweight_16_ssse3, 7, 7, 8
     BIWEIGHT_SSSE3_SETUP
-    mov        r3, %1
+    movifnidn r3d, r3m
 
-%if %1 == 16
 .nextrow
     movh       m0, [r0]
     movh       m2, [r0+8]
@@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
     mova       [r0], m0
     add        r0, r2
     add        r1, r2
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
-%endif
-%endmacro
 
 INIT_XMM
-BIWEIGHT_SSSE3_16 16
-BIWEIGHT_SSSE3_16  8
-
-%macro BIWEIGHT_SSSE3_8 1
-cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
+cglobal h264_biweight_8_ssse3, 7, 7, 8
     BIWEIGHT_SSSE3_SETUP
-    mov        r3, %1/2
+    movifnidn r3d, r3m
+    sar        r3, 1
     lea        r4, [r2*2]
 
-%if %1 == 16
 .nextrow
     movh       m0, [r0]
     movh       m1, [r1]
@@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
     movhps     [r0+r2], m0
     add        r0, r4
     add        r1, r4
-    dec        r3
+    dec        r3d
     jnz .nextrow
     REP_RET
-%else
-    jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
-%endif
-%endmacro
-
-INIT_XMM
-BIWEIGHT_SSSE3_8 16
-BIWEIGHT_SSSE3_8  8
-BIWEIGHT_SSSE3_8  4
diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm
index 1c58d72d94..20df6fbab5 100644
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@@ -36,33 +36,26 @@ cextern pw_1
 SECTION .text
 
 ;-----------------------------------------------------------------------------
-; void h264_weight(uint8_t *dst, int stride, int log2_denom,
+; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
 ;                  int weight, int offset);
 ;-----------------------------------------------------------------------------
-%ifdef ARCH_X86_32
-DECLARE_REG_TMP 2
-%else
-DECLARE_REG_TMP 10
-%endif
-
-%macro WEIGHT_PROLOGUE 1
-    mov t0, %1
+%macro WEIGHT_PROLOGUE 0
 .prologue
-    PROLOGUE 0,5,8
+    PROLOGUE 0,6,8
     movifnidn  r0, r0mp
     movifnidn r1d, r1m
-    movifnidn r3d, r3m
     movifnidn r4d, r4m
+    movifnidn r5d, r5m
 %endmacro
 
 %macro WEIGHT_SETUP 1
     mova       m0, [pw_1]
-    movd       m2, r2m
+    movd       m2, r3m
     pslld      m0, m2       ; 1<<log2_denom
     SPLATW     m0, m0
-    shl        r4, 19       ; *8, move to upper half of dword
-    lea        r4, [r4+r3*2+0x10000]
-    movd       m3, r4d      ; weight<<1 | 1+(offset<<(3))
+    shl        r5, 19       ; *8, move to upper half of dword
+    lea        r5, [r5+r4*2+0x10000]
+    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
     pshufd     m3, m3, 0
     mova       m4, [pw_pixel_max]
     paddw      m2, [sq_1]   ; log2_denom+1
@@ -96,8 +89,8 @@ DECLARE_REG_TMP 10
 %endmacro
 
 %macro WEIGHT_FUNC_DBL 1
-cglobal h264_weight_16x16_10_%1
-    WEIGHT_PROLOGUE 16
+cglobal h264_weight_16_10_%1
+    WEIGHT_PROLOGUE
     WEIGHT_SETUP %1
 .nextrow
     WEIGHT_OP %1,  0
@@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1
     WEIGHT_OP %1, 16
     mova [r0+16], m5
     add       r0, r1
-    dec       t0
+    dec       r2d
     jnz .nextrow
     REP_RET
-
-cglobal h264_weight_16x8_10_%1
-    mov t0, 8
-    jmp mangle(ff_h264_weight_16x16_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4
 
 
 %macro WEIGHT_FUNC_MM 1
-cglobal h264_weight_8x16_10_%1
-    WEIGHT_PROLOGUE 16
+cglobal h264_weight_8_10_%1
+    WEIGHT_PROLOGUE
     WEIGHT_SETUP %1
 .nextrow
     WEIGHT_OP  %1, 0
     mova     [r0], m5
     add        r0, r1
-    dec        t0
+    dec        r2d
     jnz .nextrow
     REP_RET
-
-cglobal h264_weight_8x8_10_%1
-    mov t0, 8
-    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
-
-cglobal h264_weight_8x4_10_%1
-    mov t0, 4
-    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4
 
 
 %macro WEIGHT_FUNC_HALF_MM 1
-cglobal h264_weight_4x8_10_%1
-    WEIGHT_PROLOGUE 4
+cglobal h264_weight_4_10_%1
+    WEIGHT_PROLOGUE
+    sar         r2d, 1
     WEIGHT_SETUP %1
     lea         r3, [r1*2]
 .nextrow
@@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1
     movh      [r0], m5
     movhps [r0+r1], m5
     add         r0, r3
-    dec         t0
+    dec         r2d
     jnz .nextrow
     REP_RET
-
-cglobal h264_weight_4x4_10_%1
-    mov t0, 2
-    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
-
-cglobal h264_weight_4x2_10_%1
-    mov t0, 1
-    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4
 
 
 ;-----------------------------------------------------------------------------
-; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
-;                    int weightd, int weights, int offset);
+; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
+;                    int log2_denom, int weightd, int weights, int offset);
 ;-----------------------------------------------------------------------------
 %ifdef ARCH_X86_32
-DECLARE_REG_TMP 2,3
+DECLARE_REG_TMP 3
 %else
-DECLARE_REG_TMP 10,2
+DECLARE_REG_TMP 10
 %endif
 
-%macro BIWEIGHT_PROLOGUE 1
-    mov t0, %1
+%macro BIWEIGHT_PROLOGUE 0
 .prologue
     PROLOGUE 0,7,8
     movifnidn  r0, r0mp
     movifnidn  r1, r1mp
-    movifnidn t1d, r2m
-    movifnidn r4d, r4m
+    movifnidn r2d, r2m
     movifnidn r5d, r5m
     movifnidn r6d, r6m
+    movifnidn t0d, r7m
 %endmacro
 
 %macro BIWEIGHT_SETUP 1
-    lea        r6, [r6*4+1] ; (offset<<2)+1
-    or         r6, 1
-    shl        r5, 16
-    or         r4, r5
-    movd       m4, r4d      ; weightd | weights
-    movd       m5, r6d      ; (offset+1)|1
-    movd       m6, r3m      ; log2_denom
+    lea        t0, [t0*4+1] ; (offset<<2)+1
+    or         t0, 1
+    shl        r6, 16
+    or         r5, r6
+    movd       m4, r5d      ; weightd | weights
+    movd       m5, t0d      ; (offset+1)|1
+    movd       m6, r4m      ; log2_denom
     pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
     paddd      m6, [sq_1]
     pshufd     m4, m4, 0
     pshufd     m5, m5, 0
     mova       m3, [pw_pixel_max]
+    movifnidn r3d, r3m
 %ifnidn %1, sse4
     pxor       m7, m7
 %endif
@@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2
 %endmacro
 
 %macro BIWEIGHT_FUNC_DBL 1
-cglobal h264_biweight_16x16_10_%1
-    BIWEIGHT_PROLOGUE 16
+cglobal h264_biweight_16_10_%1
+    BIWEIGHT_PROLOGUE
     BIWEIGHT_SETUP %1
 .nextrow
     BIWEIGHT  %1,  0
     mova [r0   ], m0
     BIWEIGHT  %1, 16
     mova [r0+16], m0
-    add       r0, t1
-    add       r1, t1
-    dec       t0
+    add       r0, r2
+    add       r1, r2
+    dec       r3d
     jnz .nextrow
     REP_RET
-
-cglobal h264_biweight_16x8_10_%1
-    mov t0, 8
-    jmp mangle(ff_h264_biweight_16x16_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2
 BIWEIGHT_FUNC_DBL sse4
 
 %macro BIWEIGHT_FUNC 1
-cglobal h264_biweight_8x16_10_%1
-    BIWEIGHT_PROLOGUE 16
+cglobal h264_biweight_8_10_%1
+    BIWEIGHT_PROLOGUE
     BIWEIGHT_SETUP %1
 .nextrow
     BIWEIGHT %1, 0
     mova   [r0], m0
-    add      r0, t1
-    add      r1, t1
-    dec      t0
+    add      r0, r2
+    add      r1, r2
+    dec      r3d
     jnz .nextrow
     REP_RET
-
-cglobal h264_biweight_8x8_10_%1
-    mov t0, 8
-    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
-
-cglobal h264_biweight_8x4_10_%1
-    mov t0, 4
-    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
 %endmacro
 
 INIT_XMM
@@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2
 BIWEIGHT_FUNC sse4
 
 %macro BIWEIGHT_FUNC_HALF 1
-cglobal h264_biweight_4x8_10_%1
-    BIWEIGHT_PROLOGUE 4
+cglobal h264_biweight_4_10_%1
+    BIWEIGHT_PROLOGUE
     BIWEIGHT_SETUP %1
-    lea        r4, [t1*2]
+    sar        r3d, 1
+    lea        r4, [r2*2]
 .nextrow
-    BIWEIGHT    %1, 0, t1
+    BIWEIGHT    %1, 0, r2
     movh   [r0   ], m0
-    movhps [r0+t1], m0
+    movhps [r0+r2], m0
     add         r0, r4
     add         r1, r4
-    dec         t0
+    dec         r3d
     jnz .nextrow
     REP_RET
-
-cglobal h264_biweight_4x4_10_%1
-    mov t0, 2
-    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
-
-cglobal h264_biweight_4x2_10_%1
-    mov t0, 1
-    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
 %endmacro
 
 INIT_XMM
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index 71beb262c9..b337462aec 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -298,63 +298,53 @@ LF_IFUNC(v,  luma_intra,      10, mmxext)
 /***********************************/
 /* weighted prediction */
 
-#define H264_WEIGHT(W, H, OPT) \
-void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
-    int stride, int log2_denom, int weight, int offset);
+#define H264_WEIGHT(W, OPT) \
+void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \
+    int stride, int height, int log2_denom, int weight, int offset);
 
-#define H264_BIWEIGHT(W, H, OPT) \
-void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
-    uint8_t *src, int stride, int log2_denom, int weightd, \
+#define H264_BIWEIGHT(W, OPT) \
+void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \
+    uint8_t *src, int stride, int height, int log2_denom, int weightd, \
     int weights, int offset);
 
-#define H264_BIWEIGHT_MMX(W,H) \
-H264_WEIGHT  (W, H, mmx2) \
-H264_BIWEIGHT(W, H, mmx2)
-
-#define H264_BIWEIGHT_MMX_SSE(W,H) \
-H264_BIWEIGHT_MMX(W, H) \
-H264_WEIGHT      (W, H, sse2) \
-H264_BIWEIGHT    (W, H, sse2) \
-H264_BIWEIGHT    (W, H, ssse3)
-
-H264_BIWEIGHT_MMX_SSE(16, 16)
-H264_BIWEIGHT_MMX_SSE(16,  8)
-H264_BIWEIGHT_MMX_SSE( 8, 16)
-H264_BIWEIGHT_MMX_SSE( 8,  8)
-H264_BIWEIGHT_MMX_SSE( 8,  4)
-H264_BIWEIGHT_MMX    ( 4,  8)
-H264_BIWEIGHT_MMX    ( 4,  4)
-H264_BIWEIGHT_MMX    ( 4,  2)
-
-#define H264_WEIGHT_10(W, H, DEPTH, OPT) \
-void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
-    int stride, int log2_denom, int weight, int offset);
-
-#define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \
-void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \
-    (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
+#define H264_BIWEIGHT_MMX(W) \
+H264_WEIGHT  (W, mmx2) \
+H264_BIWEIGHT(W, mmx2)
+
+#define H264_BIWEIGHT_MMX_SSE(W) \
+H264_BIWEIGHT_MMX(W) \
+H264_WEIGHT      (W, sse2) \
+H264_BIWEIGHT    (W, sse2) \
+H264_BIWEIGHT    (W, ssse3)
+
+H264_BIWEIGHT_MMX_SSE(16)
+H264_BIWEIGHT_MMX_SSE( 8)
+H264_BIWEIGHT_MMX    ( 4)
+
+#define H264_WEIGHT_10(W, DEPTH, OPT) \
+void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
+    int stride, int height, int log2_denom, int weight, int offset);
+
+#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
+void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \
+    (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \
      int weightd, int weights, int offset);
 
-#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \
-H264_WEIGHT_10  (W, H, DEPTH, sse2) \
-H264_WEIGHT_10  (W, H, DEPTH, sse4) \
-H264_BIWEIGHT_10(W, H, DEPTH, sse2) \
-H264_BIWEIGHT_10(W, H, DEPTH, sse4)
-
-H264_BIWEIGHT_10_SSE(16, 16, 10)
-H264_BIWEIGHT_10_SSE(16,  8, 10)
-H264_BIWEIGHT_10_SSE( 8, 16, 10)
-H264_BIWEIGHT_10_SSE( 8,  8, 10)
-H264_BIWEIGHT_10_SSE( 8,  4, 10)
-H264_BIWEIGHT_10_SSE( 4,  8, 10)
-H264_BIWEIGHT_10_SSE( 4,  4, 10)
-H264_BIWEIGHT_10_SSE( 4,  2, 10)
+#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
+H264_WEIGHT_10  (W, DEPTH, sse2) \
+H264_WEIGHT_10  (W, DEPTH, sse4) \
+H264_BIWEIGHT_10(W, DEPTH, sse2) \
+H264_BIWEIGHT_10(W, DEPTH, sse4)
+
+H264_BIWEIGHT_10_SSE(16, 10)
+H264_BIWEIGHT_10_SSE( 8, 10)
+H264_BIWEIGHT_10_SSE( 4, 10)
 
 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
 {
     int mm_flags = av_get_cpu_flags();
 
-    if (mm_flags & AV_CPU_FLAG_MMX2) {
+    if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2) {
         c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
     }
 
@@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
             c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
 #endif
-            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
-            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
-            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
-            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
-            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
-            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
-            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
-            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
-
-            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
-            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
-            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
-            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
-            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
-            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
-            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
-            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
+            c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2;
+            c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2;
+            c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2;
+
+            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2;
+            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2;
+            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2;
 
             if (mm_flags&AV_CPU_FLAG_SSE2) {
                 c->h264_idct8_add           = ff_h264_idct8_add_8_sse2;
@@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                 c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_sse2;
                 c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
 
-                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
-                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
-                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
-                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
-                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
+                c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2;
+                c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2;
 
-                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
-                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
-                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
-                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
-                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
+                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2;
+                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2;
 
 #if HAVE_ALIGNED_STACK
                 c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
@@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
 #endif
             }
             if (mm_flags&AV_CPU_FLAG_SSSE3) {
-                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
-                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
-                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
-                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
-                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
+                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3;
+                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3;
             }
             if (HAVE_AVX && mm_flags&AV_CPU_FLAG_AVX) {
 #if HAVE_ALIGNED_STACK
@@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                 c->h264_idct8_add4     = ff_h264_idct8_add4_10_sse2;
 #endif
 
-                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2;
-                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2;
-                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2;
-                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2;
-                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2;
-                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2;
-                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2;
-                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2;
-
-                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2;
-                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2;
-                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2;
-                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2;
-                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2;
-                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2;
-                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2;
-                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2;
+                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
+                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
+                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
+
+                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
+                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
+                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
 
                 c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
                 c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
@@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
 #endif
             }
             if (mm_flags&AV_CPU_FLAG_SSE4) {
-                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4;
-                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4;
-                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4;
-                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4;
-                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4;
-                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4;
-                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4;
-                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4;
-
-                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4;
-                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4;
-                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4;
-                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4;
-                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4;
-                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4;
-                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4;
-                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4;
+                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
+                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
+                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
+
+                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
+                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
+                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
             }
 #if HAVE_AVX
             if (mm_flags&AV_CPU_FLAG_AVX) {
diff --git a/libavdevice/Makefile b/libavdevice/Makefile
index 879e933994..97dd380776 100644
--- a/libavdevice/Makefile
+++ b/libavdevice/Makefile
@@ -10,7 +10,7 @@ OBJS    = alldevices.o avdevice.o
 
 # input/output devices
 OBJS-$(CONFIG_ALSA_INDEV)                += alsa-audio-common.o \
-                                            alsa-audio-dec.o
+                                            alsa-audio-dec.o timefilter.o
 OBJS-$(CONFIG_ALSA_OUTDEV)               += alsa-audio-common.o \
                                             alsa-audio-enc.o
 OBJS-$(CONFIG_BKTR_INDEV)                += bktr.o
@@ -19,7 +19,7 @@ OBJS-$(CONFIG_DSHOW_INDEV)               += dshow.o dshow_enummediatypes.o \
                                             dshow_pin.o dshow_common.o
 OBJS-$(CONFIG_DV1394_INDEV)              += dv1394.o
 OBJS-$(CONFIG_FBDEV_INDEV)               += fbdev.o
-OBJS-$(CONFIG_JACK_INDEV)                += jack_audio.o
+OBJS-$(CONFIG_JACK_INDEV)                += jack_audio.o timefilter.o
 OBJS-$(CONFIG_LAVFI_INDEV)               += lavfi.o
 OBJS-$(CONFIG_OPENAL_INDEV)              += openal-dec.o
 OBJS-$(CONFIG_OSS_INDEV)                 += oss_audio.o
@@ -39,4 +39,6 @@ OBJS-$(CONFIG_LIBDC1394_INDEV)           += libdc1394.o
 SKIPHEADERS-$(HAVE_ALSA_ASOUNDLIB_H)     += alsa-audio.h
 SKIPHEADERS-$(HAVE_SNDIO_H)              += sndio_common.h
 
+TESTPROGS = timefilter
+
 include $(SRC_PATH)/subdir.mak
diff --git a/libavdevice/alsa-audio.h b/libavdevice/alsa-audio.h
index 83af464865..e453a2011b 100644
--- a/libavdevice/alsa-audio.h
+++ b/libavdevice/alsa-audio.h
@@ -33,7 +33,7 @@
 #include <alsa/asoundlib.h>
 #include "config.h"
 #include "libavutil/log.h"
-#include "libavformat/timefilter.h"
+#include "timefilter.h"
 #include "avdevice.h"
 
 /* XXX: we make the assumption that the soundcard accepts this format */
diff --git a/libavdevice/jack_audio.c b/libavdevice/jack_audio.c
index 72554f0ebe..42499d363b 100644
--- a/libavdevice/jack_audio.c
+++ b/libavdevice/jack_audio.c
@@ -28,7 +28,8 @@
 #include "libavutil/fifo.h"
 #include "libavutil/opt.h"
 #include "libavcodec/avcodec.h"
-#include "libavformat/timefilter.h"
+#include "libavformat/avformat.h"
+#include "timefilter.h"
 #include "avdevice.h"
 
 /**
diff --git a/libavformat/timefilter.c b/libavdevice/timefilter.c
index 5c780c8d90..3c67a59881 100644
--- a/libavformat/timefilter.c
+++ b/libavdevice/timefilter.c
@@ -24,8 +24,8 @@
 
 
 #include "config.h"
-#include "avformat.h"
 #include "timefilter.h"
+#include "libavutil/mem.h"
 
 struct TimeFilter {
     /// Delay Locked Loop data. These variables refer to mathematical
diff --git a/libavformat/timefilter.h b/libavdevice/timefilter.h
index ded8ec7bed..4580904092 100644
--- a/libavformat/timefilter.h
+++ b/libavdevice/timefilter.h
@@ -22,8 +22,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVFORMAT_TIMEFILTER_H
-#define AVFORMAT_TIMEFILTER_H
+#ifndef AVDEVICE_TIMEFILTER_H
+#define AVDEVICE_TIMEFILTER_H
 
 /**
  * Opaque type representing a time filter state
@@ -94,4 +94,4 @@ void ff_timefilter_reset(TimeFilter *);
  */
 void ff_timefilter_destroy(TimeFilter *);
 
-#endif /* AVFORMAT_TIMEFILTER_H */
+#endif /* AVDEVICE_TIMEFILTER_H */
diff --git a/libavformat/Makefile b/libavformat/Makefile
index ddd1ab7833..15ccb291ed 100644
--- a/libavformat/Makefile
+++ b/libavformat/Makefile
@@ -354,11 +354,8 @@ OBJS-$(CONFIG_RTP_PROTOCOL)              += rtpproto.o
 OBJS-$(CONFIG_TCP_PROTOCOL)              += tcp.o
 OBJS-$(CONFIG_UDP_PROTOCOL)              += udp.o
 
-# libavdevice dependencies
-OBJS-$(CONFIG_ALSA_INDEV)                += timefilter.o
-OBJS-$(CONFIG_JACK_INDEV)                += timefilter.o
 
-TESTPROGS = seek timefilter
+TESTPROGS = seek
 TOOLS     = pktdumper probetest
 
 include $(SRC_PATH)/subdir.mak
diff --git a/libavformat/flvdec.c b/libavformat/flvdec.c
index 7025ddd014..d48d367500 100644
--- a/libavformat/flvdec.c
+++ b/libavformat/flvdec.c
@@ -228,8 +228,9 @@ static int amf_parse_object(AVFormatContext *s, AVStream *astream, AVStream *vst
         case AMF_DATA_TYPE_OBJECT: {
             unsigned int keylen;
 
-            if (vstream && ioc->seekable && key && !strcmp(KEYFRAMES_TAG, key) && depth == 1)
-                if (parse_keyframes_index(s, ioc, vstream, max_pos) < 0)
+            if ((vstream || astream) && ioc->seekable && key && !strcmp(KEYFRAMES_TAG, key) && depth == 1)
+                if (parse_keyframes_index(s, ioc, vstream ? vstream : astream,
+                                          max_pos) < 0)
                     av_log(s, AV_LOG_ERROR, "Keyframe index parsing failed\n");
 
             while(avio_tell(ioc) < max_pos - 2 && (keylen = avio_rb16(ioc))) {
diff --git a/libavformat/flvenc.c b/libavformat/flvenc.c
index 98c4170311..627bb6d3ab 100644
--- a/libavformat/flvenc.c
+++ b/libavformat/flvenc.c
@@ -60,10 +60,13 @@ typedef struct FLVContext {
     int64_t duration_offset;
     int64_t filesize_offset;
     int64_t duration;
-    int delay; ///< first dts delay for AVC
-    int64_t last_ts;
 } FLVContext;
 
+typedef struct FLVStreamContext {
+    int     delay;      ///< first dts delay for each stream (needed for AVC & Speex)
+    int64_t last_ts;    ///< last timestamp for each stream
+} FLVStreamContext;
+
 static int get_audio_flags(AVCodecContext *enc){
     int flags = (enc->bits_per_coded_sample == 16) ? FLV_SAMPLESSIZE_16BIT : FLV_SAMPLESSIZE_8BIT;
 
@@ -182,6 +185,7 @@ static int flv_write_header(AVFormatContext *s)
 
     for(i=0; i<s->nb_streams; i++){
         AVCodecContext *enc = s->streams[i]->codec;
+        FLVStreamContext *sc;
         if (enc->codec_type == AVMEDIA_TYPE_VIDEO) {
             if (s->streams[i]->r_frame_rate.den && s->streams[i]->r_frame_rate.num) {
                 framerate = av_q2d(s->streams[i]->r_frame_rate);
@@ -199,6 +203,12 @@ static int flv_write_header(AVFormatContext *s)
                 return -1;
         }
         av_set_pts_info(s->streams[i], 32, 1, 1000); /* 32 bit pts in ms */
+
+        sc = av_mallocz(sizeof(FLVStreamContext));
+        if (!sc)
+            return AVERROR(ENOMEM);
+        s->streams[i]->priv_data = sc;
+        sc->last_ts = -1;
     }
     avio_write(pb, "FLV", 3);
     avio_w8(pb,1);
@@ -218,8 +228,6 @@ static int flv_write_header(AVFormatContext *s)
         }
     }
 
-    flv->last_ts = -1;
-
     /* write meta_tag */
     avio_w8(pb, 18);         // tag type META
     metadata_size_pos= avio_tell(pb);
@@ -361,9 +369,10 @@ static int flv_write_trailer(AVFormatContext *s)
     /* Add EOS tag */
     for (i = 0; i < s->nb_streams; i++) {
         AVCodecContext *enc = s->streams[i]->codec;
+        FLVStreamContext *sc = s->streams[i]->priv_data;
         if (enc->codec_type == AVMEDIA_TYPE_VIDEO &&
                 (enc->codec_id == CODEC_ID_H264 || enc->codec_id == CODEC_ID_MPEG4)) {
-            put_avc_eos_tag(pb, flv->last_ts);
+            put_avc_eos_tag(pb, sc->last_ts);
         }
     }
 
@@ -384,6 +393,7 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt)
     AVIOContext *pb = s->pb;
     AVCodecContext *enc = s->streams[pkt->stream_index]->codec;
     FLVContext *flv = s->priv_data;
+    FLVStreamContext *sc = s->streams[pkt->stream_index]->priv_data;
     unsigned ts;
     int size= pkt->size;
     uint8_t *data= NULL;
@@ -434,20 +444,20 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt)
         av_log(s, AV_LOG_ERROR, "malformated aac bitstream, use -absf aac_adtstoasc\n");
         return -1;
     }
-    if (!flv->delay && pkt->dts < 0)
-        flv->delay = -pkt->dts;
+    if (!sc->delay && pkt->dts < 0)
+        sc->delay = -pkt->dts;
 
-    ts = pkt->dts + flv->delay; // add delay to force positive dts
+    ts = pkt->dts + sc->delay; // add delay to force positive dts
 
     /* check Speex packet duration */
-    if (enc->codec_id == CODEC_ID_SPEEX && ts - flv->last_ts > 160) {
+    if (enc->codec_id == CODEC_ID_SPEEX && ts - sc->last_ts > 160) {
         av_log(s, AV_LOG_WARNING, "Warning: Speex stream has more than "
                                   "8 frames per packet. Adobe Flash "
                                   "Player cannot handle this!\n");
     }
 
-    if (flv->last_ts < ts)
-        flv->last_ts = ts;
+    if (sc->last_ts < ts)
+        sc->last_ts = ts;
 
     avio_wb24(pb,size + flags_size);
     avio_wb24(pb,ts);
@@ -471,7 +481,7 @@ static int flv_write_packet(AVFormatContext *s, AVPacket *pkt)
     avio_write(pb, data ? data : pkt->data, size);
 
     avio_wb32(pb,size+flags_size+11); // previous tag size
-    flv->duration = FFMAX(flv->duration, pkt->pts + flv->delay + pkt->duration);
+    flv->duration = FFMAX(flv->duration, pkt->pts + sc->delay + pkt->duration);
 
     avio_flush(pb);
 
diff --git a/libavformat/mov.c b/libavformat/mov.c
index 3c8bbc9ca1..198f3cd938 100644
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -35,6 +35,7 @@
 #include "riff.h"
 #include "isom.h"
 #include "libavcodec/get_bits.h"
+#include "id3v1.h"
 
 #if CONFIG_ZLIB
 #include <zlib.h>
@@ -99,31 +100,48 @@ static int mov_metadata_track_or_disc_number(MOVContext *c, AVIOContext *pb,
     return 0;
 }
 
-static int mov_metadata_int8(MOVContext *c, AVIOContext *pb,
-                             unsigned len, const char *key)
+static int mov_metadata_int8_bypass_padding(MOVContext *c, AVIOContext *pb,
+                                            unsigned len, const char *key)
 {
-  char buf[16];
+    char buf[16];
+
+    /* bypass padding bytes */
+    avio_r8(pb);
+    avio_r8(pb);
+    avio_r8(pb);
+
+    snprintf(buf, sizeof(buf), "%hu", avio_r8(pb));
+    av_dict_set(&c->fc->metadata, key, buf, 0);
+
+    return 0;
+}
 
-  /* bypass padding bytes */
-  avio_r8(pb);
-  avio_r8(pb);
-  avio_r8(pb);
+static int mov_metadata_int8_no_padding(MOVContext *c, AVIOContext *pb,
+                                        unsigned len, const char *key)
+{
+    char buf[16];
 
-  snprintf(buf, sizeof(buf), "%hu", avio_r8(pb));
-  av_dict_set(&c->fc->metadata, key, buf, 0);
+    snprintf(buf, sizeof(buf), "%hu", avio_r8(pb));
+    av_dict_set(&c->fc->metadata, key, buf, 0);
 
-  return 0;
+    return 0;
 }
 
-static int mov_metadata_stik(MOVContext *c, AVIOContext *pb,
+static int mov_metadata_gnre(MOVContext *c, AVIOContext *pb,
                              unsigned len, const char *key)
 {
-  char buf[16];
+    short genre;
+    char buf[20];
 
-  snprintf(buf, sizeof(buf), "%hu", avio_r8(pb));
-  av_dict_set(&c->fc->metadata, key, buf, 0);
+    avio_r8(pb); // unknown
 
-  return 0;
+    genre = avio_r8(pb);
+    if (genre < 1 || genre > ID3v1_GENRE_MAX)
+        return 0;
+    snprintf(buf, sizeof(buf), "%s", ff_id3v1_genre_str[genre-1]);
+    av_dict_set(&c->fc->metadata, key, buf, 0);
+
+    return 0;
 }
 
 static const uint32_t mac_to_unicode[128] = {
@@ -189,6 +207,8 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom)
     case MKTAG(0xa9,'a','l','b'): key = "album";     break;
     case MKTAG(0xa9,'d','a','y'): key = "date";      break;
     case MKTAG(0xa9,'g','e','n'): key = "genre";     break;
+    case MKTAG( 'g','n','r','e'): key = "genre";
+        parse = mov_metadata_gnre; break;
     case MKTAG(0xa9,'t','o','o'):
     case MKTAG(0xa9,'s','w','r'): key = "encoder";   break;
     case MKTAG(0xa9,'e','n','c'): key = "encoder";   break;
@@ -202,11 +222,15 @@ static int mov_read_udta_string(MOVContext *c, AVIOContext *pb, MOVAtom atom)
     case MKTAG( 'd','i','s','k'): key = "disc";
         parse = mov_metadata_track_or_disc_number; break;
     case MKTAG( 't','v','e','s'): key = "episode_sort";
-        parse = mov_metadata_int8; break;
+        parse = mov_metadata_int8_bypass_padding; break;
     case MKTAG( 't','v','s','n'): key = "season_number";
-        parse = mov_metadata_int8; break;
+        parse = mov_metadata_int8_bypass_padding; break;
     case MKTAG( 's','t','i','k'): key = "media_type";
-        parse = mov_metadata_stik; break;
+        parse = mov_metadata_int8_no_padding; break;
+    case MKTAG( 'h','d','v','d'): key = "hd_video";
+        parse = mov_metadata_int8_no_padding; break;
+    case MKTAG( 'p','g','a','p'): key = "gapless_playback";
+        parse = mov_metadata_int8_no_padding; break;
     }
 
     if (c->itunes_metadata && atom.size > 8) {
diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
index 13c2f0fd48..5d0a9dfaeb 100644
--- a/libavutil/pixdesc.c
+++ b/libavutil/pixdesc.c
@@ -859,6 +859,29 @@ const AVPixFmtDescriptor av_pix_fmt_descriptors[PIX_FMT_NB] = {
         },
         .flags = PIX_FMT_BE,
     },
+    [PIX_FMT_YUV422P9LE] = {
+        .name = "yuv422p9le",
+        .nb_components= 3,
+        .log2_chroma_w= 1,
+        .log2_chroma_h= 0,
+        .comp = {
+            {0,1,1,0,8},        /* Y */
+            {1,1,1,0,8},        /* U */
+            {2,1,1,0,8},        /* V */
+        },
+    },
+    [PIX_FMT_YUV422P9BE] = {
+        .name = "yuv422p9be",
+        .nb_components= 3,
+        .log2_chroma_w= 1,
+        .log2_chroma_h= 0,
+        .comp = {
+            {0,1,1,0,8},        /* Y */
+            {1,1,1,0,8},        /* U */
+            {2,1,1,0,8},        /* V */
+        },
+        .flags = PIX_FMT_BE,
+    },
     [PIX_FMT_YUV422P10LE] = {
         .name = "yuv422p10le",
         .nb_components= 3,
diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index f23d0c5054..25c9ef99ff 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -149,12 +149,15 @@ enum PixelFormat {
     PIX_FMT_YUV444P9LE, ///< planar YUV 4:4:4, 27bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
     PIX_FMT_YUV444P10BE,///< planar YUV 4:4:4, 30bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian
     PIX_FMT_YUV444P10LE,///< planar YUV 4:4:4, 30bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
+    PIX_FMT_YUV422P9BE, ///< planar YUV 4:2:2, 18bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
+    PIX_FMT_YUV422P9LE, ///< planar YUV 4:2:2, 18bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian
 
     PIX_FMT_RGBA64BE,  ///< packed RGBA 16:16:16:16, 64bpp, 16R, 16G, 16B, 16A, the 2-byte value for each R/G/B/A component is stored as big-endian
     PIX_FMT_RGBA64LE,  ///< packed RGBA 16:16:16:16, 64bpp, 16R, 16G, 16B, 16A, the 2-byte value for each R/G/B/A component is stored as little-endian
     PIX_FMT_BGRA64BE,  ///< packed RGBA 16:16:16:16, 64bpp, 16B, 16G, 16R, 16A, the 2-byte value for each R/G/B/A component is stored as big-endian
     PIX_FMT_BGRA64LE,  ///< packed RGBA 16:16:16:16, 64bpp, 16B, 16G, 16R, 16A, the 2-byte value for each R/G/B/A component is stored as little-endian
     PIX_FMT_GBR24P,    ///< planar GBR, 24bpp, 8G, 8B, 8R.
+
     PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
 };
 
@@ -182,6 +185,7 @@ enum PixelFormat {
 #define PIX_FMT_BGR444 PIX_FMT_NE(BGR444BE, BGR444LE)
 
 #define PIX_FMT_YUV420P9  PIX_FMT_NE(YUV420P9BE , YUV420P9LE)
+#define PIX_FMT_YUV422P9  PIX_FMT_NE(YUV422P9BE , YUV422P9LE)
 #define PIX_FMT_YUV444P9  PIX_FMT_NE(YUV444P9BE , YUV444P9LE)
 #define PIX_FMT_YUV420P10 PIX_FMT_NE(YUV420P10BE, YUV420P10LE)
 #define PIX_FMT_YUV422P10 PIX_FMT_NE(YUV422P10BE, YUV422P10LE)
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index fc4781b532..c486b06425 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -536,6 +536,18 @@
 %endif
 %endmacro
 
+%macro SPLATD_MMX 1
+    punpckldq  %1, %1
+%endmacro
+
+%macro SPLATD_SSE 1
+    shufps  %1, %1, 0
+%endmacro
+
+%macro SPLATD_SSE2 1
+    pshufd  %1, %1, 0
+%endmacro
+
 %macro CLIPW 3 ;(dst, min, max)
     pmaxsw %1, %2
     pminsw %1, %3
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 31346a3521..29b7b4c212 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -2843,6 +2843,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
 #if HAVE_BIGENDIAN
         case PIX_FMT_YUV444P9LE:
+        case PIX_FMT_YUV422P9LE:
         case PIX_FMT_YUV420P9LE:
         case PIX_FMT_YUV422P10LE:
         case PIX_FMT_YUV420P10LE:
@@ -2852,6 +2853,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
 #else
         case PIX_FMT_YUV444P9BE:
+        case PIX_FMT_YUV422P9BE:
         case PIX_FMT_YUV420P9BE:
         case PIX_FMT_YUV444P10BE:
         case PIX_FMT_YUV422P10BE:
@@ -2912,6 +2914,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
     switch (srcFormat) {
 #if HAVE_BIGENDIAN
     case PIX_FMT_YUV444P9LE:
+    case PIX_FMT_YUV422P9LE:
     case PIX_FMT_YUV420P9LE:
     case PIX_FMT_YUV422P10LE:
     case PIX_FMT_YUV420P10LE:
@@ -2922,6 +2925,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
 #else
     case PIX_FMT_YUV444P9BE:
+    case PIX_FMT_YUV422P9BE:
     case PIX_FMT_YUV420P9BE:
     case PIX_FMT_YUV444P10BE:
     case PIX_FMT_YUV422P10BE:
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 0e62b38bcc..c8e8685f72 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -547,6 +547,8 @@ const char *sws_format_name(enum PixelFormat format);
 #define isNBPS(x)       (           \
            (x)==PIX_FMT_YUV420P9LE  \
         || (x)==PIX_FMT_YUV420P9BE  \
+        || (x)==PIX_FMT_YUV422P9LE  \
+        || (x)==PIX_FMT_YUV422P9BE  \
         || (x)==PIX_FMT_YUV444P9BE  \
         || (x)==PIX_FMT_YUV444P9LE  \
         || (x)==PIX_FMT_YUV422P10BE \
@@ -574,6 +576,7 @@ const char *sws_format_name(enum PixelFormat format);
 #define isPlanarYUV(x)  (           \
         isPlanar8YUV(x)             \
         || (x)==PIX_FMT_YUV420P9LE  \
+        || (x)==PIX_FMT_YUV422P9LE  \
         || (x)==PIX_FMT_YUV444P9LE  \
         || (x)==PIX_FMT_YUV420P10LE \
         || (x)==PIX_FMT_YUV422P10LE \
@@ -583,6 +586,7 @@ const char *sws_format_name(enum PixelFormat format);
         || (x)==PIX_FMT_YUV422P16LE \
         || (x)==PIX_FMT_YUV444P16LE \
         || (x)==PIX_FMT_YUV420P9BE  \
+        || (x)==PIX_FMT_YUV422P9BE  \
         || (x)==PIX_FMT_YUV444P9BE  \
         || (x)==PIX_FMT_YUV420P10BE \
         || (x)==PIX_FMT_YUV422P10BE \
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 6c2a8de120..673dae0adc 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -136,6 +136,8 @@ const static FormatEntry format_entries[PIX_FMT_NB] = {
     [PIX_FMT_YUV420P9LE]  = { 1 , 1 },
     [PIX_FMT_YUV420P10BE] = { 1 , 1 },
     [PIX_FMT_YUV420P10LE] = { 1 , 1 },
+    [PIX_FMT_YUV422P9BE]  = { 1 , 1 },
+    [PIX_FMT_YUV422P9LE]  = { 1 , 1 },
     [PIX_FMT_YUV422P10BE] = { 1 , 1 },
     [PIX_FMT_YUV422P10LE] = { 1 , 1 },
     [PIX_FMT_YUV444P9BE]  = { 1 , 1 },
@@ -280,15 +282,18 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
                 if (flags & SWS_BICUBIC) {
                     int64_t B= (param[0] != SWS_PARAM_DEFAULT ? param[0] :   0) * (1<<24);
                     int64_t C= (param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6) * (1<<24);
-                    int64_t dd = ( d*d)>>30;
-                    int64_t ddd= (dd*d)>>30;
 
-                    if      (d < 1LL<<30)
-                        coeff = (12*(1<<24)-9*B-6*C)*ddd + (-18*(1<<24)+12*B+6*C)*dd + (6*(1<<24)-2*B)*(1<<30);
-                    else if (d < 1LL<<31)
-                        coeff = (-B-6*C)*ddd + (6*B+30*C)*dd + (-12*B-48*C)*d + (8*B+24*C)*(1<<30);
-                    else
-                        coeff=0.0;
+                    if (d >= 1LL<<31) {
+                        coeff = 0.0;
+                    } else {
+                        int64_t dd  = (d  * d) >> 30;
+                        int64_t ddd = (dd * d) >> 30;
+
+                        if (d < 1LL<<30)
+                            coeff = (12*(1<<24)-9*B-6*C)*ddd + (-18*(1<<24)+12*B+6*C)*dd + (6*(1<<24)-2*B)*(1<<30);
+                        else
+                            coeff = (-B-6*C)*ddd + (6*B+30*C)*dd + (-12*B-48*C)*d + (8*B+24*C)*(1<<30);
+                    }
                     coeff *= fone>>(30+24);
                 }
 /*                else if (flags & SWS_X) {
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
index 36182a5ea9..eb478e2186 100644
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -790,8 +790,8 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4], int
         y_table32 = c->yuvTable;
         yb = -(384<<16) - oy;
         for (i = 0; i < 1024; i++) {
-            uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
-            y_table32[i     ] = (yval << rbase) + (needAlpha ? 0 : (255 << abase));
+            unsigned yval = av_clip_uint8((yb + 0x8000) >> 16);
+            y_table32[i     ] = (yval << rbase) + (needAlpha ? 0 : (255u << abase));
             y_table32[i+1024] = yval << gbase;
             y_table32[i+2048] = yval << bbase;
             yb += cy;
diff --git a/tests/ref/lavfi/pixdesc b/tests/ref/lavfi/pixdesc
index 8bf3801524..0b18dfb7da 100644
--- a/tests/ref/lavfi/pixdesc
+++ b/tests/ref/lavfi/pixdesc
@@ -42,6 +42,8 @@ yuv422p10be         bdc13b630fd668b34c6fe1aae28dfc71
 yuv422p10le         d0607c260a45c973e6639f4e449730ad
 yuv422p16be         4e9b3b3467aeebb6a528cee5966800ed
 yuv422p16le         f87c81bf16916b64d201359be0b4b6f4
+yuv422p9be          29b71579946940a8c00fa844c9dff507
+yuv422p9le          062b7f9cbb972bf36b5bdb1a7623701a
 yuv440p             5a064afe2b453bb52cdb3f176b1aa1cf
 yuv444p             0a98447b78fd476aa39686da6a74fa2e
 yuv444p10be         e65cbae7e4f1892c23defbc8e8052cf6
diff --git a/tests/ref/lavfi/pixfmts_copy b/tests/ref/lavfi/pixfmts_copy
index 8bf3801524..0b18dfb7da 100644
--- a/tests/ref/lavfi/pixfmts_copy
+++ b/tests/ref/lavfi/pixfmts_copy
@@ -42,6 +42,8 @@ yuv422p10be         bdc13b630fd668b34c6fe1aae28dfc71
 yuv422p10le         d0607c260a45c973e6639f4e449730ad
 yuv422p16be         4e9b3b3467aeebb6a528cee5966800ed
 yuv422p16le         f87c81bf16916b64d201359be0b4b6f4
+yuv422p9be          29b71579946940a8c00fa844c9dff507
+yuv422p9le          062b7f9cbb972bf36b5bdb1a7623701a
 yuv440p             5a064afe2b453bb52cdb3f176b1aa1cf
 yuv444p             0a98447b78fd476aa39686da6a74fa2e
 yuv444p10be         e65cbae7e4f1892c23defbc8e8052cf6
diff --git a/tests/ref/lavfi/pixfmts_null b/tests/ref/lavfi/pixfmts_null
index 8bf3801524..0b18dfb7da 100644
--- a/tests/ref/lavfi/pixfmts_null
+++ b/tests/ref/lavfi/pixfmts_null
@@ -42,6 +42,8 @@ yuv422p10be         bdc13b630fd668b34c6fe1aae28dfc71
 yuv422p10le         d0607c260a45c973e6639f4e449730ad
 yuv422p16be         4e9b3b3467aeebb6a528cee5966800ed
 yuv422p16le         f87c81bf16916b64d201359be0b4b6f4
+yuv422p9be          29b71579946940a8c00fa844c9dff507
+yuv422p9le          062b7f9cbb972bf36b5bdb1a7623701a
 yuv440p             5a064afe2b453bb52cdb3f176b1aa1cf
 yuv444p             0a98447b78fd476aa39686da6a74fa2e
 yuv444p10be         e65cbae7e4f1892c23defbc8e8052cf6
diff --git a/tests/ref/lavfi/pixfmts_scale b/tests/ref/lavfi/pixfmts_scale
index eee76706d3..e17544dbe6 100644
--- a/tests/ref/lavfi/pixfmts_scale
+++ b/tests/ref/lavfi/pixfmts_scale
@@ -42,6 +42,8 @@ yuv422p10be         cea7ca6b0e66d6f29539885896c88603
 yuv422p10le         a10c4a5837547716f13cd61918b145f9
 yuv422p16be         285993ee0c0f4f8e511ee46f93c5f38c
 yuv422p16le         61bfcee8e54465f760164f5a75d40b5e
+yuv422p9be          82494823944912f73cebc58ad2979bbd
+yuv422p9le          fc69c8a21f473916a4b4225636b97e06
 yuv440p             461503fdb9b90451020aa3b25ddf041c
 yuv444p             81b2eba962d12e8d64f003ac56f6faf2
 yuv444p10be         e9d3c8e744b8b0d8187ca092fa203fc9
diff --git a/tests/ref/lavfi/pixfmts_vflip b/tests/ref/lavfi/pixfmts_vflip
index 763e0bb069..c8301e380f 100644
--- a/tests/ref/lavfi/pixfmts_vflip
+++ b/tests/ref/lavfi/pixfmts_vflip
@@ -42,6 +42,8 @@ yuv422p10be         588fe319b96513c32e21d3e32b45447f
 yuv422p10le         11b57f2bd9661024153f3973b9090cdb
 yuv422p16be         c092d083548c2a144c372a98c46875c7
 yuv422p16le         c071b9397a416d51cbe339345cbcba84
+yuv422p9be          7c6f1e140b3999ee7d923854e507752a
+yuv422p9le          51f10d79c07989060dd06e767e6d7d60
 yuv440p             876385e96165acf51271b20e5d85a416
 yuv444p             9c3c667d1613b72d15bc6d851c5eb8f7
 yuv444p10be         944a4997c4edb3a8dd0f0493cfd5a1fd