From 96d30c34951b42479f4d1a4210e8a36347c4d653 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 14 May 2015 13:39:37 -0400
Subject: vp9: disable all pmulhrsw in 8/16 iadst x86 optimizations.

They all overflow in various samples that are considered valid input.
---
 libavcodec/x86/vp9itxfm.asm | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
index 9cf0d78fab..a08e1ff313 100644
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -868,7 +868,8 @@ VP9_IDCT_IDCT_8x8_ADD_XMM avx, 13
 
     ; m6=out0, m5=out1, m4=t2, m3=t3, m7=t6, m0=t7, m2=out6, m1=out7
 
-%if cpuflag(ssse3)
+    ; unfortunately, the code below overflows in some cases
+%if 0; cpuflag(ssse3)
     SUMSUB_BA                w,  3,  4,  2
     SUMSUB_BA                w,  0,  7,  2
     pmulhrsw                m3, W_11585x2_REG
@@ -1647,7 +1648,8 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx
     VP9_RND_SH_SUMSUB_BA     4,  7,  0,  2,  1, [pd_8192]
     PSIGNW                  m4, [pw_m1]                     ; m4=out13[w], m7=t15[w]
 
-%if cpuflag(ssse3)
+    ; unfortunately, the code below overflows in some cases
+%if 0; cpuflag(ssse3)
     SUMSUB_BA                w,  7,  6,  1
     pmulhrsw                m7, [pw_m11585x2]               ; m7=out5[w]
     pmulhrsw                m6, [pw_11585x2]                ; m6=out10[w]
-- 
cgit v1.2.3


From d32d0593f172095dd1ab7660e98560482ed780cb Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 14 May 2015 14:15:27 -0400
Subject: vp9: disable more pmulhrsw optimizations in idct16/32.

For idct16, only when called from a adst16x16 variant, so impact is
minor. For idct32, for all, so relatively major impact.
---
 libavcodec/x86/vp9itxfm.asm | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
index a08e1ff313..d9fb36f710 100644
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -997,7 +997,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
 ;    SUMSUB_BA            w, x13, x14, 7       ; t6, t9
 ;    SUMSUB_BA            w, x15, x12, 7       ; t7, t8
 
-%macro VP9_IDCT16_1D_START 5 ; src, nnzc, stride, scratch, scratch_stride
+%macro VP9_IDCT16_1D_START 6 ; src, nnzc, stride, scratch, scratch_stride, is_iadst
 %if %2 <= 4
     mova                m3, [%1+ 1*%3]      ; IN(1)
     mova                m0, [%1+ 3*%3]      ; IN(3)
@@ -1090,7 +1090,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
     ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
     ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
 
-%if cpuflag(ssse3)
+%if cpuflag(ssse3) && %6 == 0
     SUMSUB_BA            w,  2,  5, 7
     SUMSUB_BA            w,  3,  4, 7
     pmulhrsw            m5, [pw_11585x2]    ; t10
@@ -1164,7 +1164,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
     SUMSUB_BA            w,  4,  6, 2       ; t4,  t5
     SUMSUB_BA            w,  7,  5, 2       ; t7,  t6
 
-%if cpuflag(ssse3)
+%if cpuflag(ssse3) && %6 == 0
     SUMSUB_BA            w,  6,  5, 2
     pmulhrsw            m5, [pw_11585x2]                              ; t5
     pmulhrsw            m6, [pw_11585x2]                              ; t6
@@ -1184,7 +1184,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
     mova                m3, [%1+ 8*%3]      ; IN(8)
 
     ; from 3 stages back
-%if cpuflag(ssse3)
+%if cpuflag(ssse3) && %6 == 0
     SUMSUB_BA            w,  3,  2, 5
     pmulhrsw            m3, [pw_11585x2]    ; t0
     pmulhrsw            m2, [pw_11585x2]    ; t1
@@ -1249,9 +1249,9 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
 %endif
 %endmacro
 
-%macro VP9_IDCT16_1D 2-3 16 ; src, pass, nnzc
+%macro VP9_IDCT16_1D 2-4 16, 1 ; src, pass, nnzc, is_iadst
 %if %2 == 1
-    VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16
+    VP9_IDCT16_1D_START %1, %3, 32, tmpq, 16, %4
 
 %if ARCH_X86_64
     ; backup a different register
@@ -1318,7 +1318,7 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
     mova     [tmpq+15*16], m7
 %endif
 %else ; %2 == 2
-    VP9_IDCT16_1D_START %1, %3, 32, %1, 32
+    VP9_IDCT16_1D_START %1, %3, 32, %1, 32, %4
 
 %if cpuflag(ssse3)
 %define ROUND_REG [pw_512]
@@ -1468,12 +1468,12 @@ cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
 %if cpuflag(ssse3)
 .idct8x8:
     mov               tmpq, rsp
-    VP9_IDCT16_1D   blockq, 1, 8
+    VP9_IDCT16_1D   blockq, 1, 8, 0
 
     mov               cntd, 2
     mov           dst_bakq, dstq
 .loop2_8x8:
-    VP9_IDCT16_1D     tmpq, 2, 8
+    VP9_IDCT16_1D     tmpq, 2, 8, 0
     lea               dstq, [dst_bakq+8]
     add               tmpq, 16
     dec               cntd
@@ -1489,7 +1489,7 @@ cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
     mov               cntd, 2
     mov               tmpq, rsp
 .loop1_full:
-    VP9_IDCT16_1D   blockq, 1
+    VP9_IDCT16_1D   blockq, 1, 16, 0
     add             blockq, 16
     add               tmpq, 256
     dec               cntd
@@ -1500,7 +1500,7 @@ cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
     mov               tmpq, rsp
     mov           dst_bakq, dstq
 .loop2_full:
-    VP9_IDCT16_1D     tmpq, 2
+    VP9_IDCT16_1D     tmpq, 2, 16, 0
     lea               dstq, [dst_bakq+8]
     add               tmpq, 16
     dec               cntd
@@ -1901,7 +1901,7 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
 %macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
 %assign %%str 16*%2*%2
     ; first do t0-15, this can be done identical to idct16x16
-    VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str
+    VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq, 2*%%str, 1
 
     ; store everything on stack to make space available for t16-31
     ; we store interleaved with the output of the second half (t16-31)
@@ -2132,7 +2132,7 @@ IADST16_FN iadst, IADST16, iadst, IADST16, avx
     ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23,
     ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31
 
-%if cpuflag(ssse3)
+%if 0; cpuflag(ssse3)
 %if ARCH_X86_64
     SUMSUB_BA             w,  4,  7,  8
     SUMSUB_BA             w,  5,  1,  8
-- 
cgit v1.2.3


From e12188e1438b09fb1304cadd404dd3aabbb660ac Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 14 May 2015 16:37:49 -0400
Subject: vp9: fix segmentation map referencing upon framesize change.

---
 libavcodec/vp9.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index db3f541c5e..a3cecf209a 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -153,6 +153,7 @@ typedef struct VP9Context {
         uint8_t temporal;
         uint8_t absolute_vals;
         uint8_t update_map;
+        uint8_t ignore_refmap;
         struct {
             uint8_t q_enabled;
             uint8_t lf_enabled;
@@ -724,6 +725,7 @@ static int decode_frame_header(AVCodecContext *ctx,
                      s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
 
     /* segmentation header info */
+    s->segmentation.ignore_refmap = 0;
     if ((s->segmentation.enabled = get_bits1(&s->gb))) {
         if ((s->segmentation.update_map = get_bits1(&s->gb))) {
             for (i = 0; i < 7; i++)
@@ -738,10 +740,11 @@ static int decode_frame_header(AVCodecContext *ctx,
         if ((!s->segmentation.update_map || s->segmentation.temporal) &&
             (w != s->frames[CUR_FRAME].tf.f->width ||
              h != s->frames[CUR_FRAME].tf.f->height)) {
-            av_log(ctx, AV_LOG_ERROR,
+            av_log(ctx, AV_LOG_WARNING,
                    "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
                    s->segmentation.temporal, s->segmentation.update_map);
-            return AVERROR_INVALIDDATA;
+                s->segmentation.ignore_refmap = 1;
+            //return AVERROR_INVALIDDATA;
         }
 
         if (get_bits1(&s->gb)) {
@@ -1457,7 +1460,7 @@ static void decode_mode(AVCodecContext *ctx)
                 vp56_rac_get_prob_branchy(&s->c,
                     s->prob.segpred[s->above_segpred_ctx[col] +
                                     s->left_segpred_ctx[row7]]))) {
-        if (!s->errorres) {
+        if (!s->errorres && !s->segmentation.ignore_refmap) {
             int pred = 8, x;
             uint8_t *refsegmap = s->frames[REF_FRAME_SEGMAP].segmentation_map;
 
-- 
cgit v1.2.3


From dc96c0f9fc96bf4167633befc074394062793322 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 14 May 2015 16:38:53 -0400
Subject: vp9: read all 4x4 blocks in sub8x8 blocks individually with
 scalability.

---
 libavcodec/vp9.c             | 4 ++++
 libavcodec/vp9_mc_template.c | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index a3cecf209a..6982eefd47 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -2833,6 +2833,7 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func
     mc_chroma_scaled(s, s->dsp.s##mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
                      row, col, mv, bw, bh, w, h, bytesperpixel, \
                      s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
+#define SCALED 1
 #define FN(x) x##_scaled_8bpp
 #define BYTES_PER_PIXEL 1
 #include "vp9_mc_template.c"
@@ -2845,6 +2846,7 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func
 #undef mc_chroma_dir
 #undef FN
 #undef BYTES_PER_PIXEL
+#undef SCALED
 
 static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
                                               uint8_t *dst, ptrdiff_t dst_stride,
@@ -2930,6 +2932,7 @@ static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)
                       row, col, mv, bw, bh, w, h, i) \
     mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
                        row, col, mv, bw, bh, w, h, bytesperpixel)
+#define SCALED 0
 #define FN(x) x##_8bpp
 #define BYTES_PER_PIXEL 1
 #include "vp9_mc_template.c"
@@ -2942,6 +2945,7 @@ static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)
 #undef mc_chroma_dir_dir
 #undef FN
 #undef BYTES_PER_PIXEL
+#undef SCALED
 
 static av_always_inline void inter_recon(AVCodecContext *ctx, int bytesperpixel)
 {
diff --git a/libavcodec/vp9_mc_template.c b/libavcodec/vp9_mc_template.c
index 250e0a691c..5e6ee870b8 100644
--- a/libavcodec/vp9_mc_template.c
+++ b/libavcodec/vp9_mc_template.c
@@ -53,6 +53,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
     if (b->bs > BS_8x8) {
         VP56mv uvmv;
 
+#if SCALED == 0
         if (b->bs == BS_8x4) {
             mc_luma_dir(s, mc[3][b->filter][0], s->dst[0], ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
@@ -201,7 +202,9 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                                   &b->mv[1][1], 4, 8 >> s->ss_v, w2, h2, 1);
                 }
             }
-        } else {
+        } else
+#endif
+        {
             av_assert2(b->bs == BS_4x4);
 
             // FIXME if two horizontally adjacent blocks have the same MV,
-- 
cgit v1.2.3


From 1e4a77d0018e9d083423dc289b302d02eaeabe65 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 14 May 2015 17:05:18 -0400
Subject: vp9: apply mv scaling workaround only when subsampling is enabled.

---
 libavcodec/vp9.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 6982eefd47..c90059e87f 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -2782,13 +2782,23 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func
                                               int bw, int bh, int w, int h, int bytesperpixel,
                                               const uint16_t *scale, const uint8_t *step)
 {
-    // BUG https://code.google.com/p/webm/issues/detail?id=820
-    int mx = scale_mv(mv->x << !s->ss_h, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
-    int my = scale_mv(mv->y << !s->ss_v, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
-#undef scale_mv
+    int mx, my;
     int refbw_m1, refbh_m1;
     int th;
 
+    if (s->ss_h) {
+        // BUG https://code.google.com/p/webm/issues/detail?id=820
+        mx = scale_mv(mv->x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
+    } else {
+        mx = scale_mv(mv->x << 1, 0) + scale_mv(x * 16, 0);
+    }
+    if (s->ss_v) {
+        // BUG https://code.google.com/p/webm/issues/detail?id=820
+        my = scale_mv(mv->y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
+    } else {
+        my = scale_mv(mv->y << 1, 1) + scale_mv(y * 16, 1);
+    }
+#undef scale_mv
     y = my >> 4;
     x = mx >> 4;
     ref_u += y * src_stride_u + x * bytesperpixel;
-- 
cgit v1.2.3


From 96a58a8daa4d70e19d635b3823edbb8945116358 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 14 May 2015 20:13:43 -0400
Subject: vp9: set skip flag if the block had no coded coefficients.

This reproduces libvpx behaviour. It seems like it originally only
targeted loopfilter behaviour, but this unfortunately effects following
block contexting and thus directs bitstream sync.
---
 libavcodec/vp9.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index c90059e87f..1b4fbd2cf3 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -2298,7 +2298,7 @@ static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
                                    nnz, scan, nb, band_counts, qmul);
 }
 
-static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
+static av_always_inline int decode_coeffs(AVCodecContext *ctx, int is8bitsperpixel)
 {
     VP9Context *s = ctx->priv_data;
     VP9Block *b = s->b;
@@ -2327,6 +2327,7 @@ static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpi
     const int16_t *y_band_counts = band_counts[b->tx];
     const int16_t *uv_band_counts = band_counts[b->uvtx];
     int bytesperpixel = is8bitsperpixel ? 1 : 2;
+    int total_coeff = 0;
 
 #define MERGE(la, end, step, rd) \
     for (n = 0; n < end; n += step) \
@@ -2346,6 +2347,7 @@ static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpi
                                      c, e, p, a[x] + l[y], yscans[txtp], \
                                      ynbs[txtp], y_band_counts, qmul[0]); \
             a[x] = l[y] = !!res; \
+            total_coeff |= !!res; \
             if (step >= 4) { \
                 AV_WN16A(&s->eob[n], res); \
             } else { \
@@ -2419,6 +2421,7 @@ static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpi
                                      16 * step * step, c, e, p, a[x] + l[y], \
                                      uvscan, uvnb, uv_band_counts, qmul[1]); \
             a[x] = l[y] = !!res; \
+            total_coeff |= !!res; \
             if (step >= 4) { \
                 AV_WN16A(&s->uveob[pl][n], res); \
             } else { \
@@ -2458,16 +2461,18 @@ static av_always_inline void decode_coeffs(AVCodecContext *ctx, int is8bitsperpi
             break;
         }
     }
+
+    return total_coeff;
 }
 
-static void decode_coeffs_8bpp(AVCodecContext *ctx)
+static int decode_coeffs_8bpp(AVCodecContext *ctx)
 {
-    decode_coeffs(ctx, 1);
+    return decode_coeffs(ctx, 1);
 }
 
-static void decode_coeffs_16bpp(AVCodecContext *ctx)
+static int decode_coeffs_16bpp(AVCodecContext *ctx)
 {
-    decode_coeffs(ctx, 0);
+    return decode_coeffs(ctx, 0);
 }
 
 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
@@ -3180,10 +3185,17 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
                            (s->ss_v && h4 * 2 == (1 << b->tx)));
 
         if (!b->skip) {
+            int has_coeffs;
+
             if (bytesperpixel == 1) {
-                decode_coeffs_8bpp(ctx);
+                has_coeffs = decode_coeffs_8bpp(ctx);
             } else {
-                decode_coeffs_16bpp(ctx);
+                has_coeffs = decode_coeffs_16bpp(ctx);
+            }
+            if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
+                b->skip = 1;
+                memset(&s->above_skip_ctx[col], 1, w4);
+                memset(&s->left_skip_ctx[s->row7], 1, h4);
             }
         } else {
             int row7 = s->row7;
-- 
cgit v1.2.3


From 29045fbfd4c77e25909b9227d48d2252b8c2c5d2 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 14 May 2015 22:13:46 -0400
Subject: vp9: clip motion vectors in the same way as libvpx does.

The practical effect of this is that the scaling will wrongly not be
applied to the interpolation edge (the 3/4 constants in this patch).
In other words, we clip to the pre-scaling interpolation, even though
these should be clipped post-scaling. The resulting out-of-frame MVs
are thus automatically clipped within the visible portion of the frame,
which is probably not the intention, but is unfortunately what libvpx
does, so we need to copy that behaviour.
---
 libavcodec/vp9.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 1b4fbd2cf3..bc2dc0d412 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -2740,18 +2740,23 @@ static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func sm
                                             uint8_t *dst, ptrdiff_t dst_stride,
                                             const uint8_t *ref, ptrdiff_t ref_stride,
                                             ThreadFrame *ref_frame,
-                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
                                             int bw, int bh, int w, int h, int bytesperpixel,
                                             const uint16_t *scale, const uint8_t *step)
 {
 #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
+    int mx, my;
+    int refbw_m1, refbh_m1;
+    int th;
+    VP56mv mv;
+
+    mv.x = av_clip(in_mv->x, -(x + bw + 4) << 3, (s->cols * 8 - x + 3) << 3);
+    mv.y = av_clip(in_mv->y, -(y + bh + 4) << 3, (s->rows * 8 - y + 3) << 3);
     // BUG libvpx seems to scale the two components separately. This introduces
     // rounding errors but we have to reproduce them to be exactly compatible
     // with the output from libvpx...
-    int mx = scale_mv(mv->x * 2, 0) + scale_mv(x * 16, 0);
-    int my = scale_mv(mv->y * 2, 1) + scale_mv(y * 16, 1);
-    int refbw_m1, refbh_m1;
-    int th;
+    mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
+    my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
 
     y = my >> 4;
     x = mx >> 4;
@@ -2783,25 +2788,30 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func
                                               const uint8_t *ref_u, ptrdiff_t src_stride_u,
                                               const uint8_t *ref_v, ptrdiff_t src_stride_v,
                                               ThreadFrame *ref_frame,
-                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
+                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
                                               int bw, int bh, int w, int h, int bytesperpixel,
                                               const uint16_t *scale, const uint8_t *step)
 {
     int mx, my;
     int refbw_m1, refbh_m1;
     int th;
+    VP56mv mv;
 
     if (s->ss_h) {
         // BUG https://code.google.com/p/webm/issues/detail?id=820
-        mx = scale_mv(mv->x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
+        mv.x = av_clip(in_mv->x, -(x + bw + 4) << 4, (s->cols * 4 - x + 3) << 4);
+        mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
     } else {
-        mx = scale_mv(mv->x << 1, 0) + scale_mv(x * 16, 0);
+        mv.x = av_clip(in_mv->x, -(x + bw + 4) << 3, (s->cols * 8 - x + 3) << 3);
+        mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
     }
     if (s->ss_v) {
         // BUG https://code.google.com/p/webm/issues/detail?id=820
-        my = scale_mv(mv->y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
+        mv.y = av_clip(in_mv->y, -(y + bh + 4) << 4, (s->rows * 4 - y + 3) << 4);
+        my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
     } else {
-        my = scale_mv(mv->y << 1, 1) + scale_mv(y * 16, 1);
+        mv.y = av_clip(in_mv->y, -(y + bh + 4) << 3, (s->rows * 8 - y + 3) << 3);
+        my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
     }
 #undef scale_mv
     y = my >> 4;
-- 
cgit v1.2.3


From 3e634e3e98216854e7d4083d6b1f89cb79d39f9e Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 14 May 2015 22:40:26 -0400
Subject: vp9: extend loopfilter workaround for vp9 h/v mix-up to work for 422.

---
 libavcodec/vp9.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index bc2dc0d412..8e0d598127 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -3108,8 +3108,12 @@ static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_
             }
             if (!ss_h)
                 mask[0][y][3] |= m_col;
-            if (!ss_v)
-                mask[1][y][3] |= m_col;
+            if (!ss_v) {
+                if (ss_h && (col_end & 1))
+                    mask[1][y][3] |= (t << (w - 1)) - t;
+                else
+                    mask[1][y][3] |= m_col;
+            }
         }
     } else {
         int y, t = 1 << col_and_7, m_col = (t << w) - t;
-- 
cgit v1.2.3


From 88126916c8199bacdd429a3f9eeb3b717f0d3b9a Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 14 May 2015 22:48:59 -0400
Subject: vp9: fix crash when playing back 440/440 content with width%64<56.

---
 libavcodec/vp9.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 8e0d598127..23cf99b214 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -3314,7 +3314,7 @@ static void decode_b(AVCodecContext *ctx, int row, int col,
         int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
         int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
 
-        for (n = 1; o < w; n++) {
+        for (n = s->ss_h; o < w; n++) {
             int bw = 64 >> n;
 
             av_assert2(n <= 4);
-- 
cgit v1.2.3


From c81677e9b73c31508b78123dd6e4473fe45b03cd Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Fri, 15 May 2015 11:42:39 -0400
Subject: vp9: reset loopfilter mode/ref deltas on keyframe.

---
 libavcodec/vp9.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 23cf99b214..8b1ef67cdf 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -698,6 +698,15 @@ static int decode_frame_header(AVCodecContext *ctx,
     s->framectxid   = c = get_bits(&s->gb, 2);
 
     /* loopfilter header data */
+    if (s->keyframe || s->errorres || s->intraonly) {
+        // reset loopfilter defaults
+        s->lf_delta.ref[0] = 1;
+        s->lf_delta.ref[1] = 0;
+        s->lf_delta.ref[2] = -1;
+        s->lf_delta.ref[3] = -1;
+        s->lf_delta.mode[0] = 0;
+        s->lf_delta.mode[1] = 0;
+    }
     s->filter.level = get_bits(&s->gb, 6);
     sharp = get_bits(&s->gb, 3);
     // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
-- 
cgit v1.2.3


From 5de142d316a6f806e8a5faee4599f49cd26af11e Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Fri, 15 May 2015 11:43:09 -0400
Subject: vp9: clamp segmented lflvl before applying ref/mode deltas.

---
 libavcodec/vp9.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 8b1ef67cdf..bc88bf9680 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -800,9 +800,9 @@ static int decode_frame_header(AVCodecContext *ctx,
         sh = s->filter.level >= 32;
         if (s->segmentation.feat[i].lf_enabled) {
             if (s->segmentation.absolute_vals)
-                lflvl = s->segmentation.feat[i].lf_val;
+                lflvl = av_clip_uintp2(s->segmentation.feat[i].lf_val, 6);
             else
-                lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
+                lflvl = av_clip_uintp2(s->filter.level + s->segmentation.feat[i].lf_val, 6);
         } else {
             lflvl  = s->filter.level;
         }
-- 
cgit v1.2.3


From 32b6d31ef3abd45186cfc69607ffe66837588d53 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Fri, 15 May 2015 14:15:46 -0400
Subject: vp9: don't allow compound references if error_resilience is enabled.

libvpx (probably accidentally) clears the bits if error_res is set,
along with keyframe/intraonly. This probably wasn't the intention
(since it's local data), but it's behaviour we have to copy...
---
 libavcodec/vp9.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index bc88bf9680..9540f3d812 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -648,8 +648,9 @@ static int decode_frame_header(AVCodecContext *ctx,
             s->highprecisionmvs = get_bits1(&s->gb);
             s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
                                                 get_bits(&s->gb, 2);
-            s->allowcompinter = s->signbias[0] != s->signbias[1] ||
-                                s->signbias[0] != s->signbias[2];
+            s->allowcompinter = !s->errorres &&
+                                (s->signbias[0] != s->signbias[1] ||
+                                 s->signbias[0] != s->signbias[2]);
             if (s->allowcompinter) {
                 if (s->signbias[0] == s->signbias[1]) {
                     s->fixcompref    = 2;
-- 
cgit v1.2.3


From 68c1e9131654576fb3abc13da742c115303a3b54 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Fri, 15 May 2015 21:14:08 -0400
Subject: vp9: improve signbias check.

Otherwise it will still scale motion vectors, which leads to corrupted
prediction.
---
 libavcodec/vp9.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 9540f3d812..42c1ec97e3 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -614,11 +614,11 @@ static int decode_frame_header(AVCodecContext *ctx,
         } else {
             s->refreshrefmask = get_bits(&s->gb, 8);
             s->refidx[0]      = get_bits(&s->gb, 3);
-            s->signbias[0]    = get_bits1(&s->gb);
+            s->signbias[0]    = get_bits1(&s->gb) && !s->errorres;
             s->refidx[1]      = get_bits(&s->gb, 3);
-            s->signbias[1]    = get_bits1(&s->gb);
+            s->signbias[1]    = get_bits1(&s->gb) && !s->errorres;
             s->refidx[2]      = get_bits(&s->gb, 3);
-            s->signbias[2]    = get_bits1(&s->gb);
+            s->signbias[2]    = get_bits1(&s->gb) && !s->errorres;
             if (!s->refs[s->refidx[0]].f->data[0] ||
                 !s->refs[s->refidx[1]].f->data[0] ||
                 !s->refs[s->refidx[2]].f->data[0]) {
@@ -648,8 +648,7 @@ static int decode_frame_header(AVCodecContext *ctx,
             s->highprecisionmvs = get_bits1(&s->gb);
             s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
                                                 get_bits(&s->gb, 2);
-            s->allowcompinter = !s->errorres &&
-                                (s->signbias[0] != s->signbias[1] ||
+            s->allowcompinter = (s->signbias[0] != s->signbias[1] ||
                                  s->signbias[0] != s->signbias[2]);
             if (s->allowcompinter) {
                 if (s->signbias[0] == s->signbias[1]) {
-- 
cgit v1.2.3


From ccfb03ecc4f64eee75b934671264e9b6a90ae7bd Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Sat, 16 May 2015 08:36:19 -0400
Subject: vp9: fix scaled motion vector clipping for sub8x8 blocks.

To match the obscure clipping bug behaviour in libvpx.
---
 libavcodec/vp9.c             |  28 +++++++-----
 libavcodec/vp9_mc_template.c | 104 ++++++++++++++++++++++---------------------
 2 files changed, 70 insertions(+), 62 deletions(-)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 42c1ec97e3..09c96f162a 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -2750,6 +2750,7 @@ static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func sm
                                             const uint8_t *ref, ptrdiff_t ref_stride,
                                             ThreadFrame *ref_frame,
                                             ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
+                                            int px, int py, int pw, int ph,
                                             int bw, int bh, int w, int h, int bytesperpixel,
                                             const uint16_t *scale, const uint8_t *step)
 {
@@ -2759,8 +2760,8 @@ static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func sm
     int th;
     VP56mv mv;
 
-    mv.x = av_clip(in_mv->x, -(x + bw + 4) << 3, (s->cols * 8 - x + 3) << 3);
-    mv.y = av_clip(in_mv->y, -(y + bh + 4) << 3, (s->rows * 8 - y + 3) << 3);
+    mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
+    mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
     // BUG libvpx seems to scale the two components separately. This introduces
     // rounding errors but we have to reproduce them to be exactly compatible
     // with the output from libvpx...
@@ -2798,6 +2799,7 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func
                                               const uint8_t *ref_v, ptrdiff_t src_stride_v,
                                               ThreadFrame *ref_frame,
                                               ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
+                                              int px, int py, int pw, int ph,
                                               int bw, int bh, int w, int h, int bytesperpixel,
                                               const uint16_t *scale, const uint8_t *step)
 {
@@ -2808,18 +2810,18 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func
 
     if (s->ss_h) {
         // BUG https://code.google.com/p/webm/issues/detail?id=820
-        mv.x = av_clip(in_mv->x, -(x + bw + 4) << 4, (s->cols * 4 - x + 3) << 4);
+        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 4, (s->cols * 4 - x + px + 3) << 4);
         mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
     } else {
-        mv.x = av_clip(in_mv->x, -(x + bw + 4) << 3, (s->cols * 8 - x + 3) << 3);
+        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) << 3, (s->cols * 8 - x + px + 3) << 3);
         mx = scale_mv(mv.x << 1, 0) + scale_mv(x * 16, 0);
     }
     if (s->ss_v) {
         // BUG https://code.google.com/p/webm/issues/detail?id=820
-        mv.y = av_clip(in_mv->y, -(y + bh + 4) << 4, (s->rows * 4 - y + 3) << 4);
+        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 4, (s->rows * 4 - y + py + 3) << 4);
         my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
     } else {
-        mv.y = av_clip(in_mv->y, -(y + bh + 4) << 3, (s->rows * 8 - y + 3) << 3);
+        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) << 3, (s->rows * 8 - y + py + 3) << 3);
         my = scale_mv(mv.y << 1, 1) + scale_mv(y * 16, 1);
     }
 #undef scale_mv
@@ -2858,14 +2860,15 @@ static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func
     }
 }
 
-#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, bw, bh, w, h, i) \
+#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
+                    px, py, pw, ph, bw, bh, w, h, i) \
     mc_luma_scaled(s, s->dsp.s##mc, dst, dst_ls, src, src_ls, tref, row, col, \
-                   mv, bw, bh, w, h, bytesperpixel, \
+                   mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
                    s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
-                      row, col, mv, bw, bh, w, h, i) \
+                      row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
     mc_chroma_scaled(s, s->dsp.s##mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
-                     row, col, mv, bw, bh, w, h, bytesperpixel, \
+                     row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
                      s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
 #define SCALED 1
 #define FN(x) x##_scaled_8bpp
@@ -2959,11 +2962,12 @@ static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)
     }
 }
 
-#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, bw, bh, w, h, i) \
+#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
+                    px, py, pw, ph, bw, bh, w, h, i) \
     mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
                      mv, bw, bh, w, h, bytesperpixel)
 #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
-                      row, col, mv, bw, bh, w, h, i) \
+                      row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
     mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
                        row, col, mv, bw, bh, w, h, bytesperpixel)
 #define SCALED 0
diff --git a/libavcodec/vp9_mc_template.c b/libavcodec/vp9_mc_template.c
index 5e6ee870b8..f4eb4e56ac 100644
--- a/libavcodec/vp9_mc_template.c
+++ b/libavcodec/vp9_mc_template.c
@@ -57,11 +57,11 @@ static void FN(inter_pred)(AVCodecContext *ctx)
         if (b->bs == BS_8x4) {
             mc_luma_dir(s, mc[3][b->filter][0], s->dst[0], ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1, 0);
+                        row << 3, col << 3, &b->mv[0][0],,,,, 8, 4, w1, h1, 0);
             mc_luma_dir(s, mc[3][b->filter][0],
                         s->dst[0] + 4 * ls_y, ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1, 0);
+                        (row << 3) + 4, col << 3, &b->mv[2][0],,,,, 8, 4, w1, h1, 0);
             w1 = (w1 + s->ss_h) >> s->ss_h;
             if (s->ss_v) {
                 h1 = (h1 + 1) >> 1;
@@ -71,14 +71,14 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                               ref1->data[1], ref1->linesize[1],
                               ref1->data[2], ref1->linesize[2], tref1,
                               row << 2, col << (3 - s->ss_h),
-                              &uvmv, 8 >> s->ss_h, 4, w1, h1, 0);
+                              &uvmv,,,,, 8 >> s->ss_h, 4, w1, h1, 0);
             } else {
                 mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][0],
                               s->dst[1], s->dst[2], ls_uv,
                               ref1->data[1], ref1->linesize[1],
                               ref1->data[2], ref1->linesize[2], tref1,
                               row << 3, col << (3 - s->ss_h),
-                              &b->mv[0][0], 8 >> s->ss_h, 4, w1, h1, 0);
+                              &b->mv[0][0],,,,, 8 >> s->ss_h, 4, w1, h1, 0);
                 // BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index
                 // to get the motion vector for the bottom 4x4 block
                 // https://code.google.com/p/webm/issues/detail?id=993
@@ -92,17 +92,17 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                               ref1->data[1], ref1->linesize[1],
                               ref1->data[2], ref1->linesize[2], tref1,
                               (row << 3) + 4, col << (3 - s->ss_h),
-                              &uvmv, 8 >> s->ss_h, 4, w1, h1, 0);
+                              &uvmv,,,,, 8 >> s->ss_h, 4, w1, h1, 0);
             }
 
             if (b->comp) {
                 mc_luma_dir(s, mc[3][b->filter][1], s->dst[0], ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2, 1);
+                            row << 3, col << 3, &b->mv[0][1],,,,, 8, 4, w2, h2, 1);
                 mc_luma_dir(s, mc[3][b->filter][1],
                             s->dst[0] + 4 * ls_y, ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2, 1);
+                            (row << 3) + 4, col << 3, &b->mv[2][1],,,,, 8, 4, w2, h2, 1);
                 w2 = (w2 + s->ss_h) >> s->ss_h;
                 if (s->ss_v) {
                     h2 = (h2 + 1) >> 1;
@@ -112,14 +112,14 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                                   ref2->data[1], ref2->linesize[1],
                                   ref2->data[2], ref2->linesize[2], tref2,
                                   row << 2, col << (3 - s->ss_h),
-                                  &uvmv, 8 >> s->ss_h, 4, w2, h2, 1);
+                                  &uvmv,,,,, 8 >> s->ss_h, 4, w2, h2, 1);
                 } else {
                     mc_chroma_dir(s, mc[3 + s->ss_h][b->filter][1],
                                   s->dst[1], s->dst[2], ls_uv,
                                   ref2->data[1], ref2->linesize[1],
                                   ref2->data[2], ref2->linesize[2], tref2,
                                   row << 3, col << (3 - s->ss_h),
-                                  &b->mv[0][1], 8 >> s->ss_h, 4, w2, h2, 1);
+                                  &b->mv[0][1],,,,, 8 >> s->ss_h, 4, w2, h2, 1);
                     // BUG for 4:2:2 bs=8x4, libvpx uses the wrong block index
                     // to get the motion vector for the bottom 4x4 block
                     // https://code.google.com/p/webm/issues/detail?id=993
@@ -133,16 +133,16 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                                   ref2->data[1], ref2->linesize[1],
                                   ref2->data[2], ref2->linesize[2], tref2,
                                   (row << 3) + 4, col << (3 - s->ss_h),
-                                  &uvmv, 8 >> s->ss_h, 4, w2, h2, 1);
+                                  &uvmv,,,,, 8 >> s->ss_h, 4, w2, h2, 1);
                 }
             }
         } else if (b->bs == BS_4x8) {
             mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1, 0);
+                        row << 3, col << 3, &b->mv[0][0],,,,, 4, 8, w1, h1, 0);
             mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4 * bytesperpixel, ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1, 0);
+                        row << 3, (col << 3) + 4, &b->mv[1][0],,,,, 4, 8, w1, h1, 0);
             h1 = (h1 + s->ss_v) >> s->ss_v;
             if (s->ss_h) {
                 w1 = (w1 + 1) >> 1;
@@ -152,30 +152,30 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                               ref1->data[1], ref1->linesize[1],
                               ref1->data[2], ref1->linesize[2], tref1,
                               row << (3 - s->ss_v), col << 2,
-                              &uvmv, 4, 8 >> s->ss_v, w1, h1, 0);
+                              &uvmv,,,,, 4, 8 >> s->ss_v, w1, h1, 0);
             } else {
                 mc_chroma_dir(s, mc[4][b->filter][0],
                               s->dst[1], s->dst[2], ls_uv,
                               ref1->data[1], ref1->linesize[1],
                               ref1->data[2], ref1->linesize[2], tref1,
                               row << (3 - s->ss_v), col << 3,
-                              &b->mv[0][0], 4, 8 >> s->ss_v, w1, h1, 0);
+                              &b->mv[0][0],,,,, 4, 8 >> s->ss_v, w1, h1, 0);
                 mc_chroma_dir(s, mc[4][b->filter][0],
                               s->dst[1] + 4 * bytesperpixel,
                               s->dst[2] + 4 * bytesperpixel, ls_uv,
                               ref1->data[1], ref1->linesize[1],
                               ref1->data[2], ref1->linesize[2], tref1,
                               row << (3 - s->ss_v), (col << 3) + 4,
-                              &b->mv[1][0], 4, 8 >> s->ss_v, w1, h1, 0);
+                              &b->mv[1][0],,,,, 4, 8 >> s->ss_v, w1, h1, 0);
             }
 
             if (b->comp) {
                 mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2, 1);
+                            row << 3, col << 3, &b->mv[0][1],,,,, 4, 8, w2, h2, 1);
                 mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4 * bytesperpixel, ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2, 1);
+                            row << 3, (col << 3) + 4, &b->mv[1][1],,,,, 4, 8, w2, h2, 1);
                 h2 = (h2 + s->ss_v) >> s->ss_v;
                 if (s->ss_h) {
                     w2 = (w2 + 1) >> 1;
@@ -185,21 +185,21 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                                   ref2->data[1], ref2->linesize[1],
                                   ref2->data[2], ref2->linesize[2], tref2,
                                   row << (3 - s->ss_v), col << 2,
-                                  &uvmv, 4, 8 >> s->ss_v, w2, h2, 1);
+                                  &uvmv,,,,, 4, 8 >> s->ss_v, w2, h2, 1);
                 } else {
                     mc_chroma_dir(s, mc[4][b->filter][1],
                                   s->dst[1], s->dst[2], ls_uv,
                                   ref2->data[1], ref2->linesize[1],
                                   ref2->data[2], ref2->linesize[2], tref2,
                                   row << (3 - s->ss_v), col << 3,
-                                  &b->mv[0][1], 4, 8 >> s->ss_v, w2, h2, 1);
+                                  &b->mv[0][1],,,,, 4, 8 >> s->ss_v, w2, h2, 1);
                     mc_chroma_dir(s, mc[4][b->filter][1],
                                   s->dst[1] + 4 * bytesperpixel,
                                   s->dst[2] + 4 * bytesperpixel, ls_uv,
                                   ref2->data[1], ref2->linesize[1],
                                   ref2->data[2], ref2->linesize[2], tref2,
                                   row << (3 - s->ss_v), (col << 3) + 4,
-                                  &b->mv[1][1], 4, 8 >> s->ss_v, w2, h2, 1);
+                                  &b->mv[1][1],,,,, 4, 8 >> s->ss_v, w2, h2, 1);
                 }
             }
         } else
@@ -211,18 +211,22 @@ static void FN(inter_pred)(AVCodecContext *ctx)
             // do a w8 instead of a w4 call
             mc_luma_dir(s, mc[4][b->filter][0], s->dst[0], ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1, 0);
+                        row << 3, col << 3, &b->mv[0][0],
+                        0, 0, 8, 8, 4, 4, w1, h1, 0);
             mc_luma_dir(s, mc[4][b->filter][0], s->dst[0] + 4 * bytesperpixel, ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1, 0);
+                        row << 3, (col << 3) + 4, &b->mv[1][0],
+                        4, 0, 8, 8, 4, 4, w1, h1, 0);
             mc_luma_dir(s, mc[4][b->filter][0],
                         s->dst[0] + 4 * ls_y, ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1, 0);
+                        (row << 3) + 4, col << 3, &b->mv[2][0],
+                        0, 4, 8, 8, 4, 4, w1, h1, 0);
             mc_luma_dir(s, mc[4][b->filter][0],
                         s->dst[0] + 4 * ls_y + 4 * bytesperpixel, ls_y,
                         ref1->data[0], ref1->linesize[0], tref1,
-                        (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1, 0);
+                        (row << 3) + 4, (col << 3) + 4, &b->mv[3][0],
+                        4, 4, 8, 8, 4, 4, w1, h1, 0);
             if (s->ss_v) {
                 h1 = (h1 + 1) >> 1;
                 if (s->ss_h) {
@@ -234,7 +238,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                                   ref1->data[1], ref1->linesize[1],
                                   ref1->data[2], ref1->linesize[2], tref1,
                                   row << 2, col << 2,
-                                  &uvmv, 4, 4, w1, h1, 0);
+                                  &uvmv, 0, 0, 4, 4, 4, 4, w1, h1, 0);
                 } else {
                     uvmv = ROUNDED_DIV_MVx2(b->mv[0][0], b->mv[2][0]);
                     mc_chroma_dir(s, mc[4][b->filter][0],
@@ -242,7 +246,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                                   ref1->data[1], ref1->linesize[1],
                                   ref1->data[2], ref1->linesize[2], tref1,
                                   row << 2, col << 3,
-                                  &uvmv, 4, 4, w1, h1, 0);
+                                  &uvmv, 0, 0, 8, 4, 4, 4, w1, h1, 0);
                     uvmv = ROUNDED_DIV_MVx2(b->mv[1][0], b->mv[3][0]);
                     mc_chroma_dir(s, mc[4][b->filter][0],
                                   s->dst[1] + 4 * bytesperpixel,
@@ -250,7 +254,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                                   ref1->data[1], ref1->linesize[1],
                                   ref1->data[2], ref1->linesize[2], tref1,
                                   row << 2, (col << 3) + 4,
-                                  &uvmv, 4, 4, w1, h1, 0);
+                                  &uvmv, 4, 0, 8, 4, 4, 4, w1, h1, 0);
                 }
             } else {
                 if (s->ss_h) {
@@ -261,7 +265,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                                   ref1->data[1], ref1->linesize[1],
                                   ref1->data[2], ref1->linesize[2], tref1,
                                   row << 3, col << 2,
-                                  &uvmv, 4, 4, w1, h1, 0);
+                                  &uvmv, 0, 0, 4, 8, 4, 4, w1, h1, 0);
                     // BUG libvpx uses wrong block index for 4:2:2 bs=4x4
                     // bottom block
                     // https://code.google.com/p/webm/issues/detail?id=993
@@ -271,52 +275,52 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                                   ref1->data[1], ref1->linesize[1],
                                   ref1->data[2], ref1->linesize[2], tref1,
                                   (row << 3) + 4, col << 2,
-                                  &uvmv, 4, 4, w1, h1, 0);
+                                  &uvmv, 0, 4, 4, 8, 4, 4, w1, h1, 0);
                 } else {
                     mc_chroma_dir(s, mc[4][b->filter][0],
                                   s->dst[1], s->dst[2], ls_uv,
                                   ref1->data[1], ref1->linesize[1],
                                   ref1->data[2], ref1->linesize[2], tref1,
                                   row << 3, col << 3,
-                                  &b->mv[0][0], 4, 4, w1, h1, 0);
+                                  &b->mv[0][0], 0, 0, 8, 8, 4, 4, w1, h1, 0);
                     mc_chroma_dir(s, mc[4][b->filter][0],
                                   s->dst[1] + 4 * bytesperpixel,
                                   s->dst[2] + 4 * bytesperpixel, ls_uv,
                                   ref1->data[1], ref1->linesize[1],
                                   ref1->data[2], ref1->linesize[2], tref1,
                                   row << 3, (col << 3) + 4,
-                                  &b->mv[1][0], 4, 4, w1, h1, 0);
+                                  &b->mv[1][0], 4, 0, 8, 8, 4, 4, w1, h1, 0);
                     mc_chroma_dir(s, mc[4][b->filter][0],
                                   s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
                                   ref1->data[1], ref1->linesize[1],
                                   ref1->data[2], ref1->linesize[2], tref1,
                                   (row << 3) + 4, col << 3,
-                                  &b->mv[2][0], 4, 4, w1, h1, 0);
+                                  &b->mv[2][0], 0, 4, 8, 8, 4, 4, w1, h1, 0);
                     mc_chroma_dir(s, mc[4][b->filter][0],
                                   s->dst[1] + 4 * ls_uv + 4 * bytesperpixel,
                                   s->dst[2] + 4 * ls_uv + 4 * bytesperpixel, ls_uv,
                                   ref1->data[1], ref1->linesize[1],
                                   ref1->data[2], ref1->linesize[2], tref1,
                                   (row << 3) + 4, (col << 3) + 4,
-                                  &b->mv[3][0], 4, 4, w1, h1, 0);
+                                  &b->mv[3][0], 4, 4, 8, 8, 4, 4, w1, h1, 0);
                 }
             }
 
             if (b->comp) {
                 mc_luma_dir(s, mc[4][b->filter][1], s->dst[0], ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2, 1);
+                            row << 3, col << 3, &b->mv[0][1], 0, 0, 8, 8, 4, 4, w2, h2, 1);
                 mc_luma_dir(s, mc[4][b->filter][1], s->dst[0] + 4 * bytesperpixel, ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2, 1);
+                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 0, 8, 8, 4, 4, w2, h2, 1);
                 mc_luma_dir(s, mc[4][b->filter][1],
                             s->dst[0] + 4 * ls_y, ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2, 1);
+                            (row << 3) + 4, col << 3, &b->mv[2][1], 0, 4, 8, 8, 4, 4, w2, h2, 1);
                 mc_luma_dir(s, mc[4][b->filter][1],
                             s->dst[0] + 4 * ls_y + 4 * bytesperpixel, ls_y,
                             ref2->data[0], ref2->linesize[0], tref2,
-                            (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2, 1);
+                            (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, 8, 8, 4, 4, w2, h2, 1);
                 if (s->ss_v) {
                     h2 = (h2 + 1) >> 1;
                     if (s->ss_h) {
@@ -328,7 +332,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                                       ref2->data[1], ref2->linesize[1],
                                       ref2->data[2], ref2->linesize[2], tref2,
                                       row << 2, col << 2,
-                                      &uvmv, 4, 4, w2, h2, 1);
+                                      &uvmv, 0, 0, 4, 4, 4, 4, w2, h2, 1);
                     } else {
                         uvmv = ROUNDED_DIV_MVx2(b->mv[0][1], b->mv[2][1]);
                         mc_chroma_dir(s, mc[4][b->filter][1],
@@ -336,7 +340,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                                       ref2->data[1], ref2->linesize[1],
                                       ref2->data[2], ref2->linesize[2], tref2,
                                       row << 2, col << 3,
-                                      &uvmv, 4, 4, w2, h2, 1);
+                                      &uvmv, 0, 0, 8, 4, 4, 4, w2, h2, 1);
                         uvmv = ROUNDED_DIV_MVx2(b->mv[1][1], b->mv[3][1]);
                         mc_chroma_dir(s, mc[4][b->filter][1],
                                       s->dst[1] + 4 * bytesperpixel,
@@ -344,7 +348,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                                       ref2->data[1], ref2->linesize[1],
                                       ref2->data[2], ref2->linesize[2], tref2,
                                       row << 2, (col << 3) + 4,
-                                      &uvmv, 4, 4, w2, h2, 1);
+                                      &uvmv, 4, 0, 8, 4, 4, 4, w2, h2, 1);
                     }
                 } else {
                     if (s->ss_h) {
@@ -355,7 +359,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                                       ref2->data[1], ref2->linesize[1],
                                       ref2->data[2], ref2->linesize[2], tref2,
                                       row << 3, col << 2,
-                                      &uvmv, 4, 4, w2, h2, 1);
+                                      &uvmv, 0, 0, 4, 8, 4, 4, w2, h2, 1);
                         // BUG libvpx uses wrong block index for 4:2:2 bs=4x4
                         // bottom block
                         // https://code.google.com/p/webm/issues/detail?id=993
@@ -365,34 +369,34 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                                       ref2->data[1], ref2->linesize[1],
                                       ref2->data[2], ref2->linesize[2], tref2,
                                       (row << 3) + 4, col << 2,
-                                      &uvmv, 4, 4, w2, h2, 1);
+                                      &uvmv, 0, 4, 4, 8, 4, 4, w2, h2, 1);
                     } else {
                         mc_chroma_dir(s, mc[4][b->filter][1],
                                       s->dst[1], s->dst[2], ls_uv,
                                       ref2->data[1], ref2->linesize[1],
                                       ref2->data[2], ref2->linesize[2], tref2,
                                       row << 3, col << 3,
-                                      &b->mv[0][1], 4, 4, w2, h2, 1);
+                                      &b->mv[0][1], 0, 0, 8, 8, 4, 4, w2, h2, 1);
                         mc_chroma_dir(s, mc[4][b->filter][1],
                                       s->dst[1] + 4 * bytesperpixel,
                                       s->dst[2] + 4 * bytesperpixel, ls_uv,
                                       ref2->data[1], ref2->linesize[1],
                                       ref2->data[2], ref2->linesize[2], tref2,
                                       row << 3, (col << 3) + 4,
-                                      &b->mv[1][1], 4, 4, w2, h2, 1);
+                                      &b->mv[1][1], 4, 0, 8, 8, 4, 4, w2, h2, 1);
                         mc_chroma_dir(s, mc[4][b->filter][1],
                                       s->dst[1] + 4 * ls_uv, s->dst[2] + 4 * ls_uv, ls_uv,
                                       ref2->data[1], ref2->linesize[1],
                                       ref2->data[2], ref2->linesize[2], tref2,
                                       (row << 3) + 4, col << 3,
-                                      &b->mv[2][1], 4, 4, w2, h2, 1);
+                                      &b->mv[2][1], 0, 4, 8, 8, 4, 4, w2, h2, 1);
                         mc_chroma_dir(s, mc[4][b->filter][1],
                                       s->dst[1] + 4 * ls_uv + 4 * bytesperpixel,
                                       s->dst[2] + 4 * ls_uv + 4 * bytesperpixel, ls_uv,
                                       ref2->data[1], ref2->linesize[1],
                                       ref2->data[2], ref2->linesize[2], tref2,
                                       (row << 3) + 4, (col << 3) + 4,
-                                      &b->mv[3][1], 4, 4, w2, h2, 1);
+                                      &b->mv[3][1], 4, 4, 8, 8, 4, 4, w2, h2, 1);
                     }
                 }
             }
@@ -404,7 +408,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
 
         mc_luma_dir(s, mc[bwl][b->filter][0], s->dst[0], ls_y,
                     ref1->data[0], ref1->linesize[0], tref1,
-                    row << 3, col << 3, &b->mv[0][0], bw, bh, w1, h1, 0);
+                    row << 3, col << 3, &b->mv[0][0], 0, 0, bw, bh, bw, bh, w1, h1, 0);
         w1 = (w1 + s->ss_h) >> s->ss_h;
         h1 = (h1 + s->ss_v) >> s->ss_v;
         mc_chroma_dir(s, mc[bwl + s->ss_h][b->filter][0],
@@ -412,12 +416,12 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                       ref1->data[1], ref1->linesize[1],
                       ref1->data[2], ref1->linesize[2], tref1,
                       row << (3 - s->ss_v), col << (3 - s->ss_h),
-                      &b->mv[0][0], uvbw, uvbh, w1, h1, 0);
+                      &b->mv[0][0], 0, 0, uvbw, uvbh, uvbw, uvbh, w1, h1, 0);
 
         if (b->comp) {
             mc_luma_dir(s, mc[bwl][b->filter][1], s->dst[0], ls_y,
                         ref2->data[0], ref2->linesize[0], tref2,
-                        row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2, 1);
+                        row << 3, col << 3, &b->mv[0][1], 0, 0, bw, bh, bw, bh, w2, h2, 1);
             w2 = (w2 + s->ss_h) >> s->ss_h;
             h2 = (h2 + s->ss_v) >> s->ss_v;
             mc_chroma_dir(s, mc[bwl + s->ss_h][b->filter][1],
@@ -425,7 +429,7 @@ static void FN(inter_pred)(AVCodecContext *ctx)
                           ref2->data[1], ref2->linesize[1],
                           ref2->data[2], ref2->linesize[2], tref2,
                           row << (3 - s->ss_v), col << (3 - s->ss_h),
-                          &b->mv[0][1], uvbw, uvbh, w2, h2, 1);
+                          &b->mv[0][1], 0, 0, uvbw, uvbh, uvbw, uvbh, w2, h2, 1);
         }
     }
 }
-- 
cgit v1.2.3


From 900e3af857871f1075d9f095ea22bfee1484c086 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Sat, 16 May 2015 11:59:01 -0400
Subject: vp9: match another find_ref_mvs() bug in libvpx.

If we find a second non-sub8x8 motion vector for a non-first sub8x8
block, and the clamped value is identical to the first non-sub8x8
motion vector, then the resulting nearmv motion vector is forced to
zero.
---
 libavcodec/vp9.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 09c96f162a..7f28b2689e 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -1111,7 +1111,7 @@ static void find_ref_mvs(VP9Context *s,
     int row = s->row, col = s->col, row7 = s->row7;
     const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
 #define INVALID_MV 0x80008000U
-    uint32_t mem = INVALID_MV;
+    uint32_t mem = INVALID_MV, mem_sub8x8 = INVALID_MV;
     int i;
 
 #define RETURN_DIRECT_MV(mv) \
@@ -1142,15 +1142,25 @@ static void find_ref_mvs(VP9Context *s,
         if (sb > 0) { \
             VP56mv tmp; \
             uint32_t m; \
-            clamp_mv(&tmp, &mv, s); \
-            m = AV_RN32A(&tmp); \
-            if (!idx) { \
-                AV_WN32A(pmv, m); \
-                return; \
-            } else if (mem == INVALID_MV) { \
-                mem = m; \
-            } else if (m != mem) { \
-                AV_WN32A(pmv, m); \
+            av_assert2(idx == 1); \
+            av_assert2(mem != INVALID_MV); \
+            if (mem_sub8x8 == INVALID_MV) { \
+                clamp_mv(&tmp, &mv, s); \
+                m = AV_RN32A(&tmp); \
+                if (m != mem) { \
+                    AV_WN32A(pmv, m); \
+                    return; \
+                } \
+                mem_sub8x8 = AV_RN32A(&mv); \
+            } else if (mem_sub8x8 != AV_RN32A(&mv)) { \
+                clamp_mv(&tmp, &mv, s); \
+                m = AV_RN32A(&tmp); \
+                if (m != mem) { \
+                    AV_WN32A(pmv, m); \
+                } else { \
+                    /* BUG I'm pretty sure this isn't the intention */ \
+                    AV_WN32A(pmv, 0); \
+                } \
                 return; \
             } \
         } else { \
-- 
cgit v1.2.3