From 72ca830f511fcdc01253689615faed25da7c57bf Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Mon, 30 Sep 2013 23:03:30 -0400
Subject: lavc: VP9 decoder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Originally written by Ronald S. Bultje <rsbultje@gmail.com> and
Clément Bœsch <u@pkh.me>

Further contributions by:
Anton Khirnov <anton@khirnov.net>
Diego Biurrun <diego@biurrun.de>
Luca Barbato <lu_zero@gentoo.org>
Martin Storsjö <martin@martin.st>

Signed-off-by: Luca Barbato <lu_zero@gentoo.org>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavcodec/Makefile          |    2 +
 libavcodec/allcodecs.c       |    1 +
 libavcodec/version.h         |    2 +-
 libavcodec/vp9.c             | 1270 ++++++++++++++++++++++++
 libavcodec/vp9.h             |  419 ++++++++
 libavcodec/vp9block.c        | 1684 ++++++++++++++++++++++++++++++++
 libavcodec/vp9data.c         | 2133 +++++++++++++++++++++++++++++++++++++++++
 libavcodec/vp9data.h         |   70 ++
 libavcodec/vp9dsp.c          | 2174 ++++++++++++++++++++++++++++++++++++++++++
 libavcodec/vp9mvs.c          |  344 +++++++
 libavcodec/vp9prob.c         |  274 ++++++
 libavcodec/x86/Makefile      |    2 +
 libavcodec/x86/vp9dsp.asm    |  277 ++++++
 libavcodec/x86/vp9dsp_init.c |  245 +++++
 14 files changed, 8896 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/vp9.c
 create mode 100644 libavcodec/vp9.h
 create mode 100644 libavcodec/vp9block.c
 create mode 100644 libavcodec/vp9data.c
 create mode 100644 libavcodec/vp9data.h
 create mode 100644 libavcodec/vp9dsp.c
 create mode 100644 libavcodec/vp9mvs.c
 create mode 100644 libavcodec/vp9prob.c
 create mode 100644 libavcodec/x86/vp9dsp.asm
 create mode 100644 libavcodec/x86/vp9dsp_init.c

(limited to 'libavcodec')
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 03d7459bb6..1674d4740f 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -392,6 +392,8 @@ OBJS-$(CONFIG_VP5_DECODER)             += vp5.o vp56.o vp56data.o vp56dsp.o \
 OBJS-$(CONFIG_VP6_DECODER)             += vp6.o vp56.o vp56data.o vp56dsp.o \
                                           vp6dsp.o vp56rac.o
 OBJS-$(CONFIG_VP8_DECODER)             += vp8.o vp8dsp.o vp56rac.o
+OBJS-$(CONFIG_VP9_DECODER)             += vp9.o vp9data.o vp9dsp.o \
+                                          vp9block.o vp9prob.o vp9mvs.o vp56rac.o
 OBJS-$(CONFIG_VQA_DECODER)             += vqavideo.o
 OBJS-$(CONFIG_WAVPACK_DECODER)         += wavpack.o
 OBJS-$(CONFIG_WEBP_DECODER)            += webp.o
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index faa94b1ecb..6453e300f5 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -257,6 +257,7 @@ void avcodec_register_all(void)
     REGISTER_DECODER(VP6A,              vp6a);
     REGISTER_DECODER(VP6F,              vp6f);
     REGISTER_DECODER(VP8,               vp8);
+    REGISTER_DECODER(VP9,               vp9);
     REGISTER_DECODER(VQA,               vqa);
     REGISTER_DECODER(WEBP,              webp);
     REGISTER_ENCDEC (WMV1,              wmv1);
diff --git a/libavcodec/version.h b/libavcodec/version.h
index 38a606422d..e9d09ac98e 100644
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -27,7 +27,7 @@
  */
 
 #define LIBAVCODEC_VERSION_MAJOR 55
-#define LIBAVCODEC_VERSION_MINOR 27
+#define LIBAVCODEC_VERSION_MINOR 28
 #define LIBAVCODEC_VERSION_MICRO  0
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
new file mode 100644
index 0000000000..9048700fb7
--- /dev/null
+++ b/libavcodec/vp9.c
@@ -0,0 +1,1270 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "internal.h"
+#include "videodsp.h"
+#include "vp56.h"
+#include "vp9.h"
+#include "vp9data.h"
+
+#define VP9_SYNCCODE 0x498342
+#define MAX_PROB 255
+
+static void vp9_decode_flush(AVCodecContext *avctx)
+{
+    VP9Context *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->refs); i++)
+        av_frame_unref(s->refs[i]);
+}
+
+static int update_size(AVCodecContext *avctx, int w, int h)
+{
+    VP9Context *s = avctx->priv_data;
+    uint8_t *p;
+
+    if (s->above_partition_ctx && w == avctx->width && h == avctx->height)
+        return 0;
+
+    vp9_decode_flush(avctx);
+
+    if (w <= 0 || h <= 0)
+        return AVERROR_INVALIDDATA;
+
+    avctx->width  = w;
+    avctx->height = h;
+    s->sb_cols    = (w + 63) >> 6;
+    s->sb_rows    = (h + 63) >> 6;
+    s->cols       = (w +  7) >> 3;
+    s->rows       = (h +  7) >> 3;
+
+#define assign(var, type, n) var = (type)p; p += s->sb_cols * n * sizeof(*var)
+    av_free(s->above_partition_ctx);
+    p = av_malloc(s->sb_cols *
+                  (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx) +
+                   64 * s->sb_rows * (1 + sizeof(*s->mv[0]) * 2)));
+    if (!p)
+        return AVERROR(ENOMEM);
+    assign(s->above_partition_ctx, uint8_t *,     8);
+    assign(s->above_skip_ctx,      uint8_t *,     8);
+    assign(s->above_txfm_ctx,      uint8_t *,     8);
+    assign(s->above_mode_ctx,      uint8_t *,    16);
+    assign(s->above_y_nnz_ctx,     uint8_t *,    16);
+    assign(s->above_uv_nnz_ctx[0], uint8_t *,     8);
+    assign(s->above_uv_nnz_ctx[1], uint8_t *,     8);
+    assign(s->intra_pred_data[0],  uint8_t *,    64);
+    assign(s->intra_pred_data[1],  uint8_t *,    32);
+    assign(s->intra_pred_data[2],  uint8_t *,    32);
+    assign(s->above_segpred_ctx,   uint8_t *,     8);
+    assign(s->above_intra_ctx,     uint8_t *,     8);
+    assign(s->above_comp_ctx,      uint8_t *,     8);
+    assign(s->above_ref_ctx,       uint8_t *,     8);
+    assign(s->above_filter_ctx,    uint8_t *,     8);
+    assign(s->lflvl,               VP9Filter *,   1);
+    assign(s->above_mv_ctx,        VP56mv(*)[2], 16);
+    assign(s->segmentation_map,    uint8_t *,      64 * s->sb_rows);
+    assign(s->mv[0],               VP9MVRefPair *, 64 * s->sb_rows);
+    assign(s->mv[1],               VP9MVRefPair *, 64 * s->sb_rows);
+#undef assign
+
+    return 0;
+}
+
+// The sign bit is at the end, not the start, of a bit sequence
+static av_always_inline int get_bits_with_sign(GetBitContext *gb, int n)
+{
+    int v = get_bits(gb, n);
+    return get_bits1(gb) ? -v : v;
+}
+
+static av_always_inline int inv_recenter_nonneg(int v, int m)
+{
+    if (v > 2 * m)
+        return v;
+    if (v & 1)
+        return m - ((v + 1) >> 1);
+    return m + (v >> 1);
+}
+
+// differential forward probability updates
+static int update_prob(VP56RangeCoder *c, int p)
+{
+    static const int inv_map_table[MAX_PROB - 1] = {
+          7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176,
+        189, 202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,
+         10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,
+         25,  26,  27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,
+         40,  41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,  53,  54,
+         55,  56,  57,  58,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
+         70,  71,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
+         86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  99, 100,
+        101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
+        116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
+        131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
+        146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
+        161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+        177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
+        192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
+        207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
+        222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
+        237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
+        252, 253,
+    };
+    int d;
+
+    /* This code is trying to do a differential probability update. For a
+     * current probability A in the range [1, 255], the difference to a new
+     * probability of any value can be expressed differentially as 1-A, 255-A
+     * where some part of this (absolute range) exists both in positive as
+     * well as the negative part, whereas another part only exists in one
+     * half. We're trying to code this shared part differentially, i.e.
+     * times two where the value of the lowest bit specifies the sign, and
+     * the single part is then coded on top of this. This absolute difference
+     * then again has a value of [0, 254], but a bigger value in this range
+     * indicates that we're further away from the original value A, so we
+     * can code this as a VLC code, since higher values are increasingly
+     * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
+     * updates vs. the 'fine, exact' updates further down the range, which
+     * adds one extra dimension to this differential update model. */
+
+    if (!vp8_rac_get(c)) {
+        d = vp8_rac_get_uint(c, 4) + 0;
+    } else if (!vp8_rac_get(c)) {
+        d = vp8_rac_get_uint(c, 4) + 16;
+    } else if (!vp8_rac_get(c)) {
+        d = vp8_rac_get_uint(c, 5) + 32;
+    } else {
+        d = vp8_rac_get_uint(c, 7);
+        if (d >= 65) {
+            d = (d << 1) - 65 + vp8_rac_get(c);
+            d = av_clip(d, 0, MAX_PROB - 65 - 1);
+        }
+        d += 64;
+    }
+
+    return p <= 128
+           ?   1 + inv_recenter_nonneg(inv_map_table[d], p - 1)
+           : 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
+}
+
+static int decode_frame_header(AVCodecContext *avctx,
+                               const uint8_t *data, int size, int *ref)
+{
+    VP9Context *s = avctx->priv_data;
+    int c, i, j, k, l, m, n, w, h, max, size2, ret, sharp;
+    int last_invisible;
+    const uint8_t *data2;
+
+    /* general header */
+    if ((ret = init_get_bits8(&s->gb, data, size)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
+        return ret;
+    }
+    if (get_bits(&s->gb, 2) != 0x2) { // frame marker
+        av_log(avctx, AV_LOG_ERROR, "Invalid frame marker\n");
+        return AVERROR_INVALIDDATA;
+    }
+    s->profile = get_bits1(&s->gb);
+    if (get_bits1(&s->gb)) { // reserved bit
+        av_log(avctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
+        return AVERROR_INVALIDDATA;
+    }
+    if (get_bits1(&s->gb)) {
+        *ref = get_bits(&s->gb, 3);
+        return 0;
+    }
+
+    s->last_keyframe = s->keyframe;
+    s->keyframe      = !get_bits1(&s->gb);
+
+    last_invisible = s->invisible;
+    s->invisible   = !get_bits1(&s->gb);
+    s->errorres    = get_bits1(&s->gb);
+    // FIXME disable this upon resolution change
+    s->use_last_frame_mvs = !s->errorres && !last_invisible;
+
+    if (s->keyframe) {
+        if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
+            av_log(avctx, AV_LOG_ERROR, "Invalid sync code\n");
+            return AVERROR_INVALIDDATA;
+        }
+        s->colorspace = get_bits(&s->gb, 3);
+        if (s->colorspace == 7) { // RGB = profile 1
+            av_log(avctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
+            return AVERROR_INVALIDDATA;
+        }
+        s->fullrange = get_bits1(&s->gb);
+        // for profile 1, here follows the subsampling bits
+        s->refreshrefmask = 0xff;
+        w = get_bits(&s->gb, 16) + 1;
+        h = get_bits(&s->gb, 16) + 1;
+        if (get_bits1(&s->gb)) // display size
+            skip_bits(&s->gb, 32);
+    } else {
+        s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
+        s->resetctx  = s->errorres ? 0 : get_bits(&s->gb, 2);
+        if (s->intraonly) {
+            if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
+                av_log(avctx, AV_LOG_ERROR, "Invalid sync code\n");
+                return AVERROR_INVALIDDATA;
+            }
+            s->refreshrefmask = get_bits(&s->gb, 8);
+            w = get_bits(&s->gb, 16) + 1;
+            h = get_bits(&s->gb, 16) + 1;
+            if (get_bits1(&s->gb)) // display size
+                skip_bits(&s->gb, 32);
+        } else {
+            s->refreshrefmask = get_bits(&s->gb, 8);
+            s->refidx[0]      = get_bits(&s->gb, 3);
+            s->signbias[0]    = get_bits1(&s->gb);
+            s->refidx[1]      = get_bits(&s->gb, 3);
+            s->signbias[1]    = get_bits1(&s->gb);
+            s->refidx[2]      = get_bits(&s->gb, 3);
+            s->signbias[2]    = get_bits1(&s->gb);
+            if (!s->refs[s->refidx[0]]->buf[0] ||
+                !s->refs[s->refidx[1]]->buf[0] ||
+                !s->refs[s->refidx[2]]->buf[0]) {
+                av_log(avctx, AV_LOG_ERROR,
+                       "Not all references are available\n");
+                return AVERROR_INVALIDDATA;
+            }
+            if (get_bits1(&s->gb)) {
+                w = s->refs[s->refidx[0]]->width;
+                h = s->refs[s->refidx[0]]->height;
+            } else if (get_bits1(&s->gb)) {
+                w = s->refs[s->refidx[1]]->width;
+                h = s->refs[s->refidx[1]]->height;
+            } else if (get_bits1(&s->gb)) {
+                w = s->refs[s->refidx[2]]->width;
+                h = s->refs[s->refidx[2]]->height;
+            } else {
+                w = get_bits(&s->gb, 16) + 1;
+                h = get_bits(&s->gb, 16) + 1;
+            }
+            if (get_bits1(&s->gb)) // display size
+                skip_bits(&s->gb, 32);
+            s->highprecisionmvs = get_bits1(&s->gb);
+            s->filtermode       = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
+                                  get_bits(&s->gb, 2);
+            s->allowcompinter   = s->signbias[0] != s->signbias[1] ||
+                                  s->signbias[0] != s->signbias[2];
+            if (s->allowcompinter) {
+                if (s->signbias[0] == s->signbias[1]) {
+                    s->fixcompref    = 2;
+                    s->varcompref[0] = 0;
+                    s->varcompref[1] = 1;
+                } else if (s->signbias[0] == s->signbias[2]) {
+                    s->fixcompref    = 1;
+                    s->varcompref[0] = 0;
+                    s->varcompref[1] = 2;
+                } else {
+                    s->fixcompref    = 0;
+                    s->varcompref[0] = 1;
+                    s->varcompref[1] = 2;
+                }
+            }
+        }
+    }
+
+    s->refreshctx   = s->errorres ? 0 : get_bits1(&s->gb);
+    s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
+    s->framectxid   = c = get_bits(&s->gb, 2);
+
+    /* loopfilter header data */
+    s->filter.level = get_bits(&s->gb, 6);
+    sharp           = get_bits(&s->gb, 3);
+    /* If sharpness changed, reinit lim/mblim LUTs. if it didn't change,
+     * keep the old cache values since they are still valid. */
+    if (s->filter.sharpness != sharp)
+        memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
+    s->filter.sharpness = sharp;
+    if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
+        if (get_bits1(&s->gb)) {
+            for (i = 0; i < 4; i++)
+                if (get_bits1(&s->gb))
+                    s->lf_delta.ref[i] = get_bits_with_sign(&s->gb, 6);
+            for (i = 0; i < 2; i++)
+                if (get_bits1(&s->gb))
+                    s->lf_delta.mode[i] = get_bits_with_sign(&s->gb, 6);
+        }
+    } else {
+        memset(&s->lf_delta, 0, sizeof(s->lf_delta));
+    }
+
+    /* quantization header data */
+    s->yac_qi      = get_bits(&s->gb, 8);
+    s->ydc_qdelta  = get_bits1(&s->gb) ? get_bits_with_sign(&s->gb, 4) : 0;
+    s->uvdc_qdelta = get_bits1(&s->gb) ? get_bits_with_sign(&s->gb, 4) : 0;
+    s->uvac_qdelta = get_bits1(&s->gb) ? get_bits_with_sign(&s->gb, 4) : 0;
+    s->lossless    = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
+                     s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
+
+    /* segmentation header info */
+    if ((s->segmentation.enabled = get_bits1(&s->gb))) {
+        if ((s->segmentation.update_map = get_bits1(&s->gb))) {
+            for (i = 0; i < 7; i++)
+                s->prob.seg[i] = get_bits1(&s->gb) ?
+                                 get_bits(&s->gb, 8) : 255;
+            if ((s->segmentation.temporal = get_bits1(&s->gb)))
+                for (i = 0; i < 3; i++)
+                    s->prob.segpred[i] = get_bits1(&s->gb) ?
+                                         get_bits(&s->gb, 8) : 255;
+        }
+
+        if (get_bits1(&s->gb)) {
+            s->segmentation.absolute_vals = get_bits1(&s->gb);
+            for (i = 0; i < 8; i++) {
+                if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
+                    s->segmentation.feat[i].q_val = get_bits_with_sign(&s->gb, 8);
+                if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
+                    s->segmentation.feat[i].lf_val = get_bits_with_sign(&s->gb, 6);
+                if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
+                    s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
+                s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
+            }
+        }
+    } else {
+        s->segmentation.feat[0].q_enabled    = 0;
+        s->segmentation.feat[0].lf_enabled   = 0;
+        s->segmentation.feat[0].skip_enabled = 0;
+        s->segmentation.feat[0].ref_enabled  = 0;
+    }
+
+    // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
+    for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
+        int qyac, qydc, quvac, quvdc, lflvl, sh;
+
+        if (s->segmentation.feat[i].q_enabled) {
+            if (s->segmentation.absolute_vals)
+                qyac = s->segmentation.feat[i].q_val;
+            else
+                qyac = s->yac_qi + s->segmentation.feat[i].q_val;
+        } else {
+            qyac = s->yac_qi;
+        }
+        qydc  = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
+        quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
+        quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
+        qyac  = av_clip_uintp2(qyac, 8);
+
+        s->segmentation.feat[i].qmul[0][0] = ff_vp9_dc_qlookup[qydc];
+        s->segmentation.feat[i].qmul[0][1] = ff_vp9_ac_qlookup[qyac];
+        s->segmentation.feat[i].qmul[1][0] = ff_vp9_dc_qlookup[quvdc];
+        s->segmentation.feat[i].qmul[1][1] = ff_vp9_ac_qlookup[quvac];
+
+        sh = s->filter.level >= 32;
+        if (s->segmentation.feat[i].lf_enabled) {
+            if (s->segmentation.absolute_vals)
+                lflvl = s->segmentation.feat[i].lf_val;
+            else
+                lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
+        } else {
+            lflvl = s->filter.level;
+        }
+        s->segmentation.feat[i].lflvl[0][0] =
+        s->segmentation.feat[i].lflvl[0][1] =
+            av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
+        for (j = 1; j < 4; j++) {
+            s->segmentation.feat[i].lflvl[j][0] =
+                av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
+                                         s->lf_delta.mode[0]) << sh), 6);
+            s->segmentation.feat[i].lflvl[j][1] =
+                av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
+                                         s->lf_delta.mode[1]) << sh), 6);
+        }
+    }
+
+    /* tiling info */
+    if ((ret = update_size(avctx, w, h)) < 0) {
+        av_log(avctx, AV_LOG_ERROR,
+               "Failed to initialize decoder for %dx%d\n", w, h);
+        return ret;
+    }
+    for (s->tiling.log2_tile_cols = 0;
+         (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
+         s->tiling.log2_tile_cols++) ;
+    for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
+    max = FFMAX(0, max - 1);
+    while (max > s->tiling.log2_tile_cols) {
+        if (get_bits1(&s->gb))
+            s->tiling.log2_tile_cols++;
+        else
+            break;
+    }
+    s->tiling.log2_tile_rows = decode012(&s->gb);
+    s->tiling.tile_rows      = 1 << s->tiling.log2_tile_rows;
+    if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
+        s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
+        s->c_b              = av_fast_realloc(s->c_b, &s->c_b_size,
+                                              sizeof(VP56RangeCoder) *
+                                              s->tiling.tile_cols);
+        if (!s->c_b) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Ran out of memory during range coder init\n");
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    if (s->keyframe || s->errorres || s->intraonly) {
+        s->prob_ctx[0].p =
+        s->prob_ctx[1].p =
+        s->prob_ctx[2].p =
+        s->prob_ctx[3].p = ff_vp9_default_probs;
+        memcpy(s->prob_ctx[0].coef, ff_vp9_default_coef_probs,
+               sizeof(ff_vp9_default_coef_probs));
+        memcpy(s->prob_ctx[1].coef, ff_vp9_default_coef_probs,
+               sizeof(ff_vp9_default_coef_probs));
+        memcpy(s->prob_ctx[2].coef, ff_vp9_default_coef_probs,
+               sizeof(ff_vp9_default_coef_probs));
+        memcpy(s->prob_ctx[3].coef, ff_vp9_default_coef_probs,
+               sizeof(ff_vp9_default_coef_probs));
+    }
+
+    // next 16 bits is size of the rest of the header (arith-coded)
+    size2 = get_bits(&s->gb, 16);
+    data2 = align_get_bits(&s->gb);
+    if (size2 > size - (data2 - data)) {
+        av_log(avctx, AV_LOG_ERROR, "Invalid compressed header size\n");
+        return AVERROR_INVALIDDATA;
+    }
+    ff_vp56_init_range_decoder(&s->c, data2, size2);
+    if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
+        av_log(avctx, AV_LOG_ERROR, "Marker bit was set\n");
+        return AVERROR_INVALIDDATA;
+    }
+
+    if (s->keyframe || s->intraonly)
+        memset(s->counts.coef, 0,
+               sizeof(s->counts.coef) + sizeof(s->counts.eob));
+    else
+        memset(&s->counts, 0, sizeof(s->counts));
+
+    /* FIXME is it faster to not copy here, but do it down in the fw updates
+     * as explicit copies if the fw update is missing (and skip the copy upon
+     * fw update)? */
+    s->prob.p = s->prob_ctx[c].p;
+
+    // txfm updates
+    if (s->lossless) {
+        s->txfmmode = TX_4X4;
+    } else {
+        s->txfmmode = vp8_rac_get_uint(&s->c, 2);
+        if (s->txfmmode == 3)
+            s->txfmmode += vp8_rac_get(&s->c);
+
+        if (s->txfmmode == TX_SWITCHABLE) {
+            for (i = 0; i < 2; i++)
+                if (vp56_rac_get_prob_branchy(&s->c, 252))
+                    s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
+            for (i = 0; i < 2; i++)
+                for (j = 0; j < 2; j++)
+                    if (vp56_rac_get_prob_branchy(&s->c, 252))
+                        s->prob.p.tx16p[i][j] =
+                            update_prob(&s->c, s->prob.p.tx16p[i][j]);
+            for (i = 0; i < 2; i++)
+                for (j = 0; j < 3; j++)
+                    if (vp56_rac_get_prob_branchy(&s->c, 252))
+                        s->prob.p.tx32p[i][j] =
+                            update_prob(&s->c, s->prob.p.tx32p[i][j]);
+        }
+    }
+
+    // coef updates
+    for (i = 0; i < 4; i++) {
+        uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
+        if (vp8_rac_get(&s->c)) {
+            for (j = 0; j < 2; j++)
+                for (k = 0; k < 2; k++)
+                    for (l = 0; l < 6; l++)
+                        for (m = 0; m < 6; m++) {
+                            uint8_t *p = s->prob.coef[i][j][k][l][m];
+                            uint8_t *r = ref[j][k][l][m];
+                            if (m >= 3 && l == 0) // dc only has 3 pt
+                                break;
+                            for (n = 0; n < 3; n++) {
+                                if (vp56_rac_get_prob_branchy(&s->c, 252))
+                                    p[n] = update_prob(&s->c, r[n]);
+                                else
+                                    p[n] = r[n];
+                            }
+                            p[3] = 0;
+                        }
+        } else {
+            for (j = 0; j < 2; j++)
+                for (k = 0; k < 2; k++)
+                    for (l = 0; l < 6; l++)
+                        for (m = 0; m < 6; m++) {
+                            uint8_t *p = s->prob.coef[i][j][k][l][m];
+                            uint8_t *r = ref[j][k][l][m];
+                            if (m > 3 && l == 0) // dc only has 3 pt
+                                break;
+                            memcpy(p, r, 3);
+                            p[3] = 0;
+                        }
+        }
+        if (s->txfmmode == i)
+            break;
+    }
+
+    // mode updates
+    for (i = 0; i < 3; i++)
+        if (vp56_rac_get_prob_branchy(&s->c, 252))
+            s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
+    if (!s->keyframe && !s->intraonly) {
+        for (i = 0; i < 7; i++)
+            for (j = 0; j < 3; j++)
+                if (vp56_rac_get_prob_branchy(&s->c, 252))
+                    s->prob.p.mv_mode[i][j] =
+                        update_prob(&s->c, s->prob.p.mv_mode[i][j]);
+
+        if (s->filtermode == FILTER_SWITCHABLE)
+            for (i = 0; i < 4; i++)
+                for (j = 0; j < 2; j++)
+                    if (vp56_rac_get_prob_branchy(&s->c, 252))
+                        s->prob.p.filter[i][j] =
+                            update_prob(&s->c, s->prob.p.filter[i][j]);
+
+        for (i = 0; i < 4; i++)
+            if (vp56_rac_get_prob_branchy(&s->c, 252))
+                s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
+
+        if (s->allowcompinter) {
+            s->comppredmode = vp8_rac_get(&s->c);
+            if (s->comppredmode)
+                s->comppredmode += vp8_rac_get(&s->c);
+            if (s->comppredmode == PRED_SWITCHABLE)
+                for (i = 0; i < 5; i++)
+                    if (vp56_rac_get_prob_branchy(&s->c, 252))
+                        s->prob.p.comp[i] =
+                            update_prob(&s->c, s->prob.p.comp[i]);
+        } else {
+            s->comppredmode = PRED_SINGLEREF;
+        }
+
+        if (s->comppredmode != PRED_COMPREF) {
+            for (i = 0; i < 5; i++) {
+                if (vp56_rac_get_prob_branchy(&s->c, 252))
+                    s->prob.p.single_ref[i][0] =
+                        update_prob(&s->c, s->prob.p.single_ref[i][0]);
+                if (vp56_rac_get_prob_branchy(&s->c, 252))
+                    s->prob.p.single_ref[i][1] =
+                        update_prob(&s->c, s->prob.p.single_ref[i][1]);
+            }
+        }
+
+        if (s->comppredmode != PRED_SINGLEREF) {
+            for (i = 0; i < 5; i++)
+                if (vp56_rac_get_prob_branchy(&s->c, 252))
+                    s->prob.p.comp_ref[i] =
+                        update_prob(&s->c, s->prob.p.comp_ref[i]);
+        }
+
+        for (i = 0; i < 4; i++)
+            for (j = 0; j < 9; j++)
+                if (vp56_rac_get_prob_branchy(&s->c, 252))
+                    s->prob.p.y_mode[i][j] =
+                        update_prob(&s->c, s->prob.p.y_mode[i][j]);
+
+        for (i = 0; i < 4; i++)
+            for (j = 0; j < 4; j++)
+                for (k = 0; k < 3; k++)
+                    if (vp56_rac_get_prob_branchy(&s->c, 252))
+                        s->prob.p.partition[3 - i][j][k] =
+                            update_prob(&s->c,
+                                        s->prob.p.partition[3 - i][j][k]);
+
+        // mv fields don't use the update_prob subexp model for some reason
+        for (i = 0; i < 3; i++)
+            if (vp56_rac_get_prob_branchy(&s->c, 252))
+                s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
+
+        for (i = 0; i < 2; i++) {
+            if (vp56_rac_get_prob_branchy(&s->c, 252))
+                s->prob.p.mv_comp[i].sign =
+                    (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
+
+            for (j = 0; j < 10; j++)
+                if (vp56_rac_get_prob_branchy(&s->c, 252))
+                    s->prob.p.mv_comp[i].classes[j] =
+                        (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
+
+            if (vp56_rac_get_prob_branchy(&s->c, 252))
+                s->prob.p.mv_comp[i].class0 =
+                    (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
+
+            for (j = 0; j < 10; j++)
+                if (vp56_rac_get_prob_branchy(&s->c, 252))
+                    s->prob.p.mv_comp[i].bits[j] =
+                        (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
+        }
+
+        for (i = 0; i < 2; i++) {
+            for (j = 0; j < 2; j++)
+                for (k = 0; k < 3; k++)
+                    if (vp56_rac_get_prob_branchy(&s->c, 252))
+                        s->prob.p.mv_comp[i].class0_fp[j][k] =
+                            (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
+
+            for (j = 0; j < 3; j++)
+                if (vp56_rac_get_prob_branchy(&s->c, 252))
+                    s->prob.p.mv_comp[i].fp[j] =
+                        (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
+        }
+
+        if (s->highprecisionmvs) {
+            for (i = 0; i < 2; i++) {
+                if (vp56_rac_get_prob_branchy(&s->c, 252))
+                    s->prob.p.mv_comp[i].class0_hp =
+                        (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
+
+                if (vp56_rac_get_prob_branchy(&s->c, 252))
+                    s->prob.p.mv_comp[i].hp =
+                        (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
+            }
+        }
+    }
+
+    return (data2 - data) + size2;
+}
+
+static int decode_subblock(AVCodecContext *avctx, int row, int col,
+                           VP9Filter *lflvl,
+                           ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
+{
+    VP9Context *s = avctx->priv_data;
+    int c = ((s->above_partition_ctx[col]       >> (3 - bl)) & 1) |
+            (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
+    int ret;
+    const uint8_t *p = s->keyframe ? ff_vp9_default_kf_partition_probs[bl][c]
+                                   : s->prob.p.partition[bl][c];
+    enum BlockPartition bp;
+    ptrdiff_t hbs = 4 >> bl;
+
+    if (bl == BL_8X8) {
+        bp  = vp8_rac_get_tree(&s->c, ff_vp9_partition_tree, p);
+        ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+    } else if (col + hbs < s->cols) {
+        if (row + hbs < s->rows) {
+            bp = vp8_rac_get_tree(&s->c, ff_vp9_partition_tree, p);
+            switch (bp) {
+            case PARTITION_NONE:
+                ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
+                                          bl, bp);
+                break;
+            case PARTITION_H:
+                ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
+                                          bl, bp);
+                if (!ret) {
+                    yoff  += hbs * 8 * s->cur_frame->linesize[0];
+                    uvoff += hbs * 4 * s->cur_frame->linesize[1];
+                    ret    = ff_vp9_decode_block(avctx, row + hbs, col, lflvl,
+                                                 yoff, uvoff, bl, bp);
+                }
+                break;
+            case PARTITION_V:
+                ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
+                                          bl, bp);
+                if (!ret) {
+                    yoff  += hbs * 8;
+                    uvoff += hbs * 4;
+                    ret    = ff_vp9_decode_block(avctx, row, col + hbs, lflvl,
+                                                 yoff, uvoff, bl, bp);
+                }
+                break;
+            case PARTITION_SPLIT:
+                ret = decode_subblock(avctx, row, col, lflvl,
+                                      yoff, uvoff, bl + 1);
+                if (!ret) {
+                    ret = decode_subblock(avctx, row, col + hbs, lflvl,
+                                          yoff + 8 * hbs, uvoff + 4 * hbs,
+                                          bl + 1);
+                    if (!ret) {
+                        yoff  += hbs * 8 * s->cur_frame->linesize[0];
+                        uvoff += hbs * 4 * s->cur_frame->linesize[1];
+                        ret    = decode_subblock(avctx, row + hbs, col, lflvl,
+                                                 yoff, uvoff, bl + 1);
+                        if (!ret) {
+                            ret = decode_subblock(avctx, row + hbs, col + hbs,
+                                                  lflvl, yoff + 8 * hbs,
+                                                  uvoff + 4 * hbs, bl + 1);
+                        }
+                    }
+                }
+                break;
+            default:
+                av_log(avctx, AV_LOG_ERROR, "Unexpected partition %d.", bp);
+                return AVERROR_INVALIDDATA;
+            }
+        } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
+            bp  = PARTITION_SPLIT;
+            ret = decode_subblock(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+            if (!ret)
+                ret = decode_subblock(avctx, row, col + hbs, lflvl,
+                                      yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
+        } else {
+            bp  = PARTITION_H;
+            ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
+                                      bl, bp);
+        }
+    } else if (row + hbs < s->rows) {
+        if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
+            bp  = PARTITION_SPLIT;
+            ret = decode_subblock(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+            if (!ret) {
+                yoff  += hbs * 8 * s->cur_frame->linesize[0];
+                uvoff += hbs * 4 * s->cur_frame->linesize[1];
+                ret    = decode_subblock(avctx, row + hbs, col, lflvl,
+                                         yoff, uvoff, bl + 1);
+            }
+        } else {
+            bp  = PARTITION_V;
+            ret = ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff,
+                                      bl, bp);
+        }
+    } else {
+        bp  = PARTITION_SPLIT;
+        ret = decode_subblock(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+    }
+    s->counts.partition[bl][c][bp]++;
+
+    return ret;
+}
+
+static void loopfilter_subblock(AVCodecContext *avctx, VP9Filter *lflvl,
+                                int row, int col,
+                                ptrdiff_t yoff, ptrdiff_t uvoff)
+{
+    VP9Context *s = avctx->priv_data;
+    uint8_t *dst   = s->cur_frame->data[0] + yoff, *lvl = lflvl->level;
+    ptrdiff_t ls_y = s->cur_frame->linesize[0], ls_uv = s->cur_frame->linesize[1];
+    int y, x, p;
+
+    /* FIXME: In how far can we interleave the v/h loopfilter calls? E.g.
+     * if you think of them as acting on a 8x8 block max, we can interleave
+     * each v/h within the single x loop, but that only works if we work on
+     * 8 pixel blocks, and we won't always do that (we want at least 16px
+     * to use SSE2 optimizations, perhaps 32 for AVX2). */
+
+    // filter edges between columns, Y plane (e.g. block1 | block2)
+    for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
+        uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
+        uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
+        unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
+        unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
+        unsigned hm  = hm1 | hm2 | hm13 | hm23;
+
+        for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
+            if (hm1 & x) {
+                int L = *l, H = L >> 4;
+                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                if (col || x > 1) {
+                    if (hmask1[0] & x) {
+                        if (hmask2[0] & x) {
+                            av_assert2(l[8] == L);
+                            s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
+                        } else {
+                            s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
+                        }
+                    } else if (hm2 & x) {
+                        L  = l[8];
+                        H |= (L >> 4) << 8;
+                        E |= s->filter.mblim_lut[L] << 8;
+                        I |= s->filter.lim_lut[L] << 8;
+                        s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
+                                               [!!(hmask2[1] & x)]
+                                               [0](ptr, ls_y, E, I, H);
+                    } else {
+                        s->dsp.loop_filter_8[!!(hmask1[1] & x)]
+                                            [0](ptr, ls_y, E, I, H);
+                    }
+                }
+            } else if (hm2 & x) {
+                int L = l[8], H = L >> 4;
+                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                if (col || x > 1) {
+                    s->dsp.loop_filter_8[!!(hmask2[1] & x)]
+                                        [0](ptr + 8 * ls_y, ls_y, E, I, H);
+                }
+            }
+            if (hm13 & x) {
+                int L = *l, H = L >> 4;
+                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                if (hm23 & x) {
+                    L  = l[8];
+                    H |= (L >> 4) << 8;
+                    E |= s->filter.mblim_lut[L] << 8;
+                    I |= s->filter.lim_lut[L] << 8;
+                    s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
+                } else {
+                    s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
+                }
+            } else if (hm23 & x) {
+                int L = l[8], H = L >> 4;
+                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
+            }
+        }
+    }
+
+    //                                          block1
+    // filter edges between rows, Y plane (e.g. ------)
+    //                                          block2
+    dst = s->cur_frame->data[0] + yoff;
+    lvl = lflvl->level;
+    for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
+        uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
+        unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
+
+        for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
+            if (row || y) {
+                if (vm & x) {
+                    int L = *l, H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                    if (vmask[0] & x) {
+                        if (vmask[0] & (x << 1)) {
+                            av_assert2(l[1] == L);
+                            s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
+                        } else {
+                            s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
+                        }
+                    } else if (vm & (x << 1)) {
+                        L  = l[1];
+                        H |= (L >> 4) << 8;
+                        E |= s->filter.mblim_lut[L] << 8;
+                        I |= s->filter.lim_lut[L] << 8;
+                        s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
+                                               [!!(vmask[1] & (x << 1))]
+                                               [1](ptr, ls_y, E, I, H);
+                    } else {
+                        s->dsp.loop_filter_8[!!(vmask[1] & x)]
+                                            [1](ptr, ls_y, E, I, H);
+                    }
+                } else if (vm & (x << 1)) {
+                    int L = l[1], H = L >> 4;
+                    int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                    s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
+                                        [1](ptr + 8, ls_y, E, I, H);
+                }
+            }
+            if (vm3 & x) {
+                int L = *l, H = L >> 4;
+                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                if (vm3 & (x << 1)) {
+                    L  = l[1];
+                    H |= (L >> 4) << 8;
+                    E |= s->filter.mblim_lut[L] << 8;
+                    I |= s->filter.lim_lut[L] << 8;
+                    s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
+                } else {
+                    s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
+                }
+            } else if (vm3 & (x << 1)) {
+                int L = l[1], H = L >> 4;
+                int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
+
+                s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
+            }
+        }
+    }
+
+    // same principle but for U/V planes
+    for (p = 0; p < 2; p++) {
+        lvl = lflvl->level;
+        dst = s->cur_frame->data[1 + p] + uvoff;
+        for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
+            uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
+            uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
+            unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
+            unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
+
+            for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
+                if (col || x > 1) {
+                    if (hm1 & x) {
+                        int L = *l, H = L >> 4;
+                        int E = s->filter.mblim_lut[L];
+                        int I = s->filter.lim_lut[L];
+
+                        if (hmask1[0] & x) {
+                            if (hmask2[0] & x) {
+                                av_assert2(l[16] == L);
+                                s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
+                            } else {
+                                s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
+                            }
+                        } else if (hm2 & x) {
+                            L  = l[16];
+                            H |= (L >> 4) << 8;
+                            E |= s->filter.mblim_lut[L] << 8;
+                            I |= s->filter.lim_lut[L] << 8;
+                            s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
+                                                   [!!(hmask2[1] & x)]
+                                                   [0](ptr, ls_uv, E, I, H);
+                        } else {
+                            s->dsp.loop_filter_8[!!(hmask1[1] & x)]
+                                                [0](ptr, ls_uv, E, I, H);
+                        }
+                    } else if (hm2 & x) {
+                        int L = l[16], H = L >> 4;
+                        int E = s->filter.mblim_lut[L];
+                        int I = s->filter.lim_lut[L];
+
+                        s->dsp.loop_filter_8[!!(hmask2[1] & x)]
+                                            [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
+                    }
+                }
+                if (x & 0xAA)
+                    l += 2;
+            }
+        }
+        lvl = lflvl->level;
+        dst = s->cur_frame->data[1 + p] + uvoff;
+        for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
+            uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
+            unsigned vm = vmask[0] | vmask[1] | vmask[2];
+
+            for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
+                if (row || y) {
+                    if (vm & x) {
+                        int L = *l, H = L >> 4;
+                        int E = s->filter.mblim_lut[L];
+                        int I = s->filter.lim_lut[L];
+
+                        if (vmask[0] & x) {
+                            if (vmask[0] & (x << 2)) {
+                                av_assert2(l[2] == L);
+                                s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
+                            } else {
+                                s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
+                            }
+                        } else if (vm & (x << 2)) {
+                            L  = l[2];
+                            H |= (L >> 4) << 8;
+                            E |= s->filter.mblim_lut[L] << 8;
+                            I |= s->filter.lim_lut[L] << 8;
+                            s->dsp.loop_filter_mix2[!!(vmask[1] &  x)]
+                                                   [!!(vmask[1] & (x << 2))]
+                                                   [1](ptr, ls_uv, E, I, H);
+                        } else {
+                            s->dsp.loop_filter_8[!!(vmask[1] & x)]
+                                                [1](ptr, ls_uv, E, I, H);
+                        }
+                    } else if (vm & (x << 2)) {
+                        int L = l[2], H = L >> 4;
+                        int E = s->filter.mblim_lut[L];
+                        int I = s->filter.lim_lut[L];
+
+                        s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
+                                            [1](ptr + 8, ls_uv, E, I, H);
+                    }
+                }
+            }
+            if (y & 1)
+                lvl += 16;
+        }
+    }
+}
+
+static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
+{
+    int sb_start =  (idx      * n) >> log2_n;
+    int sb_end   = ((idx + 1) * n) >> log2_n;
+    *start = FFMIN(sb_start, n) << 3;
+    *end   = FFMIN(sb_end,   n) << 3;
+}
+
+static int vp9_decode_frame(AVCodecContext *avctx, AVFrame *frame,
+                            int *got_frame, const uint8_t *data, int size)
+{
+    VP9Context *s = avctx->priv_data;
+    int ret, tile_row, tile_col, i, ref = -1, row, col;
+    ptrdiff_t yoff = 0, uvoff = 0;
+
+    ret = decode_frame_header(avctx, data, size, &ref);
+    if (ret < 0) {
+        return ret;
+    } else if (!ret) {
+        if (!s->refs[ref]->buf[0]) {
+            av_log(avctx, AV_LOG_ERROR,
+                   "Requested reference %d not available\n", ref);
+            return AVERROR_INVALIDDATA;
+        }
+
+        ret = av_frame_ref(frame, s->refs[ref]);
+        if (ret < 0)
+            return ret;
+        *got_frame = 1;
+        return 0;
+    }
+    data += ret;
+    size -= ret;
+
+    s->cur_frame = frame;
+
+    av_frame_unref(s->cur_frame);
+    if ((ret = ff_get_buffer(avctx, s->cur_frame,
+                             s->refreshrefmask ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
+        return ret;
+    s->cur_frame->key_frame = s->keyframe;
+    s->cur_frame->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
+                                          : AV_PICTURE_TYPE_P;
+
+    // main tile decode loop
+    memset(s->above_partition_ctx, 0, s->cols);
+    memset(s->above_skip_ctx, 0, s->cols);
+    if (s->keyframe || s->intraonly)
+        memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
+    else
+        memset(s->above_mode_ctx, NEARESTMV, s->cols);
+    memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
+    memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
+    memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
+    memset(s->above_segpred_ctx, 0, s->cols);
+    for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
+        set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
+                        tile_row, s->tiling.log2_tile_rows, s->sb_rows);
+        for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
+            int64_t tile_size;
+
+            if (tile_col == s->tiling.tile_cols - 1 &&
+                tile_row == s->tiling.tile_rows - 1) {
+                tile_size = size;
+            } else {
+                tile_size = AV_RB32(data);
+                data     += 4;
+                size     -= 4;
+            }
+            if (tile_size > size)
+                return AVERROR_INVALIDDATA;
+            ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
+            if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) // marker bit
+                return AVERROR_INVALIDDATA;
+            data += tile_size;
+            size -= tile_size;
+        }
+
+        for (row = s->tiling.tile_row_start;
+             row < s->tiling.tile_row_end;
+             row += 8, yoff += s->cur_frame->linesize[0] * 64,
+             uvoff += s->cur_frame->linesize[1] * 32) {
+            VP9Filter *lflvl = s->lflvl;
+            ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
+
+            for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
+                set_tile_offset(&s->tiling.tile_col_start,
+                                &s->tiling.tile_col_end,
+                                tile_col, s->tiling.log2_tile_cols, s->sb_cols);
+
+                memset(s->left_partition_ctx, 0, 8);
+                memset(s->left_skip_ctx, 0, 8);
+                if (s->keyframe || s->intraonly)
+                    memset(s->left_mode_ctx, DC_PRED, 16);
+                else
+                    memset(s->left_mode_ctx, NEARESTMV, 8);
+                memset(s->left_y_nnz_ctx, 0, 16);
+                memset(s->left_uv_nnz_ctx, 0, 16);
+                memset(s->left_segpred_ctx, 0, 8);
+
+                memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
+                for (col = s->tiling.tile_col_start;
+                     col < s->tiling.tile_col_end;
+                     col += 8, yoff2 += 64, uvoff2 += 32, lflvl++) {
+                    // FIXME integrate with lf code (i.e. zero after each
+                    // use, similar to invtxfm coefficients, or similar)
+                    memset(lflvl->mask, 0, sizeof(lflvl->mask));
+
+                    if ((ret = decode_subblock(avctx, row, col, lflvl,
+                                               yoff2, uvoff2, BL_64X64)) < 0)
+                        return ret;
+                }
+                memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
+            }
+
+            // backup pre-loopfilter reconstruction data for intra
+            // prediction of next row of sb64s
+            if (row + 8 < s->rows) {
+                memcpy(s->intra_pred_data[0],
+                       s->cur_frame->data[0] + yoff +
+                       63 * s->cur_frame->linesize[0],
+                       8 * s->cols);
+                memcpy(s->intra_pred_data[1],
+                       s->cur_frame->data[1] + uvoff +
+                       31 * s->cur_frame->linesize[1],
+                       4 * s->cols);
+                memcpy(s->intra_pred_data[2],
+                       s->cur_frame->data[2] + uvoff +
+                       31 * s->cur_frame->linesize[2],
+                       4 * s->cols);
+            }
+
+            // loopfilter one row
+            if (s->filter.level) {
+                yoff2  = yoff;
+                uvoff2 = uvoff;
+                lflvl  = s->lflvl;
+                for (col = 0; col < s->cols;
+                     col += 8, yoff2 += 64, uvoff2 += 32, lflvl++)
+                    loopfilter_subblock(avctx, lflvl, row, col, yoff2, uvoff2);
+            }
+        }
+    }
+
+    // bw adaptivity (or in case of parallel decoding mode, fw adaptivity
+    // probability maintenance between frames)
+    if (s->refreshctx) {
+        if (s->parallelmode) {
+            memcpy(s->prob_ctx[s->framectxid].coef, s->prob.coef,
+                   sizeof(s->prob.coef));
+            s->prob_ctx[s->framectxid].p = s->prob.p;
+        } else {
+            ff_vp9_adapt_probs(s);
+        }
+    }
+    FFSWAP(VP9MVRefPair *, s->mv[0], s->mv[1]);
+
+    // ref frame setup
+    for (i = 0; i < 8; i++)
+        if (s->refreshrefmask & (1 << i)) {
+            av_frame_unref(s->refs[i]);
+            ret = av_frame_ref(s->refs[i], s->cur_frame);
+            if (ret < 0)
+                return ret;
+        }
+
+    if (s->invisible)
+        av_frame_unref(s->cur_frame);
+    else
+        *got_frame = 1;
+
+    return 0;
+}
+
+static int vp9_decode_packet(AVCodecContext *avctx, void *frame,
+                             int *got_frame, AVPacket *avpkt)
+{
+    const uint8_t *data = avpkt->data;
+    int size            = avpkt->size;
+    int marker, ret;
+
+    /* Read superframe index - this is a collection of individual frames
+     * that together lead to one visible frame */
+    marker = data[size - 1];
+    if ((marker & 0xe0) == 0xc0) {
+        int nbytes   = 1 + ((marker >> 3) & 0x3);
+        int n_frames = 1 + (marker & 0x7);
+        int idx_sz   = 2 + n_frames * nbytes;
+
+        if (size >= idx_sz && data[size - idx_sz] == marker) {
+            const uint8_t *idx = data + size + 1 - idx_sz;
+
+            while (n_frames--) {
+                int sz = AV_RL32(idx);
+
+                if (nbytes < 4)
+                    sz &= (1 << (8 * nbytes)) - 1;
+                idx += nbytes;
+
+                if (sz > size) {
+                    av_log(avctx, AV_LOG_ERROR,
+                           "Superframe packet size too big: %d > %d\n",
+                           sz, size);
+                    return AVERROR_INVALIDDATA;
+                }
+
+                ret = vp9_decode_frame(avctx, frame, got_frame, data, sz);
+                if (ret < 0)
+                    return ret;
+                data += sz;
+                size -= sz;
+            }
+            return size;
+        }
+    }
+
+    /* If we get here, there was no valid superframe index, i.e. this is just
+     * one whole single frame. Decode it as such from the complete input buf. */
+    if ((ret = vp9_decode_frame(avctx, frame, got_frame, data, size)) < 0)
+        return ret;
+    return size;
+}
+
+static av_cold int vp9_decode_free(AVCodecContext *avctx)
+{
+    VP9Context *s = avctx->priv_data;
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->refs); i++)
+        av_frame_free(&s->refs[i]);
+
+    av_freep(&s->c_b);
+    av_freep(&s->above_partition_ctx);
+
+    return 0;
+}
+
+static av_cold int vp9_decode_init(AVCodecContext *avctx)
+{
+    VP9Context *s = avctx->priv_data;
+    int i;
+
+    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
+
+    ff_vp9dsp_init(&s->dsp);
+    ff_videodsp_init(&s->vdsp, 8);
+
+    for (i = 0; i < FF_ARRAY_ELEMS(s->refs); i++) {
+        s->refs[i] = av_frame_alloc();
+        if (!s->refs[i]) {
+            vp9_decode_free(avctx);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    s->filter.sharpness = -1;
+
+    return 0;
+}
+
+AVCodec ff_vp9_decoder = {
+    .name           = "vp9",
+    .long_name      = NULL_IF_CONFIG_SMALL("Google VP9"),
+    .type           = AVMEDIA_TYPE_VIDEO,
+    .id             = AV_CODEC_ID_VP9,
+    .priv_data_size = sizeof(VP9Context),
+    .init           = vp9_decode_init,
+    .decode         = vp9_decode_packet,
+    .flush          = vp9_decode_flush,
+    .close          = vp9_decode_free,
+    .capabilities   = CODEC_CAP_DR1,
+};
diff --git a/libavcodec/vp9.h b/libavcodec/vp9.h
new file mode 100644
index 0000000000..0a6c6eed42
--- /dev/null
+++ b/libavcodec/vp9.h
@@ -0,0 +1,419 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VP9_H
+#define AVCODEC_VP9_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavutil/internal.h"
+
+#include "avcodec.h"
+#include "vp56.h"
+
+enum TxfmMode {
+    TX_4X4,
+    TX_8X8,
+    TX_16X16,
+    TX_32X32,
+    N_TXFM_SIZES,
+    TX_SWITCHABLE = N_TXFM_SIZES,
+    N_TXFM_MODES
+};
+
+enum TxfmType {
+    DCT_DCT,
+    DCT_ADST,
+    ADST_DCT,
+    ADST_ADST,
+    N_TXFM_TYPES
+};
+
+enum IntraPredMode {
+    VERT_PRED,
+    HOR_PRED,
+    DC_PRED,
+    DIAG_DOWN_LEFT_PRED,
+    DIAG_DOWN_RIGHT_PRED,
+    VERT_RIGHT_PRED,
+    HOR_DOWN_PRED,
+    VERT_LEFT_PRED,
+    HOR_UP_PRED,
+    TM_VP8_PRED,
+    LEFT_DC_PRED,
+    TOP_DC_PRED,
+    DC_128_PRED,
+    DC_127_PRED,
+    DC_129_PRED,
+    N_INTRA_PRED_MODES
+};
+
+enum FilterMode {
+    FILTER_8TAP_SMOOTH,
+    FILTER_8TAP_REGULAR,
+    FILTER_8TAP_SHARP,
+    FILTER_BILINEAR,
+    FILTER_SWITCHABLE,
+};
+
+enum BlockPartition {
+    PARTITION_NONE,    // [ ] <-.
+    PARTITION_H,       // [-]   |
+    PARTITION_V,       // [|]   |
+    PARTITION_SPLIT,   // [+] --'
+};
+
+enum InterPredMode {
+    NEARESTMV = 10,
+    NEARMV    = 11,
+    ZEROMV    = 12,
+    NEWMV     = 13,
+};
+
+enum MVJoint {
+    MV_JOINT_ZERO,
+    MV_JOINT_H,
+    MV_JOINT_V,
+    MV_JOINT_HV,
+};
+
+typedef struct ProbContext {
+    uint8_t y_mode[4][9];
+    uint8_t uv_mode[10][9];
+    uint8_t filter[4][2];
+    uint8_t mv_mode[7][3];
+    uint8_t intra[4];
+    uint8_t comp[5];
+    uint8_t single_ref[5][2];
+    uint8_t comp_ref[5];
+    uint8_t tx32p[2][3];
+    uint8_t tx16p[2][2];
+    uint8_t tx8p[2];
+    uint8_t skip[3];
+    uint8_t mv_joint[3];
+    struct {
+        uint8_t sign;
+        uint8_t classes[10];
+        uint8_t class0;
+        uint8_t bits[10];
+        uint8_t class0_fp[2][3];
+        uint8_t fp[3];
+        uint8_t class0_hp;
+        uint8_t hp;
+    } mv_comp[2];
+    uint8_t partition[4][4][3];
+} ProbContext;
+
+typedef void (*vp9_mc_func)(uint8_t *dst, const uint8_t *ref,
+                            ptrdiff_t dst_stride,
+                            ptrdiff_t ref_stride,
+                            int h, int mx, int my);
+
+typedef struct VP9DSPContext {
+    /*
+     * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32
+     * dimension 2: intra prediction modes
+     *
+     * dst/left/top is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels)
+     * stride is aligned by 16 pixels
+     * top[-1] is top/left; top[4,7] is top-right for 4x4
+     */
+    // FIXME(rbultje) maybe replace left/top pointers with HAVE_TOP/
+    // HAVE_LEFT/HAVE_TOPRIGHT flags instead, and then handle it in-place?
+    // also needs to fit in with what h264/vp8/etc do
+    void (*intra_pred[N_TXFM_SIZES][N_INTRA_PRED_MODES])(uint8_t *dst,
+                                                         ptrdiff_t stride,
+                                                         const uint8_t *left,
+                                                         const uint8_t *top);
+
+    /*
+     * dimension 1: 0=4x4, 1=8x8, 2=16x16, 3=32x32, 4=lossless (3-4=dct only)
+     * dimension 2: 0=dct/dct, 1=dct/adst, 2=adst/dct, 3=adst/adst
+     *
+     * dst is aligned by transform-size (i.e. 4, 8, 16 or 32 pixels)
+     * stride is aligned by 16 pixels
+     * block is 16-byte aligned
+     * eob indicates the position (+1) of the last non-zero coefficient,
+     * in scan-order. This can be used to write faster versions, e.g. a
+     * dc-only 4x4/8x8/16x16/32x32, or a 4x4-only (eob<10) 8x8/16x16/32x32,
+     * etc.
+     */
+    // FIXME also write idct_add_block() versions for whole (inter) pred
+    // blocks, so we can do 2 4x4s at once
+    void (*itxfm_add[N_TXFM_SIZES + 1][N_TXFM_TYPES])(uint8_t *dst,
+                                                      ptrdiff_t stride,
+                                                      int16_t *block, int eob);
+
+    /*
+     * dimension 1: width of filter (0=4, 1=8, 2=16)
+     * dimension 2: 0=col-edge filter (h), 1=row-edge filter (v)
+     *
+     * dst/stride are aligned by 8
+     */
+    void (*loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride,
+                                int mb_lim, int lim, int hev_thr);
+
+    /*
+     * dimension 1: 0=col-edge filter (h), 1=row-edge filter (v)
+     *
+     * The width of filter is assumed to be 16; dst/stride are aligned by 16
+     */
+    void (*loop_filter_16[2])(uint8_t *dst, ptrdiff_t stride,
+                              int mb_lim, int lim, int hev_thr);
+
+    /*
+     * dimension 1/2: width of filter (0=4, 1=8) for each filter half
+     * dimension 3: 0=col-edge filter (h), 1=row-edge filter (v)
+     *
+     * dst/stride are aligned by operation size
+     * this basically calls loop_filter[d1][d3][0](), followed by
+     * loop_filter[d2][d3][0]() on the next 8 pixels
+     * mb_lim/lim/hev_thr contain two values in the lowest two bytes of the
+     * integer.
+     */
+    // FIXME perhaps a mix4 that operates on 32px (for AVX2)
+    void (*loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride,
+                                      int mb_lim, int lim, int hev_thr);
+
+    /*
+     * dimension 1: hsize (0: 64, 1: 32, 2: 16, 3: 8, 4: 4)
+     * dimension 2: filter type (0: smooth, 1: regular, 2: sharp, 3: bilin)
+     * dimension 3: averaging type (0: put, 1: avg)
+     * dimension 4: x subpel interpolation (0: none, 1: 8tap/bilin)
+     * dimension 5: y subpel interpolation (1: none, 1: 8tap/bilin)
+     *
+     * dst/stride are aligned by hsize
+     */
+    vp9_mc_func mc[5][4][2][2][2];
+} VP9DSPContext;
+
+enum CompPredMode {
+    PRED_SINGLEREF,
+    PRED_COMPREF,
+    PRED_SWITCHABLE,
+};
+
+typedef struct VP9MVRefPair {
+    VP56mv mv[2];
+    int8_t ref[2];
+} VP9MVRefPair;
+
+typedef struct VP9Filter {
+    uint8_t level[8 * 8];
+    uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
+                              [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
+} VP9Filter;
+
+enum BlockLevel {
+    BL_64X64,
+    BL_32X32,
+    BL_16X16,
+    BL_8X8,
+};
+
+enum BlockSize {
+    BS_64x64,
+    BS_64x32,
+    BS_32x64,
+    BS_32x32,
+    BS_32x16,
+    BS_16x32,
+    BS_16x16,
+    BS_16x8,
+    BS_8x16,
+    BS_8x8,
+    BS_8x4,
+    BS_4x8,
+    BS_4x4,
+    N_BS_SIZES,
+};
+
+typedef struct VP9Block {
+    uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
+    enum FilterMode filter;
+    VP56mv mv[4 /* b_idx */][2 /* ref */];
+    enum BlockSize bs;
+    enum TxfmMode tx, uvtx;
+
+    int row, row7, col, col7;
+    uint8_t *dst[3];
+    ptrdiff_t y_stride, uv_stride;
+} VP9Block;
+
+typedef struct VP9Context {
+    VP9DSPContext dsp;
+    VideoDSPContext vdsp;
+    GetBitContext gb;
+    VP56RangeCoder c;
+    VP56RangeCoder *c_b;
+    unsigned c_b_size;
+    VP9Block b;
+
+    // bitstream header
+    uint8_t profile;
+    uint8_t keyframe, last_keyframe;
+    uint8_t invisible;
+    uint8_t use_last_frame_mvs;
+    uint8_t errorres;
+    uint8_t colorspace;
+    uint8_t fullrange;
+    uint8_t intraonly;
+    uint8_t resetctx;
+    uint8_t refreshrefmask;
+    uint8_t highprecisionmvs;
+    enum FilterMode filtermode;
+    uint8_t allowcompinter;
+    uint8_t fixcompref;
+    uint8_t refreshctx;
+    uint8_t parallelmode;
+    uint8_t framectxid;
+    uint8_t refidx[3];
+    uint8_t signbias[3];
+    uint8_t varcompref[2];
+    AVFrame *refs[8];
+    AVFrame *cur_frame;
+
+    struct {
+        uint8_t level;
+        int8_t sharpness;
+        uint8_t lim_lut[64];
+        uint8_t mblim_lut[64];
+    } filter;
+    struct {
+        uint8_t enabled;
+        int8_t mode[2];
+        int8_t ref[4];
+    } lf_delta;
+    uint8_t yac_qi;
+    int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
+    uint8_t lossless;
+    struct {
+        uint8_t enabled;
+        uint8_t temporal;
+        uint8_t absolute_vals;
+        uint8_t update_map;
+        struct {
+            uint8_t q_enabled;
+            uint8_t lf_enabled;
+            uint8_t ref_enabled;
+            uint8_t skip_enabled;
+            uint8_t ref_val;
+            int16_t q_val;
+            int8_t lf_val;
+            int16_t qmul[2][2];
+            uint8_t lflvl[4][2];
+        } feat[8];
+    } segmentation;
+    struct {
+        unsigned log2_tile_cols, log2_tile_rows;
+        unsigned tile_cols, tile_rows;
+        unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
+    } tiling;
+    unsigned sb_cols, sb_rows, rows, cols;
+    struct {
+        ProbContext p;
+        uint8_t coef[4][2][2][6][6][3];
+    } prob_ctx[4];
+    struct {
+        ProbContext p;
+        uint8_t coef[4][2][2][6][6][11];
+        uint8_t seg[7];
+        uint8_t segpred[3];
+    } prob;
+    struct {
+        unsigned y_mode[4][10];
+        unsigned uv_mode[10][10];
+        unsigned filter[4][3];
+        unsigned mv_mode[7][4];
+        unsigned intra[4][2];
+        unsigned comp[5][2];
+        unsigned single_ref[5][2][2];
+        unsigned comp_ref[5][2];
+        unsigned tx32p[2][4];
+        unsigned tx16p[2][3];
+        unsigned tx8p[2][2];
+        unsigned skip[3][2];
+        unsigned mv_joint[4];
+        struct {
+            unsigned sign[2];
+            unsigned classes[11];
+            unsigned class0[2];
+            unsigned bits[10][2];
+            unsigned class0_fp[2][4];
+            unsigned fp[4];
+            unsigned class0_hp[2];
+            unsigned hp[2];
+        } mv_comp[2];
+        unsigned partition[4][4][4];
+        unsigned coef[4][2][2][6][6][3];
+        unsigned eob[4][2][2][6][6][2];
+    } counts;
+    enum TxfmMode txfmmode;
+    enum CompPredMode comppredmode;
+
+    // contextual (left/above) cache
+    uint8_t left_partition_ctx[8], *above_partition_ctx;
+    uint8_t left_mode_ctx[16], *above_mode_ctx;
+    // FIXME maybe merge some of the below in a flags field?
+    uint8_t left_y_nnz_ctx[16], *above_y_nnz_ctx;
+    uint8_t left_uv_nnz_ctx[2][8], *above_uv_nnz_ctx[2];
+    uint8_t left_skip_ctx[8], *above_skip_ctx; // 1bit
+    uint8_t left_txfm_ctx[8], *above_txfm_ctx; // 2bit
+    uint8_t left_segpred_ctx[8], *above_segpred_ctx; // 1bit
+    uint8_t left_intra_ctx[8], *above_intra_ctx; // 1bit
+    uint8_t left_comp_ctx[8], *above_comp_ctx; // 1bit
+    uint8_t left_ref_ctx[8], *above_ref_ctx; // 2bit
+    uint8_t left_filter_ctx[8], *above_filter_ctx;
+    VP56mv left_mv_ctx[16][2], (*above_mv_ctx)[2];
+
+    // whole-frame cache
+    uint8_t *intra_pred_data[3];
+    uint8_t *segmentation_map;
+    VP9MVRefPair *mv[2];
+    VP9Filter *lflvl;
+    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71 * 80];
+
+    // block reconstruction intermediates
+    DECLARE_ALIGNED(32, int16_t, block)[4096];
+    DECLARE_ALIGNED(32, int16_t, uvblock)[2][1024];
+    uint8_t eob[256];
+    uint8_t uveob[2][64];
+    VP56mv min_mv, max_mv;
+    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64];
+    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32 * 32];
+} VP9Context;
+
+void ff_vp9dsp_init(VP9DSPContext *dsp);
+
+void ff_vp9dsp_init_x86(VP9DSPContext *dsp);
+
+void ff_vp9_fill_mv(VP9Context *s, VP56mv *mv, int mode, int sb);
+
+void ff_vp9_adapt_probs(VP9Context *s);
+
+int ff_vp9_decode_block(AVCodecContext *avctx, int row, int col,
+                        VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
+                        enum BlockLevel bl, enum BlockPartition bp);
+
+#endif /* AVCODEC_VP9_H */
diff --git a/libavcodec/vp9block.c b/libavcodec/vp9block.c
new file mode 100644
index 0000000000..e6865934c4
--- /dev/null
+++ b/libavcodec/vp9block.c
@@ -0,0 +1,1684 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+
+#include "avcodec.h"
+#include "get_bits.h"
+#include "internal.h"
+#include "videodsp.h"
+#include "vp56.h"
+#include "vp9.h"
+#include "vp9data.h"
+
+static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
+    {
+        { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
+        {  4,  4 }, {  4, 2 }, { 2,  4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
+    },  {
+        {  8,  8 }, {  8, 4 }, { 4,  8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
+        {  2,  2 }, {  2, 1 }, { 1,  2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+    }
+};
+
+// differential forward probability updates
+static void decode_mode(VP9Context *s, VP9Block *const b)
+{
+    static const uint8_t left_ctx[N_BS_SIZES] = {
+        0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
+    };
+    static const uint8_t above_ctx[N_BS_SIZES] = {
+        0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
+    };
+    static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
+        TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
+        TX_16X16, TX_8X8,   TX_8X8,   TX_8X8,   TX_4X4,   TX_4X4,  TX_4X4
+    };
+    int row = b->row, col = b->col, row7 = b->row7;
+    enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
+    int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
+    int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]);
+    int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
+    int y;
+
+    if (!s->segmentation.enabled) {
+        b->seg_id = 0;
+    } else if (s->keyframe || s->intraonly) {
+        b->seg_id = s->segmentation.update_map ?
+                    vp8_rac_get_tree(&s->c, ff_vp9_segmentation_tree, s->prob.seg) : 0;
+    } else if (!s->segmentation.update_map ||
+               (s->segmentation.temporal &&
+                vp56_rac_get_prob_branchy(&s->c,
+                                          s->prob.segpred[s->above_segpred_ctx[col] +
+                                                          s->left_segpred_ctx[row7]]))) {
+        int pred = 8, x;
+
+        for (y = 0; y < h4; y++)
+            for (x = 0; x < w4; x++)
+                pred = FFMIN(pred,
+                             s->segmentation_map[(y + row) * 8 * s->sb_cols + x + col]);
+        b->seg_id = pred;
+
+        memset(&s->above_segpred_ctx[col], 1, w4);
+        memset(&s->left_segpred_ctx[row7], 1, h4);
+    } else {
+        b->seg_id = vp8_rac_get_tree(&s->c, ff_vp9_segmentation_tree,
+                                     s->prob.seg);
+
+        memset(&s->above_segpred_ctx[col], 0, w4);
+        memset(&s->left_segpred_ctx[row7], 0, h4);
+    }
+    if ((s->segmentation.enabled && s->segmentation.update_map) || s->keyframe) {
+        for (y = 0; y < h4; y++)
+            memset(&s->segmentation_map[(y + row) * 8 * s->sb_cols + col],
+                   b->seg_id, w4);
+    }
+
+    b->skip = s->segmentation.enabled &&
+              s->segmentation.feat[b->seg_id].skip_enabled;
+    if (!b->skip) {
+        int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
+        b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
+        s->counts.skip[c][b->skip]++;
+    }
+
+    if (s->keyframe || s->intraonly) {
+        b->intra = 1;
+    } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
+        b->intra = !s->segmentation.feat[b->seg_id].ref_val;
+    } else {
+        int c, bit;
+
+        if (have_a && have_l) {
+            c  = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
+            c += (c == 2);
+        } else {
+            c = have_a ? 2 * s->above_intra_ctx[col] :
+                have_l ? 2 * s->left_intra_ctx[row7] : 0;
+        }
+        bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
+        s->counts.intra[c][bit]++;
+        b->intra = !bit;
+    }
+
+    if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
+        int c;
+        if (have_a) {
+            if (have_l) {
+                c = (s->above_skip_ctx[col] ? max_tx :
+                     s->above_txfm_ctx[col]) +
+                    (s->left_skip_ctx[row7] ? max_tx :
+                     s->left_txfm_ctx[row7]) > max_tx;
+            } else {
+                c = s->above_skip_ctx[col] ? 1 :
+                    (s->above_txfm_ctx[col] * 2 > max_tx);
+            }
+        } else if (have_l) {
+            c = s->left_skip_ctx[row7] ? 1 :
+                (s->left_txfm_ctx[row7] * 2 > max_tx);
+        } else {
+            c = 1;
+        }
+        switch (max_tx) {
+        case TX_32X32:
+            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
+            if (b->tx) {
+                b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
+                if (b->tx == 2)
+                    b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
+            }
+            s->counts.tx32p[c][b->tx]++;
+            break;
+        case TX_16X16:
+            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
+            if (b->tx)
+                b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
+            s->counts.tx16p[c][b->tx]++;
+            break;
+        case TX_8X8:
+            b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
+            s->counts.tx8p[c][b->tx]++;
+            break;
+        case TX_4X4:
+            b->tx = TX_4X4;
+            break;
+        }
+    } else {
+        b->tx = FFMIN(max_tx, s->txfmmode);
+    }
+
+    if (s->keyframe || s->intraonly) {
+        uint8_t *a = &s->above_mode_ctx[col * 2];
+        uint8_t *l = &s->left_mode_ctx[(row7) << 1];
+
+        b->comp = 0;
+        if (b->bs > BS_8x8) {
+            // FIXME the memory storage intermediates here aren't really
+            // necessary, they're just there to make the code slightly
+            // simpler for now
+            b->mode[0] =
+            a[0]       = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                                          ff_vp9_default_kf_ymode_probs[a[0]][l[0]]);
+            if (b->bs != BS_8x4) {
+                b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                                              ff_vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
+                l[0]       =
+                a[1]       = b->mode[1];
+            } else {
+                l[0]       =
+                a[1]       =
+                b->mode[1] = b->mode[0];
+            }
+            if (b->bs != BS_4x8) {
+                b->mode[2] =
+                a[0]       = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                                              ff_vp9_default_kf_ymode_probs[a[0]][l[1]]);
+                if (b->bs != BS_8x4) {
+                    b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                                                  ff_vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
+                    l[1]       =
+                    a[1]       = b->mode[3];
+                } else {
+                    l[1]       =
+                    a[1]       =
+                    b->mode[3] = b->mode[2];
+                }
+            } else {
+                b->mode[2] = b->mode[0];
+                l[1]       =
+                a[1]       =
+                b->mode[3] = b->mode[1];
+            }
+        } else {
+            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                                          ff_vp9_default_kf_ymode_probs[*a][*l]);
+            b->mode[3] =
+            b->mode[2] =
+            b->mode[1] = b->mode[0];
+            // FIXME this can probably be optimized
+            memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
+            memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
+        }
+        b->uvmode = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                                     ff_vp9_default_kf_uvmode_probs[b->mode[3]]);
+    } else if (b->intra) {
+        b->comp = 0;
+        if (b->bs > BS_8x8) {
+            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                                          s->prob.p.y_mode[0]);
+            s->counts.y_mode[0][b->mode[0]]++;
+            if (b->bs != BS_8x4) {
+                b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                                              s->prob.p.y_mode[0]);
+                s->counts.y_mode[0][b->mode[1]]++;
+            } else {
+                b->mode[1] = b->mode[0];
+            }
+            if (b->bs != BS_4x8) {
+                b->mode[2] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                                              s->prob.p.y_mode[0]);
+                s->counts.y_mode[0][b->mode[2]]++;
+                if (b->bs != BS_8x4) {
+                    b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                                                  s->prob.p.y_mode[0]);
+                    s->counts.y_mode[0][b->mode[3]]++;
+                } else {
+                    b->mode[3] = b->mode[2];
+                }
+            } else {
+                b->mode[2] = b->mode[0];
+                b->mode[3] = b->mode[1];
+            }
+        } else {
+            static const uint8_t size_group[10] = {
+                3, 3, 3, 3, 2, 2, 2, 1, 1, 1
+            };
+            int sz = size_group[b->bs];
+
+            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                                          s->prob.p.y_mode[sz]);
+            b->mode[1] =
+            b->mode[2] =
+            b->mode[3] = b->mode[0];
+            s->counts.y_mode[sz][b->mode[3]]++;
+        }
+        b->uvmode = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
+                                     s->prob.p.uv_mode[b->mode[3]]);
+        s->counts.uv_mode[b->mode[3]][b->uvmode]++;
+    } else {
+        static const uint8_t inter_mode_ctx_lut[14][14] = {
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
+            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
+            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
+            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
+            { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
+        };
+
+        if (s->segmentation.feat[b->seg_id].ref_enabled) {
+            av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
+            b->comp   = 0;
+            b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
+        } else {
+            // read comp_pred flag
+            if (s->comppredmode != PRED_SWITCHABLE) {
+                b->comp = s->comppredmode == PRED_COMPREF;
+            } else {
+                int c;
+
+                // FIXME add intra as ref=0xff (or -1) to make these easier?
+                if (have_a) {
+                    if (have_l) {
+                        if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
+                            c = 4;
+                        } else if (s->above_comp_ctx[col]) {
+                            c = 2 + (s->left_intra_ctx[row7] ||
+                                     s->left_ref_ctx[row7] == s->fixcompref);
+                        } else if (s->left_comp_ctx[row7]) {
+                            c = 2 + (s->above_intra_ctx[col] ||
+                                     s->above_ref_ctx[col] == s->fixcompref);
+                        } else {
+                            c = (!s->above_intra_ctx[col] &&
+                                 s->above_ref_ctx[col] == s->fixcompref) ^
+                                (!s->left_intra_ctx[row7] &&
+                                 s->left_ref_ctx[row & 7] == s->fixcompref);
+                        }
+                    } else {
+                        c = s->above_comp_ctx[col] ? 3 :
+                            (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
+                    }
+                } else if (have_l) {
+                    c = s->left_comp_ctx[row7] ? 3 :
+                        (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
+                } else {
+                    c = 1;
+                }
+                b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
+                s->counts.comp[c][b->comp]++;
+            }
+
+            // read actual references
+            // FIXME probably cache a few variables here to prevent repetitive
+            // memory accesses below
+            if (b->comp) { /* two references */
+                int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
+
+                b->ref[fix_idx] = s->fixcompref;
+                // FIXME can this codeblob be replaced by some sort of LUT?
+                if (have_a) {
+                    if (have_l) {
+                        if (s->above_intra_ctx[col]) {
+                            if (s->left_intra_ctx[row7]) {
+                                c = 2;
+                            } else {
+                                c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
+                            }
+                        } else if (s->left_intra_ctx[row7]) {
+                            c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
+                        } else {
+                            int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
+
+                            if (refl == refa && refa == s->varcompref[1]) {
+                                c = 0;
+                            } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
+                                if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
+                                    (refl == s->fixcompref && refa == s->varcompref[0])) {
+                                    c = 4;
+                                } else {
+                                    c = (refa == refl) ? 3 : 1;
+                                }
+                            } else if (!s->left_comp_ctx[row7]) {
+                                if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
+                                    c = 1;
+                                } else {
+                                    c = (refl == s->varcompref[1] &&
+                                         refa != s->varcompref[1]) ? 2 : 4;
+                                }
+                            } else if (!s->above_comp_ctx[col]) {
+                                if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
+                                    c = 1;
+                                } else {
+                                    c = (refa == s->varcompref[1] &&
+                                         refl != s->varcompref[1]) ? 2 : 4;
+                                }
+                            } else {
+                                c = (refl == refa) ? 4 : 2;
+                            }
+                        }
+                    } else {
+                        if (s->above_intra_ctx[col]) {
+                            c = 2;
+                        } else if (s->above_comp_ctx[col]) {
+                            c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
+                        } else {
+                            c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
+                        }
+                    }
+                } else if (have_l) {
+                    if (s->left_intra_ctx[row7]) {
+                        c = 2;
+                    } else if (s->left_comp_ctx[row7]) {
+                        c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
+                    } else {
+                        c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
+                    }
+                } else {
+                    c = 2;
+                }
+                bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
+                b->ref[var_idx] = s->varcompref[bit];
+                s->counts.comp_ref[c][bit]++;
+            } else { /* single reference */
+                int bit, c;
+
+                if (have_a && !s->above_intra_ctx[col]) {
+                    if (have_l && !s->left_intra_ctx[row7]) {
+                        if (s->left_comp_ctx[row7]) {
+                            if (s->above_comp_ctx[col]) {
+                                c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
+                                         !s->above_ref_ctx[col]);
+                            } else {
+                                c = (3 * !s->above_ref_ctx[col]) +
+                                    (!s->fixcompref || !s->left_ref_ctx[row7]);
+                            }
+                        } else if (s->above_comp_ctx[col]) {
+                            c = (3 * !s->left_ref_ctx[row7]) +
+                                (!s->fixcompref || !s->above_ref_ctx[col]);
+                        } else {
+                            c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
+                        }
+                    } else if (s->above_intra_ctx[col]) {
+                        c = 2;
+                    } else if (s->above_comp_ctx[col]) {
+                        c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
+                    } else {
+                        c = 4 * (!s->above_ref_ctx[col]);
+                    }
+                } else if (have_l && !s->left_intra_ctx[row7]) {
+                    if (s->left_intra_ctx[row7]) {
+                        c = 2;
+                    } else if (s->left_comp_ctx[row7]) {
+                        c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
+                    } else {
+                        c = 4 * (!s->left_ref_ctx[row7]);
+                    }
+                } else {
+                    c = 2;
+                }
+                bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
+                s->counts.single_ref[c][0][bit]++;
+                if (!bit) {
+                    b->ref[0] = 0;
+                } else {
+                    // FIXME can this codeblob be replaced by some sort of LUT?
+                    if (have_a) {
+                        if (have_l) {
+                            if (s->left_intra_ctx[row7]) {
+                                if (s->above_intra_ctx[col]) {
+                                    c = 2;
+                                } else if (s->above_comp_ctx[col]) {
+                                    c = 1 + 2 * (s->fixcompref == 1 ||
+                                                 s->above_ref_ctx[col] == 1);
+                                } else if (!s->above_ref_ctx[col]) {
+                                    c = 3;
+                                } else {
+                                    c = 4 * (s->above_ref_ctx[col] == 1);
+                                }
+                            } else if (s->above_intra_ctx[col]) {
+                                if (s->left_intra_ctx[row7]) {
+                                    c = 2;
+                                } else if (s->left_comp_ctx[row7]) {
+                                    c = 1 + 2 * (s->fixcompref == 1 ||
+                                                 s->left_ref_ctx[row7] == 1);
+                                } else if (!s->left_ref_ctx[row7]) {
+                                    c = 3;
+                                } else {
+                                    c = 4 * (s->left_ref_ctx[row7] == 1);
+                                }
+                            } else if (s->above_comp_ctx[col]) {
+                                if (s->left_comp_ctx[row7]) {
+                                    if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
+                                        c = 3 * (s->fixcompref == 1 ||
+                                                 s->left_ref_ctx[row7] == 1);
+                                    } else {
+                                        c = 2;
+                                    }
+                                } else if (!s->left_ref_ctx[row7]) {
+                                    c = 1 + 2 * (s->fixcompref == 1 ||
+                                                 s->above_ref_ctx[col] == 1);
+                                } else {
+                                    c = 3 * (s->left_ref_ctx[row7] == 1) +
+                                        (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
+                                }
+                            } else if (s->left_comp_ctx[row7]) {
+                                if (!s->above_ref_ctx[col]) {
+                                    c = 1 + 2 * (s->fixcompref == 1 ||
+                                                 s->left_ref_ctx[row7] == 1);
+                                } else {
+                                    c = 3 * (s->above_ref_ctx[col] == 1) +
+                                        (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
+                                }
+                            } else if (!s->above_ref_ctx[col]) {
+                                if (!s->left_ref_ctx[row7]) {
+                                    c = 3;
+                                } else {
+                                    c = 4 * (s->left_ref_ctx[row7] == 1);
+                                }
+                            } else if (!s->left_ref_ctx[row7]) {
+                                c = 4 * (s->above_ref_ctx[col] == 1);
+                            } else {
+                                c = 2 * (s->left_ref_ctx[row7] == 1) +
+                                    2 * (s->above_ref_ctx[col] == 1);
+                            }
+                        } else {
+                            if (s->above_intra_ctx[col] ||
+                                (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
+                                c = 2;
+                            } else if (s->above_comp_ctx[col]) {
+                                c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
+                            } else {
+                                c = 4 * (s->above_ref_ctx[col] == 1);
+                            }
+                        }
+                    } else if (have_l) {
+                        if (s->left_intra_ctx[row7] ||
+                            (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
+                            c = 2;
+                        } else if (s->left_comp_ctx[row7]) {
+                            c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
+                        } else {
+                            c = 4 * (s->left_ref_ctx[row7] == 1);
+                        }
+                    } else {
+                        c = 2;
+                    }
+                    bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
+                    s->counts.single_ref[c][1][bit]++;
+                    b->ref[0] = 1 + bit;
+                }
+            }
+        }
+
+        if (b->bs <= BS_8x8) {
+            if (s->segmentation.feat[b->seg_id].skip_enabled) {
+                b->mode[0] =
+                b->mode[1] =
+                b->mode[2] =
+                b->mode[3] = ZEROMV;
+            } else {
+                static const uint8_t off[10] = {
+                    3, 0, 0, 1, 0, 0, 0, 0, 0, 0
+                };
+
+                // FIXME this needs to use the LUT tables from find_ref_mvs
+                // because not all are -1,0/0,-1
+                int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
+                                          [s->left_mode_ctx[row7 + off[b->bs]]];
+
+                b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
+                                              s->prob.p.mv_mode[c]);
+                b->mode[1] =
+                b->mode[2] =
+                b->mode[3] = b->mode[0];
+                s->counts.mv_mode[c][b->mode[0] - 10]++;
+            }
+        }
+
+        if (s->filtermode == FILTER_SWITCHABLE) {
+            int c;
+
+            if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
+                if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
+                    c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
+                        s->left_filter_ctx[row7] : 3;
+                } else {
+                    c = s->above_filter_ctx[col];
+                }
+            } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
+                c = s->left_filter_ctx[row7];
+            } else {
+                c = 3;
+            }
+
+            b->filter = vp8_rac_get_tree(&s->c, ff_vp9_filter_tree,
+                                         s->prob.p.filter[c]);
+            s->counts.filter[c][b->filter]++;
+        } else {
+            b->filter = s->filtermode;
+        }
+
+        if (b->bs > BS_8x8) {
+            int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
+
+            b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
+                                          s->prob.p.mv_mode[c]);
+            s->counts.mv_mode[c][b->mode[0] - 10]++;
+            ff_vp9_fill_mv(s, b->mv[0], b->mode[0], 0);
+
+            if (b->bs != BS_8x4) {
+                b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
+                                              s->prob.p.mv_mode[c]);
+                s->counts.mv_mode[c][b->mode[1] - 10]++;
+                ff_vp9_fill_mv(s, b->mv[1], b->mode[1], 1);
+            } else {
+                b->mode[1] = b->mode[0];
+                AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
+                AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
+            }
+
+            if (b->bs != BS_4x8) {
+                b->mode[2] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
+                                              s->prob.p.mv_mode[c]);
+                s->counts.mv_mode[c][b->mode[2] - 10]++;
+                ff_vp9_fill_mv(s, b->mv[2], b->mode[2], 2);
+
+                if (b->bs != BS_8x4) {
+                    b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
+                                                  s->prob.p.mv_mode[c]);
+                    s->counts.mv_mode[c][b->mode[3] - 10]++;
+                    ff_vp9_fill_mv(s, b->mv[3], b->mode[3], 3);
+                } else {
+                    b->mode[3] = b->mode[2];
+                    AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
+                    AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
+                }
+            } else {
+                b->mode[2] = b->mode[0];
+                AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
+                AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
+                b->mode[3] = b->mode[1];
+                AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
+                AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
+            }
+        } else {
+            ff_vp9_fill_mv(s, b->mv[0], b->mode[0], -1);
+            AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
+            AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
+            AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
+            AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
+            AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
+            AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
+        }
+    }
+
+    // FIXME this can probably be optimized
+    memset(&s->above_skip_ctx[col], b->skip, w4);
+    memset(&s->left_skip_ctx[row7], b->skip, h4);
+    memset(&s->above_txfm_ctx[col], b->tx, w4);
+    memset(&s->left_txfm_ctx[row7], b->tx, h4);
+    memset(&s->above_partition_ctx[col], above_ctx[b->bs], w4);
+    memset(&s->left_partition_ctx[row7], left_ctx[b->bs], h4);
+    if (!s->keyframe && !s->intraonly) {
+        memset(&s->above_intra_ctx[col], b->intra, w4);
+        memset(&s->left_intra_ctx[row7], b->intra, h4);
+        memset(&s->above_comp_ctx[col], b->comp, w4);
+        memset(&s->left_comp_ctx[row7], b->comp, h4);
+        memset(&s->above_mode_ctx[col], b->mode[3], w4);
+        memset(&s->left_mode_ctx[row7], b->mode[3], h4);
+        if (s->filtermode == FILTER_SWITCHABLE && !b->intra) {
+            memset(&s->above_filter_ctx[col], b->filter, w4);
+            memset(&s->left_filter_ctx[row7], b->filter, h4);
+            b->filter = ff_vp9_filter_lut[b->filter];
+        }
+        if (b->bs > BS_8x8) {
+            int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
+
+            AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
+            AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
+            AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
+            AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
+            AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
+            AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
+            AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
+            AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
+        } else {
+            int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
+
+            for (n = 0; n < w4 * 2; n++) {
+                AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
+                AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
+            }
+            for (n = 0; n < h4 * 2; n++) {
+                AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
+                AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
+            }
+        }
+
+        if (!b->intra) { // FIXME write 0xff or -1 if intra, so we can use this
+                         // as a direct check in above branches
+            int vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
+
+            memset(&s->above_ref_ctx[col], vref, w4);
+            memset(&s->left_ref_ctx[row7], vref, h4);
+        }
+    }
+
+    // FIXME kinda ugly
+    for (y = 0; y < h4; y++) {
+        int x, o = (row + y) * s->sb_cols * 8 + col;
+
+        if (b->intra) {
+            for (x = 0; x < w4; x++) {
+                s->mv[0][o + x].ref[0] =
+                s->mv[0][o + x].ref[1] = -1;
+            }
+        } else if (b->comp) {
+            for (x = 0; x < w4; x++) {
+                s->mv[0][o + x].ref[0] = b->ref[0];
+                s->mv[0][o + x].ref[1] = b->ref[1];
+                AV_COPY32(&s->mv[0][o + x].mv[0], &b->mv[3][0]);
+                AV_COPY32(&s->mv[0][o + x].mv[1], &b->mv[3][1]);
+            }
+        } else {
+            for (x = 0; x < w4; x++) {
+                s->mv[0][o + x].ref[0] = b->ref[0];
+                s->mv[0][o + x].ref[1] = -1;
+                AV_COPY32(&s->mv[0][o + x].mv[0], &b->mv[3][0]);
+            }
+        }
+    }
+}
+
+// FIXME remove tx argument, and merge cnt/eob arguments?
+static int decode_block_coeffs(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
+                               enum TxfmMode tx, unsigned (*cnt)[6][3],
+                               unsigned (*eob)[6][2], uint8_t(*p)[6][11],
+                               int nnz, const int16_t *scan,
+                               const int16_t(*nb)[2],
+                               const int16_t *band_counts, const int16_t *qmul)
+{
+    int i = 0, band = 0, band_left = band_counts[band];
+    uint8_t *tp = p[0][nnz];
+    uint8_t cache[1024];
+
+    do {
+        int val, rc;
+
+        val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
+        eob[band][nnz][val]++;
+        if (!val)
+            break;
+
+skip_eob:
+        if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
+            cnt[band][nnz][0]++;
+            if (!--band_left)
+                band_left = band_counts[++band];
+            cache[scan[i]] = 0;
+            nnz            = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
+            tp             = p[band][nnz];
+            if (++i == n_coeffs)
+                break;  //invalid input; blocks should end with EOB
+            goto skip_eob;
+        }
+
+        rc = scan[i];
+        if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
+            cnt[band][nnz][1]++;
+            val       = 1;
+            cache[rc] = 1;
+        } else {
+            // fill in p[3-10] (model fill) - only once per frame for each pos
+            if (!tp[3])
+                memcpy(&tp[3], ff_vp9_model_pareto8[tp[2]], 8);
+
+            cnt[band][nnz][2]++;
+            if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
+                if (!vp56_rac_get_prob_branchy(c, tp[4])) {
+                    cache[rc] = val = 2;
+                } else {
+                    val       = 3 + vp56_rac_get_prob(c, tp[5]);
+                    cache[rc] = 3;
+                }
+            } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
+                cache[rc] = 4;
+                if (!vp56_rac_get_prob_branchy(c, tp[7])) {
+                    val  =  vp56_rac_get_prob(c, 159) + 5;
+                } else {
+                    val  = (vp56_rac_get_prob(c, 165) << 1) + 7;
+                    val +=  vp56_rac_get_prob(c, 145);
+                }
+            } else { // cat 3-6
+                cache[rc] = 5;
+                if (!vp56_rac_get_prob_branchy(c, tp[8])) {
+                    if (!vp56_rac_get_prob_branchy(c, tp[9])) {
+                        val  = (vp56_rac_get_prob(c, 173) << 2) + 11;
+                        val += (vp56_rac_get_prob(c, 148) << 1);
+                        val +=  vp56_rac_get_prob(c, 140);
+                    } else {
+                        val  = (vp56_rac_get_prob(c, 176) << 3) + 19;
+                        val += (vp56_rac_get_prob(c, 155) << 2);
+                        val += (vp56_rac_get_prob(c, 140) << 1);
+                        val +=  vp56_rac_get_prob(c, 135);
+                    }
+                } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
+                    val  = (vp56_rac_get_prob(c, 180) << 4) + 35;
+                    val += (vp56_rac_get_prob(c, 157) << 3);
+                    val += (vp56_rac_get_prob(c, 141) << 2);
+                    val += (vp56_rac_get_prob(c, 134) << 1);
+                    val +=  vp56_rac_get_prob(c, 130);
+                } else {
+                    val  = (vp56_rac_get_prob(c, 254) << 13) + 67;
+                    val += (vp56_rac_get_prob(c, 254) << 12);
+                    val += (vp56_rac_get_prob(c, 254) << 11);
+                    val += (vp56_rac_get_prob(c, 252) << 10);
+                    val += (vp56_rac_get_prob(c, 249) << 9);
+                    val += (vp56_rac_get_prob(c, 243) << 8);
+                    val += (vp56_rac_get_prob(c, 230) << 7);
+                    val += (vp56_rac_get_prob(c, 196) << 6);
+                    val += (vp56_rac_get_prob(c, 177) << 5);
+                    val += (vp56_rac_get_prob(c, 153) << 4);
+                    val += (vp56_rac_get_prob(c, 140) << 3);
+                    val += (vp56_rac_get_prob(c, 133) << 2);
+                    val += (vp56_rac_get_prob(c, 130) << 1);
+                    val +=  vp56_rac_get_prob(c, 129);
+                }
+            }
+        }
+        if (!--band_left)
+            band_left = band_counts[++band];
+        if (tx == TX_32X32) // FIXME slow
+            coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
+        else
+            coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
+        nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
+        tp  = p[band][nnz];
+    } while (++i < n_coeffs);
+
+    return i;
+}
+
+static int decode_coeffs(AVCodecContext *avctx)
+{
+    VP9Context *s = avctx->priv_data;
+    VP9Block *const b = &s->b;
+    int row = b->row, col = b->col;
+    uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
+    unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
+    unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
+    int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
+    int end_x = FFMIN(2 * (s->cols - col), w4);
+    int end_y = FFMIN(2 * (s->rows - row), h4);
+    int n, pl, x, y, step1d = 1 << b->tx, step = 1 << (b->tx * 2);
+    int uvstep1d = 1 << b->uvtx, uvstep = 1 << (b->uvtx * 2), ret;
+    int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
+    int tx = 4 * s->lossless + b->tx;
+    const int16_t **yscans = ff_vp9_scans[tx];
+    const int16_t (**ynbs)[2] = ff_vp9_scans_nb[tx];
+    const int16_t *uvscan = ff_vp9_scans[b->uvtx][DCT_DCT];
+    const int16_t (*uvnb)[2] = ff_vp9_scans_nb[b->uvtx][DCT_DCT];
+    uint8_t *a = &s->above_y_nnz_ctx[col * 2];
+    uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
+    static const int16_t band_counts[4][8] = {
+        { 1, 2, 3, 4,  3,   16 - 13, 0 },
+        { 1, 2, 3, 4, 11,   64 - 21, 0 },
+        { 1, 2, 3, 4, 11,  256 - 21, 0 },
+        { 1, 2, 3, 4, 11, 1024 - 21, 0 },
+    };
+    const int16_t *y_band_counts  = band_counts[b->tx];
+    const int16_t *uv_band_counts = band_counts[b->uvtx];
+
+    /* y tokens */
+    if (b->tx > TX_4X4) { // FIXME slow
+        for (y = 0; y < end_y; y += step1d)
+            for (x = 1; x < step1d; x++)
+                l[y] |= l[y + x];
+        for (x = 0; x < end_x; x += step1d)
+            for (y = 1; y < step1d; y++)
+                a[x] |= a[x + y];
+    }
+    for (n = 0, y = 0; y < end_y; y += step1d) {
+        for (x = 0; x < end_x; x += step1d, n += step) {
+            enum TxfmType txtp = ff_vp9_intra_txfm_type[b->mode[b->tx == TX_4X4 &&
+                                                                b->bs > BS_8x8 ?
+                                                                n : 0]];
+            int nnz = a[x] + l[y];
+            if ((ret = decode_block_coeffs(&s->c, s->block + 16 * n, 16 * step,
+                                           b->tx, c, e, p, nnz, yscans[txtp],
+                                           ynbs[txtp], y_band_counts,
+                                           qmul[0])) < 0)
+                return ret;
+            a[x] = l[y] = !!ret;
+            if (b->tx > TX_8X8)
+                AV_WN16A(&s->eob[n], ret);
+            else
+                s->eob[n] = ret;
+        }
+    }
+    if (b->tx > TX_4X4) { // FIXME slow
+        for (y = 0; y < end_y; y += step1d)
+            memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, step1d - 1));
+        for (x = 0; x < end_x; x += step1d)
+            memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, step1d - 1));
+    }
+
+    p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
+    c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
+    e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
+    w4    >>= 1;
+    h4    >>= 1;
+    end_x >>= 1;
+    end_y >>= 1;
+    for (pl = 0; pl < 2; pl++) {
+        a = &s->above_uv_nnz_ctx[pl][col];
+        l = &s->left_uv_nnz_ctx[pl][row & 7];
+        if (b->uvtx > TX_4X4) { // FIXME slow
+            for (y = 0; y < end_y; y += uvstep1d)
+                for (x = 1; x < uvstep1d; x++)
+                    l[y] |= l[y + x];
+            for (x = 0; x < end_x; x += uvstep1d)
+                for (y = 1; y < uvstep1d; y++)
+                    a[x] |= a[x + y];
+        }
+        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
+            for (x = 0; x < end_x; x += uvstep1d, n += uvstep) {
+                int nnz = a[x] + l[y];
+                if ((ret = decode_block_coeffs(&s->c, s->uvblock[pl] + 16 * n,
+                                               16 * uvstep, b->uvtx, c, e, p,
+                                               nnz, uvscan, uvnb,
+                                               uv_band_counts, qmul[1])) < 0)
+                    return ret;
+                a[x] = l[y] = !!ret;
+                if (b->uvtx > TX_8X8)
+                    AV_WN16A(&s->uveob[pl][n], ret);
+                else
+                    s->uveob[pl][n] = ret;
+            }
+        }
+        if (b->uvtx > TX_4X4) { // FIXME slow
+            for (y = 0; y < end_y; y += uvstep1d)
+                memset(&l[y + 1], l[y], FFMIN(end_y - y - 1, uvstep1d - 1));
+            for (x = 0; x < end_x; x += uvstep1d)
+                memset(&a[x + 1], a[x], FFMIN(end_x - x - 1, uvstep1d - 1));
+        }
+    }
+
+    return 0;
+}
+
+static av_always_inline int check_intra_mode(VP9Context *s, int mode,
+                                             uint8_t **a,
+                                             uint8_t *dst_edge,
+                                             ptrdiff_t stride_edge,
+                                             uint8_t *dst_inner,
+                                             ptrdiff_t stride_inner,
+                                             uint8_t *l, int col, int x, int w,
+                                             int row, int y, enum TxfmMode tx,
+                                             int p)
+{
+    int have_top   = row > 0 || y > 0;
+    int have_left  = col > s->tiling.tile_col_start || x > 0;
+    int have_right = x < w - 1;
+    static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
+        [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED            },
+                                   { DC_127_PRED,          VERT_PRED            } },
+        [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED          },
+                                   { HOR_PRED,             HOR_PRED             } },
+        [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED          },
+                                   { LEFT_DC_PRED,         DC_PRED              } },
+        [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED  },
+                                   { DC_127_PRED,          DIAG_DOWN_LEFT_PRED  } },
+        [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
+                                   { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
+        [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED      },
+                                   { VERT_RIGHT_PRED,      VERT_RIGHT_PRED      } },
+        [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED        },
+                                   { HOR_DOWN_PRED,        HOR_DOWN_PRED        } },
+        [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED       },
+                                   { DC_127_PRED,          VERT_LEFT_PRED       } },
+        [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED          },
+                                   { HOR_UP_PRED,          HOR_UP_PRED          } },
+        [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED            },
+                                   { HOR_PRED,             TM_VP8_PRED          } },
+    };
+    static const struct {
+        uint8_t needs_left:1;
+        uint8_t needs_top:1;
+        uint8_t needs_topleft:1;
+        uint8_t needs_topright:1;
+    } edges[N_INTRA_PRED_MODES] = {
+        [VERT_PRED]            = { .needs_top  = 1 },
+        [HOR_PRED]             = { .needs_left = 1 },
+        [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
+        [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
+        [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1,
+                                   .needs_topleft = 1 },
+        [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1,
+                                   .needs_topleft = 1 },
+        [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1,
+                                   .needs_topleft = 1 },
+        [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
+        [HOR_UP_PRED]          = { .needs_left = 1 },
+        [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1,
+                                   .needs_topleft = 1 },
+        [LEFT_DC_PRED]         = { .needs_left = 1 },
+        [TOP_DC_PRED]          = { .needs_top  = 1 },
+        [DC_128_PRED]          = { 0 },
+        [DC_127_PRED]          = { 0 },
+        [DC_129_PRED]          = { 0 }
+    };
+
+    av_assert2(mode >= 0 && mode < 10);
+    mode = mode_conv[mode][have_left][have_top];
+    if (edges[mode].needs_top) {
+        uint8_t *top = NULL, *topleft = NULL;
+        int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
+        int n_px_need_tr = 0;
+
+        if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
+            n_px_need_tr = 4;
+
+        // if top of sb64-row, use s->intra_pred_data[] instead of
+        // dst[-stride] for intra prediction (it contains pre- instead of
+        // post-loopfilter data)
+        if (have_top) {
+            top = !(row & 7) && !y ?
+                  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
+                  y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
+            if (have_left)
+                topleft = !(row & 7) && !y ?
+                          s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
+                          y == 0 || x == 0 ? &dst_edge[-stride_edge] :
+                          &dst_inner[-stride_inner];
+        }
+
+        if (have_top &&
+            (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
+            (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
+            n_px_need + n_px_need_tr <= n_px_have) {
+            *a = top;
+        } else {
+            if (have_top) {
+                if (n_px_need <= n_px_have) {
+                    memcpy(*a, top, n_px_need);
+                } else {
+                    memcpy(*a, top, n_px_have);
+                    memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
+                           n_px_need - n_px_have);
+                }
+            } else {
+                memset(*a, 127, n_px_need);
+            }
+            if (edges[mode].needs_topleft) {
+                if (have_left && have_top)
+                    (*a)[-1] = topleft[-1];
+                else
+                    (*a)[-1] = have_top ? 129 : 127;
+            }
+            if (tx == TX_4X4 && edges[mode].needs_topright) {
+                if (have_top && have_right &&
+                    n_px_need + n_px_need_tr <= n_px_have) {
+                    memcpy(&(*a)[4], &top[4], 4);
+                } else {
+                    memset(&(*a)[4], (*a)[3], 4);
+                }
+            }
+        }
+    }
+    if (edges[mode].needs_left) {
+        if (have_left) {
+            int i;
+            int n_px_need = 4 << tx;
+            int n_px_have = (((s->rows - row) << !p) - y) * 4;
+            uint8_t *dst     = x == 0 ? dst_edge : dst_inner;
+            ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
+
+            if (n_px_need <= n_px_have) {
+                for (i = 0; i < n_px_need; i++)
+                    l[i] = dst[i * stride - 1];
+            } else {
+                for (i = 0; i < n_px_have; i++)
+                    l[i] = dst[i * stride - 1];
+                memset(&l[i], l[i - 1], n_px_need - n_px_have);
+            }
+        } else {
+            memset(l, 129, 4 << tx);
+        }
+    }
+
+    return mode;
+}
+
+static void intra_recon(AVCodecContext *avctx, ptrdiff_t y_off, ptrdiff_t uv_off)
+{
+    VP9Context *s = avctx->priv_data;
+    VP9Block *const b = &s->b;
+    int row = b->row, col = b->col;
+    int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
+    int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
+    int end_x = FFMIN(2 * (s->cols - col), w4);
+    int end_y = FFMIN(2 * (s->rows - row), h4);
+    int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
+    int uvstep1d = 1 << b->uvtx, p;
+    uint8_t *dst = b->dst[0], *dst_r = s->cur_frame->data[0] + y_off;
+
+    for (n = 0, y = 0; y < end_y; y += step1d) {
+        uint8_t *ptr = dst, *ptr_r = dst_r;
+        for (x = 0; x < end_x;
+             x += step1d, ptr += 4 * step1d, ptr_r += 4 * step1d, n += step) {
+            int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
+                               y * 2 + x : 0];
+            LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
+            uint8_t *a = &a_buf[16], l[32];
+            enum TxfmType txtp = ff_vp9_intra_txfm_type[mode];
+            int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
+
+            mode = check_intra_mode(s, mode, &a, ptr_r,
+                                    s->cur_frame->linesize[0],
+                                    ptr, b->y_stride, l,
+                                    col, x, w4, row, y, b->tx, 0);
+            s->dsp.intra_pred[b->tx][mode](ptr, b->y_stride, l, a);
+            if (eob)
+                s->dsp.itxfm_add[tx][txtp](ptr, b->y_stride,
+                                           s->block + 16 * n, eob);
+        }
+        dst_r += 4 * s->cur_frame->linesize[0] * step1d;
+        dst   += 4 * b->y_stride * step1d;
+    }
+
+    // U/V
+    h4    >>= 1;
+    w4    >>= 1;
+    end_x >>= 1;
+    end_y >>= 1;
+    step    = 1 << (b->uvtx * 2);
+    for (p = 0; p < 2; p++) {
+        dst   = b->dst[1 + p];
+        dst_r = s->cur_frame->data[1 + p] + uv_off;
+        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
+            uint8_t *ptr = dst, *ptr_r = dst_r;
+            for (x = 0; x < end_x;
+                 x += uvstep1d, ptr += 4 * uvstep1d,
+                 ptr_r += 4 * uvstep1d, n += step) {
+                int mode = b->uvmode;
+                LOCAL_ALIGNED_16(uint8_t, a_buf, [48]);
+                uint8_t *a = &a_buf[16], l[32];
+                int eob    = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n])
+                                              : s->uveob[p][n];
+
+                mode = check_intra_mode(s, mode, &a, ptr_r,
+                                        s->cur_frame->linesize[1],
+                                        ptr, b->uv_stride, l,
+                                        col, x, w4, row, y, b->uvtx, p + 1);
+                s->dsp.intra_pred[b->uvtx][mode](ptr, b->uv_stride, l, a);
+                if (eob)
+                    s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, b->uv_stride,
+                                                    s->uvblock[p] + 16 * n,
+                                                    eob);
+            }
+            dst_r += 4 * uvstep1d * s->cur_frame->linesize[1];
+            dst   += 4 * uvstep1d * b->uv_stride;
+        }
+    }
+}
+
+static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func(*mc)[2],
+                                         uint8_t *dst, ptrdiff_t dst_stride,
+                                         const uint8_t *ref,
+                                         ptrdiff_t ref_stride,
+                                         ptrdiff_t y, ptrdiff_t x,
+                                         const VP56mv *mv,
+                                         int bw, int bh, int w, int h)
+{
+    int mx = mv->x, my = mv->y;
+
+    y   += my >> 3;
+    x   += mx >> 3;
+    ref += y * ref_stride + x;
+    mx  &= 7;
+    my  &= 7;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    if (x < !!mx * 3 || y < !!my * 3 ||
+        x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref - !!my * 3 * ref_stride - !!mx * 3,
+                                 80,
+                                 ref_stride,
+                                 bw + !!mx * 7, bh + !!my * 7,
+                                 x - !!mx * 3, y - !!my * 3, w, h);
+        ref        = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
+        ref_stride = 80;
+    }
+    mc[!!mx][!!my](dst, ref, dst_stride, ref_stride, bh, mx << 1, my << 1);
+}
+
+static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func(*mc)[2],
+                                           uint8_t *dst_u, uint8_t *dst_v,
+                                           ptrdiff_t dst_stride,
+                                           const uint8_t *ref_u,
+                                           ptrdiff_t src_stride_u,
+                                           const uint8_t *ref_v,
+                                           ptrdiff_t src_stride_v,
+                                           ptrdiff_t y, ptrdiff_t x,
+                                           const VP56mv *mv,
+                                           int bw, int bh, int w, int h)
+{
+    int mx = mv->x, my = mv->y;
+
+    y     += my >> 4;
+    x     += mx >> 4;
+    ref_u += y * src_stride_u + x;
+    ref_v += y * src_stride_v + x;
+    mx    &= 15;
+    my    &= 15;
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
+    if (x < !!mx * 3 || y < !!my * 3 ||
+        x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_u - !!my * 3 * src_stride_u - !!mx * 3,
+                                 80,
+                                 src_stride_u,
+                                 bw + !!mx * 7, bh + !!my * 7,
+                                 x - !!mx * 3, y - !!my * 3, w, h);
+        ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
+        mc[!!mx][!!my](dst_u, ref_u, dst_stride, 80, bh, mx, my);
+
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
+                                 ref_v - !!my * 3 * src_stride_v - !!mx * 3,
+                                 80,
+                                 src_stride_v,
+                                 bw + !!mx * 7, bh + !!my * 7,
+                                 x - !!mx * 3, y - !!my * 3, w, h);
+        ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
+        mc[!!mx][!!my](dst_v, ref_v, dst_stride, 80, bh, mx, my);
+    } else {
+        mc[!!mx][!!my](dst_u, ref_u, dst_stride, src_stride_u, bh, mx, my);
+        mc[!!mx][!!my](dst_v, ref_v, dst_stride, src_stride_v, bh, mx, my);
+    }
+}
+
+static int inter_recon(AVCodecContext *avctx)
+{
+    static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
+        { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
+        { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
+    };
+    VP9Context *s = avctx->priv_data;
+    VP9Block *const b = &s->b;
+    int row = b->row, col = b->col;
+    AVFrame *ref1 = s->refs[s->refidx[b->ref[0]]];
+    AVFrame *ref2 = b->comp ? s->refs[s->refidx[b->ref[1]]] : NULL;
+    int w = avctx->width, h = avctx->height;
+    ptrdiff_t ls_y = b->y_stride, ls_uv = b->uv_stride;
+
+    if (!ref1->data[0] || (b->comp && !ref2->data[0]))
+        return AVERROR_INVALIDDATA;
+
+    // y inter pred
+    if (b->bs > BS_8x8) {
+        if (b->bs == BS_8x4) {
+            mc_luma_dir(s, s->dsp.mc[3][b->filter][0], b->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0],
+                        row << 3, col << 3, &b->mv[0][0], 8, 4, w, h);
+            mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
+                        b->dst[0] + 4 * ls_y, ls_y,
+                        ref1->data[0], ref1->linesize[0],
+                        (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w, h);
+
+            if (b->comp) {
+                mc_luma_dir(s, s->dsp.mc[3][b->filter][1], b->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0],
+                            row << 3, col << 3, &b->mv[0][1], 8, 4, w, h);
+                mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
+                            b->dst[0] + 4 * ls_y, ls_y,
+                            ref2->data[0], ref2->linesize[0],
+                            (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w, h);
+            }
+        } else if (b->bs == BS_4x8) {
+            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0],
+                        row << 3, col << 3, &b->mv[0][0], 4, 8, w, h);
+            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0] + 4, ls_y,
+                        ref1->data[0], ref1->linesize[0],
+                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w, h);
+
+            if (b->comp) {
+                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0],
+                            row << 3, col << 3, &b->mv[0][1], 4, 8, w, h);
+                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0] + 4, ls_y,
+                            ref2->data[0], ref2->linesize[0],
+                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w, h);
+            }
+        } else {
+            av_assert2(b->bs == BS_4x4);
+
+            // FIXME if two horizontally adjacent blocks have the same MV,
+            // do a w8 instead of a w4 call
+            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0], ls_y,
+                        ref1->data[0], ref1->linesize[0],
+                        row << 3, col << 3, &b->mv[0][0], 4, 4, w, h);
+            mc_luma_dir(s, s->dsp.mc[4][b->filter][0], b->dst[0] + 4, ls_y,
+                        ref1->data[0], ref1->linesize[0],
+                        row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w, h);
+            mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
+                        b->dst[0] + 4 * ls_y, ls_y,
+                        ref1->data[0], ref1->linesize[0],
+                        (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w, h);
+            mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
+                        b->dst[0] + 4 * ls_y + 4, ls_y,
+                        ref1->data[0], ref1->linesize[0],
+                        (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w, h);
+
+            if (b->comp) {
+                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0], ls_y,
+                            ref2->data[0], ref2->linesize[0],
+                            row << 3, col << 3, &b->mv[0][1], 4, 4, w, h);
+                mc_luma_dir(s, s->dsp.mc[4][b->filter][1], b->dst[0] + 4, ls_y,
+                            ref2->data[0], ref2->linesize[0],
+                            row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w, h);
+                mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
+                            b->dst[0] + 4 * ls_y, ls_y,
+                            ref2->data[0], ref2->linesize[0],
+                            (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w, h);
+                mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
+                            b->dst[0] + 4 * ls_y + 4, ls_y,
+                            ref2->data[0], ref2->linesize[0],
+                            (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w, h);
+            }
+        }
+    } else {
+        int bwl = bwlog_tab[0][b->bs];
+        int bw  = bwh_tab[0][b->bs][0] * 4;
+        int bh  = bwh_tab[0][b->bs][1] * 4;
+
+        mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], b->dst[0], ls_y,
+                    ref1->data[0], ref1->linesize[0],
+                    row << 3, col << 3, &b->mv[0][0], bw, bh, w, h);
+
+        if (b->comp)
+            mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], b->dst[0], ls_y,
+                        ref2->data[0], ref2->linesize[0],
+                        row << 3, col << 3, &b->mv[0][1], bw, bh, w, h);
+    }
+
+    // uv inter pred
+    {
+        int bwl = bwlog_tab[1][b->bs];
+        int bw  = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
+        VP56mv mvuv;
+
+        w = (w + 1) >> 1;
+        h = (h + 1) >> 1;
+        if (b->bs > BS_8x8) {
+            mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x +
+                                 b->mv[2][0].x + b->mv[3][0].x, 4);
+            mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y +
+                                 b->mv[2][0].y + b->mv[3][0].y, 4);
+        } else {
+            mvuv = b->mv[0][0];
+        }
+
+        mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
+                      b->dst[1], b->dst[2], ls_uv,
+                      ref1->data[1], ref1->linesize[1],
+                      ref1->data[2], ref1->linesize[2],
+                      row << 2, col << 2, &mvuv, bw, bh, w, h);
+
+        if (b->comp) {
+            if (b->bs > BS_8x8) {
+                mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x +
+                                     b->mv[2][1].x + b->mv[3][1].x, 4);
+                mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y +
+                                     b->mv[2][1].y + b->mv[3][1].y, 4);
+            } else {
+                mvuv = b->mv[0][1];
+            }
+            mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
+                          b->dst[1], b->dst[2], ls_uv,
+                          ref2->data[1], ref2->linesize[1],
+                          ref2->data[2], ref2->linesize[2],
+                          row << 2, col << 2, &mvuv, bw, bh, w, h);
+        }
+    }
+
+    if (!b->skip) {
+        /* mostly copied intra_reconn() */
+
+        int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
+        int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
+        int end_x = FFMIN(2 * (s->cols - col), w4);
+        int end_y = FFMIN(2 * (s->rows - row), h4);
+        int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
+        int uvstep1d = 1 << b->uvtx, p;
+        uint8_t *dst = b->dst[0];
+
+        // y itxfm add
+        for (n = 0, y = 0; y < end_y; y += step1d) {
+            uint8_t *ptr = dst;
+            for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
+                int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
+
+                if (eob)
+                    s->dsp.itxfm_add[tx][DCT_DCT](ptr, b->y_stride,
+                                                  s->block + 16 * n, eob);
+            }
+            dst += 4 * b->y_stride * step1d;
+        }
+
+        // uv itxfm add
+        h4    >>= 1;
+        w4    >>= 1;
+        end_x >>= 1;
+        end_y >>= 1;
+        step    = 1 << (b->uvtx * 2);
+        for (p = 0; p < 2; p++) {
+            dst = b->dst[p + 1];
+            for (n = 0, y = 0; y < end_y; y += uvstep1d) {
+                uint8_t *ptr = dst;
+                for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
+                    int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n])
+                                               : s->uveob[p][n];
+                    if (eob)
+                        s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, b->uv_stride,
+                                                        s->uvblock[p] + 16 * n, eob);
+                }
+                dst += 4 * uvstep1d * b->uv_stride;
+            }
+        }
+    }
+    return 0;
+}
+
+static av_always_inline void mask_edges(VP9Filter *lflvl, int is_uv,
+                                        int row_and_7, int col_and_7,
+                                        int w, int h, int col_end, int row_end,
+                                        enum TxfmMode tx, int skip_inter)
+{
+    // FIXME I'm pretty sure all loops can be replaced by a single LUT if
+    // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
+    // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
+    // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
+
+    // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
+    // edges. This means that for UV, we work on two subsampled blocks at
+    // a time, and we only use the topleft block's mode information to set
+    // things like block strength. Thus, for any block size smaller than
+    // 16x16, ignore the odd portion of the block.
+    if (tx == TX_4X4 && is_uv) {
+        if (h == 1) {
+            if (row_and_7 & 1)
+                return;
+            if (!row_end)
+                h += 1;
+        }
+        if (w == 1) {
+            if (col_and_7 & 1)
+                return;
+            if (!col_end)
+                w += 1;
+        }
+    }
+
+    if (tx == TX_4X4 && !skip_inter) {
+        int t = 1 << col_and_7, m_col = (t << w) - t, y;
+        int m_col_odd = (t << (w - 1)) - t;
+
+        // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
+        if (is_uv) {
+            int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
+
+            for (y = row_and_7; y < h + row_and_7; y++) {
+                int col_mask_id = 2 - !(y & 7);
+
+                lflvl->mask[is_uv][0][y][1] |= m_row_8;
+                lflvl->mask[is_uv][0][y][2] |= m_row_4;
+                // for odd lines, if the odd col is not being filtered,
+                // skip odd row also:
+                // .---. <-- a
+                // |   |
+                // |___| <-- b
+                // ^   ^
+                // c   d
+                //
+                // if a/c are even row/col and b/d are odd, and d is skipped,
+                // e.g. right edge of size-66x66.webm, then skip b also (bug)
+                if ((col_end & 1) && (y & 1)) {
+                    lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
+                } else {
+                    lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
+                }
+            }
+        } else {
+            int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
+
+            for (y = row_and_7; y < h + row_and_7; y++) {
+                int col_mask_id = 2 - !(y & 3);
+
+                lflvl->mask[is_uv][0][y][1]           |= m_row_8; // row edge
+                lflvl->mask[is_uv][0][y][2]           |= m_row_4;
+                lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
+                lflvl->mask[is_uv][0][y][3]           |= m_col;
+                lflvl->mask[is_uv][1][y][3]           |= m_col;
+            }
+        }
+    } else {
+        int y, t = 1 << col_and_7, m_col = (t << w) - t;
+
+        if (!skip_inter) {
+            int mask_id = (tx == TX_8X8);
+            int l2 = tx + is_uv - 1, step1d = 1 << l2;
+            static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
+            int m_row = m_col & masks[l2];
+
+            // at odd UV col/row edges tx16/tx32 loopfilter edges, force
+            // 8wd loopfilter to prevent going off the visible edge.
+            if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
+                int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
+                int m_row_8  = m_row - m_row_16;
+
+                for (y = row_and_7; y < h + row_and_7; y++) {
+                    lflvl->mask[is_uv][0][y][0] |= m_row_16;
+                    lflvl->mask[is_uv][0][y][1] |= m_row_8;
+                }
+            } else {
+                for (y = row_and_7; y < h + row_and_7; y++)
+                    lflvl->mask[is_uv][0][y][mask_id] |= m_row;
+            }
+
+            if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
+                for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
+                    lflvl->mask[is_uv][1][y][0] |= m_col;
+                if (y - row_and_7 == h - 1)
+                    lflvl->mask[is_uv][1][y][1] |= m_col;
+            } else {
+                for (y = row_and_7; y < h + row_and_7; y += step1d)
+                    lflvl->mask[is_uv][1][y][mask_id] |= m_col;
+            }
+        } else if (tx != TX_4X4) {
+            int mask_id;
+
+            mask_id = (tx == TX_8X8) || (is_uv && h == 1);
+            lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
+            mask_id = (tx == TX_8X8) || (is_uv && w == 1);
+            for (y = row_and_7; y < h + row_and_7; y++)
+                lflvl->mask[is_uv][0][y][mask_id] |= t;
+        } else if (is_uv) {
+            int t8 = t & 0x01, t4 = t - t8;
+
+            for (y = row_and_7; y < h + row_and_7; y++) {
+                lflvl->mask[is_uv][0][y][2] |= t4;
+                lflvl->mask[is_uv][0][y][1] |= t8;
+            }
+            lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
+        } else {
+            int t8 = t & 0x11, t4 = t - t8;
+
+            for (y = row_and_7; y < h + row_and_7; y++) {
+                lflvl->mask[is_uv][0][y][2] |= t4;
+                lflvl->mask[is_uv][0][y][1] |= t8;
+            }
+            lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
+        }
+    }
+}
+
+int ff_vp9_decode_block(AVCodecContext *avctx, int row, int col,
+                        VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
+                        enum BlockLevel bl, enum BlockPartition bp)
+{
+    VP9Context *s = avctx->priv_data;
+    VP9Block *const b = &s->b;
+    enum BlockSize bs = bl * 3 + bp;
+    int ret, y, w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
+    int emu[2];
+
+    b->row  = row;
+    b->row7 = row & 7;
+    b->col  = col;
+    b->col7 = col & 7;
+
+    s->min_mv.x = -(128 + col * 64);
+    s->min_mv.y = -(128 + row * 64);
+    s->max_mv.x = 128 + (s->cols - col - w4) * 64;
+    s->max_mv.y = 128 + (s->rows - row - h4) * 64;
+
+    b->bs = bs;
+    decode_mode(s, b);
+    b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
+
+    if (!b->skip) {
+        if ((ret = decode_coeffs(avctx)) < 0)
+            return ret;
+    } else {
+        int pl;
+
+        memset(&s->above_y_nnz_ctx[col * 2], 0, w4 * 2);
+        memset(&s->left_y_nnz_ctx[(row & 7) << 1], 0, h4 * 2);
+        for (pl = 0; pl < 2; pl++) {
+            memset(&s->above_uv_nnz_ctx[pl][col], 0, w4);
+            memset(&s->left_uv_nnz_ctx[pl][row & 7], 0, h4);
+        }
+    }
+
+    /* Emulated overhangs if the stride of the target buffer can't hold.
+     * This allows to support emu-edge and so on even if we have large
+     * block overhangs. */
+    emu[0] = (col + w4) * 8 > s->cur_frame->linesize[0] ||
+             (row + h4) > s->rows + 2 * !(avctx->flags & CODEC_FLAG_EMU_EDGE);
+    emu[1] = (col + w4) * 4 > s->cur_frame->linesize[1] ||
+             (row + h4) > s->rows + 2 * !(avctx->flags & CODEC_FLAG_EMU_EDGE);
+    if (emu[0]) {
+        b->dst[0]   = s->tmp_y;
+        b->y_stride = 64;
+    } else {
+        b->dst[0]   = s->cur_frame->data[0] + yoff;
+        b->y_stride = s->cur_frame->linesize[0];
+    }
+    if (emu[1]) {
+        b->dst[1]    = s->tmp_uv[0];
+        b->dst[2]    = s->tmp_uv[1];
+        b->uv_stride = 32;
+    } else {
+        b->dst[1]    = s->cur_frame->data[1] + uvoff;
+        b->dst[2]    = s->cur_frame->data[2] + uvoff;
+        b->uv_stride = s->cur_frame->linesize[1];
+    }
+    if (b->intra) {
+        intra_recon(avctx, yoff, uvoff);
+    } else {
+        if ((ret = inter_recon(avctx)) < 0)
+            return ret;
+    }
+    if (emu[0]) {
+        int w = FFMIN(s->cols - col, w4) * 8;
+        int h = FFMIN(s->rows - row, h4) * 8;
+        int n, o = 0;
+
+        for (n = 0; o < w; n++) {
+            int bw = 64 >> n;
+
+            av_assert2(n <= 4);
+            if (w & bw) {
+                s->dsp.mc[n][0][0][0][0](s->cur_frame->data[0] + yoff + o,
+                                         s->tmp_y + o,
+                                         s->cur_frame->linesize[0],
+                                         64, h, 0, 0);
+                o += bw;
+            }
+        }
+    }
+    if (emu[1]) {
+        int w = FFMIN(s->cols - col, w4) * 4;
+        int h = FFMIN(s->rows - row, h4) * 4;
+        int n, o = 0;
+
+        for (n = 1; o < w; n++) {
+            int bw = 64 >> n;
+
+            av_assert2(n <= 4);
+            if (w & bw) {
+                s->dsp.mc[n][0][0][0][0](s->cur_frame->data[1] + uvoff + o,
+                                         s->tmp_uv[0] + o,
+                                         s->cur_frame->linesize[1],
+                                         32, h, 0, 0);
+                s->dsp.mc[n][0][0][0][0](s->cur_frame->data[2] + uvoff + o,
+                                         s->tmp_uv[1] + o,
+                                         s->cur_frame->linesize[2],
+                                         32, h, 0, 0);
+                o += bw;
+            }
+        }
+    }
+
+    // pick filter level and find edges to apply filter to
+    if (s->filter.level &&
+        (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
+                                                    [b->mode[3] != ZEROMV]) > 0) {
+        int x_end = FFMIN(s->cols - col, w4);
+        int y_end = FFMIN(s->rows - row, h4);
+        int skip_inter = !b->intra && b->skip;
+
+        for (y = 0; y < h4; y++)
+            memset(&lflvl->level[((row & 7) + y) * 8 + (col & 7)], lvl, w4);
+        mask_edges(lflvl, 0, row & 7, col & 7, x_end, y_end, 0, 0, b->tx, skip_inter);
+        mask_edges(lflvl, 1, row & 7, col & 7, x_end, y_end,
+                   s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
+                   s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
+                   b->uvtx, skip_inter);
+
+        if (!s->filter.lim_lut[lvl]) {
+            int sharp = s->filter.sharpness;
+            int limit = lvl;
+
+            if (sharp > 0) {
+                limit >>= (sharp + 3) >> 2;
+                limit   = FFMIN(limit, 9 - sharp);
+            }
+            limit = FFMAX(limit, 1);
+
+            s->filter.lim_lut[lvl]   = limit;
+            s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
+        }
+    }
+
+    return 0;
+}
diff --git a/libavcodec/vp9data.c b/libavcodec/vp9data.c
new file mode 100644
index 0000000000..374fa8bb8c
--- /dev/null
+++ b/libavcodec/vp9data.c
@@ -0,0 +1,2133 @@
+/*
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "vp9.h"
+#include "vp9data.h"
+
+const int8_t ff_vp9_partition_tree[3][2] = {
+    { -PARTITION_NONE,                1 }, // '0'
+    {    -PARTITION_H,                2 }, // '10'
+    {    -PARTITION_V, -PARTITION_SPLIT }, // '110', '111'
+};
+
+const uint8_t ff_vp9_default_kf_partition_probs[4][4][3] = {
+    { /* 64x64 -> 32x32 */
+        { 174,  35,  49 } /* a/l both not split */,
+        {  68,  11,  27 } /* a split, l not split */,
+        {  57,  15,   9 } /* l split, a not split */,
+        {  12,   3,   3 } /* a/l both split */
+    }, { /* 32x32 -> 16x16 */
+        { 150,  40,  39 } /* a/l both not split */,
+        {  78,  12,  26 } /* a split, l not split */,
+        {  67,  33,  11 } /* l split, a not split */,
+        {  24,   7,   5 } /* a/l both split */,
+    }, { /* 16x16 -> 8x8 */
+        { 149,  53,  53 } /* a/l both not split */,
+        {  94,  20,  48 } /* a split, l not split */,
+        {  83,  53,  24 } /* l split, a not split */,
+        {  52,  18,  18 } /* a/l both split */,
+    }, { /* 8x8 -> 4x4 */
+        { 158,  97,  94 } /* a/l both not split */,
+        {  93,  24,  99 } /* a split, l not split */,
+        {  85, 119,  44 } /* l split, a not split */,
+        {  62,  59,  67 } /* a/l both split */,
+    },
+};
+
+const int8_t ff_vp9_segmentation_tree[7][2] = {
+    {  1,  2 },
+    {  3,  4 },
+    {  5,  6 },
+    { -0, -1 }, // '00x'
+    { -2, -3 }, // '01x'
+    { -4, -5 }, // '10x'
+    { -6, -7 }, // '11x'
+};
+
+const int8_t ff_vp9_intramode_tree[9][2] = {
+    {              -DC_PRED,                1 }, // '0'
+    {          -TM_VP8_PRED,                2 }, // '10'
+    {            -VERT_PRED,                3 }, // '110'
+    {                     4,                6 },
+    {             -HOR_PRED,                5 }, // '11100'
+    { -DIAG_DOWN_RIGHT_PRED, -VERT_RIGHT_PRED }, // '11101x'
+    {  -DIAG_DOWN_LEFT_PRED,                7 }, // '11110'
+    {       -VERT_LEFT_PRED,                8 }, // '111110'
+    {        -HOR_DOWN_PRED,     -HOR_UP_PRED }, // '111111x'
+};
+
+const uint8_t ff_vp9_default_kf_ymode_probs[10][10][9] = {
+    { /* above = v */
+        {  43,  46, 168, 134, 107, 128,  69, 142,  92 } /* left = v */,
+        {  44,  29,  68, 159, 201, 177,  50,  57,  77 } /* left = h */,
+        {  63,  36, 126, 146, 123, 158,  60,  90,  96 } /* left = dc */,
+        {  58,  38,  76, 114,  97, 172,  78, 133,  92 } /* left = d45 */,
+        {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
+        {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
+        {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
+        {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
+        {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,
+        {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
+    }, { /* above = h */
+        {  55,  44,  68, 166, 179, 192,  57,  57, 108 } /* left = v */,
+        {  42,  26,  11, 199, 241, 228,  23,  15,  85 } /* left = h */,
+        {  82,  26,  26, 171, 208, 204,  44,  32, 105 } /* left = dc */,
+        {  68,  42,  19, 131, 160, 199,  55,  52,  83 } /* left = d45 */,
+        {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
+        {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
+        {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
+        {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
+        {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,
+        {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
+    }, { /* above = dc */
+        {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
+        {  73,  32,  19, 187, 222, 215,  46,  34, 100 } /* left = h */,
+        { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
+        {  91,  30,  32, 116, 121, 186,  93,  86,  94 } /* left = d45 */,
+        {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
+        {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
+        {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
+        {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
+        {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,
+        {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
+    }, { /* above = d45 */
+        {  59,  38,  83, 112, 103, 162,  98, 136,  90 } /* left = v */,
+        {  62,  30,  23, 158, 200, 207,  59,  57,  50 } /* left = h */,
+        { 103,  26,  36, 129, 132, 201,  83,  80,  93 } /* left = dc */,
+        {  67,  30,  29,  84,  86, 191, 102,  91,  59 } /* left = d45 */,
+        {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
+        {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
+        {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
+        {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
+        {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,
+        {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
+    }, { /* above = d135 */
+        {  53,  40,  55, 139,  69, 183,  61,  80, 110 } /* left = v */,
+        {  40,  29,  19, 161, 180, 207,  43,  24,  91 } /* left = h */,
+        {  69,  23,  29, 128,  83, 199,  46,  44, 101 } /* left = dc */,
+        {  60,  34,  19, 105,  61, 198,  53,  64,  89 } /* left = d45 */,
+        {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
+        {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
+        {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
+        {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
+        {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,
+        {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
+    }, { /* above = d117 */
+        {  46,  27,  80, 150,  55, 124,  55, 121, 135 } /* left = v */,
+        {  36,  23,  27, 165, 149, 166,  54,  64, 118 } /* left = h */,
+        {  64,  19,  37, 156,  66, 138,  49,  95, 133 } /* left = dc */,
+        {  53,  21,  36, 131,  63, 163,  60, 109,  81 } /* left = d45 */,
+        {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
+        {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
+        {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
+        {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
+        {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,
+        {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
+    }, { /* above = d153 */
+        {  56,  39,  58, 133, 117, 173,  48,  53, 187 } /* left = v */,
+        {  35,  21,  12, 161, 212, 207,  20,  23, 145 } /* left = h */,
+        {  75,  17,  22, 136, 138, 185,  32,  34, 166 } /* left = dc */,
+        {  56,  29,  19, 117, 109, 181,  55,  68, 112 } /* left = d45 */,
+        {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
+        {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
+        {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
+        {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
+        {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,
+        {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
+    }, { /* above = d63 */
+        {  48,  34,  86, 101,  92, 146,  78, 179, 134 } /* left = v */,
+        {  47,  22,  24, 138, 187, 178,  68,  69,  59 } /* left = h */,
+        {  78,  23,  39, 111, 117, 170,  74, 124,  94 } /* left = dc */,
+        {  56,  25,  33, 105, 112, 187,  95, 177, 129 } /* left = d45 */,
+        {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
+        {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
+        {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
+        {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
+        {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,
+        {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
+    }, { /* above = d27 */
+        {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
+        {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
+        {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
+        {  68,  36,  17, 106, 102, 206,  59,  74,  74 } /* left = d45 */,
+        {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
+        {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
+        {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
+        {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
+        {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,
+        {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
+    }, { /* above = tm */
+        {  44,  78, 115, 132, 119, 173,  71, 112,  93 } /* left = v */,
+        {  39,  38,  21, 184, 227, 206,  42,  32,  64 } /* left = h */,
+        {  65,  70,  60, 155, 159, 199,  61,  60,  81 } /* left = dc */,
+        {  58,  47,  36, 124, 137, 193,  80,  82,  78 } /* left = d45 */,
+        {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
+        {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
+        {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
+        {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
+        {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,
+        {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
+    }
+};
+
+const uint8_t ff_vp9_default_kf_uvmode_probs[10][9] = {
+    { 118,  15, 123, 148, 131, 101,  44,  93, 131 } /* y = v */,
+    { 113,  12,  23, 188, 226, 142,  26,  32, 125 } /* y = h */,
+    { 144,  11,  54, 157, 195, 130,  46,  58, 108 } /* y = dc */,
+    { 120,  11,  50, 123, 163, 135,  64,  77, 103 } /* y = d45 */,
+    { 113,   9,  36, 155, 111, 157,  32,  44, 161 } /* y = d135 */,
+    { 116,   9,  55, 176,  76,  96,  37,  61, 149 } /* y = d117 */,
+    { 115,   9,  28, 141, 161, 167,  21,  25, 193 } /* y = d153 */,
+    { 116,  12,  64, 120, 140, 125,  49, 115, 121 } /* y = d63 */,
+    { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d27 */,
+    { 102,  19,  66, 162, 182, 122,  35,  59, 128 } /* y = tm */
+};
+
+const int8_t ff_vp9_inter_mode_tree[3][2] = {
+    {    -ZEROMV,      1 }, // '0'
+    { -NEARESTMV,      2 }, // '10'
+    {    -NEARMV, -NEWMV }, // '11x'
+};
+
+const int8_t ff_vp9_filter_tree[2][2] = {
+    { -0,  1 },  // '0'
+    { -1, -2 },  // '1x'
+};
+
+const enum FilterMode ff_vp9_filter_lut[3] = {
+    FILTER_8TAP_REGULAR,
+    FILTER_8TAP_SMOOTH,
+    FILTER_8TAP_SHARP,
+};
+
+const int16_t ff_vp9_dc_qlookup[256] = {
+       4,    8,    8,    9,   10,   11,   12,   12,
+      13,   14,   15,   16,   17,   18,   19,   19,
+      20,   21,   22,   23,   24,   25,   26,   26,
+      27,   28,   29,   30,   31,   32,   32,   33,
+      34,   35,   36,   37,   38,   38,   39,   40,
+      41,   42,   43,   43,   44,   45,   46,   47,
+      48,   48,   49,   50,   51,   52,   53,   53,
+      54,   55,   56,   57,   57,   58,   59,   60,
+      61,   62,   62,   63,   64,   65,   66,   66,
+      67,   68,   69,   70,   70,   71,   72,   73,
+      74,   74,   75,   76,   77,   78,   78,   79,
+      80,   81,   81,   82,   83,   84,   85,   85,
+      87,   88,   90,   92,   93,   95,   96,   98,
+      99,  101,  102,  104,  105,  107,  108,  110,
+     111,  113,  114,  116,  117,  118,  120,  121,
+     123,  125,  127,  129,  131,  134,  136,  138,
+     140,  142,  144,  146,  148,  150,  152,  154,
+     156,  158,  161,  164,  166,  169,  172,  174,
+     177,  180,  182,  185,  187,  190,  192,  195,
+     199,  202,  205,  208,  211,  214,  217,  220,
+     223,  226,  230,  233,  237,  240,  243,  247,
+     250,  253,  257,  261,  265,  269,  272,  276,
+     280,  284,  288,  292,  296,  300,  304,  309,
+     313,  317,  322,  326,  330,  335,  340,  344,
+     349,  354,  359,  364,  369,  374,  379,  384,
+     389,  395,  400,  406,  411,  417,  423,  429,
+     435,  441,  447,  454,  461,  467,  475,  482,
+     489,  497,  505,  513,  522,  530,  539,  549,
+     559,  569,  579,  590,  602,  614,  626,  640,
+     654,  668,  684,  700,  717,  736,  755,  775,
+     796,  819,  843,  869,  896,  925,  955,  988,
+    1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336,
+};
+
+const int16_t ff_vp9_ac_qlookup[256] = {
+       4,    8,    9,   10,   11,   12,   13,   14,
+      15,   16,   17,   18,   19,   20,   21,   22,
+      23,   24,   25,   26,   27,   28,   29,   30,
+      31,   32,   33,   34,   35,   36,   37,   38,
+      39,   40,   41,   42,   43,   44,   45,   46,
+      47,   48,   49,   50,   51,   52,   53,   54,
+      55,   56,   57,   58,   59,   60,   61,   62,
+      63,   64,   65,   66,   67,   68,   69,   70,
+      71,   72,   73,   74,   75,   76,   77,   78,
+      79,   80,   81,   82,   83,   84,   85,   86,
+      87,   88,   89,   90,   91,   92,   93,   94,
+      95,   96,   97,   98,   99,  100,  101,  102,
+     104,  106,  108,  110,  112,  114,  116,  118,
+     120,  122,  124,  126,  128,  130,  132,  134,
+     136,  138,  140,  142,  144,  146,  148,  150,
+     152,  155,  158,  161,  164,  167,  170,  173,
+     176,  179,  182,  185,  188,  191,  194,  197,
+     200,  203,  207,  211,  215,  219,  223,  227,
+     231,  235,  239,  243,  247,  251,  255,  260,
+     265,  270,  275,  280,  285,  290,  295,  300,
+     305,  311,  317,  323,  329,  335,  341,  347,
+     353,  359,  366,  373,  380,  387,  394,  401,
+     408,  416,  424,  432,  440,  448,  456,  465,
+     474,  483,  492,  501,  510,  520,  530,  540,
+     550,  560,  571,  582,  593,  604,  615,  627,
+     639,  651,  663,  676,  689,  702,  715,  729,
+     743,  757,  771,  786,  801,  816,  832,  848,
+     864,  881,  898,  915,  933,  951,  969,  988,
+    1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151,
+    1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
+    1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567,
+    1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+};
+
+const enum TxfmType ff_vp9_intra_txfm_type[14] = {
+    [VERT_PRED]            = ADST_DCT,
+    [HOR_PRED]             = DCT_ADST,
+    [DC_PRED]              = DCT_DCT,
+    [DIAG_DOWN_LEFT_PRED]  = DCT_DCT,
+    [DIAG_DOWN_RIGHT_PRED] = ADST_ADST,
+    [VERT_RIGHT_PRED]      = ADST_DCT,
+    [HOR_DOWN_PRED]        = DCT_ADST,
+    [VERT_LEFT_PRED]       = ADST_DCT,
+    [HOR_UP_PRED]          = DCT_ADST,
+    [TM_VP8_PRED]          = ADST_ADST,
+    [NEARESTMV]            = DCT_DCT,
+    [NEARMV]               = DCT_DCT,
+    [ZEROMV]               = DCT_DCT,
+    [NEWMV]                = DCT_DCT,
+};
+
+const int16_t ff_vp9_default_scan_4x4[16] = {
+     0,  1,  4,  5,
+     2,  8,  3,  6,
+    12,  9,  7, 10,
+    13, 11, 14, 15,
+};
+
+const int16_t ff_vp9_col_scan_4x4[16] = {
+     0,  1,  2,  4,
+     3,  5,  6,  8,
+     7,  9, 10, 12,
+    13, 11, 14, 15,
+};
+
+const int16_t ff_vp9_row_scan_4x4[16] = {
+     0,  4,  1,  8,
+     5, 12,  9,  2,
+     6, 13,  3, 10,
+     7, 14, 11, 15,
+};
+
+const int16_t ff_vp9_default_scan_8x8[64] = {
+     0,  1,  8,  2,  9, 16, 10,  3,
+    17, 24, 18, 11,  4, 25, 32, 19,
+    12, 26,  5, 33, 20, 27, 40, 13,
+    34,  6, 41, 28, 21, 35, 42, 48,
+    14,  7, 36, 29, 43, 56, 49, 22,
+    15, 37, 50, 44, 57, 30, 23, 51,
+    45, 58, 38, 31, 52, 59, 39, 46,
+    53, 60, 47, 54, 61, 55, 62, 63,
+};
+
+const int16_t ff_vp9_col_scan_8x8[64] = {
+     0,  1,  2,  8,  3,  9,  4, 10,
+    16,  5, 11, 17, 12, 18,  6, 24,
+    19, 13, 25,  7, 26, 20, 32, 14,
+    27, 21, 33, 28, 34, 15, 22, 35,
+    40, 29, 41, 36, 23, 30, 42, 37,
+    48, 43, 31, 44, 49, 38, 50, 56,
+    45, 39, 51, 57, 52, 46, 58, 53,
+    59, 47, 60, 54, 61, 55, 62, 63,
+};
+
+const int16_t ff_vp9_row_scan_8x8[64] = {
+     0,  8, 16,  1,  9, 24,  2, 17,
+    32, 10, 25,  3, 40, 18, 11, 33,
+    26, 19,  4, 48, 41, 34, 12, 27,
+    56, 20,  5, 42, 35, 13, 49, 28,
+     6, 21, 43, 36, 14, 50, 29, 57,
+     7, 44, 22, 37, 51, 15, 58, 30,
+    23, 45, 52, 38, 59, 31, 46, 53,
+    39, 60, 47, 61, 54, 62, 55, 63,
+};
+
+const int16_t ff_vp9_default_scan_16x16[256] = {
+      0,   1,  16,   2,  17,  32,   3,  18,  33,  48,   4,  34,  19,  49,  20,   5,
+     35,  64,  50,  36,  65,  21,   6,  51,  80,  66,  37,  22,  52,   7,  81,  67,
+     38,  82,  53,  23,  96,  68,   8,  83,  97,  54,  39,  69, 112,  24,  98,  84,
+     70,  55,   9,  40,  85,  99, 113, 128,  25, 114, 100,  71,  86,  56,  10,  41,
+    115, 101, 129, 116,  72,  87,  26, 130, 144, 102,  57,  11,  42, 117, 131, 145,
+     88, 103,  27,  73, 132, 118, 146,  58, 160,  12,  43, 133, 147, 104,  89, 119,
+    161,  74, 148, 134,  28, 162,  59,  13, 176, 120, 149,  90, 135, 105, 163,  44,
+     75, 177, 164,  29, 150, 121, 136, 178, 165,  14, 106,  60,  91, 151,  45, 179,
+    192, 137, 166, 122,  76, 180, 152,  30,  61,  15, 107, 167, 181, 193,  92, 208,
+     46, 138, 123, 153, 194,  77, 168, 182,  31, 195, 209, 183, 108, 139,  62, 154,
+     47, 196,  93, 169, 210, 197, 224, 124, 184, 211,  78, 109, 170, 155,  63, 198,
+    212, 185, 225, 240, 140,  94, 199, 125,  79, 213, 226, 171, 186, 156, 214, 200,
+    110, 227, 141,  95, 241, 215, 228, 201, 126, 242, 187, 172, 157, 229, 111, 216,
+    243, 142, 202, 230, 127, 217, 244, 173, 188, 231, 158, 203, 143, 245, 218, 232,
+    189, 246, 159, 174, 233, 247, 219, 204, 175, 190, 248, 234, 205, 220, 249, 191,
+    235, 221, 250, 206, 222, 251, 236, 207, 237, 223, 252, 238, 253, 239, 254, 255,
+};
+
+const int16_t ff_vp9_col_scan_16x16[256] = {
+      0,   1,   2,   3,  16,   4,  17,   5,  18,   6,  19,  32,  20,   7,  33,  21,
+     34,   8,  35,  22,  48,  36,   9,  49,  23,  50,  37,  10,  38,  51,  24,  64,
+     52,  11,  65,  39,  25,  53,  66,  54,  40,  67,  12,  80,  26,  68,  55,  81,
+     41,  69,  13,  27,  82,  56,  70,  83,  42,  14,  84,  96,  71,  28,  57,  85,
+     97,  15,  72,  98,  43,  86,  58,  99,  29,  87, 100, 112,  73,  44, 101,  59,
+     30, 113,  88, 114,  74, 128, 102,  45,  31, 115,  60, 103,  89, 116,  75, 129,
+    117,  46, 104,  90,  61, 130, 118, 131, 132, 105,  76,  47, 119, 144,  91,  62,
+    133, 106, 145, 120, 146, 134,  77, 147, 121,  92, 135, 148,  63, 107, 136, 122,
+     93, 149, 160,  78, 150, 137, 108, 161, 162, 151, 123,  79, 138, 163, 152,  94,
+    164, 109, 165, 153, 124, 139, 176, 166,  95, 177, 167, 110, 154, 178, 125, 179,
+    140, 168, 155, 111, 180, 192, 181, 169, 141, 126, 182, 193, 194, 156, 183, 170,
+    195, 127, 142, 196, 184, 208, 197, 157, 171, 143, 185, 198, 209, 199, 210, 172,
+    158, 186, 211, 224, 212, 200, 240, 159, 213, 225, 187, 201, 173, 226, 214, 215,
+    227, 202, 228, 188, 241, 216, 174, 229, 242, 203, 243, 217, 230, 175, 189, 244,
+    231, 204, 218, 232, 245, 219, 246, 190, 233, 205, 191, 247, 234, 248, 220, 206,
+    249, 235, 221, 207, 250, 236, 222, 251, 223, 237, 238, 252, 239, 253, 254, 255,
+};
+
+const int16_t ff_vp9_row_scan_16x16[256] = {
+      0,  16,  32,   1,  48,  17,  64,  33,   2,  80,  18,  49,  96,  34,   3,  65,
+     19, 112,  50,  81,  35,   4, 128,  66,  20,  97,  51,  82,   5, 144,  36,  67,
+    113,  98,  21,  52, 160,  83, 129,  37,  68,   6, 114, 176,  99,  53,  22,  84,
+    145,  38,  69, 130,   7, 115, 192, 100,  54,  23,  85, 161, 146, 131,  39,  70,
+    208, 116,   8, 101, 177,  55,  86,  24, 162, 147, 132,  71, 224, 117,  40, 102,
+      9, 148,  56,  87, 193, 163, 240, 133, 178,  25, 118,  72,  41, 103, 164,  10,
+    149,  88, 134, 209, 179,  57, 119, 194,  26,  73, 165, 150, 104,  42, 135,  11,
+    180, 120,  89, 225, 195,  58,  27, 210, 151, 181, 166,  74,  43, 105,  12, 136,
+     90,  59, 241, 121,  28, 196, 167, 211, 152,  44, 182, 137,  75,  13, 226, 106,
+    122,  60, 197,  91, 168,  29, 183, 153,  14,  76, 212, 138,  45, 107,  15, 198,
+     92, 227, 169,  30, 123, 154,  61, 242, 184, 213, 139,  46,  77,  31, 108, 170,
+    199, 185, 124, 228,  93, 155, 214,  62, 140, 243,  78,  47, 200, 109, 186, 171,
+    201,  94,  63, 215, 229, 156,  79, 125, 141, 110, 216, 187, 172, 244, 202, 230,
+    217,  95, 157, 126, 245, 111, 142, 231, 188, 127, 158, 218, 173, 232, 246, 233,
+    203, 143, 247, 174, 189, 159, 219, 204, 248, 234, 249, 175, 190, 220, 205, 250,
+    235, 191, 221, 251, 236, 206, 252, 222, 207, 237, 223, 253, 238, 254, 239, 255,
+};
+
+const int16_t ff_vp9_default_scan_32x32[1024] = {
+       0,    1,   32,    2,   33,   64,    3,   34,   65,    4,   96,   35,   66,    5,   36,   97,
+      67,  128,   98,   68,   37,    6,  129,   99,    7,  160,   69,   38,  130,  100,  161,  131,
+      39,   70,    8,  101,  162,  132,  192,   71,   40,    9,  102,  163,  133,  193,   72,  224,
+     103,   41,  164,   10,  194,  134,  165,   73,  104,  135,  225,   42,  195,   11,  256,  166,
+     226,  196,   74,  105,  136,   43,   12,  167,  197,  227,  257,   75,  106,  137,  228,   44,
+     198,  168,  258,  288,   13,  229,   76,  107,  199,  138,  259,  169,  289,   45,  230,  260,
+     200,  108,   14,  170,  139,  320,  290,   77,  231,  261,   46,  201,  140,  291,  109,  232,
+     321,  262,  171,   78,  292,   15,  322,  202,  263,  352,  172,  293,  233,  141,  323,  110,
+      47,  203,  264,  234,  294,  353,  324,   16,   79,  204,  265,  295,  325,  173,  354,  142,
+     235,  384,   48,  296,  111,  266,  355,  326,   80,   17,  205,  236,  174,  356,  385,  327,
+     143,  297,  267,  357,  386,  112,   49,  328,  298,  206,  416,  237,  358,  387,   81,  175,
+      18,  329,  359,  388,  299,  330,  389,  113,  417,  238,  360,   50,  207,  418,  390,  331,
+      19,  448,  361,   82,  419,  391,  239,   51,  362,  420,  114,  449,  480,  421,   83,  363,
+     450,  422,  512,  451,  423,  115,  452,  481,  453,  482,  454,  544,  483,  455,  513,  484,
+     514,  485,  515,  486,  545,  576,  487,  546,  547,  608,  577,  578,  579,  609,  610,  611,
+      20,  144,  268,  392,  516,  640,   21,   52,  145,  176,  269,  300,  393,  424,  517,  548,
+     641,  672,   22,   53,   84,  146,  177,  208,  270,  301,  332,  394,  425,  456,  518,  549,
+     580,  642,  673,  704,   23,   54,   85,  116,  147,  178,  209,  240,  271,  302,  333,  364,
+     395,  426,  457,  488,  519,  550,  581,  612,  643,  674,  705,  736,   55,   86,  117,  179,
+     210,  241,  303,  334,  365,  427,  458,  489,  551,  582,  613,  675,  706,  737,   87,  118,
+     211,  242,  335,  366,  459,  490,  583,  614,  707,  738,  119,  243,  367,  491,  615,  739,
+      24,  148,  272,  396,  520,  644,  768,   25,   56,  149,  180,  273,  304,  397,  428,  521,
+     552,  645,  676,  769,  800,   26,   57,   88,  150,  181,  212,  274,  305,  336,  398,  429,
+     460,  522,  553,  584,  646,  677,  708,  770,  801,  832,   27,   58,   89,  120,  151,  182,
+     213,  244,  275,  306,  337,  368,  399,  430,  461,  492,  523,  554,  585,  616,  647,  678,
+     709,  740,  771,  802,  833,  864,   59,   90,  121,  183,  214,  245,  307,  338,  369,  431,
+     462,  493,  555,  586,  617,  679,  710,  741,  803,  834,  865,   91,  122,  215,  246,  339,
+     370,  463,  494,  587,  618,  711,  742,  835,  866,  123,  247,  371,  495,  619,  743,  867,
+      28,  152,  276,  400,  524,  648,  772,  896,   29,   60,  153,  184,  277,  308,  401,  432,
+     525,  556,  649,  680,  773,  804,  897,  928,   30,   61,   92,  154,  185,  216,  278,  309,
+     340,  402,  433,  464,  526,  557,  588,  650,  681,  712,  774,  805,  836,  898,  929,  960,
+      31,   62,   93,  124,  155,  186,  217,  248,  279,  310,  341,  372,  403,  434,  465,  496,
+     527,  558,  589,  620,  651,  682,  713,  744,  775,  806,  837,  868,  899,  930,  961,  992,
+      63,   94,  125,  187,  218,  249,  311,  342,  373,  435,  466,  497,  559,  590,  621,  683,
+     714,  745,  807,  838,  869,  931,  962,  993,   95,  126,  219,  250,  343,  374,  467,  498,
+     591,  622,  715,  746,  839,  870,  963,  994,  127,  251,  375,  499,  623,  747,  871,  995,
+     156,  280,  404,  528,  652,  776,  900,  157,  188,  281,  312,  405,  436,  529,  560,  653,
+     684,  777,  808,  901,  932,  158,  189,  220,  282,  313,  344,  406,  437,  468,  530,  561,
+     592,  654,  685,  716,  778,  809,  840,  902,  933,  964,  159,  190,  221,  252,  283,  314,
+     345,  376,  407,  438,  469,  500,  531,  562,  593,  624,  655,  686,  717,  748,  779,  810,
+     841,  872,  903,  934,  965,  996,  191,  222,  253,  315,  346,  377,  439,  470,  501,  563,
+     594,  625,  687,  718,  749,  811,  842,  873,  935,  966,  997,  223,  254,  347,  378,  471,
+     502,  595,  626,  719,  750,  843,  874,  967,  998,  255,  379,  503,  627,  751,  875,  999,
+     284,  408,  532,  656,  780,  904,  285,  316,  409,  440,  533,  564,  657,  688,  781,  812,
+     905,  936,  286,  317,  348,  410,  441,  472,  534,  565,  596,  658,  689,  720,  782,  813,
+     844,  906,  937,  968,  287,  318,  349,  380,  411,  442,  473,  504,  535,  566,  597,  628,
+     659,  690,  721,  752,  783,  814,  845,  876,  907,  938,  969, 1000,  319,  350,  381,  443,
+     474,  505,  567,  598,  629,  691,  722,  753,  815,  846,  877,  939,  970, 1001,  351,  382,
+     475,  506,  599,  630,  723,  754,  847,  878,  971, 1002,  383,  507,  631,  755,  879, 1003,
+     412,  536,  660,  784,  908,  413,  444,  537,  568,  661,  692,  785,  816,  909,  940,  414,
+     445,  476,  538,  569,  600,  662,  693,  724,  786,  817,  848,  910,  941,  972,  415,  446,
+     477,  508,  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,  911,  942,
+     973, 1004,  447,  478,  509,  571,  602,  633,  695,  726,  757,  819,  850,  881,  943,  974,
+    1005,  479,  510,  603,  634,  727,  758,  851,  882,  975, 1006,  511,  635,  759,  883, 1007,
+     540,  664,  788,  912,  541,  572,  665,  696,  789,  820,  913,  944,  542,  573,  604,  666,
+     697,  728,  790,  821,  852,  914,  945,  976,  543,  574,  605,  636,  667,  698,  729,  760,
+     791,  822,  853,  884,  915,  946,  977, 1008,  575,  606,  637,  699,  730,  761,  823,  854,
+     885,  947,  978, 1009,  607,  638,  731,  762,  855,  886,  979, 1010,  639,  763,  887, 1011,
+     668,  792,  916,  669,  700,  793,  824,  917,  948,  670,  701,  732,  794,  825,  856,  918,
+     949,  980,  671,  702,  733,  764,  795,  826,  857,  888,  919,  950,  981, 1012,  703,  734,
+     765,  827,  858,  889,  951,  982, 1013,  735,  766,  859,  890,  983, 1014,  767,  891, 1015,
+     796,  920,  797,  828,  921,  952,  798,  829,  860,  922,  953,  984,  799,  830,  861,  892,
+     923,  954,  985, 1016,  831,  862,  893,  955,  986, 1017,  863,  894,  987, 1018,  895, 1019,
+     924,  925,  956,  926,  957,  988,  927,  958,  989, 1020,  959,  990, 1021,  991, 1022, 1023,
+};
+
+const int16_t *ff_vp9_scans[5][4] = {
+    {
+        ff_vp9_default_scan_4x4, ff_vp9_col_scan_4x4,
+        ff_vp9_row_scan_4x4, ff_vp9_default_scan_4x4
+    }, {
+        ff_vp9_default_scan_8x8, ff_vp9_col_scan_8x8,
+        ff_vp9_row_scan_8x8, ff_vp9_default_scan_8x8
+    }, {
+        ff_vp9_default_scan_16x16, ff_vp9_col_scan_16x16,
+        ff_vp9_row_scan_16x16, ff_vp9_default_scan_16x16
+    }, {
+        ff_vp9_default_scan_32x32, ff_vp9_default_scan_32x32,
+        ff_vp9_default_scan_32x32, ff_vp9_default_scan_32x32
+    }, { // lossless
+        ff_vp9_default_scan_4x4, ff_vp9_default_scan_4x4,
+        ff_vp9_default_scan_4x4, ff_vp9_default_scan_4x4
+    }
+};
+
+const int16_t ff_vp9_default_scan_4x4_nb[16][2] = {
+    {  0,  0 }, {  0,  0 }, {  4,  1 }, {  1,  1 },
+    {  4,  4 }, {  2,  2 }, {  5,  2 }, {  8,  8 },
+    {  8,  5 }, {  6,  3 }, {  9,  6 }, { 12,  9 },
+    { 10,  7 }, { 13, 10 }, { 14, 11 }, {  0,  0 },
+};
+
+const int16_t ff_vp9_col_scan_4x4_nb[16][2] = {
+    {  0,  0 }, {  1,  1 }, {  0,  0 }, {  2,  2 },
+    {  4,  4 }, {  5,  5 }, {  4,  4 }, {  6,  6 },
+    {  8,  8 }, {  9,  9 }, {  8,  8 }, { 12, 12 },
+    { 10, 10 }, { 13, 13 }, { 14, 14 }, {  0,  0 },
+};
+
+const int16_t ff_vp9_row_scan_4x4_nb[16][2] = {
+    {  0,  0 }, {  0,  0 }, {  4,  4 }, {  1,  1 },
+    {  8,  8 }, {  5,  5 }, {  1,  1 }, {  2,  2 },
+    {  9,  9 }, {  2,  2 }, {  6,  6 }, {  3,  3 },
+    { 10, 10 }, {  7,  7 }, { 11, 11 }, {  0,  0 },
+};
+
+const int16_t ff_vp9_default_scan_8x8_nb[64][2] = {
+    {  0,  0 }, {  0,  0 }, {  1,  1 }, {  8,  1 },
+    {  8,  8 }, {  9,  2 }, {  2,  2 }, { 16,  9 },
+    { 16, 16 }, { 17, 10 }, { 10,  3 }, {  3,  3 },
+    { 24, 17 }, { 24, 24 }, { 18, 11 }, { 11,  4 },
+    { 25, 18 }, {  4,  4 }, { 32, 25 }, { 19, 12 },
+    { 26, 19 }, { 32, 32 }, { 12,  5 }, { 33, 26 },
+    {  5,  5 }, { 40, 33 }, { 27, 20 }, { 20, 13 },
+    { 34, 27 }, { 41, 34 }, { 40, 40 }, { 13,  6 },
+    {  6,  6 }, { 35, 28 }, { 28, 21 }, { 42, 35 },
+    { 48, 48 }, { 48, 41 }, { 21, 14 }, { 14,  7 },
+    { 36, 29 }, { 49, 42 }, { 43, 36 }, { 56, 49 },
+    { 29, 22 }, { 22, 15 }, { 50, 43 }, { 44, 37 },
+    { 57, 50 }, { 37, 30 }, { 30, 23 }, { 51, 44 },
+    { 58, 51 }, { 38, 31 }, { 45, 38 }, { 52, 45 },
+    { 59, 52 }, { 46, 39 }, { 53, 46 }, { 60, 53 },
+    { 54, 47 }, { 61, 54 }, { 62, 55 }, {  0,  0 },
+};
+
+const int16_t ff_vp9_col_scan_8x8_nb[64][2] = {
+    {  0,  0 }, {  1,  1 }, {  0,  0 }, {  2,  2 },
+    {  8,  8 }, {  3,  3 }, {  9,  9 }, {  8,  8 },
+    {  4,  4 }, { 10, 10 }, { 16, 16 }, { 11, 11 },
+    { 17, 17 }, {  5,  5 }, { 16, 16 }, { 18, 18 },
+    { 12, 12 }, { 24, 24 }, {  6,  6 }, { 25, 25 },
+    { 19, 19 }, { 24, 24 }, { 13, 13 }, { 26, 26 },
+    { 20, 20 }, { 32, 32 }, { 27, 27 }, { 33, 33 },
+    { 14, 14 }, { 21, 21 }, { 34, 34 }, { 32, 32 },
+    { 28, 28 }, { 40, 40 }, { 35, 35 }, { 22, 22 },
+    { 29, 29 }, { 41, 41 }, { 36, 36 }, { 40, 40 },
+    { 42, 42 }, { 30, 30 }, { 43, 43 }, { 48, 48 },
+    { 37, 37 }, { 49, 49 }, { 48, 48 }, { 44, 44 },
+    { 38, 38 }, { 50, 50 }, { 56, 56 }, { 51, 51 },
+    { 45, 45 }, { 57, 57 }, { 52, 52 }, { 58, 58 },
+    { 46, 46 }, { 59, 59 }, { 53, 53 }, { 60, 60 },
+    { 54, 54 }, { 61, 61 }, { 62, 62 }, {  0,  0 },
+};
+
+const int16_t ff_vp9_row_scan_8x8_nb[64][2] = {
+    {  0,  0 }, {  8,  8 }, {  0,  0 }, {  1,  1 },
+    { 16, 16 }, {  1,  1 }, {  9,  9 }, { 24, 24 },
+    {  2,  2 }, { 17, 17 }, {  2,  2 }, { 32, 32 },
+    { 10, 10 }, {  3,  3 }, { 25, 25 }, { 18, 18 },
+    { 11, 11 }, {  3,  3 }, { 40, 40 }, { 33, 33 },
+    { 26, 26 }, {  4,  4 }, { 19, 19 }, { 48, 48 },
+    { 12, 12 }, {  4,  4 }, { 34, 34 }, { 27, 27 },
+    {  5,  5 }, { 41, 41 }, { 20, 20 }, {  5,  5 },
+    { 13, 13 }, { 35, 35 }, { 28, 28 }, {  6,  6 },
+    { 42, 42 }, { 21, 21 }, { 49, 49 }, {  6,  6 },
+    { 36, 36 }, { 14, 14 }, { 29, 29 }, { 43, 43 },
+    {  7,  7 }, { 50, 50 }, { 22, 22 }, { 15, 15 },
+    { 37, 37 }, { 44, 44 }, { 30, 30 }, { 51, 51 },
+    { 23, 23 }, { 38, 38 }, { 45, 45 }, { 31, 31 },
+    { 52, 52 }, { 39, 39 }, { 53, 53 }, { 46, 46 },
+    { 54, 54 }, { 47, 47 }, { 55, 55 }, {  0,  0 },
+};
+
+const int16_t ff_vp9_default_scan_16x16_nb[256][2] = {
+    {   0,   0 }, {   0,   0 }, {   1,   1 }, {  16,   1 },
+    {  16,  16 }, {   2,   2 }, {  17,   2 }, {  32,  17 },
+    {  32,  32 }, {   3,   3 }, {  33,  18 }, {  18,   3 },
+    {  48,  33 }, {  19,   4 }, {   4,   4 }, {  34,  19 },
+    {  48,  48 }, {  49,  34 }, {  35,  20 }, {  64,  49 },
+    {  20,   5 }, {   5,   5 }, {  50,  35 }, {  64,  64 },
+    {  65,  50 }, {  36,  21 }, {  21,   6 }, {  51,  36 },
+    {   6,   6 }, {  80,  65 }, {  66,  51 }, {  37,  22 },
+    {  81,  66 }, {  52,  37 }, {  22,   7 }, {  80,  80 },
+    {  67,  52 }, {   7,   7 }, {  82,  67 }, {  96,  81 },
+    {  53,  38 }, {  38,  23 }, {  68,  53 }, {  96,  96 },
+    {  23,   8 }, {  97,  82 }, {  83,  68 }, {  69,  54 },
+    {  54,  39 }, {   8,   8 }, {  39,  24 }, {  84,  69 },
+    {  98,  83 }, { 112,  97 }, { 112, 112 }, {  24,   9 },
+    { 113,  98 }, {  99,  84 }, {  70,  55 }, {  85,  70 },
+    {  55,  40 }, {   9,   9 }, {  40,  25 }, { 114,  99 },
+    { 100,  85 }, { 128, 113 }, { 115, 100 }, {  71,  56 },
+    {  86,  71 }, {  25,  10 }, { 129, 114 }, { 128, 128 },
+    { 101,  86 }, {  56,  41 }, {  10,  10 }, {  41,  26 },
+    { 116, 101 }, { 130, 115 }, { 144, 129 }, {  87,  72 },
+    { 102,  87 }, {  26,  11 }, {  72,  57 }, { 131, 116 },
+    { 117, 102 }, { 145, 130 }, {  57,  42 }, { 144, 144 },
+    {  11,  11 }, {  42,  27 }, { 132, 117 }, { 146, 131 },
+    { 103,  88 }, {  88,  73 }, { 118, 103 }, { 160, 145 },
+    {  73,  58 }, { 147, 132 }, { 133, 118 }, {  27,  12 },
+    { 161, 146 }, {  58,  43 }, {  12,  12 }, { 160, 160 },
+    { 119, 104 }, { 148, 133 }, {  89,  74 }, { 134, 119 },
+    { 104,  89 }, { 162, 147 }, {  43,  28 }, {  74,  59 },
+    { 176, 161 }, { 163, 148 }, {  28,  13 }, { 149, 134 },
+    { 120, 105 }, { 135, 120 }, { 177, 162 }, { 164, 149 },
+    {  13,  13 }, { 105,  90 }, {  59,  44 }, {  90,  75 },
+    { 150, 135 }, {  44,  29 }, { 178, 163 }, { 176, 176 },
+    { 136, 121 }, { 165, 150 }, { 121, 106 }, {  75,  60 },
+    { 179, 164 }, { 151, 136 }, {  29,  14 }, {  60,  45 },
+    {  14,  14 }, { 106,  91 }, { 166, 151 }, { 180, 165 },
+    { 192, 177 }, {  91,  76 }, { 192, 192 }, {  45,  30 },
+    { 137, 122 }, { 122, 107 }, { 152, 137 }, { 193, 178 },
+    {  76,  61 }, { 167, 152 }, { 181, 166 }, {  30,  15 },
+    { 194, 179 }, { 208, 193 }, { 182, 167 }, { 107,  92 },
+    { 138, 123 }, {  61,  46 }, { 153, 138 }, {  46,  31 },
+    { 195, 180 }, {  92,  77 }, { 168, 153 }, { 209, 194 },
+    { 196, 181 }, { 208, 208 }, { 123, 108 }, { 183, 168 },
+    { 210, 195 }, {  77,  62 }, { 108,  93 }, { 169, 154 },
+    { 154, 139 }, {  62,  47 }, { 197, 182 }, { 211, 196 },
+    { 184, 169 }, { 224, 209 }, { 224, 224 }, { 139, 124 },
+    {  93,  78 }, { 198, 183 }, { 124, 109 }, {  78,  63 },
+    { 212, 197 }, { 225, 210 }, { 170, 155 }, { 185, 170 },
+    { 155, 140 }, { 213, 198 }, { 199, 184 }, { 109,  94 },
+    { 226, 211 }, { 140, 125 }, {  94,  79 }, { 240, 225 },
+    { 214, 199 }, { 227, 212 }, { 200, 185 }, { 125, 110 },
+    { 241, 226 }, { 186, 171 }, { 171, 156 }, { 156, 141 },
+    { 228, 213 }, { 110,  95 }, { 215, 200 }, { 242, 227 },
+    { 141, 126 }, { 201, 186 }, { 229, 214 }, { 126, 111 },
+    { 216, 201 }, { 243, 228 }, { 172, 157 }, { 187, 172 },
+    { 230, 215 }, { 157, 142 }, { 202, 187 }, { 142, 127 },
+    { 244, 229 }, { 217, 202 }, { 231, 216 }, { 188, 173 },
+    { 245, 230 }, { 158, 143 }, { 173, 158 }, { 232, 217 },
+    { 246, 231 }, { 218, 203 }, { 203, 188 }, { 174, 159 },
+    { 189, 174 }, { 247, 232 }, { 233, 218 }, { 204, 189 },
+    { 219, 204 }, { 248, 233 }, { 190, 175 }, { 234, 219 },
+    { 220, 205 }, { 249, 234 }, { 205, 190 }, { 221, 206 },
+    { 250, 235 }, { 235, 220 }, { 206, 191 }, { 236, 221 },
+    { 222, 207 }, { 251, 236 }, { 237, 222 }, { 252, 237 },
+    { 238, 223 }, { 253, 238 }, { 254, 239 }, {   0,   0 },
+};
+
+const int16_t ff_vp9_col_scan_16x16_nb[256][2] = {
+    {   0,   0 }, {   1,   1 }, {   2,   2 }, {   0,   0 },
+    {   3,   3 }, {  16,  16 }, {   4,   4 }, {  17,  17 },
+    {   5,   5 }, {  18,  18 }, {  16,  16 }, {  19,  19 },
+    {   6,   6 }, {  32,  32 }, {  20,  20 }, {  33,  33 },
+    {   7,   7 }, {  34,  34 }, {  21,  21 }, {  32,  32 },
+    {  35,  35 }, {   8,   8 }, {  48,  48 }, {  22,  22 },
+    {  49,  49 }, {  36,  36 }, {   9,   9 }, {  37,  37 },
+    {  50,  50 }, {  23,  23 }, {  48,  48 }, {  51,  51 },
+    {  10,  10 }, {  64,  64 }, {  38,  38 }, {  24,  24 },
+    {  52,  52 }, {  65,  65 }, {  53,  53 }, {  39,  39 },
+    {  66,  66 }, {  11,  11 }, {  64,  64 }, {  25,  25 },
+    {  67,  67 }, {  54,  54 }, {  80,  80 }, {  40,  40 },
+    {  68,  68 }, {  12,  12 }, {  26,  26 }, {  81,  81 },
+    {  55,  55 }, {  69,  69 }, {  82,  82 }, {  41,  41 },
+    {  13,  13 }, {  83,  83 }, {  80,  80 }, {  70,  70 },
+    {  27,  27 }, {  56,  56 }, {  84,  84 }, {  96,  96 },
+    {  14,  14 }, {  71,  71 }, {  97,  97 }, {  42,  42 },
+    {  85,  85 }, {  57,  57 }, {  98,  98 }, {  28,  28 },
+    {  86,  86 }, {  99,  99 }, {  96,  96 }, {  72,  72 },
+    {  43,  43 }, { 100, 100 }, {  58,  58 }, {  29,  29 },
+    { 112, 112 }, {  87,  87 }, { 113, 113 }, {  73,  73 },
+    { 112, 112 }, { 101, 101 }, {  44,  44 }, {  30,  30 },
+    { 114, 114 }, {  59,  59 }, { 102, 102 }, {  88,  88 },
+    { 115, 115 }, {  74,  74 }, { 128, 128 }, { 116, 116 },
+    {  45,  45 }, { 103, 103 }, {  89,  89 }, {  60,  60 },
+    { 129, 129 }, { 117, 117 }, { 130, 130 }, { 131, 131 },
+    { 104, 104 }, {  75,  75 }, {  46,  46 }, { 118, 118 },
+    { 128, 128 }, {  90,  90 }, {  61,  61 }, { 132, 132 },
+    { 105, 105 }, { 144, 144 }, { 119, 119 }, { 145, 145 },
+    { 133, 133 }, {  76,  76 }, { 146, 146 }, { 120, 120 },
+    {  91,  91 }, { 134, 134 }, { 147, 147 }, {  62,  62 },
+    { 106, 106 }, { 135, 135 }, { 121, 121 }, {  92,  92 },
+    { 148, 148 }, { 144, 144 }, {  77,  77 }, { 149, 149 },
+    { 136, 136 }, { 107, 107 }, { 160, 160 }, { 161, 161 },
+    { 150, 150 }, { 122, 122 }, {  78,  78 }, { 137, 137 },
+    { 162, 162 }, { 151, 151 }, {  93,  93 }, { 163, 163 },
+    { 108, 108 }, { 164, 164 }, { 152, 152 }, { 123, 123 },
+    { 138, 138 }, { 160, 160 }, { 165, 165 }, {  94,  94 },
+    { 176, 176 }, { 166, 166 }, { 109, 109 }, { 153, 153 },
+    { 177, 177 }, { 124, 124 }, { 178, 178 }, { 139, 139 },
+    { 167, 167 }, { 154, 154 }, { 110, 110 }, { 179, 179 },
+    { 176, 176 }, { 180, 180 }, { 168, 168 }, { 140, 140 },
+    { 125, 125 }, { 181, 181 }, { 192, 192 }, { 193, 193 },
+    { 155, 155 }, { 182, 182 }, { 169, 169 }, { 194, 194 },
+    { 126, 126 }, { 141, 141 }, { 195, 195 }, { 183, 183 },
+    { 192, 192 }, { 196, 196 }, { 156, 156 }, { 170, 170 },
+    { 142, 142 }, { 184, 184 }, { 197, 197 }, { 208, 208 },
+    { 198, 198 }, { 209, 209 }, { 171, 171 }, { 157, 157 },
+    { 185, 185 }, { 210, 210 }, { 208, 208 }, { 211, 211 },
+    { 199, 199 }, { 224, 224 }, { 158, 158 }, { 212, 212 },
+    { 224, 224 }, { 186, 186 }, { 200, 200 }, { 172, 172 },
+    { 225, 225 }, { 213, 213 }, { 214, 214 }, { 226, 226 },
+    { 201, 201 }, { 227, 227 }, { 187, 187 }, { 240, 240 },
+    { 215, 215 }, { 173, 173 }, { 228, 228 }, { 241, 241 },
+    { 202, 202 }, { 242, 242 }, { 216, 216 }, { 229, 229 },
+    { 174, 174 }, { 188, 188 }, { 243, 243 }, { 230, 230 },
+    { 203, 203 }, { 217, 217 }, { 231, 231 }, { 244, 244 },
+    { 218, 218 }, { 245, 245 }, { 189, 189 }, { 232, 232 },
+    { 204, 204 }, { 190, 190 }, { 246, 246 }, { 233, 233 },
+    { 247, 247 }, { 219, 219 }, { 205, 205 }, { 248, 248 },
+    { 234, 234 }, { 220, 220 }, { 206, 206 }, { 249, 249 },
+    { 235, 235 }, { 221, 221 }, { 250, 250 }, { 222, 222 },
+    { 236, 236 }, { 237, 237 }, { 251, 251 }, { 238, 238 },
+    { 252, 252 }, { 253, 253 }, { 254, 254 }, {   0,   0 },
+};
+
+const int16_t ff_vp9_row_scan_16x16_nb[256][2] = {
+    {   0,   0 }, {  16,  16 }, {   0,   0 }, {  32,  32 },
+    {   1,   1 }, {  48,  48 }, {  17,  17 }, {   1,   1 },
+    {  64,  64 }, {   2,   2 }, {  33,  33 }, {  80,  80 },
+    {  18,  18 }, {   2,   2 }, {  49,  49 }, {   3,   3 },
+    {  96,  96 }, {  34,  34 }, {  65,  65 }, {  19,  19 },
+    {   3,   3 }, { 112, 112 }, {  50,  50 }, {   4,   4 },
+    {  81,  81 }, {  35,  35 }, {  66,  66 }, {   4,   4 },
+    { 128, 128 }, {  20,  20 }, {  51,  51 }, {  97,  97 },
+    {  82,  82 }, {   5,   5 }, {  36,  36 }, { 144, 144 },
+    {  67,  67 }, { 113, 113 }, {  21,  21 }, {  52,  52 },
+    {   5,   5 }, {  98,  98 }, { 160, 160 }, {  83,  83 },
+    {  37,  37 }, {   6,   6 }, {  68,  68 }, { 129, 129 },
+    {  22,  22 }, {  53,  53 }, { 114, 114 }, {   6,   6 },
+    {  99,  99 }, { 176, 176 }, {  84,  84 }, {  38,  38 },
+    {   7,   7 }, {  69,  69 }, { 145, 145 }, { 130, 130 },
+    { 115, 115 }, {  23,  23 }, {  54,  54 }, { 192, 192 },
+    { 100, 100 }, {   7,   7 }, {  85,  85 }, { 161, 161 },
+    {  39,  39 }, {  70,  70 }, {   8,   8 }, { 146, 146 },
+    { 131, 131 }, { 116, 116 }, {  55,  55 }, { 208, 208 },
+    { 101, 101 }, {  24,  24 }, {  86,  86 }, {   8,   8 },
+    { 132, 132 }, {  40,  40 }, {  71,  71 }, { 177, 177 },
+    { 147, 147 }, { 224, 224 }, { 117, 117 }, { 162, 162 },
+    {   9,   9 }, { 102, 102 }, {  56,  56 }, {  25,  25 },
+    {  87,  87 }, { 148, 148 }, {   9,   9 }, { 133, 133 },
+    {  72,  72 }, { 118, 118 }, { 193, 193 }, { 163, 163 },
+    {  41,  41 }, { 103, 103 }, { 178, 178 }, {  10,  10 },
+    {  57,  57 }, { 149, 149 }, { 134, 134 }, {  88,  88 },
+    {  26,  26 }, { 119, 119 }, {  10,  10 }, { 164, 164 },
+    { 104, 104 }, {  73,  73 }, { 209, 209 }, { 179, 179 },
+    {  42,  42 }, {  11,  11 }, { 194, 194 }, { 135, 135 },
+    { 165, 165 }, { 150, 150 }, {  58,  58 }, {  27,  27 },
+    {  89,  89 }, {  11,  11 }, { 120, 120 }, {  74,  74 },
+    {  43,  43 }, { 225, 225 }, { 105, 105 }, {  12,  12 },
+    { 180, 180 }, { 151, 151 }, { 195, 195 }, { 136, 136 },
+    {  28,  28 }, { 166, 166 }, { 121, 121 }, {  59,  59 },
+    {  12,  12 }, { 210, 210 }, {  90,  90 }, { 106, 106 },
+    {  44,  44 }, { 181, 181 }, {  75,  75 }, { 152, 152 },
+    {  13,  13 }, { 167, 167 }, { 137, 137 }, {  13,  13 },
+    {  60,  60 }, { 196, 196 }, { 122, 122 }, {  29,  29 },
+    {  91,  91 }, {  14,  14 }, { 182, 182 }, {  76,  76 },
+    { 211, 211 }, { 153, 153 }, {  14,  14 }, { 107, 107 },
+    { 138, 138 }, {  45,  45 }, { 226, 226 }, { 168, 168 },
+    { 197, 197 }, { 123, 123 }, {  30,  30 }, {  61,  61 },
+    {  15,  15 }, {  92,  92 }, { 154, 154 }, { 183, 183 },
+    { 169, 169 }, { 108, 108 }, { 212, 212 }, {  77,  77 },
+    { 139, 139 }, { 198, 198 }, {  46,  46 }, { 124, 124 },
+    { 227, 227 }, {  62,  62 }, {  31,  31 }, { 184, 184 },
+    {  93,  93 }, { 170, 170 }, { 155, 155 }, { 185, 185 },
+    {  78,  78 }, {  47,  47 }, { 199, 199 }, { 213, 213 },
+    { 140, 140 }, {  63,  63 }, { 109, 109 }, { 125, 125 },
+    {  94,  94 }, { 200, 200 }, { 171, 171 }, { 156, 156 },
+    { 228, 228 }, { 186, 186 }, { 214, 214 }, { 201, 201 },
+    {  79,  79 }, { 141, 141 }, { 110, 110 }, { 229, 229 },
+    {  95,  95 }, { 126, 126 }, { 215, 215 }, { 172, 172 },
+    { 111, 111 }, { 142, 142 }, { 202, 202 }, { 157, 157 },
+    { 216, 216 }, { 230, 230 }, { 217, 217 }, { 187, 187 },
+    { 127, 127 }, { 231, 231 }, { 158, 158 }, { 173, 173 },
+    { 143, 143 }, { 203, 203 }, { 188, 188 }, { 232, 232 },
+    { 218, 218 }, { 233, 233 }, { 159, 159 }, { 174, 174 },
+    { 204, 204 }, { 189, 189 }, { 234, 234 }, { 219, 219 },
+    { 175, 175 }, { 205, 205 }, { 235, 235 }, { 220, 220 },
+    { 190, 190 }, { 236, 236 }, { 206, 206 }, { 191, 191 },
+    { 221, 221 }, { 207, 207 }, { 237, 237 }, { 222, 222 },
+    { 238, 238 }, { 223, 223 }, { 239, 239 }, {   0,   0 },
+};
+
+const int16_t ff_vp9_default_scan_32x32_nb[1024][2] = {
+    {    0,    0 }, {    0,    0 }, {    1,    1 }, {   32,    1 },
+    {   32,   32 }, {    2,    2 }, {   33,    2 }, {   64,   33 },
+    {    3,    3 }, {   64,   64 }, {   34,    3 }, {   65,   34 },
+    {    4,    4 }, {   35,    4 }, {   96,   65 }, {   66,   35 },
+    {   96,   96 }, {   97,   66 }, {   67,   36 }, {   36,    5 },
+    {    5,    5 }, {  128,   97 }, {   98,   67 }, {    6,    6 },
+    {  128,  128 }, {   68,   37 }, {   37,    6 }, {  129,   98 },
+    {   99,   68 }, {  160,  129 }, {  130,   99 }, {   38,    7 },
+    {   69,   38 }, {    7,    7 }, {  100,   69 }, {  161,  130 },
+    {  131,  100 }, {  160,  160 }, {   70,   39 }, {   39,    8 },
+    {    8,    8 }, {  101,   70 }, {  162,  131 }, {  132,  101 },
+    {  192,  161 }, {   71,   40 }, {  192,  192 }, {  102,   71 },
+    {   40,    9 }, {  163,  132 }, {    9,    9 }, {  193,  162 },
+    {  133,  102 }, {  164,  133 }, {   72,   41 }, {  103,   72 },
+    {  134,  103 }, {  224,  193 }, {   41,   10 }, {  194,  163 },
+    {   10,   10 }, {  224,  224 }, {  165,  134 }, {  225,  194 },
+    {  195,  164 }, {   73,   42 }, {  104,   73 }, {  135,  104 },
+    {   42,   11 }, {   11,   11 }, {  166,  135 }, {  196,  165 },
+    {  226,  195 }, {  256,  225 }, {   74,   43 }, {  105,   74 },
+    {  136,  105 }, {  227,  196 }, {   43,   12 }, {  197,  166 },
+    {  167,  136 }, {  257,  226 }, {  256,  256 }, {   12,   12 },
+    {  228,  197 }, {   75,   44 }, {  106,   75 }, {  198,  167 },
+    {  137,  106 }, {  258,  227 }, {  168,  137 }, {  288,  257 },
+    {   44,   13 }, {  229,  198 }, {  259,  228 }, {  199,  168 },
+    {  107,   76 }, {   13,   13 }, {  169,  138 }, {  138,  107 },
+    {  288,  288 }, {  289,  258 }, {   76,   45 }, {  230,  199 },
+    {  260,  229 }, {   45,   14 }, {  200,  169 }, {  139,  108 },
+    {  290,  259 }, {  108,   77 }, {  231,  200 }, {  320,  289 },
+    {  261,  230 }, {  170,  139 }, {   77,   46 }, {  291,  260 },
+    {   14,   14 }, {  321,  290 }, {  201,  170 }, {  262,  231 },
+    {  320,  320 }, {  171,  140 }, {  292,  261 }, {  232,  201 },
+    {  140,  109 }, {  322,  291 }, {  109,   78 }, {   46,   15 },
+    {  202,  171 }, {  263,  232 }, {  233,  202 }, {  293,  262 },
+    {  352,  321 }, {  323,  292 }, {   15,   15 }, {   78,   47 },
+    {  203,  172 }, {  264,  233 }, {  294,  263 }, {  324,  293 },
+    {  172,  141 }, {  353,  322 }, {  141,  110 }, {  234,  203 },
+    {  352,  352 }, {   47,   16 }, {  295,  264 }, {  110,   79 },
+    {  265,  234 }, {  354,  323 }, {  325,  294 }, {   79,   48 },
+    {   16,   16 }, {  204,  173 }, {  235,  204 }, {  173,  142 },
+    {  355,  324 }, {  384,  353 }, {  326,  295 }, {  142,  111 },
+    {  296,  265 }, {  266,  235 }, {  356,  325 }, {  385,  354 },
+    {  111,   80 }, {   48,   17 }, {  327,  296 }, {  297,  266 },
+    {  205,  174 }, {  384,  384 }, {  236,  205 }, {  357,  326 },
+    {  386,  355 }, {   80,   49 }, {  174,  143 }, {   17,   17 },
+    {  328,  297 }, {  358,  327 }, {  387,  356 }, {  298,  267 },
+    {  329,  298 }, {  388,  357 }, {  112,   81 }, {  416,  385 },
+    {  237,  206 }, {  359,  328 }, {   49,   18 }, {  206,  175 },
+    {  417,  386 }, {  389,  358 }, {  330,  299 }, {   18,   18 },
+    {  416,  416 }, {  360,  329 }, {   81,   50 }, {  418,  387 },
+    {  390,  359 }, {  238,  207 }, {   50,   19 }, {  361,  330 },
+    {  419,  388 }, {  113,   82 }, {  448,  417 }, {  448,  448 },
+    {  420,  389 }, {   82,   51 }, {  362,  331 }, {  449,  418 },
+    {  421,  390 }, {  480,  480 }, {  450,  419 }, {  422,  391 },
+    {  114,   83 }, {  451,  420 }, {  480,  449 }, {  452,  421 },
+    {  481,  450 }, {  453,  422 }, {  512,  512 }, {  482,  451 },
+    {  454,  423 }, {  512,  481 }, {  483,  452 }, {  513,  482 },
+    {  484,  453 }, {  514,  483 }, {  485,  454 }, {  544,  513 },
+    {  544,  544 }, {  486,  455 }, {  545,  514 }, {  546,  515 },
+    {  576,  576 }, {  576,  545 }, {  577,  546 }, {  578,  547 },
+    {  608,  577 }, {  609,  578 }, {  610,  579 }, {   19,   19 },
+    {  143,  112 }, {  267,  236 }, {  391,  360 }, {  515,  484 },
+    {  608,  608 }, {   20,   20 }, {   51,   20 }, {  144,  113 },
+    {  175,  144 }, {  268,  237 }, {  299,  268 }, {  392,  361 },
+    {  423,  392 }, {  516,  485 }, {  547,  516 }, {  640,  609 },
+    {  640,  640 }, {   21,   21 }, {   52,   21 }, {   83,   52 },
+    {  145,  114 }, {  176,  145 }, {  207,  176 }, {  269,  238 },
+    {  300,  269 }, {  331,  300 }, {  393,  362 }, {  424,  393 },
+    {  455,  424 }, {  517,  486 }, {  548,  517 }, {  579,  548 },
+    {  641,  610 }, {  672,  641 }, {  672,  672 }, {   22,   22 },
+    {   53,   22 }, {   84,   53 }, {  115,   84 }, {  146,  115 },
+    {  177,  146 }, {  208,  177 }, {  239,  208 }, {  270,  239 },
+    {  301,  270 }, {  332,  301 }, {  363,  332 }, {  394,  363 },
+    {  425,  394 }, {  456,  425 }, {  487,  456 }, {  518,  487 },
+    {  549,  518 }, {  580,  549 }, {  611,  580 }, {  642,  611 },
+    {  673,  642 }, {  704,  673 }, {  704,  704 }, {   54,   23 },
+    {   85,   54 }, {  116,   85 }, {  178,  147 }, {  209,  178 },
+    {  240,  209 }, {  302,  271 }, {  333,  302 }, {  364,  333 },
+    {  426,  395 }, {  457,  426 }, {  488,  457 }, {  550,  519 },
+    {  581,  550 }, {  612,  581 }, {  674,  643 }, {  705,  674 },
+    {  736,  705 }, {   86,   55 }, {  117,   86 }, {  210,  179 },
+    {  241,  210 }, {  334,  303 }, {  365,  334 }, {  458,  427 },
+    {  489,  458 }, {  582,  551 }, {  613,  582 }, {  706,  675 },
+    {  737,  706 }, {  118,   87 }, {  242,  211 }, {  366,  335 },
+    {  490,  459 }, {  614,  583 }, {  738,  707 }, {   23,   23 },
+    {  147,  116 }, {  271,  240 }, {  395,  364 }, {  519,  488 },
+    {  643,  612 }, {  736,  736 }, {   24,   24 }, {   55,   24 },
+    {  148,  117 }, {  179,  148 }, {  272,  241 }, {  303,  272 },
+    {  396,  365 }, {  427,  396 }, {  520,  489 }, {  551,  520 },
+    {  644,  613 }, {  675,  644 }, {  768,  737 }, {  768,  768 },
+    {   25,   25 }, {   56,   25 }, {   87,   56 }, {  149,  118 },
+    {  180,  149 }, {  211,  180 }, {  273,  242 }, {  304,  273 },
+    {  335,  304 }, {  397,  366 }, {  428,  397 }, {  459,  428 },
+    {  521,  490 }, {  552,  521 }, {  583,  552 }, {  645,  614 },
+    {  676,  645 }, {  707,  676 }, {  769,  738 }, {  800,  769 },
+    {  800,  800 }, {   26,   26 }, {   57,   26 }, {   88,   57 },
+    {  119,   88 }, {  150,  119 }, {  181,  150 }, {  212,  181 },
+    {  243,  212 }, {  274,  243 }, {  305,  274 }, {  336,  305 },
+    {  367,  336 }, {  398,  367 }, {  429,  398 }, {  460,  429 },
+    {  491,  460 }, {  522,  491 }, {  553,  522 }, {  584,  553 },
+    {  615,  584 }, {  646,  615 }, {  677,  646 }, {  708,  677 },
+    {  739,  708 }, {  770,  739 }, {  801,  770 }, {  832,  801 },
+    {  832,  832 }, {   58,   27 }, {   89,   58 }, {  120,   89 },
+    {  182,  151 }, {  213,  182 }, {  244,  213 }, {  306,  275 },
+    {  337,  306 }, {  368,  337 }, {  430,  399 }, {  461,  430 },
+    {  492,  461 }, {  554,  523 }, {  585,  554 }, {  616,  585 },
+    {  678,  647 }, {  709,  678 }, {  740,  709 }, {  802,  771 },
+    {  833,  802 }, {  864,  833 }, {   90,   59 }, {  121,   90 },
+    {  214,  183 }, {  245,  214 }, {  338,  307 }, {  369,  338 },
+    {  462,  431 }, {  493,  462 }, {  586,  555 }, {  617,  586 },
+    {  710,  679 }, {  741,  710 }, {  834,  803 }, {  865,  834 },
+    {  122,   91 }, {  246,  215 }, {  370,  339 }, {  494,  463 },
+    {  618,  587 }, {  742,  711 }, {  866,  835 }, {   27,   27 },
+    {  151,  120 }, {  275,  244 }, {  399,  368 }, {  523,  492 },
+    {  647,  616 }, {  771,  740 }, {  864,  864 }, {   28,   28 },
+    {   59,   28 }, {  152,  121 }, {  183,  152 }, {  276,  245 },
+    {  307,  276 }, {  400,  369 }, {  431,  400 }, {  524,  493 },
+    {  555,  524 }, {  648,  617 }, {  679,  648 }, {  772,  741 },
+    {  803,  772 }, {  896,  865 }, {  896,  896 }, {   29,   29 },
+    {   60,   29 }, {   91,   60 }, {  153,  122 }, {  184,  153 },
+    {  215,  184 }, {  277,  246 }, {  308,  277 }, {  339,  308 },
+    {  401,  370 }, {  432,  401 }, {  463,  432 }, {  525,  494 },
+    {  556,  525 }, {  587,  556 }, {  649,  618 }, {  680,  649 },
+    {  711,  680 }, {  773,  742 }, {  804,  773 }, {  835,  804 },
+    {  897,  866 }, {  928,  897 }, {  928,  928 }, {   30,   30 },
+    {   61,   30 }, {   92,   61 }, {  123,   92 }, {  154,  123 },
+    {  185,  154 }, {  216,  185 }, {  247,  216 }, {  278,  247 },
+    {  309,  278 }, {  340,  309 }, {  371,  340 }, {  402,  371 },
+    {  433,  402 }, {  464,  433 }, {  495,  464 }, {  526,  495 },
+    {  557,  526 }, {  588,  557 }, {  619,  588 }, {  650,  619 },
+    {  681,  650 }, {  712,  681 }, {  743,  712 }, {  774,  743 },
+    {  805,  774 }, {  836,  805 }, {  867,  836 }, {  898,  867 },
+    {  929,  898 }, {  960,  929 }, {  960,  960 }, {   62,   31 },
+    {   93,   62 }, {  124,   93 }, {  186,  155 }, {  217,  186 },
+    {  248,  217 }, {  310,  279 }, {  341,  310 }, {  372,  341 },
+    {  434,  403 }, {  465,  434 }, {  496,  465 }, {  558,  527 },
+    {  589,  558 }, {  620,  589 }, {  682,  651 }, {  713,  682 },
+    {  744,  713 }, {  806,  775 }, {  837,  806 }, {  868,  837 },
+    {  930,  899 }, {  961,  930 }, {  992,  961 }, {   94,   63 },
+    {  125,   94 }, {  218,  187 }, {  249,  218 }, {  342,  311 },
+    {  373,  342 }, {  466,  435 }, {  497,  466 }, {  590,  559 },
+    {  621,  590 }, {  714,  683 }, {  745,  714 }, {  838,  807 },
+    {  869,  838 }, {  962,  931 }, {  993,  962 }, {  126,   95 },
+    {  250,  219 }, {  374,  343 }, {  498,  467 }, {  622,  591 },
+    {  746,  715 }, {  870,  839 }, {  994,  963 }, {  155,  124 },
+    {  279,  248 }, {  403,  372 }, {  527,  496 }, {  651,  620 },
+    {  775,  744 }, {  899,  868 }, {  156,  125 }, {  187,  156 },
+    {  280,  249 }, {  311,  280 }, {  404,  373 }, {  435,  404 },
+    {  528,  497 }, {  559,  528 }, {  652,  621 }, {  683,  652 },
+    {  776,  745 }, {  807,  776 }, {  900,  869 }, {  931,  900 },
+    {  157,  126 }, {  188,  157 }, {  219,  188 }, {  281,  250 },
+    {  312,  281 }, {  343,  312 }, {  405,  374 }, {  436,  405 },
+    {  467,  436 }, {  529,  498 }, {  560,  529 }, {  591,  560 },
+    {  653,  622 }, {  684,  653 }, {  715,  684 }, {  777,  746 },
+    {  808,  777 }, {  839,  808 }, {  901,  870 }, {  932,  901 },
+    {  963,  932 }, {  158,  127 }, {  189,  158 }, {  220,  189 },
+    {  251,  220 }, {  282,  251 }, {  313,  282 }, {  344,  313 },
+    {  375,  344 }, {  406,  375 }, {  437,  406 }, {  468,  437 },
+    {  499,  468 }, {  530,  499 }, {  561,  530 }, {  592,  561 },
+    {  623,  592 }, {  654,  623 }, {  685,  654 }, {  716,  685 },
+    {  747,  716 }, {  778,  747 }, {  809,  778 }, {  840,  809 },
+    {  871,  840 }, {  902,  871 }, {  933,  902 }, {  964,  933 },
+    {  995,  964 }, {  190,  159 }, {  221,  190 }, {  252,  221 },
+    {  314,  283 }, {  345,  314 }, {  376,  345 }, {  438,  407 },
+    {  469,  438 }, {  500,  469 }, {  562,  531 }, {  593,  562 },
+    {  624,  593 }, {  686,  655 }, {  717,  686 }, {  748,  717 },
+    {  810,  779 }, {  841,  810 }, {  872,  841 }, {  934,  903 },
+    {  965,  934 }, {  996,  965 }, {  222,  191 }, {  253,  222 },
+    {  346,  315 }, {  377,  346 }, {  470,  439 }, {  501,  470 },
+    {  594,  563 }, {  625,  594 }, {  718,  687 }, {  749,  718 },
+    {  842,  811 }, {  873,  842 }, {  966,  935 }, {  997,  966 },
+    {  254,  223 }, {  378,  347 }, {  502,  471 }, {  626,  595 },
+    {  750,  719 }, {  874,  843 }, {  998,  967 }, {  283,  252 },
+    {  407,  376 }, {  531,  500 }, {  655,  624 }, {  779,  748 },
+    {  903,  872 }, {  284,  253 }, {  315,  284 }, {  408,  377 },
+    {  439,  408 }, {  532,  501 }, {  563,  532 }, {  656,  625 },
+    {  687,  656 }, {  780,  749 }, {  811,  780 }, {  904,  873 },
+    {  935,  904 }, {  285,  254 }, {  316,  285 }, {  347,  316 },
+    {  409,  378 }, {  440,  409 }, {  471,  440 }, {  533,  502 },
+    {  564,  533 }, {  595,  564 }, {  657,  626 }, {  688,  657 },
+    {  719,  688 }, {  781,  750 }, {  812,  781 }, {  843,  812 },
+    {  905,  874 }, {  936,  905 }, {  967,  936 }, {  286,  255 },
+    {  317,  286 }, {  348,  317 }, {  379,  348 }, {  410,  379 },
+    {  441,  410 }, {  472,  441 }, {  503,  472 }, {  534,  503 },
+    {  565,  534 }, {  596,  565 }, {  627,  596 }, {  658,  627 },
+    {  689,  658 }, {  720,  689 }, {  751,  720 }, {  782,  751 },
+    {  813,  782 }, {  844,  813 }, {  875,  844 }, {  906,  875 },
+    {  937,  906 }, {  968,  937 }, {  999,  968 }, {  318,  287 },
+    {  349,  318 }, {  380,  349 }, {  442,  411 }, {  473,  442 },
+    {  504,  473 }, {  566,  535 }, {  597,  566 }, {  628,  597 },
+    {  690,  659 }, {  721,  690 }, {  752,  721 }, {  814,  783 },
+    {  845,  814 }, {  876,  845 }, {  938,  907 }, {  969,  938 },
+    { 1000,  969 }, {  350,  319 }, {  381,  350 }, {  474,  443 },
+    {  505,  474 }, {  598,  567 }, {  629,  598 }, {  722,  691 },
+    {  753,  722 }, {  846,  815 }, {  877,  846 }, {  970,  939 },
+    { 1001,  970 }, {  382,  351 }, {  506,  475 }, {  630,  599 },
+    {  754,  723 }, {  878,  847 }, { 1002,  971 }, {  411,  380 },
+    {  535,  504 }, {  659,  628 }, {  783,  752 }, {  907,  876 },
+    {  412,  381 }, {  443,  412 }, {  536,  505 }, {  567,  536 },
+    {  660,  629 }, {  691,  660 }, {  784,  753 }, {  815,  784 },
+    {  908,  877 }, {  939,  908 }, {  413,  382 }, {  444,  413 },
+    {  475,  444 }, {  537,  506 }, {  568,  537 }, {  599,  568 },
+    {  661,  630 }, {  692,  661 }, {  723,  692 }, {  785,  754 },
+    {  816,  785 }, {  847,  816 }, {  909,  878 }, {  940,  909 },
+    {  971,  940 }, {  414,  383 }, {  445,  414 }, {  476,  445 },
+    {  507,  476 }, {  538,  507 }, {  569,  538 }, {  600,  569 },
+    {  631,  600 }, {  662,  631 }, {  693,  662 }, {  724,  693 },
+    {  755,  724 }, {  786,  755 }, {  817,  786 }, {  848,  817 },
+    {  879,  848 }, {  910,  879 }, {  941,  910 }, {  972,  941 },
+    { 1003,  972 }, {  446,  415 }, {  477,  446 }, {  508,  477 },
+    {  570,  539 }, {  601,  570 }, {  632,  601 }, {  694,  663 },
+    {  725,  694 }, {  756,  725 }, {  818,  787 }, {  849,  818 },
+    {  880,  849 }, {  942,  911 }, {  973,  942 }, { 1004,  973 },
+    {  478,  447 }, {  509,  478 }, {  602,  571 }, {  633,  602 },
+    {  726,  695 }, {  757,  726 }, {  850,  819 }, {  881,  850 },
+    {  974,  943 }, { 1005,  974 }, {  510,  479 }, {  634,  603 },
+    {  758,  727 }, {  882,  851 }, { 1006,  975 }, {  539,  508 },
+    {  663,  632 }, {  787,  756 }, {  911,  880 }, {  540,  509 },
+    {  571,  540 }, {  664,  633 }, {  695,  664 }, {  788,  757 },
+    {  819,  788 }, {  912,  881 }, {  943,  912 }, {  541,  510 },
+    {  572,  541 }, {  603,  572 }, {  665,  634 }, {  696,  665 },
+    {  727,  696 }, {  789,  758 }, {  820,  789 }, {  851,  820 },
+    {  913,  882 }, {  944,  913 }, {  975,  944 }, {  542,  511 },
+    {  573,  542 }, {  604,  573 }, {  635,  604 }, {  666,  635 },
+    {  697,  666 }, {  728,  697 }, {  759,  728 }, {  790,  759 },
+    {  821,  790 }, {  852,  821 }, {  883,  852 }, {  914,  883 },
+    {  945,  914 }, {  976,  945 }, { 1007,  976 }, {  574,  543 },
+    {  605,  574 }, {  636,  605 }, {  698,  667 }, {  729,  698 },
+    {  760,  729 }, {  822,  791 }, {  853,  822 }, {  884,  853 },
+    {  946,  915 }, {  977,  946 }, { 1008,  977 }, {  606,  575 },
+    {  637,  606 }, {  730,  699 }, {  761,  730 }, {  854,  823 },
+    {  885,  854 }, {  978,  947 }, { 1009,  978 }, {  638,  607 },
+    {  762,  731 }, {  886,  855 }, { 1010,  979 }, {  667,  636 },
+    {  791,  760 }, {  915,  884 }, {  668,  637 }, {  699,  668 },
+    {  792,  761 }, {  823,  792 }, {  916,  885 }, {  947,  916 },
+    {  669,  638 }, {  700,  669 }, {  731,  700 }, {  793,  762 },
+    {  824,  793 }, {  855,  824 }, {  917,  886 }, {  948,  917 },
+    {  979,  948 }, {  670,  639 }, {  701,  670 }, {  732,  701 },
+    {  763,  732 }, {  794,  763 }, {  825,  794 }, {  856,  825 },
+    {  887,  856 }, {  918,  887 }, {  949,  918 }, {  980,  949 },
+    { 1011,  980 }, {  702,  671 }, {  733,  702 }, {  764,  733 },
+    {  826,  795 }, {  857,  826 }, {  888,  857 }, {  950,  919 },
+    {  981,  950 }, { 1012,  981 }, {  734,  703 }, {  765,  734 },
+    {  858,  827 }, {  889,  858 }, {  982,  951 }, { 1013,  982 },
+    {  766,  735 }, {  890,  859 }, { 1014,  983 }, {  795,  764 },
+    {  919,  888 }, {  796,  765 }, {  827,  796 }, {  920,  889 },
+    {  951,  920 }, {  797,  766 }, {  828,  797 }, {  859,  828 },
+    {  921,  890 }, {  952,  921 }, {  983,  952 }, {  798,  767 },
+    {  829,  798 }, {  860,  829 }, {  891,  860 }, {  922,  891 },
+    {  953,  922 }, {  984,  953 }, { 1015,  984 }, {  830,  799 },
+    {  861,  830 }, {  892,  861 }, {  954,  923 }, {  985,  954 },
+    { 1016,  985 }, {  862,  831 }, {  893,  862 }, {  986,  955 },
+    { 1017,  986 }, {  894,  863 }, { 1018,  987 }, {  923,  892 },
+    {  924,  893 }, {  955,  924 }, {  925,  894 }, {  956,  925 },
+    {  987,  956 }, {  926,  895 }, {  957,  926 }, {  988,  957 },
+    { 1019,  988 }, {  958,  927 }, {  989,  958 }, { 1020,  989 },
+    {  990,  959 }, { 1021,  990 }, { 1022,  991 }, {    0,    0 },
+};
+
+const int16_t (*ff_vp9_scans_nb[5][4])[2] = {
+    {
+        ff_vp9_default_scan_4x4_nb, ff_vp9_col_scan_4x4_nb,
+        ff_vp9_row_scan_4x4_nb, ff_vp9_default_scan_4x4_nb
+    }, {
+        ff_vp9_default_scan_8x8_nb, ff_vp9_col_scan_8x8_nb,
+        ff_vp9_row_scan_8x8_nb, ff_vp9_default_scan_8x8_nb
+    }, {
+        ff_vp9_default_scan_16x16_nb, ff_vp9_col_scan_16x16_nb,
+        ff_vp9_row_scan_16x16_nb, ff_vp9_default_scan_16x16_nb
+    }, {
+        ff_vp9_default_scan_32x32_nb, ff_vp9_default_scan_32x32_nb,
+        ff_vp9_default_scan_32x32_nb, ff_vp9_default_scan_32x32_nb
+    }, { // lossless
+        ff_vp9_default_scan_4x4_nb, ff_vp9_default_scan_4x4_nb,
+        ff_vp9_default_scan_4x4_nb, ff_vp9_default_scan_4x4_nb
+    }
+};
+
+const uint8_t ff_vp9_model_pareto8[256][8] = {
+    {   6,  86, 128,  11,  87,  42,  91,  52 },
+    {   3,  86, 128,   6,  86,  23,  88,  29 },
+    {   6,  86, 128,  11,  87,  42,  91,  52 },
+    {   9,  86, 129,  17,  88,  61,  94,  76 },
+    {  12,  86, 129,  22,  88,  77,  97,  93 },
+    {  15,  87, 129,  28,  89,  93, 100, 110 },
+    {  17,  87, 129,  33,  90, 105, 103, 123 },
+    {  20,  88, 130,  38,  91, 118, 106, 136 },
+    {  23,  88, 130,  43,  91, 128, 108, 146 },
+    {  26,  89, 131,  48,  92, 139, 111, 156 },
+    {  28,  89, 131,  53,  93, 147, 114, 163 },
+    {  31,  90, 131,  58,  94, 156, 117, 171 },
+    {  34,  90, 131,  62,  94, 163, 119, 177 },
+    {  37,  90, 132,  66,  95, 171, 122, 184 },
+    {  39,  90, 132,  70,  96, 177, 124, 189 },
+    {  42,  91, 132,  75,  97, 183, 127, 194 },
+    {  44,  91, 132,  79,  97, 188, 129, 198 },
+    {  47,  92, 133,  83,  98, 193, 132, 202 },
+    {  49,  92, 133,  86,  99, 197, 134, 205 },
+    {  52,  93, 133,  90, 100, 201, 137, 208 },
+    {  54,  93, 133,  94, 100, 204, 139, 211 },
+    {  57,  94, 134,  98, 101, 208, 142, 214 },
+    {  59,  94, 134, 101, 102, 211, 144, 216 },
+    {  62,  94, 135, 105, 103, 214, 146, 218 },
+    {  64,  94, 135, 108, 103, 216, 148, 220 },
+    {  66,  95, 135, 111, 104, 219, 151, 222 },
+    {  68,  95, 135, 114, 105, 221, 153, 223 },
+    {  71,  96, 136, 117, 106, 224, 155, 225 },
+    {  73,  96, 136, 120, 106, 225, 157, 226 },
+    {  76,  97, 136, 123, 107, 227, 159, 228 },
+    {  78,  97, 136, 126, 108, 229, 160, 229 },
+    {  80,  98, 137, 129, 109, 231, 162, 231 },
+    {  82,  98, 137, 131, 109, 232, 164, 232 },
+    {  84,  98, 138, 134, 110, 234, 166, 233 },
+    {  86,  98, 138, 137, 111, 235, 168, 234 },
+    {  89,  99, 138, 140, 112, 236, 170, 235 },
+    {  91,  99, 138, 142, 112, 237, 171, 235 },
+    {  93, 100, 139, 145, 113, 238, 173, 236 },
+    {  95, 100, 139, 147, 114, 239, 174, 237 },
+    {  97, 101, 140, 149, 115, 240, 176, 238 },
+    {  99, 101, 140, 151, 115, 241, 177, 238 },
+    { 101, 102, 140, 154, 116, 242, 179, 239 },
+    { 103, 102, 140, 156, 117, 242, 180, 239 },
+    { 105, 103, 141, 158, 118, 243, 182, 240 },
+    { 107, 103, 141, 160, 118, 243, 183, 240 },
+    { 109, 104, 141, 162, 119, 244, 185, 241 },
+    { 111, 104, 141, 164, 119, 244, 186, 241 },
+    { 113, 104, 142, 166, 120, 245, 187, 242 },
+    { 114, 104, 142, 168, 121, 245, 188, 242 },
+    { 116, 105, 143, 170, 122, 246, 190, 243 },
+    { 118, 105, 143, 171, 122, 246, 191, 243 },
+    { 120, 106, 143, 173, 123, 247, 192, 244 },
+    { 121, 106, 143, 175, 124, 247, 193, 244 },
+    { 123, 107, 144, 177, 125, 248, 195, 244 },
+    { 125, 107, 144, 178, 125, 248, 196, 244 },
+    { 127, 108, 145, 180, 126, 249, 197, 245 },
+    { 128, 108, 145, 181, 127, 249, 198, 245 },
+    { 130, 109, 145, 183, 128, 249, 199, 245 },
+    { 132, 109, 145, 184, 128, 249, 200, 245 },
+    { 134, 110, 146, 186, 129, 250, 201, 246 },
+    { 135, 110, 146, 187, 130, 250, 202, 246 },
+    { 137, 111, 147, 189, 131, 251, 203, 246 },
+    { 138, 111, 147, 190, 131, 251, 204, 246 },
+    { 140, 112, 147, 192, 132, 251, 205, 247 },
+    { 141, 112, 147, 193, 132, 251, 206, 247 },
+    { 143, 113, 148, 194, 133, 251, 207, 247 },
+    { 144, 113, 148, 195, 134, 251, 207, 247 },
+    { 146, 114, 149, 197, 135, 252, 208, 248 },
+    { 147, 114, 149, 198, 135, 252, 209, 248 },
+    { 149, 115, 149, 199, 136, 252, 210, 248 },
+    { 150, 115, 149, 200, 137, 252, 210, 248 },
+    { 152, 115, 150, 201, 138, 252, 211, 248 },
+    { 153, 115, 150, 202, 138, 252, 212, 248 },
+    { 155, 116, 151, 204, 139, 253, 213, 249 },
+    { 156, 116, 151, 205, 139, 253, 213, 249 },
+    { 158, 117, 151, 206, 140, 253, 214, 249 },
+    { 159, 117, 151, 207, 141, 253, 215, 249 },
+    { 161, 118, 152, 208, 142, 253, 216, 249 },
+    { 162, 118, 152, 209, 142, 253, 216, 249 },
+    { 163, 119, 153, 210, 143, 253, 217, 249 },
+    { 164, 119, 153, 211, 143, 253, 217, 249 },
+    { 166, 120, 153, 212, 144, 254, 218, 250 },
+    { 167, 120, 153, 212, 145, 254, 219, 250 },
+    { 168, 121, 154, 213, 146, 254, 220, 250 },
+    { 169, 121, 154, 214, 146, 254, 220, 250 },
+    { 171, 122, 155, 215, 147, 254, 221, 250 },
+    { 172, 122, 155, 216, 147, 254, 221, 250 },
+    { 173, 123, 155, 217, 148, 254, 222, 250 },
+    { 174, 123, 155, 217, 149, 254, 222, 250 },
+    { 176, 124, 156, 218, 150, 254, 223, 250 },
+    { 177, 124, 156, 219, 150, 254, 223, 250 },
+    { 178, 125, 157, 220, 151, 254, 224, 251 },
+    { 179, 125, 157, 220, 151, 254, 224, 251 },
+    { 180, 126, 157, 221, 152, 254, 225, 251 },
+    { 181, 126, 157, 221, 152, 254, 225, 251 },
+    { 183, 127, 158, 222, 153, 254, 226, 251 },
+    { 184, 127, 158, 223, 154, 254, 226, 251 },
+    { 185, 128, 159, 224, 155, 255, 227, 251 },
+    { 186, 128, 159, 224, 155, 255, 227, 251 },
+    { 187, 129, 160, 225, 156, 255, 228, 251 },
+    { 188, 130, 160, 225, 156, 255, 228, 251 },
+    { 189, 131, 160, 226, 157, 255, 228, 251 },
+    { 190, 131, 160, 226, 158, 255, 228, 251 },
+    { 191, 132, 161, 227, 159, 255, 229, 251 },
+    { 192, 132, 161, 227, 159, 255, 229, 251 },
+    { 193, 133, 162, 228, 160, 255, 230, 252 },
+    { 194, 133, 162, 229, 160, 255, 230, 252 },
+    { 195, 134, 163, 230, 161, 255, 231, 252 },
+    { 196, 134, 163, 230, 161, 255, 231, 252 },
+    { 197, 135, 163, 231, 162, 255, 231, 252 },
+    { 198, 135, 163, 231, 162, 255, 231, 252 },
+    { 199, 136, 164, 232, 163, 255, 232, 252 },
+    { 200, 136, 164, 232, 164, 255, 232, 252 },
+    { 201, 137, 165, 233, 165, 255, 233, 252 },
+    { 201, 137, 165, 233, 165, 255, 233, 252 },
+    { 202, 138, 166, 233, 166, 255, 233, 252 },
+    { 203, 138, 166, 233, 166, 255, 233, 252 },
+    { 204, 139, 166, 234, 167, 255, 234, 252 },
+    { 205, 139, 166, 234, 167, 255, 234, 252 },
+    { 206, 140, 167, 235, 168, 255, 235, 252 },
+    { 206, 140, 167, 235, 168, 255, 235, 252 },
+    { 207, 141, 168, 236, 169, 255, 235, 252 },
+    { 208, 141, 168, 236, 170, 255, 235, 252 },
+    { 209, 142, 169, 237, 171, 255, 236, 252 },
+    { 209, 143, 169, 237, 171, 255, 236, 252 },
+    { 210, 144, 169, 237, 172, 255, 236, 252 },
+    { 211, 144, 169, 237, 172, 255, 236, 252 },
+    { 212, 145, 170, 238, 173, 255, 237, 252 },
+    { 213, 145, 170, 238, 173, 255, 237, 252 },
+    { 214, 146, 171, 239, 174, 255, 237, 253 },
+    { 214, 146, 171, 239, 174, 255, 237, 253 },
+    { 215, 147, 172, 240, 175, 255, 238, 253 },
+    { 215, 147, 172, 240, 175, 255, 238, 253 },
+    { 216, 148, 173, 240, 176, 255, 238, 253 },
+    { 217, 148, 173, 240, 176, 255, 238, 253 },
+    { 218, 149, 173, 241, 177, 255, 239, 253 },
+    { 218, 149, 173, 241, 178, 255, 239, 253 },
+    { 219, 150, 174, 241, 179, 255, 239, 253 },
+    { 219, 151, 174, 241, 179, 255, 239, 253 },
+    { 220, 152, 175, 242, 180, 255, 240, 253 },
+    { 221, 152, 175, 242, 180, 255, 240, 253 },
+    { 222, 153, 176, 242, 181, 255, 240, 253 },
+    { 222, 153, 176, 242, 181, 255, 240, 253 },
+    { 223, 154, 177, 243, 182, 255, 240, 253 },
+    { 223, 154, 177, 243, 182, 255, 240, 253 },
+    { 224, 155, 178, 244, 183, 255, 241, 253 },
+    { 224, 155, 178, 244, 183, 255, 241, 253 },
+    { 225, 156, 178, 244, 184, 255, 241, 253 },
+    { 225, 157, 178, 244, 184, 255, 241, 253 },
+    { 226, 158, 179, 244, 185, 255, 242, 253 },
+    { 227, 158, 179, 244, 185, 255, 242, 253 },
+    { 228, 159, 180, 245, 186, 255, 242, 253 },
+    { 228, 159, 180, 245, 186, 255, 242, 253 },
+    { 229, 160, 181, 245, 187, 255, 242, 253 },
+    { 229, 160, 181, 245, 187, 255, 242, 253 },
+    { 230, 161, 182, 246, 188, 255, 243, 253 },
+    { 230, 162, 182, 246, 188, 255, 243, 253 },
+    { 231, 163, 183, 246, 189, 255, 243, 253 },
+    { 231, 163, 183, 246, 189, 255, 243, 253 },
+    { 232, 164, 184, 247, 190, 255, 243, 253 },
+    { 232, 164, 184, 247, 190, 255, 243, 253 },
+    { 233, 165, 185, 247, 191, 255, 244, 253 },
+    { 233, 165, 185, 247, 191, 255, 244, 253 },
+    { 234, 166, 185, 247, 192, 255, 244, 253 },
+    { 234, 167, 185, 247, 192, 255, 244, 253 },
+    { 235, 168, 186, 248, 193, 255, 244, 253 },
+    { 235, 168, 186, 248, 193, 255, 244, 253 },
+    { 236, 169, 187, 248, 194, 255, 244, 253 },
+    { 236, 169, 187, 248, 194, 255, 244, 253 },
+    { 236, 170, 188, 248, 195, 255, 245, 253 },
+    { 236, 170, 188, 248, 195, 255, 245, 253 },
+    { 237, 171, 189, 249, 196, 255, 245, 254 },
+    { 237, 172, 189, 249, 196, 255, 245, 254 },
+    { 238, 173, 190, 249, 197, 255, 245, 254 },
+    { 238, 173, 190, 249, 197, 255, 245, 254 },
+    { 239, 174, 191, 249, 198, 255, 245, 254 },
+    { 239, 174, 191, 249, 198, 255, 245, 254 },
+    { 240, 175, 192, 249, 199, 255, 246, 254 },
+    { 240, 176, 192, 249, 199, 255, 246, 254 },
+    { 240, 177, 193, 250, 200, 255, 246, 254 },
+    { 240, 177, 193, 250, 200, 255, 246, 254 },
+    { 241, 178, 194, 250, 201, 255, 246, 254 },
+    { 241, 178, 194, 250, 201, 255, 246, 254 },
+    { 242, 179, 195, 250, 202, 255, 246, 254 },
+    { 242, 180, 195, 250, 202, 255, 246, 254 },
+    { 242, 181, 196, 250, 203, 255, 247, 254 },
+    { 242, 181, 196, 250, 203, 255, 247, 254 },
+    { 243, 182, 197, 251, 204, 255, 247, 254 },
+    { 243, 183, 197, 251, 204, 255, 247, 254 },
+    { 244, 184, 198, 251, 205, 255, 247, 254 },
+    { 244, 184, 198, 251, 205, 255, 247, 254 },
+    { 244, 185, 199, 251, 206, 255, 247, 254 },
+    { 244, 185, 199, 251, 206, 255, 247, 254 },
+    { 245, 186, 200, 251, 207, 255, 247, 254 },
+    { 245, 187, 200, 251, 207, 255, 247, 254 },
+    { 246, 188, 201, 252, 207, 255, 248, 254 },
+    { 246, 188, 201, 252, 207, 255, 248, 254 },
+    { 246, 189, 202, 252, 208, 255, 248, 254 },
+    { 246, 190, 202, 252, 208, 255, 248, 254 },
+    { 247, 191, 203, 252, 209, 255, 248, 254 },
+    { 247, 191, 203, 252, 209, 255, 248, 254 },
+    { 247, 192, 204, 252, 210, 255, 248, 254 },
+    { 247, 193, 204, 252, 210, 255, 248, 254 },
+    { 248, 194, 205, 252, 211, 255, 248, 254 },
+    { 248, 194, 205, 252, 211, 255, 248, 254 },
+    { 248, 195, 206, 252, 212, 255, 249, 254 },
+    { 248, 196, 206, 252, 212, 255, 249, 254 },
+    { 249, 197, 207, 253, 213, 255, 249, 254 },
+    { 249, 197, 207, 253, 213, 255, 249, 254 },
+    { 249, 198, 208, 253, 214, 255, 249, 254 },
+    { 249, 199, 209, 253, 214, 255, 249, 254 },
+    { 250, 200, 210, 253, 215, 255, 249, 254 },
+    { 250, 200, 210, 253, 215, 255, 249, 254 },
+    { 250, 201, 211, 253, 215, 255, 249, 254 },
+    { 250, 202, 211, 253, 215, 255, 249, 254 },
+    { 250, 203, 212, 253, 216, 255, 249, 254 },
+    { 250, 203, 212, 253, 216, 255, 249, 254 },
+    { 251, 204, 213, 253, 217, 255, 250, 254 },
+    { 251, 205, 213, 253, 217, 255, 250, 254 },
+    { 251, 206, 214, 254, 218, 255, 250, 254 },
+    { 251, 206, 215, 254, 218, 255, 250, 254 },
+    { 252, 207, 216, 254, 219, 255, 250, 254 },
+    { 252, 208, 216, 254, 219, 255, 250, 254 },
+    { 252, 209, 217, 254, 220, 255, 250, 254 },
+    { 252, 210, 217, 254, 220, 255, 250, 254 },
+    { 252, 211, 218, 254, 221, 255, 250, 254 },
+    { 252, 212, 218, 254, 221, 255, 250, 254 },
+    { 253, 213, 219, 254, 222, 255, 250, 254 },
+    { 253, 213, 220, 254, 222, 255, 250, 254 },
+    { 253, 214, 221, 254, 223, 255, 250, 254 },
+    { 253, 215, 221, 254, 223, 255, 250, 254 },
+    { 253, 216, 222, 254, 224, 255, 251, 254 },
+    { 253, 217, 223, 254, 224, 255, 251, 254 },
+    { 253, 218, 224, 254, 225, 255, 251, 254 },
+    { 253, 219, 224, 254, 225, 255, 251, 254 },
+    { 254, 220, 225, 254, 225, 255, 251, 254 },
+    { 254, 221, 226, 254, 225, 255, 251, 254 },
+    { 254, 222, 227, 255, 226, 255, 251, 254 },
+    { 254, 223, 227, 255, 226, 255, 251, 254 },
+    { 254, 224, 228, 255, 227, 255, 251, 254 },
+    { 254, 225, 229, 255, 227, 255, 251, 254 },
+    { 254, 226, 230, 255, 228, 255, 251, 254 },
+    { 254, 227, 230, 255, 229, 255, 251, 254 },
+    { 255, 228, 231, 255, 230, 255, 251, 254 },
+    { 255, 229, 232, 255, 230, 255, 251, 254 },
+    { 255, 230, 233, 255, 231, 255, 252, 254 },
+    { 255, 231, 234, 255, 231, 255, 252, 254 },
+    { 255, 232, 235, 255, 232, 255, 252, 254 },
+    { 255, 233, 236, 255, 232, 255, 252, 254 },
+    { 255, 235, 237, 255, 233, 255, 252, 254 },
+    { 255, 236, 238, 255, 234, 255, 252, 254 },
+    { 255, 238, 240, 255, 235, 255, 252, 255 },
+    { 255, 239, 241, 255, 235, 255, 252, 254 },
+    { 255, 241, 243, 255, 236, 255, 252, 254 },
+    { 255, 243, 245, 255, 237, 255, 252, 254 },
+    { 255, 246, 247, 255, 239, 255, 253, 255 },
+};
+
+const ProbContext ff_vp9_default_probs = {
+    { /* y_mode */
+        {  65,  32,  18, 144, 162, 194,  41,  51,  98 } /* bsize < 8x8 */,
+        { 132,  68,  18, 165, 217, 196,  45,  40,  78 } /* bsize < 16x16 */,
+        { 173,  80,  19, 176, 240, 193,  64,  35,  46 } /* bsize < 32x32 */,
+        { 221, 135,  38, 194, 248, 121,  96,  85,  29 } /* bsize >= 32x32 */
+    }, { /* uv_mode */
+        {  48,  12, 154, 155, 139,  90,  34, 117, 119 } /* y = v */,
+        {  67,   6,  25, 204, 243, 158,  13,  21,  96 } /* y = h */,
+        { 120,   7,  76, 176, 208, 126,  28,  54, 103 } /* y = dc */,
+        {  97,   5,  44, 131, 176, 139,  48,  68,  97 } /* y = d45 */,
+        {  83,   5,  42, 156, 111, 152,  26,  49, 152 } /* y = d135 */,
+        {  80,   5,  58, 178,  74,  83,  33,  62, 145 } /* y = d117 */,
+        {  86,   5,  32, 154, 192, 168,  14,  22, 163 } /* y = d153 */,
+        {  77,   7,  64, 116, 132, 122,  37, 126, 120 } /* y = d63 */,
+        {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d27 */,
+        { 101,  21, 107, 181, 192, 103,  19,  67, 125 } /* y = tm */
+    }, { /* filter */
+        { 235, 162, },
+        {  36, 255, },
+        {  34,   3, },
+        { 149, 144, },
+    }, { /* mv_mode */
+        {  2, 173,  34 },  // 0 = both zero mv
+        {  7, 145,  85 },  // 1 = one zero mv + one a predicted mv
+        {  7, 166,  63 },  // 2 = two predicted mvs
+        {  7,  94,  66 },  // 3 = one predicted/zero and one new mv
+        {  8,  64,  46 },  // 4 = two new mvs
+        { 17,  81,  31 },  // 5 = one intra neighbor + x
+        { 25,  29,  30 },  // 6 = two intra neighbors
+    }, { /* intra */
+        9, 102, 187, 225
+    }, { /* comp */
+        239, 183, 119,  96,  41
+    }, { /* single_ref */
+        {  33,  16 },
+        {  77,  74 },
+        { 142, 142 },
+        { 172, 170 },
+        { 238, 247 }
+    }, { /* comp_ref */
+        50, 126, 123, 221, 226
+    }, { /* tx32p */
+        { 3, 136, 37, },
+        { 5,  52, 13, },
+    }, { /* tx16p */
+        { 20, 152, },
+        { 15, 101, },
+    }, { /* tx8p */
+        100, 66
+    }, { /* skip */
+        192, 128, 64
+    }, { /* mv_joint */
+        32, 64, 96
+    }, {
+        { /* mv vertical component */
+            128, /* sign */
+            { 224, 144, 192, 168, 192, 176, 192, 198, 198, 245 }, /* class */
+            216, /* class0 */
+            { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 }, /* bits */
+            { /* class0_fp */
+                { 128, 128, 64 },
+                {  96, 112, 64 }
+            },
+            { 64, 96, 64 }, /* fp */
+            160, /* class0_hp bit */
+            128, /* hp */
+        }, { /* mv horizontal component */
+            128, /* sign */
+            { 216, 128, 176, 160, 176, 176, 192, 198, 198, 208 }, /* class */
+            208, /* class0 */
+            { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 }, /* bits */
+            { /* class0_fp */
+                { 128, 128, 64 },
+                {  96, 112, 64 }
+            },
+            { 64, 96, 64 }, /* fp */
+            160, /* class0_hp bit */
+            128, /* hp */
+        }
+    }, { /* partition */
+        { /* 64x64 -> 32x32 */
+            { 222,  34,  30 } /* a/l both not split */,
+            {  72,  16,  44 } /* a split, l not split */,
+            {  58,  32,  12 } /* l split, a not split */,
+            {  10,   7,   6 } /* a/l both split */,
+        }, { /* 32x32 -> 16x16 */
+            { 177,  58,  59 } /* a/l both not split */,
+            {  68,  26,  63 } /* a split, l not split */,
+            {  52,  79,  25 } /* l split, a not split */,
+            {  17,  14,  12 } /* a/l both split */,
+        }, { /* 16x16 -> 8x8 */
+            { 174,  73,  87 } /* a/l both not split */,
+            {  92,  41,  83 } /* a split, l not split */,
+            {  82,  99,  50 } /* l split, a not split */,
+            {  53,  39,  39 } /* a/l both split */,
+        }, { /* 8x8 -> 4x4 */
+            { 199, 122, 141 } /* a/l both not split */,
+            { 147,  63, 159 } /* a split, l not split */,
+            { 148, 133, 118 } /* l split, a not split */,
+            { 121, 104, 114 } /* a/l both split */,
+        }
+    },
+};
+
+const uint8_t ff_vp9_default_coef_probs[4][2][2][6][6][3] = {
+    { /* tx = 4x4 */
+        { /* block Type 0 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 195,  29, 183 },
+                    {  84,  49, 136 },
+                    {   8,  42,  71 }
+                }, { /* Coeff Band 1 */
+                    {  31, 107, 169 },
+                    {  35,  99, 159 },
+                    {  17,  82, 140 },
+                    {   8,  66, 114 },
+                    {   2,  44,  76 },
+                    {   1,  19,  32 }
+                }, { /* Coeff Band 2 */
+                    {  40, 132, 201 },
+                    {  29, 114, 187 },
+                    {  13,  91, 157 },
+                    {   7,  75, 127 },
+                    {   3,  58,  95 },
+                    {   1,  28,  47 }
+                }, { /* Coeff Band 3 */
+                    {  69, 142, 221 },
+                    {  42, 122, 201 },
+                    {  15,  91, 159 },
+                    {   6,  67, 121 },
+                    {   1,  42,  77 },
+                    {   1,  17,  31 }
+                }, { /* Coeff Band 4 */
+                    { 102, 148, 228 },
+                    {  67, 117, 204 },
+                    {  17,  82, 154 },
+                    {   6,  59, 114 },
+                    {   2,  39,  75 },
+                    {   1,  15,  29 }
+                }, { /* Coeff Band 5 */
+                    { 156,  57, 233 },
+                    { 119,  57, 212 },
+                    {  58,  48, 163 },
+                    {  29,  40, 124 },
+                    {  12,  30,  81 },
+                    {   3,  12,  31 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 191, 107, 226 },
+                    { 124, 117, 204 },
+                    {  25,  99, 155 }
+                }, { /* Coeff Band 1 */
+                    {  29, 148, 210 },
+                    {  37, 126, 194 },
+                    {   8,  93, 157 },
+                    {   2,  68, 118 },
+                    {   1,  39,  69 },
+                    {   1,  17,  33 }
+                }, { /* Coeff Band 2 */
+                    {  41, 151, 213 },
+                    {  27, 123, 193 },
+                    {   3,  82, 144 },
+                    {   1,  58, 105 },
+                    {   1,  32,  60 },
+                    {   1,  13,  26 }
+                }, { /* Coeff Band 3 */
+                    {  59, 159, 220 },
+                    {  23, 126, 198 },
+                    {   4,  88, 151 },
+                    {   1,  66, 114 },
+                    {   1,  38,  71 },
+                    {   1,  18,  34 }
+                }, { /* Coeff Band 4 */
+                    { 114, 136, 232 },
+                    {  51, 114, 207 },
+                    {  11,  83, 155 },
+                    {   3,  56, 105 },
+                    {   1,  33,  65 },
+                    {   1,  17,  34 }
+                }, { /* Coeff Band 5 */
+                    { 149,  65, 234 },
+                    { 121,  57, 215 },
+                    {  61,  49, 166 },
+                    {  28,  36, 114 },
+                    {  12,  25,  76 },
+                    {   3,  16,  42 }
+                }
+            }
+        }, { /* block Type 1 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 214,  49, 220 },
+                    { 132,  63, 188 },
+                    {  42,  65, 137 }
+                }, { /* Coeff Band 1 */
+                    {  85, 137, 221 },
+                    { 104, 131, 216 },
+                    {  49, 111, 192 },
+                    {  21,  87, 155 },
+                    {   2,  49,  87 },
+                    {   1,  16,  28 }
+                }, { /* Coeff Band 2 */
+                    {  89, 163, 230 },
+                    {  90, 137, 220 },
+                    {  29, 100, 183 },
+                    {  10,  70, 135 },
+                    {   2,  42,  81 },
+                    {   1,  17,  33 }
+                }, { /* Coeff Band 3 */
+                    { 108, 167, 237 },
+                    {  55, 133, 222 },
+                    {  15,  97, 179 },
+                    {   4,  72, 135 },
+                    {   1,  45,  85 },
+                    {   1,  19,  38 }
+                }, { /* Coeff Band 4 */
+                    { 124, 146, 240 },
+                    {  66, 124, 224 },
+                    {  17,  88, 175 },
+                    {   4,  58, 122 },
+                    {   1,  36,  75 },
+                    {   1,  18,  37 }
+                }, { /* Coeff Band 5 */
+                    { 141,  79, 241 },
+                    { 126,  70, 227 },
+                    {  66,  58, 182 },
+                    {  30,  44, 136 },
+                    {  12,  34,  96 },
+                    {   2,  20,  47 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 229,  99, 249 },
+                    { 143, 111, 235 },
+                    {  46, 109, 192 }
+                }, { /* Coeff Band 1 */
+                    {  82, 158, 236 },
+                    {  94, 146, 224 },
+                    {  25, 117, 191 },
+                    {   9,  87, 149 },
+                    {   3,  56,  99 },
+                    {   1,  33,  57 }
+                }, { /* Coeff Band 2 */
+                    {  83, 167, 237 },
+                    {  68, 145, 222 },
+                    {  10, 103, 177 },
+                    {   2,  72, 131 },
+                    {   1,  41,  79 },
+                    {   1,  20,  39 }
+                }, { /* Coeff Band 3 */
+                    {  99, 167, 239 },
+                    {  47, 141, 224 },
+                    {  10, 104, 178 },
+                    {   2,  73, 133 },
+                    {   1,  44,  85 },
+                    {   1,  22,  47 }
+                }, { /* Coeff Band 4 */
+                    { 127, 145, 243 },
+                    {  71, 129, 228 },
+                    {  17,  93, 177 },
+                    {   3,  61, 124 },
+                    {   1,  41,  84 },
+                    {   1,  21,  52 }
+                }, { /* Coeff Band 5 */
+                    { 157,  78, 244 },
+                    { 140,  72, 231 },
+                    {  69,  58, 184 },
+                    {  31,  44, 137 },
+                    {  14,  38, 105 },
+                    {   8,  23,  61 }
+                }
+            }
+        }
+    }, { /* tx = 8x8 */
+        { /* block Type 0 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 125,  34, 187 },
+                    {  52,  41, 133 },
+                    {   6,  31,  56 }
+                }, { /* Coeff Band 1 */
+                    {  37, 109, 153 },
+                    {  51, 102, 147 },
+                    {  23,  87, 128 },
+                    {   8,  67, 101 },
+                    {   1,  41,  63 },
+                    {   1,  19,  29 }
+                }, { /* Coeff Band 2 */
+                    {  31, 154, 185 },
+                    {  17, 127, 175 },
+                    {   6,  96, 145 },
+                    {   2,  73, 114 },
+                    {   1,  51,  82 },
+                    {   1,  28,  45 }
+                }, { /* Coeff Band 3 */
+                    {  23, 163, 200 },
+                    {  10, 131, 185 },
+                    {   2,  93, 148 },
+                    {   1,  67, 111 },
+                    {   1,  41,  69 },
+                    {   1,  14,  24 }
+                }, { /* Coeff Band 4 */
+                    {  29, 176, 217 },
+                    {  12, 145, 201 },
+                    {   3, 101, 156 },
+                    {   1,  69, 111 },
+                    {   1,  39,  63 },
+                    {   1,  14,  23 }
+                }, { /* Coeff Band 5 */
+                    {  57, 192, 233 },
+                    {  25, 154, 215 },
+                    {   6, 109, 167 },
+                    {   3,  78, 118 },
+                    {   1,  48,  69 },
+                    {   1,  21,  29 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 202, 105, 245 },
+                    { 108, 106, 216 },
+                    {  18,  90, 144 }
+                }, { /* Coeff Band 1 */
+                    {  33, 172, 219 },
+                    {  64, 149, 206 },
+                    {  14, 117, 177 },
+                    {   5,  90, 141 },
+                    {   2,  61,  95 },
+                    {   1,  37,  57 }
+                }, { /* Coeff Band 2 */
+                    {  33, 179, 220 },
+                    {  11, 140, 198 },
+                    {   1,  89, 148 },
+                    {   1,  60, 104 },
+                    {   1,  33,  57 },
+                    {   1,  12,  21 }
+                }, { /* Coeff Band 3 */
+                    {  30, 181, 221 },
+                    {   8, 141, 198 },
+                    {   1,  87, 145 },
+                    {   1,  58, 100 },
+                    {   1,  31,  55 },
+                    {   1,  12,  20 }
+                }, { /* Coeff Band 4 */
+                    {  32, 186, 224 },
+                    {   7, 142, 198 },
+                    {   1,  86, 143 },
+                    {   1,  58, 100 },
+                    {   1,  31,  55 },
+                    {   1,  12,  22 }
+                }, { /* Coeff Band 5 */
+                    {  57, 192, 227 },
+                    {  20, 143, 204 },
+                    {   3,  96, 154 },
+                    {   1,  68, 112 },
+                    {   1,  42,  69 },
+                    {   1,  19,  32 }
+                }
+            }
+        }, { /* block Type 1 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 212,  35, 215 },
+                    { 113,  47, 169 },
+                    {  29,  48, 105 }
+                }, { /* Coeff Band 1 */
+                    {  74, 129, 203 },
+                    { 106, 120, 203 },
+                    {  49, 107, 178 },
+                    {  19,  84, 144 },
+                    {   4,  50,  84 },
+                    {   1,  15,  25 }
+                }, { /* Coeff Band 2 */
+                    {  71, 172, 217 },
+                    {  44, 141, 209 },
+                    {  15, 102, 173 },
+                    {   6,  76, 133 },
+                    {   2,  51,  89 },
+                    {   1,  24,  42 }
+                }, { /* Coeff Band 3 */
+                    {  64, 185, 231 },
+                    {  31, 148, 216 },
+                    {   8, 103, 175 },
+                    {   3,  74, 131 },
+                    {   1,  46,  81 },
+                    {   1,  18,  30 }
+                }, { /* Coeff Band 4 */
+                    {  65, 196, 235 },
+                    {  25, 157, 221 },
+                    {   5, 105, 174 },
+                    {   1,  67, 120 },
+                    {   1,  38,  69 },
+                    {   1,  15,  30 }
+                }, { /* Coeff Band 5 */
+                    {  65, 204, 238 },
+                    {  30, 156, 224 },
+                    {   7, 107, 177 },
+                    {   2,  70, 124 },
+                    {   1,  42,  73 },
+                    {   1,  18,  34 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 225,  86, 251 },
+                    { 144, 104, 235 },
+                    {  42,  99, 181 }
+                }, { /* Coeff Band 1 */
+                    {  85, 175, 239 },
+                    { 112, 165, 229 },
+                    {  29, 136, 200 },
+                    {  12, 103, 162 },
+                    {   6,  77, 123 },
+                    {   2,  53,  84 }
+                }, { /* Coeff Band 2 */
+                    {  75, 183, 239 },
+                    {  30, 155, 221 },
+                    {   3, 106, 171 },
+                    {   1,  74, 128 },
+                    {   1,  44,  76 },
+                    {   1,  17,  28 }
+                }, { /* Coeff Band 3 */
+                    {  73, 185, 240 },
+                    {  27, 159, 222 },
+                    {   2, 107, 172 },
+                    {   1,  75, 127 },
+                    {   1,  42,  73 },
+                    {   1,  17,  29 }
+                }, { /* Coeff Band 4 */
+                    {  62, 190, 238 },
+                    {  21, 159, 222 },
+                    {   2, 107, 172 },
+                    {   1,  72, 122 },
+                    {   1,  40,  71 },
+                    {   1,  18,  32 }
+                }, { /* Coeff Band 5 */
+                    {  61, 199, 240 },
+                    {  27, 161, 226 },
+                    {   4, 113, 180 },
+                    {   1,  76, 129 },
+                    {   1,  46,  80 },
+                    {   1,  23,  41 }
+                }
+            }
+        }
+    }, { /* tx = 16x16 */
+        { /* block Type 0 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    {   7,  27, 153 },
+                    {   5,  30,  95 },
+                    {   1,  16,  30 }
+                }, { /* Coeff Band 1 */
+                    {  50,  75, 127 },
+                    {  57,  75, 124 },
+                    {  27,  67, 108 },
+                    {  10,  54,  86 },
+                    {   1,  33,  52 },
+                    {   1,  12,  18 }
+                }, { /* Coeff Band 2 */
+                    {  43, 125, 151 },
+                    {  26, 108, 148 },
+                    {   7,  83, 122 },
+                    {   2,  59,  89 },
+                    {   1,  38,  60 },
+                    {   1,  17,  27 }
+                }, { /* Coeff Band 3 */
+                    {  23, 144, 163 },
+                    {  13, 112, 154 },
+                    {   2,  75, 117 },
+                    {   1,  50,  81 },
+                    {   1,  31,  51 },
+                    {   1,  14,  23 }
+                }, { /* Coeff Band 4 */
+                    {  18, 162, 185 },
+                    {   6, 123, 171 },
+                    {   1,  78, 125 },
+                    {   1,  51,  86 },
+                    {   1,  31,  54 },
+                    {   1,  14,  23 }
+                }, { /* Coeff Band 5 */
+                    {  15, 199, 227 },
+                    {   3, 150, 204 },
+                    {   1,  91, 146 },
+                    {   1,  55,  95 },
+                    {   1,  30,  53 },
+                    {   1,  11,  20 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    {  19,  55, 240 },
+                    {  19,  59, 196 },
+                    {   3,  52, 105 }
+                }, { /* Coeff Band 1 */
+                    {  41, 166, 207 },
+                    { 104, 153, 199 },
+                    {  31, 123, 181 },
+                    {  14, 101, 152 },
+                    {   5,  72, 106 },
+                    {   1,  36,  52 }
+                }, { /* Coeff Band 2 */
+                    {  35, 176, 211 },
+                    {  12, 131, 190 },
+                    {   2,  88, 144 },
+                    {   1,  60, 101 },
+                    {   1,  36,  60 },
+                    {   1,  16,  28 }
+                }, { /* Coeff Band 3 */
+                    {  28, 183, 213 },
+                    {   8, 134, 191 },
+                    {   1,  86, 142 },
+                    {   1,  56,  96 },
+                    {   1,  30,  53 },
+                    {   1,  12,  20 }
+                }, { /* Coeff Band 4 */
+                    {  20, 190, 215 },
+                    {   4, 135, 192 },
+                    {   1,  84, 139 },
+                    {   1,  53,  91 },
+                    {   1,  28,  49 },
+                    {   1,  11,  20 }
+                }, { /* Coeff Band 5 */
+                    {  13, 196, 216 },
+                    {   2, 137, 192 },
+                    {   1,  86, 143 },
+                    {   1,  57,  99 },
+                    {   1,  32,  56 },
+                    {   1,  13,  24 }
+                }
+            }
+        }, { /* block Type 1 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 211,  29, 217 },
+                    {  96,  47, 156 },
+                    {  22,  43,  87 }
+                }, { /* Coeff Band 1 */
+                    {  78, 120, 193 },
+                    { 111, 116, 186 },
+                    {  46, 102, 164 },
+                    {  15,  80, 128 },
+                    {   2,  49,  76 },
+                    {   1,  18,  28 }
+                }, { /* Coeff Band 2 */
+                    {  71, 161, 203 },
+                    {  42, 132, 192 },
+                    {  10,  98, 150 },
+                    {   3,  69, 109 },
+                    {   1,  44,  70 },
+                    {   1,  18,  29 }
+                }, { /* Coeff Band 3 */
+                    {  57, 186, 211 },
+                    {  30, 140, 196 },
+                    {   4,  93, 146 },
+                    {   1,  62, 102 },
+                    {   1,  38,  65 },
+                    {   1,  16,  27 }
+                }, { /* Coeff Band 4 */
+                    {  47, 199, 217 },
+                    {  14, 145, 196 },
+                    {   1,  88, 142 },
+                    {   1,  57,  98 },
+                    {   1,  36,  62 },
+                    {   1,  15,  26 }
+                }, { /* Coeff Band 5 */
+                    {  26, 219, 229 },
+                    {   5, 155, 207 },
+                    {   1,  94, 151 },
+                    {   1,  60, 104 },
+                    {   1,  36,  62 },
+                    {   1,  16,  28 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 233,  29, 248 },
+                    { 146,  47, 220 },
+                    {  43,  52, 140 }
+                }, { /* Coeff Band 1 */
+                    { 100, 163, 232 },
+                    { 179, 161, 222 },
+                    {  63, 142, 204 },
+                    {  37, 113, 174 },
+                    {  26,  89, 137 },
+                    {  18,  68,  97 }
+                }, { /* Coeff Band 2 */
+                    {  85, 181, 230 },
+                    {  32, 146, 209 },
+                    {   7, 100, 164 },
+                    {   3,  71, 121 },
+                    {   1,  45,  77 },
+                    {   1,  18,  30 }
+                }, { /* Coeff Band 3 */
+                    {  65, 187, 230 },
+                    {  20, 148, 207 },
+                    {   2,  97, 159 },
+                    {   1,  68, 116 },
+                    {   1,  40,  70 },
+                    {   1,  14,  29 }
+                }, { /* Coeff Band 4 */
+                    {  40, 194, 227 },
+                    {   8, 147, 204 },
+                    {   1,  94, 155 },
+                    {   1,  65, 112 },
+                    {   1,  39,  66 },
+                    {   1,  14,  26 }
+                }, { /* Coeff Band 5 */
+                    {  16, 208, 228 },
+                    {   3, 151, 207 },
+                    {   1,  98, 160 },
+                    {   1,  67, 117 },
+                    {   1,  41,  74 },
+                    {   1,  17,  31 }
+                }
+            }
+        }
+    }, { /* tx = 32x32 */
+        { /* block Type 0 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    {  17,  38, 140 },
+                    {   7,  34,  80 },
+                    {   1,  17,  29 }
+                }, { /* Coeff Band 1 */
+                    {  37,  75, 128 },
+                    {  41,  76, 128 },
+                    {  26,  66, 116 },
+                    {  12,  52,  94 },
+                    {   2,  32,  55 },
+                    {   1,  10,  16 }
+                }, { /* Coeff Band 2 */
+                    {  50, 127, 154 },
+                    {  37, 109, 152 },
+                    {  16,  82, 121 },
+                    {   5,  59,  85 },
+                    {   1,  35,  54 },
+                    {   1,  13,  20 }
+                }, { /* Coeff Band 3 */
+                    {  40, 142, 167 },
+                    {  17, 110, 157 },
+                    {   2,  71, 112 },
+                    {   1,  44,  72 },
+                    {   1,  27,  45 },
+                    {   1,  11,  17 }
+                }, { /* Coeff Band 4 */
+                    {  30, 175, 188 },
+                    {   9, 124, 169 },
+                    {   1,  74, 116 },
+                    {   1,  48,  78 },
+                    {   1,  30,  49 },
+                    {   1,  11,  18 }
+                }, { /* Coeff Band 5 */
+                    {  10, 222, 223 },
+                    {   2, 150, 194 },
+                    {   1,  83, 128 },
+                    {   1,  48,  79 },
+                    {   1,  27,  45 },
+                    {   1,  11,  17 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    {  36,  41, 235 },
+                    {  29,  36, 193 },
+                    {  10,  27, 111 }
+                }, { /* Coeff Band 1 */
+                    {  85, 165, 222 },
+                    { 177, 162, 215 },
+                    { 110, 135, 195 },
+                    {  57, 113, 168 },
+                    {  23,  83, 120 },
+                    {  10,  49,  61 }
+                }, { /* Coeff Band 2 */
+                    {  85, 190, 223 },
+                    {  36, 139, 200 },
+                    {   5,  90, 146 },
+                    {   1,  60, 103 },
+                    {   1,  38,  65 },
+                    {   1,  18,  30 }
+                }, { /* Coeff Band 3 */
+                    {  72, 202, 223 },
+                    {  23, 141, 199 },
+                    {   2,  86, 140 },
+                    {   1,  56,  97 },
+                    {   1,  36,  61 },
+                    {   1,  16,  27 }
+                }, { /* Coeff Band 4 */
+                    {  55, 218, 225 },
+                    {  13, 145, 200 },
+                    {   1,  86, 141 },
+                    {   1,  57,  99 },
+                    {   1,  35,  61 },
+                    {   1,  13,  22 }
+                }, { /* Coeff Band 5 */
+                    {  15, 235, 212 },
+                    {   1, 132, 184 },
+                    {   1,  84, 139 },
+                    {   1,  57,  97 },
+                    {   1,  34,  56 },
+                    {   1,  14,  23 }
+                }
+            }
+        }, { /* block Type 1 */
+            { /* Intra */
+                { /* Coeff Band 0 */
+                    { 181,  21, 201 },
+                    {  61,  37, 123 },
+                    {  10,  38,  71 }
+                }, { /* Coeff Band 1 */
+                    {  47, 106, 172 },
+                    {  95, 104, 173 },
+                    {  42,  93, 159 },
+                    {  18,  77, 131 },
+                    {   4,  50,  81 },
+                    {   1,  17,  23 }
+                }, { /* Coeff Band 2 */
+                    {  62, 147, 199 },
+                    {  44, 130, 189 },
+                    {  28, 102, 154 },
+                    {  18,  75, 115 },
+                    {   2,  44,  65 },
+                    {   1,  12,  19 }
+                }, { /* Coeff Band 3 */
+                    {  55, 153, 210 },
+                    {  24, 130, 194 },
+                    {   3,  93, 146 },
+                    {   1,  61,  97 },
+                    {   1,  31,  50 },
+                    {   1,  10,  16 }
+                }, { /* Coeff Band 4 */
+                    {  49, 186, 223 },
+                    {  17, 148, 204 },
+                    {   1,  96, 142 },
+                    {   1,  53,  83 },
+                    {   1,  26,  44 },
+                    {   1,  11,  17 }
+                }, { /* Coeff Band 5 */
+                    {  13, 217, 212 },
+                    {   2, 136, 180 },
+                    {   1,  78, 124 },
+                    {   1,  50,  83 },
+                    {   1,  29,  49 },
+                    {   1,  14,  23 }
+                }
+            }, { /* Inter */
+                { /* Coeff Band 0 */
+                    { 197,  13, 247 },
+                    {  82,  17, 222 },
+                    {  25,  17, 162 }
+                }, { /* Coeff Band 1 */
+                    { 126, 186, 247 },
+                    { 234, 191, 243 },
+                    { 176, 177, 234 },
+                    { 104, 158, 220 },
+                    {  66, 128, 186 },
+                    {  55,  90, 137 }
+                }, { /* Coeff Band 2 */
+                    { 111, 197, 242 },
+                    {  46, 158, 219 },
+                    {   9, 104, 171 },
+                    {   2,  65, 125 },
+                    {   1,  44,  80 },
+                    {   1,  17,  91 }
+                }, { /* Coeff Band 3 */
+                    { 104, 208, 245 },
+                    {  39, 168, 224 },
+                    {   3, 109, 162 },
+                    {   1,  79, 124 },
+                    {   1,  50, 102 },
+                    {   1,  43, 102 }
+                }, { /* Coeff Band 4 */
+                    {  84, 220, 246 },
+                    {  31, 177, 231 },
+                    {   2, 115, 180 },
+                    {   1,  79, 134 },
+                    {   1,  55,  77 },
+                    {   1,  60,  79 }
+                }, { /* Coeff Band 5 */
+                    {  43, 243, 240 },
+                    {   8, 180, 217 },
+                    {   1, 115, 166 },
+                    {   1,  84, 121 },
+                    {   1,  51,  67 },
+                    {   1,  16,   6 }
+                }
+            }
+        }
+    }
+};
+
+const int8_t ff_vp9_mv_joint_tree[3][2] = {
+    { -MV_JOINT_ZERO,            1 }, // '0'
+    {    -MV_JOINT_H,            2 }, // '10'
+    {    -MV_JOINT_V, -MV_JOINT_HV }, // '11x'
+};
+
+const int8_t ff_vp9_mv_class_tree[10][2] = {
+    { -0,   1 }, // '0'
+    { -1,   2 }, // '10'
+    {  3,   4 },
+    { -2,  -3 }, // '110x'
+    {  5,   6 },
+    { -4,  -5 }, // '1110x'
+    { -6,   7 }, // '11110'
+    {  8,   9 },
+    { -7,  -8 }, // '111110x'
+    { -9, -10 }, // '111111x'
+};
+
+const int8_t ff_vp9_mv_fp_tree[3][2] = {
+    { -0,  1 },   // '0'
+    { -1,  2 },   // '10'
+    { -2, -3 },   // '11x'
+};
diff --git a/libavcodec/vp9data.h b/libavcodec/vp9data.h
new file mode 100644
index 0000000000..a52cc0a353
--- /dev/null
+++ b/libavcodec/vp9data.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VP9DATA_H
+#define AVCODEC_VP9DATA_H
+
+#include <stdint.h>
+
+#include "vp9.h"
+
+extern const int8_t ff_vp9_partition_tree[3][2];
+extern const uint8_t ff_vp9_default_kf_partition_probs[4][4][3];
+extern const int8_t ff_vp9_segmentation_tree[7][2];
+extern const int8_t ff_vp9_intramode_tree[9][2];
+extern const uint8_t ff_vp9_default_kf_ymode_probs[10][10][9];
+extern const uint8_t ff_vp9_default_kf_uvmode_probs[10][9];
+extern const int8_t ff_vp9_inter_mode_tree[3][2];
+extern const int8_t ff_vp9_filter_tree[2][2];
+extern const enum FilterMode ff_vp9_filter_lut[3];
+extern const int16_t ff_vp9_dc_qlookup[256];
+extern const int16_t ff_vp9_ac_qlookup[256];
+extern const enum TxfmType ff_vp9_intra_txfm_type[14];
+extern const int16_t ff_vp9_default_scan_4x4[16];
+extern const int16_t ff_vp9_col_scan_4x4[16];
+extern const int16_t ff_vp9_row_scan_4x4[16];
+extern const int16_t ff_vp9_default_scan_8x8[64];
+extern const int16_t ff_vp9_col_scan_8x8[64];
+extern const int16_t ff_vp9_row_scan_8x8[64];
+extern const int16_t ff_vp9_default_scan_16x16[256];
+extern const int16_t ff_vp9_col_scan_16x16[256];
+extern const int16_t ff_vp9_row_scan_16x16[256];
+extern const int16_t ff_vp9_default_scan_32x32[1024];
+extern const int16_t *ff_vp9_scans[5][4];
+extern const int16_t ff_vp9_default_scan_4x4_nb[16][2];
+extern const int16_t ff_vp9_col_scan_4x4_nb[16][2];
+extern const int16_t ff_vp9_row_scan_4x4_nb[16][2];
+extern const int16_t ff_vp9_default_scan_8x8_nb[64][2];
+extern const int16_t ff_vp9_col_scan_8x8_nb[64][2];
+extern const int16_t ff_vp9_row_scan_8x8_nb[64][2];
+extern const int16_t ff_vp9_default_scan_16x16_nb[256][2];
+extern const int16_t ff_vp9_col_scan_16x16_nb[256][2];
+extern const int16_t ff_vp9_row_scan_16x16_nb[256][2];
+extern const int16_t ff_vp9_default_scan_32x32_nb[1024][2];
+extern const int16_t (*ff_vp9_scans_nb[5][4])[2];
+extern const uint8_t ff_vp9_model_pareto8[256][8];
+extern const ProbContext ff_vp9_default_probs;
+extern const uint8_t ff_vp9_default_coef_probs[4][2][2][6][6][3];
+extern const int8_t ff_vp9_mv_joint_tree[3][2];
+extern const int8_t ff_vp9_mv_class_tree[10][2];
+extern const int8_t ff_vp9_mv_fp_tree[3][2];
+
+#endif /* AVCODEC_VP9DATA_H */
diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c
new file mode 100644
index 0000000000..c83defeda3
--- /dev/null
+++ b/libavcodec/vp9dsp.c
@@ -0,0 +1,2174 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+
+#include "rnd_avg.h"
+#include "vp9.h"
+
+// FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8)
+// back with h264pred.[ch]
+
+static void vert_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                       const uint8_t *left, const uint8_t *top)
+{
+    unsigned p4 = AV_RN32A(top);
+
+    AV_WN32A(dst + stride * 0, p4);
+    AV_WN32A(dst + stride * 1, p4);
+    AV_WN32A(dst + stride * 2, p4);
+    AV_WN32A(dst + stride * 3, p4);
+}
+
+static void vert_8x8_c(uint8_t *dst, ptrdiff_t stride,
+                       const uint8_t *left, const uint8_t *top)
+{
+    uint64_t p8 = AV_RN64A(top);
+    int y;
+
+    for (y = 0; y < 8; y++) {
+        AV_WN64A(dst, p8);
+        dst += stride;
+    }
+}
+
+static void vert_16x16_c(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    uint64_t p8a = AV_RN64A(top + 0), p8b = AV_RN64A(top + 8);
+    int y;
+
+    for (y = 0; y < 16; y++) {
+        AV_WN64A(dst + 0, p8a);
+        AV_WN64A(dst + 8, p8b);
+        dst += stride;
+    }
+}
+
+static void vert_32x32_c(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    uint64_t p8a = AV_RN64A(top + 0),  p8b = AV_RN64A(top + 8),
+             p8c = AV_RN64A(top + 16), p8d = AV_RN64A(top + 24);
+    int y;
+
+    for (y = 0; y < 32; y++) {
+        AV_WN64A(dst +  0, p8a);
+        AV_WN64A(dst +  8, p8b);
+        AV_WN64A(dst + 16, p8c);
+        AV_WN64A(dst + 24, p8d);
+        dst += stride;
+    }
+}
+
+static void hor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                      const uint8_t *left, const uint8_t *top)
+{
+    AV_WN32A(dst + stride * 0, left[0] * 0x01010101U);
+    AV_WN32A(dst + stride * 1, left[1] * 0x01010101U);
+    AV_WN32A(dst + stride * 2, left[2] * 0x01010101U);
+    AV_WN32A(dst + stride * 3, left[3] * 0x01010101U);
+}
+
+static void hor_8x8_c(uint8_t *dst, ptrdiff_t stride,
+                      const uint8_t *left, const uint8_t *top)
+{
+    int y;
+
+    for (y = 0; y < 8; y++) {
+        AV_WN64A(dst, left[y] * 0x0101010101010101ULL);
+        dst += stride;
+    }
+}
+
+static void hor_16x16_c(uint8_t *dst, ptrdiff_t stride,
+                        const uint8_t *left, const uint8_t *top)
+{
+    int y;
+
+    for (y = 0; y < 16; y++) {
+        uint64_t p8 = left[y] * 0x0101010101010101ULL;
+
+        AV_WN64A(dst + 0, p8);
+        AV_WN64A(dst + 8, p8);
+        dst += stride;
+    }
+}
+
+static void hor_32x32_c(uint8_t *dst, ptrdiff_t stride,
+                        const uint8_t *left, const uint8_t *top)
+{
+    int y;
+
+    for (y = 0; y < 32; y++) {
+        uint64_t p8 = left[y] * 0x0101010101010101ULL;
+
+        AV_WN64A(dst +  0, p8);
+        AV_WN64A(dst +  8, p8);
+        AV_WN64A(dst + 16, p8);
+        AV_WN64A(dst + 24, p8);
+        dst += stride;
+    }
+}
+
+static void tm_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                     const uint8_t *left, const uint8_t *top)
+{
+    int y, tl = top[-1];
+
+    for (y = 0; y < 4; y++) {
+        int l_m_tl = left[y] - tl;
+
+        dst[0] = av_clip_uint8(top[0] + l_m_tl);
+        dst[1] = av_clip_uint8(top[1] + l_m_tl);
+        dst[2] = av_clip_uint8(top[2] + l_m_tl);
+        dst[3] = av_clip_uint8(top[3] + l_m_tl);
+        dst   += stride;
+    }
+}
+
+static void tm_8x8_c(uint8_t *dst, ptrdiff_t stride,
+                     const uint8_t *left, const uint8_t *top)
+{
+    int y, tl = top[-1];
+
+    for (y = 0; y < 8; y++) {
+        int l_m_tl = left[y] - tl;
+
+        dst[0] = av_clip_uint8(top[0] + l_m_tl);
+        dst[1] = av_clip_uint8(top[1] + l_m_tl);
+        dst[2] = av_clip_uint8(top[2] + l_m_tl);
+        dst[3] = av_clip_uint8(top[3] + l_m_tl);
+        dst[4] = av_clip_uint8(top[4] + l_m_tl);
+        dst[5] = av_clip_uint8(top[5] + l_m_tl);
+        dst[6] = av_clip_uint8(top[6] + l_m_tl);
+        dst[7] = av_clip_uint8(top[7] + l_m_tl);
+        dst   += stride;
+    }
+}
+
+static void tm_16x16_c(uint8_t *dst, ptrdiff_t stride,
+                       const uint8_t *left, const uint8_t *top)
+{
+    int y, tl = top[-1];
+
+    for (y = 0; y < 16; y++) {
+        int l_m_tl = left[y] - tl;
+
+        dst[0]  = av_clip_uint8(top[0]  + l_m_tl);
+        dst[1]  = av_clip_uint8(top[1]  + l_m_tl);
+        dst[2]  = av_clip_uint8(top[2]  + l_m_tl);
+        dst[3]  = av_clip_uint8(top[3]  + l_m_tl);
+        dst[4]  = av_clip_uint8(top[4]  + l_m_tl);
+        dst[5]  = av_clip_uint8(top[5]  + l_m_tl);
+        dst[6]  = av_clip_uint8(top[6]  + l_m_tl);
+        dst[7]  = av_clip_uint8(top[7]  + l_m_tl);
+        dst[8]  = av_clip_uint8(top[8]  + l_m_tl);
+        dst[9]  = av_clip_uint8(top[9]  + l_m_tl);
+        dst[10] = av_clip_uint8(top[10] + l_m_tl);
+        dst[11] = av_clip_uint8(top[11] + l_m_tl);
+        dst[12] = av_clip_uint8(top[12] + l_m_tl);
+        dst[13] = av_clip_uint8(top[13] + l_m_tl);
+        dst[14] = av_clip_uint8(top[14] + l_m_tl);
+        dst[15] = av_clip_uint8(top[15] + l_m_tl);
+        dst    += stride;
+    }
+}
+
+static void tm_32x32_c(uint8_t *dst, ptrdiff_t stride,
+                       const uint8_t *left, const uint8_t *top)
+{
+    int y, tl = top[-1];
+
+    for (y = 0; y < 32; y++) {
+        int l_m_tl = left[y] - tl;
+
+        dst[0]  = av_clip_uint8(top[0]  + l_m_tl);
+        dst[1]  = av_clip_uint8(top[1]  + l_m_tl);
+        dst[2]  = av_clip_uint8(top[2]  + l_m_tl);
+        dst[3]  = av_clip_uint8(top[3]  + l_m_tl);
+        dst[4]  = av_clip_uint8(top[4]  + l_m_tl);
+        dst[5]  = av_clip_uint8(top[5]  + l_m_tl);
+        dst[6]  = av_clip_uint8(top[6]  + l_m_tl);
+        dst[7]  = av_clip_uint8(top[7]  + l_m_tl);
+        dst[8]  = av_clip_uint8(top[8]  + l_m_tl);
+        dst[9]  = av_clip_uint8(top[9]  + l_m_tl);
+        dst[10] = av_clip_uint8(top[10] + l_m_tl);
+        dst[11] = av_clip_uint8(top[11] + l_m_tl);
+        dst[12] = av_clip_uint8(top[12] + l_m_tl);
+        dst[13] = av_clip_uint8(top[13] + l_m_tl);
+        dst[14] = av_clip_uint8(top[14] + l_m_tl);
+        dst[15] = av_clip_uint8(top[15] + l_m_tl);
+        dst[16] = av_clip_uint8(top[16] + l_m_tl);
+        dst[17] = av_clip_uint8(top[17] + l_m_tl);
+        dst[18] = av_clip_uint8(top[18] + l_m_tl);
+        dst[19] = av_clip_uint8(top[19] + l_m_tl);
+        dst[20] = av_clip_uint8(top[20] + l_m_tl);
+        dst[21] = av_clip_uint8(top[21] + l_m_tl);
+        dst[22] = av_clip_uint8(top[22] + l_m_tl);
+        dst[23] = av_clip_uint8(top[23] + l_m_tl);
+        dst[24] = av_clip_uint8(top[24] + l_m_tl);
+        dst[25] = av_clip_uint8(top[25] + l_m_tl);
+        dst[26] = av_clip_uint8(top[26] + l_m_tl);
+        dst[27] = av_clip_uint8(top[27] + l_m_tl);
+        dst[28] = av_clip_uint8(top[28] + l_m_tl);
+        dst[29] = av_clip_uint8(top[29] + l_m_tl);
+        dst[30] = av_clip_uint8(top[30] + l_m_tl);
+        dst[31] = av_clip_uint8(top[31] + l_m_tl);
+        dst    += stride;
+    }
+}
+
+static void dc_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                     const uint8_t *left, const uint8_t *top)
+{
+    unsigned dc = 0x01010101U *
+                  ((left[0] + left[1] + left[2] + left[3] +
+                    top[0]  + top[1]  + top[2]  + top[3]  + 4) >> 3);
+
+    AV_WN32A(dst + stride * 0, dc);
+    AV_WN32A(dst + stride * 1, dc);
+    AV_WN32A(dst + stride * 2, dc);
+    AV_WN32A(dst + stride * 3, dc);
+}
+
+static void dc_8x8_c(uint8_t *dst, ptrdiff_t stride,
+                     const uint8_t *left, const uint8_t *top)
+{
+    uint64_t dc = 0x0101010101010101ULL *
+                  ((left[0] + left[1] + left[2] + left[3] +
+                    left[4] + left[5] + left[6] + left[7] +
+                    top[0]  + top[1]  + top[2]  + top[3]  +
+                    top[4]  + top[5]  + top[6]  + top[7]  + 8) >> 4);
+    int y;
+
+    for (y = 0; y < 8; y++) {
+        AV_WN64A(dst, dc);
+        dst += stride;
+    }
+}
+
+static void dc_16x16_c(uint8_t *dst, ptrdiff_t stride,
+                       const uint8_t *left, const uint8_t *top)
+{
+    uint64_t dc = 0x0101010101010101ULL *
+                  ((left[0]  + left[1]  + left[2]  + left[3]  +
+                    left[4]  + left[5]  + left[6]  + left[7]  +
+                    left[8]  + left[9]  + left[10] + left[11] +
+                    left[12] + left[13] + left[14] + left[15] +
+                    top[0]   + top[1]   + top[2]   + top[3]   +
+                    top[4]   + top[5]   + top[6]   + top[7]   +
+                    top[8]   + top[9]   + top[10]  + top[11]  +
+                    top[12]  + top[13]  + top[14]  + top[15]  + 16) >> 5);
+    int y;
+
+    for (y = 0; y < 16; y++) {
+        AV_WN64A(dst + 0, dc);
+        AV_WN64A(dst + 8, dc);
+        dst += stride;
+    }
+}
+
+static void dc_32x32_c(uint8_t *dst, ptrdiff_t stride,
+                       const uint8_t *left, const uint8_t *top)
+{
+    uint64_t dc = 0x0101010101010101ULL *
+                  ((left[0]  + left[1]  + left[2]  + left[3]  +
+                    left[4]  + left[5]  + left[6]  + left[7]  +
+                    left[8]  + left[9]  + left[10] + left[11] +
+                    left[12] + left[13] + left[14] + left[15] +
+                    left[16] + left[17] + left[18] + left[19] +
+                    left[20] + left[21] + left[22] + left[23] +
+                    left[24] + left[25] + left[26] + left[27] +
+                    left[28] + left[29] + left[30] + left[31] +
+                    top[0]   + top[1]   + top[2]   + top[3]   +
+                    top[4]   + top[5]   + top[6]   + top[7]   +
+                    top[8]   + top[9]   + top[10]  + top[11]  +
+                    top[12]  + top[13]  + top[14]  + top[15]  +
+                    top[16]  + top[17]  + top[18]  + top[19]  +
+                    top[20]  + top[21]  + top[22]  + top[23]  +
+                    top[24]  + top[25]  + top[26]  + top[27]  +
+                    top[28]  + top[29]  + top[30]  + top[31]  + 32) >> 6);
+    int y;
+
+    for (y = 0; y < 32; y++) {
+        AV_WN64A(dst +  0, dc);
+        AV_WN64A(dst +  8, dc);
+        AV_WN64A(dst + 16, dc);
+        AV_WN64A(dst + 24, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *left, const uint8_t *top)
+{
+    unsigned dc = 0x01010101U *
+                  ((left[0] + left[1] + left[2] + left[3] + 2) >> 2);
+
+    AV_WN32A(dst + stride * 0, dc);
+    AV_WN32A(dst + stride * 1, dc);
+    AV_WN32A(dst + stride * 2, dc);
+    AV_WN32A(dst + stride * 3, dc);
+}
+
+static void dc_left_8x8_c(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *left, const uint8_t *top)
+{
+    uint64_t dc = 0x0101010101010101ULL *
+                  ((left[0] + left[1] + left[2] + left[3] +
+                    left[4] + left[5] + left[6] + left[7] + 4) >> 3);
+    int y;
+
+    for (y = 0; y < 8; y++) {
+        AV_WN64A(dst, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_16x16_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *left, const uint8_t *top)
+{
+    uint64_t dc = 0x0101010101010101ULL *
+                  ((left[0]  + left[1]  + left[2]  + left[3]  +
+                    left[4]  + left[5]  + left[6]  + left[7]  +
+                    left[8]  + left[9]  + left[10] + left[11] +
+                    left[12] + left[13] + left[14] + left[15] + 8) >> 4);
+    int y;
+
+    for (y = 0; y < 16; y++) {
+        AV_WN64A(dst + 0, dc);
+        AV_WN64A(dst + 8, dc);
+        dst += stride;
+    }
+}
+
+static void dc_left_32x32_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *left, const uint8_t *top)
+{
+    uint64_t dc = 0x0101010101010101ULL *
+                  ((left[0]  + left[1]  + left[2]  + left[3]  +
+                    left[4]  + left[5]  + left[6]  + left[7]  +
+                    left[8]  + left[9]  + left[10] + left[11] +
+                    left[12] + left[13] + left[14] + left[15] +
+                    left[16] + left[17] + left[18] + left[19] +
+                    left[20] + left[21] + left[22] + left[23] +
+                    left[24] + left[25] + left[26] + left[27] +
+                    left[28] + left[29] + left[30] + left[31] + 16) >> 5);
+    int y;
+
+    for (y = 0; y < 32; y++) {
+        AV_WN64A(dst +  0, dc);
+        AV_WN64A(dst +  8, dc);
+        AV_WN64A(dst + 16, dc);
+        AV_WN64A(dst + 24, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    unsigned dc = 0x01010101U * ((top[0] + top[1] + top[2] + top[3] + 2) >> 2);
+
+    AV_WN32A(dst + stride * 0, dc);
+    AV_WN32A(dst + stride * 1, dc);
+    AV_WN32A(dst + stride * 2, dc);
+    AV_WN32A(dst + stride * 3, dc);
+}
+
+static void dc_top_8x8_c(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    uint64_t dc = 0x0101010101010101ULL *
+                  ((top[0] + top[1] + top[2] + top[3] +
+                    top[4] + top[5] + top[6] + top[7] + 4) >> 3);
+    int y;
+
+    for (y = 0; y < 8; y++) {
+        AV_WN64A(dst, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_16x16_c(uint8_t *dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    uint64_t dc = 0x0101010101010101ULL *
+                  ((top[0]  + top[1]  + top[2]  + top[3]  +
+                    top[4]  + top[5]  + top[6]  + top[7]  +
+                    top[8]  + top[9]  + top[10] + top[11] +
+                    top[12] + top[13] + top[14] + top[15] + 8) >> 4);
+    int y;
+
+    for (y = 0; y < 16; y++) {
+        AV_WN64A(dst + 0, dc);
+        AV_WN64A(dst + 8, dc);
+        dst += stride;
+    }
+}
+
+static void dc_top_32x32_c(uint8_t *dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    uint64_t dc = 0x0101010101010101ULL *
+                  ((top[0]  + top[1]  + top[2]  + top[3]  +
+                    top[4]  + top[5]  + top[6]  + top[7]  +
+                    top[8]  + top[9]  + top[10] + top[11] +
+                    top[12] + top[13] + top[14] + top[15] +
+                    top[16] + top[17] + top[18] + top[19] +
+                    top[20] + top[21] + top[22] + top[23] +
+                    top[24] + top[25] + top[26] + top[27] +
+                    top[28] + top[29] + top[30] + top[31] + 16) >> 5);
+    int y;
+
+    for (y = 0; y < 32; y++) {
+        AV_WN64A(dst +  0, dc);
+        AV_WN64A(dst +  8, dc);
+        AV_WN64A(dst + 16, dc);
+        AV_WN64A(dst + 24, dc);
+        dst += stride;
+    }
+}
+
+static void dc_128_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    AV_WN32A(dst + stride * 0, 0x80808080U);
+    AV_WN32A(dst + stride * 1, 0x80808080U);
+    AV_WN32A(dst + stride * 2, 0x80808080U);
+    AV_WN32A(dst + stride * 3, 0x80808080U);
+}
+
+static void dc_128_8x8_c(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    int y;
+
+    for (y = 0; y < 8; y++) {
+        AV_WN64A(dst, 0x8080808080808080ULL);
+        dst += stride;
+    }
+}
+
+static void dc_128_16x16_c(uint8_t *dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    int y;
+
+    for (y = 0; y < 16; y++) {
+        AV_WN64A(dst + 0, 0x8080808080808080ULL);
+        AV_WN64A(dst + 8, 0x8080808080808080ULL);
+        dst += stride;
+    }
+}
+
+static void dc_128_32x32_c(uint8_t *dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    int y;
+
+    for (y = 0; y < 32; y++) {
+        AV_WN64A(dst +  0, 0x8080808080808080ULL);
+        AV_WN64A(dst +  8, 0x8080808080808080ULL);
+        AV_WN64A(dst + 16, 0x8080808080808080ULL);
+        AV_WN64A(dst + 24, 0x8080808080808080ULL);
+        dst += stride;
+    }
+}
+
+static void dc_127_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    AV_WN32A(dst + stride * 0, 0x7F7F7F7FU);
+    AV_WN32A(dst + stride * 1, 0x7F7F7F7FU);
+    AV_WN32A(dst + stride * 2, 0x7F7F7F7FU);
+    AV_WN32A(dst + stride * 3, 0x7F7F7F7FU);
+}
+
+static void dc_127_8x8_c(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    int y;
+
+    for (y = 0; y < 8; y++) {
+        AV_WN64A(dst, 0x7F7F7F7F7F7F7F7FULL);
+        dst += stride;
+    }
+}
+
+static void dc_127_16x16_c(uint8_t *dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    int y;
+
+    for (y = 0; y < 16; y++) {
+        AV_WN64A(dst + 0, 0x7F7F7F7F7F7F7F7FULL);
+        AV_WN64A(dst + 8, 0x7F7F7F7F7F7F7F7FULL);
+        dst += stride;
+    }
+}
+
+static void dc_127_32x32_c(uint8_t *dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    int y;
+
+    for (y = 0; y < 32; y++) {
+        AV_WN64A(dst +  0, 0x7F7F7F7F7F7F7F7FULL);
+        AV_WN64A(dst +  8, 0x7F7F7F7F7F7F7F7FULL);
+        AV_WN64A(dst + 16, 0x7F7F7F7F7F7F7F7FULL);
+        AV_WN64A(dst + 24, 0x7F7F7F7F7F7F7F7FULL);
+        dst += stride;
+    }
+}
+
+static void dc_129_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    AV_WN32A(dst + stride * 0, 0x81818181U);
+    AV_WN32A(dst + stride * 1, 0x81818181U);
+    AV_WN32A(dst + stride * 2, 0x81818181U);
+    AV_WN32A(dst + stride * 3, 0x81818181U);
+}
+
+static void dc_129_8x8_c(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    int y;
+
+    for (y = 0; y < 8; y++) {
+        AV_WN64A(dst, 0x8181818181818181ULL);
+        dst += stride;
+    }
+}
+
+static void dc_129_16x16_c(uint8_t *dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    int y;
+
+    for (y = 0; y < 16; y++) {
+        AV_WN64A(dst + 0, 0x8181818181818181ULL);
+        AV_WN64A(dst + 8, 0x8181818181818181ULL);
+        dst += stride;
+    }
+}
+
+static void dc_129_32x32_c(uint8_t *dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    int y;
+
+    for (y = 0; y < 32; y++) {
+        AV_WN64A(dst +  0, 0x8181818181818181ULL);
+        AV_WN64A(dst +  8, 0x8181818181818181ULL);
+        AV_WN64A(dst + 16, 0x8181818181818181ULL);
+        AV_WN64A(dst + 24, 0x8181818181818181ULL);
+        dst += stride;
+    }
+}
+
+#define DST(x, y) dst[(x) + (y) * stride]
+
+static void diag_downleft_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *left, const uint8_t *top)
+{
+    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7];
+
+    DST(0, 0) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(1, 0) =
+    DST(0, 1) = (a1 + a2 * 2 + a3 + 2) >> 2;
+    DST(2, 0) =
+    DST(1, 1) =
+    DST(0, 2) = (a2 + a3 * 2 + a4 + 2) >> 2;
+    DST(3, 0) =
+    DST(2, 1) =
+    DST(1, 2) =
+    DST(0, 3) = (a3 + a4 * 2 + a5 + 2) >> 2;
+    DST(3, 1) =
+    DST(2, 2) =
+    DST(1, 3) = (a4 + a5 * 2 + a6 + 2) >> 2;
+    DST(3, 2) =
+    DST(2, 3) = (a5 + a6 * 2 + a7 + 2) >> 2;
+    DST(3, 3) = a7;  // note: this is different from vp8 and such
+}
+
+#define def_diag_downleft(size)                                             \
+static void diag_downleft_ ## size ## x ## size ## _c(uint8_t *dst,         \
+                                                      ptrdiff_t stride,     \
+                                                      const uint8_t *left,  \
+                                                      const uint8_t *top)   \
+{                                                                           \
+    int i, j;                                                               \
+    uint8_t v[size - 1];                                                    \
+                                                                            \
+    for (i = 0; i < size - 2; i++)                                          \
+        v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2;             \
+    v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2;             \
+                                                                            \
+    for (j = 0; j < size; j++) {                                            \
+        memcpy(dst + j * stride, v + j, size - 1 - j);                      \
+        memset(dst + j * stride + size - 1 - j, top[size - 1], j + 1);      \
+    }                                                                       \
+}
+
+def_diag_downleft(8)
+def_diag_downleft(16)
+def_diag_downleft(32)
+
+static void diag_downright_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *left, const uint8_t *top)
+{
+    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
+
+    DST(0, 3) = (l1 + l2 * 2 + l3 + 2) >> 2;
+    DST(0, 2) =
+    DST(1, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0, 1) =
+    DST(1, 2) =
+    DST(2, 3) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0, 0) =
+    DST(1, 1) =
+    DST(2, 2) =
+    DST(3, 3) = (l0 + tl * 2 + a0 + 2) >> 2;
+    DST(1, 0) =
+    DST(2, 1) =
+    DST(3, 2) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(2, 0) =
+    DST(3, 1) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(3, 0) = (a1 + a2 * 2 + a3 + 2) >> 2;
+}
+
+#define def_diag_downright(size)                                            \
+static void diag_downright_ ## size ## x ## size ## _c(uint8_t *dst,        \
+                                                       ptrdiff_t stride,    \
+                                                       const uint8_t *left, \
+                                                       const uint8_t *top)  \
+{                                                                           \
+    int i, j;                                                               \
+    uint8_t v[size + size - 1];                                             \
+                                                                            \
+    for (i = 0; i < size - 2; i++) {                                        \
+        v[i]            = (left[size - 1 - i] +                             \
+                           left[size - 2 - i] * 2 +                         \
+                           left[size - 3 - i] + 2) >> 2;                    \
+        v[size + 1 + i] = (top[i]             +                             \
+                           top[i + 1]         * 2 +                         \
+                           top[i + 2]         + 2) >> 2;                    \
+    }                                                                       \
+    v[size - 2] = (left[1] + left[0] * 2 + top[-1] + 2) >> 2;               \
+    v[size - 1] = (left[0] + top[-1] * 2 + top[0]  + 2) >> 2;               \
+    v[size]     = (top[-1] + top[0]  * 2 + top[1]  + 2) >> 2;               \
+                                                                            \
+    for (j = 0; j < size; j++)                                              \
+        memcpy(dst + j * stride, v + size - 1 - j, size);                   \
+}
+
+def_diag_downright(8)
+def_diag_downright(16)
+def_diag_downright(32)
+
+static void vert_right_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *left, const uint8_t *top)
+{
+    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        l0 = left[0], l1 = left[1], l2 = left[2];
+
+    DST(0, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0, 2) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0, 0) =
+    DST(1, 2) = (tl + a0          + 1) >> 1;
+    DST(0, 1) =
+    DST(1, 3) = (l0 + tl * 2 + a0 + 2) >> 2;
+    DST(1, 0) =
+    DST(2, 2) = (a0 + a1          + 1) >> 1;
+    DST(1, 1) =
+    DST(2, 3) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(2, 0) =
+    DST(3, 2) = (a1 + a2          + 1) >> 1;
+    DST(2, 1) =
+    DST(3, 3) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(3, 0) = (a2 + a3          + 1) >> 1;
+    DST(3, 1) = (a1 + a2 * 2 + a3 + 2) >> 2;
+}
+
+#define def_vert_right(size)                                                \
+static void vert_right_ ## size ## x ## size ## _c(uint8_t *dst,            \
+                                                   ptrdiff_t stride,        \
+                                                   const uint8_t *left,     \
+                                                   const uint8_t *top)      \
+{                                                                           \
+    int i, j;                                                               \
+    uint8_t ve[size + size / 2 - 1], vo[size + size / 2 - 1];               \
+                                                                            \
+    for (i = 0; i < size / 2 - 2; i++) {                                    \
+        vo[i] = (left[size - 4 - i * 2] +                                   \
+                 left[size - 3 - i * 2] * 2 +                               \
+                 left[size - 2 - i * 2] + 2) >> 2;                          \
+        ve[i] = (left[size - 5 - i * 2] +                                   \
+                 left[size - 4 - i * 2] * 2 +                               \
+                 left[size - 3 - i * 2] + 2) >> 2;                          \
+    }                                                                       \
+    vo[size / 2 - 2] = (left[0] + left[1] * 2 + left[2] + 2) >> 2;          \
+    ve[size / 2 - 2] = (top[-1] + left[0] * 2 + left[1] + 2) >> 2;          \
+                                                                            \
+    ve[size / 2 - 1] = (top[-1] + top[0] + 1) >> 1;                         \
+    vo[size / 2 - 1] = (left[0] + top[-1] * 2 + top[0] + 2) >> 2;           \
+    for (i = 0; i < size - 1; i++) {                                        \
+        ve[size / 2 + i] = (top[i] + top[i + 1] + 1) >> 1;                  \
+        vo[size / 2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
+    }                                                                       \
+                                                                            \
+    for (j = 0; j < size / 2; j++) {                                        \
+        memcpy(dst +  j * 2      * stride, ve + size / 2 - 1 - j, size);    \
+        memcpy(dst + (j * 2 + 1) * stride, vo + size / 2 - 1 - j, size);    \
+    }                                                                       \
+}
+
+def_vert_right(8)
+def_vert_right(16)
+def_vert_right(32)
+
+static void hor_down_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                           const uint8_t *left, const uint8_t *top)
+{
+    int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3],
+        tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2];
+
+    DST(2, 0) = (tl + a0 * 2 + a1 + 2) >> 2;
+    DST(3, 0) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(0, 0) =
+    DST(2, 1) = (tl + l0          + 1) >> 1;
+    DST(1, 0) =
+    DST(3, 1) = (a0 + tl * 2 + l0 + 2) >> 2;
+    DST(0, 1) =
+    DST(2, 2) = (l0 + l1          + 1) >> 1;
+    DST(1, 1) =
+    DST(3, 2) = (tl + l0 * 2 + l1 + 2) >> 2;
+    DST(0, 2) =
+    DST(2, 3) = (l1 + l2          + 1) >> 1;
+    DST(1, 2) =
+    DST(3, 3) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0, 3) = (l2 + l3          + 1) >> 1;
+    DST(1, 3) = (l1 + l2 * 2 + l3 + 2) >> 2;
+}
+
+#define def_hor_down(size)                                              \
+static void hor_down_ ## size ## x ## size ## _c(uint8_t *dst,          \
+                                                 ptrdiff_t stride,      \
+                                                 const uint8_t *left,   \
+                                                 const uint8_t *top)    \
+{                                                                       \
+    int i, j;                                                           \
+    uint8_t v[size * 3 - 2];                                            \
+                                                                        \
+    for (i = 0; i < size - 2; i++) {                                    \
+        v[i * 2]        = (left[size - 2 - i] +                         \
+                           left[size - 1 - i] + 1) >> 1;                \
+        v[i * 2    + 1] = (left[size - 3 - i] +                         \
+                           left[size - 2 - i] * 2 +                     \
+                           left[size - 1 - i] + 2) >> 2;                \
+        v[size * 2 + i] = (top[i - 1] +                                 \
+                           top[i] * 2 +                                 \
+                           top[i + 1] + 2) >> 2;                        \
+    }                                                                   \
+    v[size * 2 - 2] = (top[-1] + left[0] + 1) >> 1;                     \
+    v[size * 2 - 4] = (left[0] + left[1] + 1) >> 1;                     \
+    v[size * 2 - 1] = (top[0]  + top[-1] * 2 + left[0] + 2) >> 2;       \
+    v[size * 2 - 3] = (top[-1] + left[0] * 2 + left[1] + 2) >> 2;       \
+                                                                        \
+    for (j = 0; j < size; j++)                                          \
+        memcpy(dst + j * stride, v + size * 2 - 2 - j * 2, size);       \
+}
+
+def_hor_down(8)
+def_hor_down(16)
+def_hor_down(32)
+
+static void vert_left_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *left, const uint8_t *top)
+{
+    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
+        a4 = top[4], a5 = top[5], a6 = top[6];
+
+    DST(0, 0) = (a0 + a1          + 1) >> 1;
+    DST(0, 1) = (a0 + a1 * 2 + a2 + 2) >> 2;
+    DST(1, 0) =
+    DST(0, 2) = (a1 + a2          + 1) >> 1;
+    DST(1, 1) =
+    DST(0, 3) = (a1 + a2 * 2 + a3 + 2) >> 2;
+    DST(2, 0) =
+    DST(1, 2) = (a2 + a3          + 1) >> 1;
+    DST(2, 1) =
+    DST(1, 3) = (a2 + a3 * 2 + a4 + 2) >> 2;
+    DST(3, 0) =
+    DST(2, 2) = (a3 + a4          + 1) >> 1;
+    DST(3, 1) =
+    DST(2, 3) = (a3 + a4 * 2 + a5 + 2) >> 2;
+    DST(3, 2) = (a4 + a5          + 1) >> 1;
+    DST(3, 3) = (a4 + a5 * 2 + a6 + 2) >> 2;
+}
+
+#define def_vert_left(size)                                             \
+static void vert_left_ ## size ## x ## size ## _c(uint8_t *dst,         \
+                                                  ptrdiff_t stride,     \
+                                                  const uint8_t *left,  \
+                                                  const uint8_t *top)   \
+{                                                                       \
+    int i, j;                                                           \
+    uint8_t ve[size - 1], vo[size - 1];                                 \
+                                                                        \
+    for (i = 0; i < size - 2; i++) {                                    \
+        ve[i] = (top[i] + top[i + 1] + 1) >> 1;                         \
+        vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2;        \
+    }                                                                   \
+    ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1;            \
+    vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2;        \
+                                                                        \
+    for (j = 0; j < size / 2; j++) {                                    \
+        memcpy(dst +  j * 2      * stride, ve + j, size - (j + 1));     \
+        memset(dst +  j * 2      * stride + size - j - 1,               \
+               top[size - 1], j + 1);                                   \
+        memcpy(dst + (j * 2 + 1) * stride, vo + j, size - (j + 1));     \
+        memset(dst + (j * 2 + 1) * stride + size - j - 1,               \
+               top[size - 1], j + 1);                                   \
+    }                                                                   \
+}
+
+def_vert_left(8)
+def_vert_left(16)
+def_vert_left(32)
+
+static void hor_up_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                         const uint8_t *left, const uint8_t *top)
+{
+    int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
+
+    DST(0, 0) = (l0 + l1          + 1) >> 1;
+    DST(1, 0) = (l0 + l1 * 2 + l2 + 2) >> 2;
+    DST(0, 1) =
+    DST(2, 0) = (l1 + l2          + 1) >> 1;
+    DST(1, 1) =
+    DST(3, 0) = (l1 + l2 * 2 + l3 + 2) >> 2;
+    DST(0, 2) =
+    DST(2, 1) = (l2 + l3          + 1) >> 1;
+    DST(1, 2) =
+    DST(3, 1) = (l2 + l3 * 3      + 2) >> 2;
+    DST(0, 3) =
+    DST(1, 3) =
+    DST(2, 2) =
+    DST(2, 3) =
+    DST(3, 2) =
+    DST(3, 3) = l3;
+}
+
+#define def_hor_up(size)                                                    \
+static void hor_up_ ## size ## x ## size ## _c(uint8_t *dst,                \
+                                               ptrdiff_t stride,            \
+                                               const uint8_t *left,         \
+                                               const uint8_t *top)          \
+{                                                                           \
+    int i, j;                                                               \
+    uint8_t v[size * 2 - 2];                                                \
+                                                                            \
+    for (i = 0; i < size - 2; i++) {                                        \
+        v[i * 2]     = (left[i] + left[i + 1] + 1) >> 1;                    \
+        v[i * 2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2;  \
+    }                                                                       \
+    v[size * 2 - 4] = (left[size - 2] + left[size - 1]     + 1) >> 1;       \
+    v[size * 2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2;       \
+                                                                            \
+    for (j = 0; j < size / 2; j++)                                          \
+        memcpy(dst + j * stride, v + j * 2, size);                          \
+    for (j = size / 2; j < size; j++) {                                     \
+        memcpy(dst + j * stride, v + j * 2, size * 2 - 2 - j * 2);          \
+        memset(dst + j * stride + size * 2 - 2 - j * 2, left[size - 1],     \
+               2 + j * 2 - size);                                           \
+    }                                                                       \
+}
+
+def_hor_up(8)
+def_hor_up(16)
+def_hor_up(32)
+
+#undef DST
+
+static av_cold void vp9dsp_intrapred_init(VP9DSPContext *dsp)
+{
+#define init_intra_pred(tx, sz)                                              \
+    dsp->intra_pred[tx][VERT_PRED]            = vert_           ## sz ## _c; \
+    dsp->intra_pred[tx][HOR_PRED]             = hor_            ## sz ## _c; \
+    dsp->intra_pred[tx][DC_PRED]              = dc_             ## sz ## _c; \
+    dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED]  = diag_downleft_  ## sz ## _c; \
+    dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_ ## sz ## _c; \
+    dsp->intra_pred[tx][VERT_RIGHT_PRED]      = vert_right_     ## sz ## _c; \
+    dsp->intra_pred[tx][HOR_DOWN_PRED]        = hor_down_       ## sz ## _c; \
+    dsp->intra_pred[tx][VERT_LEFT_PRED]       = vert_left_      ## sz ## _c; \
+    dsp->intra_pred[tx][HOR_UP_PRED]          = hor_up_         ## sz ## _c; \
+    dsp->intra_pred[tx][TM_VP8_PRED]          = tm_             ## sz ## _c; \
+    dsp->intra_pred[tx][LEFT_DC_PRED]         = dc_left_        ## sz ## _c; \
+    dsp->intra_pred[tx][TOP_DC_PRED]          = dc_top_         ## sz ## _c; \
+    dsp->intra_pred[tx][DC_128_PRED]          = dc_128_         ## sz ## _c; \
+    dsp->intra_pred[tx][DC_127_PRED]          = dc_127_         ## sz ## _c; \
+    dsp->intra_pred[tx][DC_129_PRED]          = dc_129_         ## sz ## _c
+
+    init_intra_pred(TX_4X4,   4x4);
+    init_intra_pred(TX_8X8,   8x8);
+    init_intra_pred(TX_16X16, 16x16);
+    init_intra_pred(TX_32X32, 32x32);
+
+#undef init_intra_pred
+}
+
+#define itxfm_wrapper(type_a, type_b, sz, bits)                             \
+static void                                                                 \
+type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst,         \
+                                                      ptrdiff_t stride,     \
+                                                      int16_t *block,       \
+                                                      int eob)              \
+{                                                                           \
+    int i, j;                                                               \
+    int16_t tmp[sz * sz], out[sz];                                          \
+    for (i = 0; i < sz; i++)                                                \
+        type_a ## sz ## _1d(tmp + i * sz, block + i, sz, 0);                \
+    memset(block, 0, sz * sz * sizeof(*block));                             \
+    for (i = 0; i < sz; i++) {                                              \
+        type_b ## sz ## _1d(out, tmp + i, sz, 1);                           \
+        for (j = 0; j < sz; j++)                                            \
+            dst[j * stride] =                                               \
+                av_clip_uint8(dst[j * stride] +                             \
+                              (bits ? (out[j] + (1 << (bits - 1))) >> bits  \
+                                    : out[j]));                             \
+        dst++;                                                              \
+    }                                                                       \
+}
+
+#define itxfm_wrap(sz, bits)             \
+    itxfm_wrapper(idct, idct, sz, bits)  \
+    itxfm_wrapper(iadst, idct, sz, bits) \
+    itxfm_wrapper(idct, iadst, sz, bits) \
+    itxfm_wrapper(iadst, iadst, sz, bits)
+
+#define IN(x) in[x * stride]
+
+static av_always_inline void idct4_1d(int16_t *out, const int16_t *in,
+                                      ptrdiff_t stride, int pass)
+{
+    int t0, t1, t2, t3;
+
+    t0 = ((IN(0)        + IN(2)) * 11585 + (1 << 13)) >> 14;
+    t1 = ((IN(0)        - IN(2)) * 11585 + (1 << 13)) >> 14;
+    t2 = (IN(1) *  6270 - IN(3)  * 15137 + (1 << 13)) >> 14;
+    t3 = (IN(1) * 15137 + IN(3)  *  6270 + (1 << 13)) >> 14;
+
+    out[0] = t0 + t3;
+    out[1] = t1 + t2;
+    out[2] = t1 - t2;
+    out[3] = t0 - t3;
+}
+
+static av_always_inline void iadst4_1d(int16_t *out, const int16_t *in,
+                                       ptrdiff_t stride, int pass)
+{
+    int t0, t1, t2, t3;
+
+    t0 =  5283 * IN(0) + 15212 * IN(2) +  9929 * IN(3);
+    t1 =  9929 * IN(0) -  5283 * IN(2) - 15212 * IN(3);
+    t2 = 13377 * (IN(0) - IN(2) + IN(3));
+    t3 = 13377 * IN(1);
+
+    out[0] = (t0 + t3      + (1 << 13)) >> 14;
+    out[1] = (t1 + t3      + (1 << 13)) >> 14;
+    out[2] = (t2           + (1 << 13)) >> 14;
+    out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14;
+}
+
+itxfm_wrap(4, 4)
+
+static av_always_inline void idct8_1d(int16_t *out, const int16_t *in,
+                                      ptrdiff_t stride, int pass)
+{
+    int t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
+
+    t0a = ((IN(0)        + IN(4)) * 11585 + (1 << 13)) >> 14;
+    t1a = ((IN(0)        - IN(4)) * 11585 + (1 << 13)) >> 14;
+    t2a = (IN(2) *  6270 - IN(6)  * 15137 + (1 << 13)) >> 14;
+    t3a = (IN(2) * 15137 + IN(6)  *  6270 + (1 << 13)) >> 14;
+    t4a = (IN(1) *  3196 - IN(7)  * 16069 + (1 << 13)) >> 14;
+    t5a = (IN(5) * 13623 - IN(3)  *  9102 + (1 << 13)) >> 14;
+    t6a = (IN(5) *  9102 + IN(3)  * 13623 + (1 << 13)) >> 14;
+    t7a = (IN(1) * 16069 + IN(7)  *  3196 + (1 << 13)) >> 14;
+
+    t0  = t0a + t3a;
+    t1  = t1a + t2a;
+    t2  = t1a - t2a;
+    t3  = t0a - t3a;
+    t4  = t4a + t5a;
+    t5a = t4a - t5a;
+    t7  = t7a + t6a;
+    t6a = t7a - t6a;
+
+    t5  = ((t6a - t5a) * 11585 + (1 << 13)) >> 14;
+    t6  = ((t6a + t5a) * 11585 + (1 << 13)) >> 14;
+
+    out[0] = t0 + t7;
+    out[1] = t1 + t6;
+    out[2] = t2 + t5;
+    out[3] = t3 + t4;
+    out[4] = t3 - t4;
+    out[5] = t2 - t5;
+    out[6] = t1 - t6;
+    out[7] = t0 - t7;
+}
+
+static av_always_inline void iadst8_1d(int16_t *out, const int16_t *in,
+                                       ptrdiff_t stride, int pass)
+{
+    int t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
+
+    t0a = 16305 * IN(7) +  1606 * IN(0);
+    t1a =  1606 * IN(7) - 16305 * IN(0);
+    t2a = 14449 * IN(5) +  7723 * IN(2);
+    t3a =  7723 * IN(5) - 14449 * IN(2);
+    t4a = 10394 * IN(3) + 12665 * IN(4);
+    t5a = 12665 * IN(3) - 10394 * IN(4);
+    t6a =  4756 * IN(1) + 15679 * IN(6);
+    t7a = 15679 * IN(1) -  4756 * IN(6);
+
+    t0  = (t0a + t4a + (1 << 13)) >> 14;
+    t1  = (t1a + t5a + (1 << 13)) >> 14;
+    t2  = (t2a + t6a + (1 << 13)) >> 14;
+    t3  = (t3a + t7a + (1 << 13)) >> 14;
+    t4  = (t0a - t4a + (1 << 13)) >> 14;
+    t5  = (t1a - t5a + (1 << 13)) >> 14;
+    t6  = (t2a - t6a + (1 << 13)) >> 14;
+    t7  = (t3a - t7a + (1 << 13)) >> 14;
+
+    t4a = 15137 * t4 +  6270 * t5;
+    t5a =  6270 * t4 - 15137 * t5;
+    t6a = 15137 * t7 -  6270 * t6;
+    t7a =  6270 * t7 + 15137 * t6;
+
+    out[0] =   t0 + t2;
+    out[7] = -(t1 + t3);
+    t2     =   t0 - t2;
+    t3     =   t1 - t3;
+
+    out[1] = -((t4a + t6a + (1 << 13)) >> 14);
+    out[6] =   (t5a + t7a + (1 << 13)) >> 14;
+    t6     =   (t4a - t6a + (1 << 13)) >> 14;
+    t7     =   (t5a - t7a + (1 << 13)) >> 14;
+
+    out[3] = -(((t2 + t3) * 11585 + (1 << 13)) >> 14);
+    out[4] =   ((t2 - t3) * 11585 + (1 << 13)) >> 14;
+    out[2] =   ((t6 + t7) * 11585 + (1 << 13)) >> 14;
+    out[5] = -(((t6 - t7) * 11585 + (1 << 13)) >> 14);
+}
+
+itxfm_wrap(8, 5)
+
+static av_always_inline void idct16_1d(int16_t *out, const int16_t *in,
+                                       ptrdiff_t stride, int pass)
+{
+    int t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    int t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
+    int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
+
+    t0a  = ((IN(0)         + IN(8)) * 11585 + (1 << 13)) >> 14;
+    t1a  = ((IN(0)         - IN(8)) * 11585 + (1 << 13)) >> 14;
+    t2a  = (IN(4)  *  6270 - IN(12) * 15137 + (1 << 13)) >> 14;
+    t3a  = (IN(4)  * 15137 + IN(12) *  6270 + (1 << 13)) >> 14;
+    t4a  = (IN(2)  *  3196 - IN(14) * 16069 + (1 << 13)) >> 14;
+    t7a  = (IN(2)  * 16069 + IN(14) *  3196 + (1 << 13)) >> 14;
+    t5a  = (IN(10) * 13623 - IN(6)  *  9102 + (1 << 13)) >> 14;
+    t6a  = (IN(10) *  9102 + IN(6)  * 13623 + (1 << 13)) >> 14;
+    t8a  = (IN(1)  *  1606 - IN(15) * 16305 + (1 << 13)) >> 14;
+    t15a = (IN(1)  * 16305 + IN(15) *  1606 + (1 << 13)) >> 14;
+    t9a  = (IN(9)  * 12665 - IN(7)  * 10394 + (1 << 13)) >> 14;
+    t14a = (IN(9)  * 10394 + IN(7)  * 12665 + (1 << 13)) >> 14;
+    t10a = (IN(5)  *  7723 - IN(11) * 14449 + (1 << 13)) >> 14;
+    t13a = (IN(5)  * 14449 + IN(11) *  7723 + (1 << 13)) >> 14;
+    t11a = (IN(13) * 15679 - IN(3)  *  4756 + (1 << 13)) >> 14;
+    t12a = (IN(13) *  4756 + IN(3)  * 15679 + (1 << 13)) >> 14;
+
+    t0   = t0a  + t3a;
+    t1   = t1a  + t2a;
+    t2   = t1a  - t2a;
+    t3   = t0a  - t3a;
+    t4   = t4a  + t5a;
+    t5   = t4a  - t5a;
+    t6   = t7a  - t6a;
+    t7   = t7a  + t6a;
+    t8   = t8a  + t9a;
+    t9   = t8a  - t9a;
+    t10  = t11a - t10a;
+    t11  = t11a + t10a;
+    t12  = t12a + t13a;
+    t13  = t12a - t13a;
+    t14  = t15a - t14a;
+    t15  = t15a + t14a;
+
+    t5a  =   ((t6         - t5) * 11585  + (1 << 13)) >> 14;
+    t6a  =   ((t6         + t5) * 11585  + (1 << 13)) >> 14;
+    t9a  =   (t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
+    t14a =   (t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
+    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
+    t13a =   (t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
+
+    t0a  = t0   + t7;
+    t1a  = t1   + t6a;
+    t2a  = t2   + t5a;
+    t3a  = t3   + t4;
+    t4   = t3   - t4;
+    t5   = t2   - t5a;
+    t6   = t1   - t6a;
+    t7   = t0   - t7;
+    t8a  = t8   + t11;
+    t9   = t9a  + t10a;
+    t10  = t9a  - t10a;
+    t11a = t8   - t11;
+    t12a = t15  - t12;
+    t13  = t14a - t13a;
+    t14  = t14a + t13a;
+    t15a = t15  + t12;
+
+    t10a = ((t13  - t10)  * 11585 + (1 << 13)) >> 14;
+    t13a = ((t13  + t10)  * 11585 + (1 << 13)) >> 14;
+    t11  = ((t12a - t11a) * 11585 + (1 << 13)) >> 14;
+    t12  = ((t12a + t11a) * 11585 + (1 << 13)) >> 14;
+
+    out[0]  = t0a + t15a;
+    out[1]  = t1a + t14;
+    out[2]  = t2a + t13a;
+    out[3]  = t3a + t12;
+    out[4]  = t4  + t11;
+    out[5]  = t5  + t10a;
+    out[6]  = t6  + t9;
+    out[7]  = t7  + t8a;
+    out[8]  = t7  - t8a;
+    out[9]  = t6  - t9;
+    out[10] = t5  - t10a;
+    out[11] = t4  - t11;
+    out[12] = t3a - t12;
+    out[13] = t2a - t13a;
+    out[14] = t1a - t14;
+    out[15] = t0a - t15a;
+}
+
+static av_always_inline void iadst16_1d(int16_t *out, const int16_t *in,
+                                        ptrdiff_t stride, int pass)
+{
+    int t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    int t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
+    int t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
+
+    t0  = IN(15) * 16364 + IN(0)  *   804;
+    t1  = IN(15) *   804 - IN(0)  * 16364;
+    t2  = IN(13) * 15893 + IN(2)  *  3981;
+    t3  = IN(13) *  3981 - IN(2)  * 15893;
+    t4  = IN(11) * 14811 + IN(4)  *  7005;
+    t5  = IN(11) *  7005 - IN(4)  * 14811;
+    t6  = IN(9)  * 13160 + IN(6)  *  9760;
+    t7  = IN(9)  *  9760 - IN(6)  * 13160;
+    t8  = IN(7)  * 11003 + IN(8)  * 12140;
+    t9  = IN(7)  * 12140 - IN(8)  * 11003;
+    t10 = IN(5)  *  8423 + IN(10) * 14053;
+    t11 = IN(5)  * 14053 - IN(10) *  8423;
+    t12 = IN(3)  *  5520 + IN(12) * 15426;
+    t13 = IN(3)  * 15426 - IN(12) *  5520;
+    t14 = IN(1)  *  2404 + IN(14) * 16207;
+    t15 = IN(1)  * 16207 - IN(14) *  2404;
+
+    t0a  = (t0 + t8  + (1 << 13)) >> 14;
+    t1a  = (t1 + t9  + (1 << 13)) >> 14;
+    t2a  = (t2 + t10 + (1 << 13)) >> 14;
+    t3a  = (t3 + t11 + (1 << 13)) >> 14;
+    t4a  = (t4 + t12 + (1 << 13)) >> 14;
+    t5a  = (t5 + t13 + (1 << 13)) >> 14;
+    t6a  = (t6 + t14 + (1 << 13)) >> 14;
+    t7a  = (t7 + t15 + (1 << 13)) >> 14;
+    t8a  = (t0 - t8  + (1 << 13)) >> 14;
+    t9a  = (t1 - t9  + (1 << 13)) >> 14;
+    t10a = (t2 - t10 + (1 << 13)) >> 14;
+    t11a = (t3 - t11 + (1 << 13)) >> 14;
+    t12a = (t4 - t12 + (1 << 13)) >> 14;
+    t13a = (t5 - t13 + (1 << 13)) >> 14;
+    t14a = (t6 - t14 + (1 << 13)) >> 14;
+    t15a = (t7 - t15 + (1 << 13)) >> 14;
+
+    t8   = t8a  * 16069 + t9a  *  3196;
+    t9   = t8a  *  3196 - t9a  * 16069;
+    t10  = t10a *  9102 + t11a * 13623;
+    t11  = t10a * 13623 - t11a *  9102;
+    t12  = t13a * 16069 - t12a *  3196;
+    t13  = t13a *  3196 + t12a * 16069;
+    t14  = t15a *  9102 - t14a * 13623;
+    t15  = t15a * 13623 + t14a *  9102;
+
+    t0   = t0a  + t4a;
+    t1   = t1a  + t5a;
+    t2   = t2a  + t6a;
+    t3   = t3a  + t7a;
+    t4   = t0a  - t4a;
+    t5   = t1a  - t5a;
+    t6   = t2a  - t6a;
+    t7   = t3a  - t7a;
+    t8a  = (t8  + t12 + (1 << 13)) >> 14;
+    t9a  = (t9  + t13 + (1 << 13)) >> 14;
+    t10a = (t10 + t14 + (1 << 13)) >> 14;
+    t11a = (t11 + t15 + (1 << 13)) >> 14;
+    t12a = (t8  - t12 + (1 << 13)) >> 14;
+    t13a = (t9  - t13 + (1 << 13)) >> 14;
+    t14a = (t10 - t14 + (1 << 13)) >> 14;
+    t15a = (t11 - t15 + (1 << 13)) >> 14;
+
+    t4a  = t4   * 15137 + t5   *  6270;
+    t5a  = t4   *  6270 - t5   * 15137;
+    t6a  = t7   * 15137 - t6   *  6270;
+    t7a  = t7   *  6270 + t6   * 15137;
+    t12  = t12a * 15137 + t13a *  6270;
+    t13  = t12a *  6270 - t13a * 15137;
+    t14  = t15a * 15137 - t14a *  6270;
+    t15  = t15a *  6270 + t14a * 15137;
+
+    out[0]  =     t0 + t2;
+    out[15] =   -(t1 + t3);
+    t2a     =     t0 - t2;
+    t3a     =     t1 - t3;
+    out[3]  = -((t4a + t6a + (1 << 13)) >> 14);
+    out[12] =   (t5a + t7a + (1 << 13)) >> 14;
+    t6      =   (t4a - t6a + (1 << 13)) >> 14;
+    t7      =   (t5a - t7a + (1 << 13)) >> 14;
+    out[1]  =  -(t8a + t10a);
+    out[14] =    t9a + t11a;
+    t10     =    t8a - t10a;
+    t11     =    t9a - t11a;
+    out[2]  =   (t12 + t14 + (1 << 13)) >> 14;
+    out[13] = -((t13 + t15 + (1 << 13)) >> 14);
+    t14a    =   (t12 - t14 + (1 << 13)) >> 14;
+    t15a    =   (t13 - t15 + (1 << 13)) >> 14;
+
+    out[7]  = ((t2a  + t3a)  * -11585 + (1 << 13)) >> 14;
+    out[8]  = ((t2a  - t3a)  *  11585 + (1 << 13)) >> 14;
+    out[4]  = ((t7   + t6)   *  11585 + (1 << 13)) >> 14;
+    out[11] = ((t7   - t6)   *  11585 + (1 << 13)) >> 14;
+    out[6]  = ((t11  + t10)  *  11585 + (1 << 13)) >> 14;
+    out[9]  = ((t11  - t10)  *  11585 + (1 << 13)) >> 14;
+    out[5]  = ((t14a + t15a) * -11585 + (1 << 13)) >> 14;
+    out[10] = ((t14a - t15a) *  11585 + (1 << 13)) >> 14;
+}
+
+itxfm_wrap(16, 6)
+
+static av_always_inline void idct32_1d(int16_t *out, const int16_t *in,
+                                       ptrdiff_t stride, int pass)
+{
+    int t0a  = ((IN(0)         + IN(16)) * 11585 + (1 << 13)) >> 14;
+    int t1a  = ((IN(0)         - IN(16)) * 11585 + (1 << 13)) >> 14;
+    int t2a  = (IN(8)  *  6270 - IN(24)  * 15137 + (1 << 13)) >> 14;
+    int t3a  = (IN(8)  * 15137 + IN(24)  *  6270 + (1 << 13)) >> 14;
+    int t4a  = (IN(4)  *  3196 - IN(28)  * 16069 + (1 << 13)) >> 14;
+    int t7a  = (IN(4)  * 16069 + IN(28)  *  3196 + (1 << 13)) >> 14;
+    int t5a  = (IN(20) * 13623 - IN(12)  *  9102 + (1 << 13)) >> 14;
+    int t6a  = (IN(20) *  9102 + IN(12)  * 13623 + (1 << 13)) >> 14;
+    int t8a  = (IN(2)  *  1606 - IN(30)  * 16305 + (1 << 13)) >> 14;
+    int t15a = (IN(2)  * 16305 + IN(30)  *  1606 + (1 << 13)) >> 14;
+    int t9a  = (IN(18) * 12665 - IN(14)  * 10394 + (1 << 13)) >> 14;
+    int t14a = (IN(18) * 10394 + IN(14)  * 12665 + (1 << 13)) >> 14;
+    int t10a = (IN(10) *  7723 - IN(22)  * 14449 + (1 << 13)) >> 14;
+    int t13a = (IN(10) * 14449 + IN(22)  *  7723 + (1 << 13)) >> 14;
+    int t11a = (IN(26) * 15679 - IN(6)   *  4756 + (1 << 13)) >> 14;
+    int t12a = (IN(26) *  4756 + IN(6)   * 15679 + (1 << 13)) >> 14;
+    int t16a = (IN(1)  *   804 - IN(31)  * 16364 + (1 << 13)) >> 14;
+    int t31a = (IN(1)  * 16364 + IN(31)  *   804 + (1 << 13)) >> 14;
+    int t17a = (IN(17) * 12140 - IN(15)  * 11003 + (1 << 13)) >> 14;
+    int t30a = (IN(17) * 11003 + IN(15)  * 12140 + (1 << 13)) >> 14;
+    int t18a = (IN(9)  *  7005 - IN(23)  * 14811 + (1 << 13)) >> 14;
+    int t29a = (IN(9)  * 14811 + IN(23)  *  7005 + (1 << 13)) >> 14;
+    int t19a = (IN(25) * 15426 - IN(7)   *  5520 + (1 << 13)) >> 14;
+    int t28a = (IN(25) *  5520 + IN(7)   * 15426 + (1 << 13)) >> 14;
+    int t20a = (IN(5)  *  3981 - IN(27)  * 15893 + (1 << 13)) >> 14;
+    int t27a = (IN(5)  * 15893 + IN(27)  *  3981 + (1 << 13)) >> 14;
+    int t21a = (IN(21) * 14053 - IN(11)  *  8423 + (1 << 13)) >> 14;
+    int t26a = (IN(21) *  8423 + IN(11)  * 14053 + (1 << 13)) >> 14;
+    int t22a = (IN(13) *  9760 - IN(19)  * 13160 + (1 << 13)) >> 14;
+    int t25a = (IN(13) * 13160 + IN(19)  *  9760 + (1 << 13)) >> 14;
+    int t23a = (IN(29) * 16207 - IN(3)   *  2404 + (1 << 13)) >> 14;
+    int t24a = (IN(29) *  2404 + IN(3)   * 16207 + (1 << 13)) >> 14;
+
+    int t0  = t0a  + t3a;
+    int t1  = t1a  + t2a;
+    int t2  = t1a  - t2a;
+    int t3  = t0a  - t3a;
+    int t4  = t4a  + t5a;
+    int t5  = t4a  - t5a;
+    int t6  = t7a  - t6a;
+    int t7  = t7a  + t6a;
+    int t8  = t8a  + t9a;
+    int t9  = t8a  - t9a;
+    int t10 = t11a - t10a;
+    int t11 = t11a + t10a;
+    int t12 = t12a + t13a;
+    int t13 = t12a - t13a;
+    int t14 = t15a - t14a;
+    int t15 = t15a + t14a;
+    int t16 = t16a + t17a;
+    int t17 = t16a - t17a;
+    int t18 = t19a - t18a;
+    int t19 = t19a + t18a;
+    int t20 = t20a + t21a;
+    int t21 = t20a - t21a;
+    int t22 = t23a - t22a;
+    int t23 = t23a + t22a;
+    int t24 = t24a + t25a;
+    int t25 = t24a - t25a;
+    int t26 = t27a - t26a;
+    int t27 = t27a + t26a;
+    int t28 = t28a + t29a;
+    int t29 = t28a - t29a;
+    int t30 = t31a - t30a;
+    int t31 = t31a + t30a;
+
+    t5a  =   ((t6         - t5) * 11585  + (1 << 13)) >> 14;
+    t6a  =   ((t6         + t5) * 11585  + (1 << 13)) >> 14;
+    t9a  =   (t14 *  6270 - t9  * 15137  + (1 << 13)) >> 14;
+    t14a =   (t14 * 15137 + t9  *  6270  + (1 << 13)) >> 14;
+    t10a = (-(t13 * 15137 + t10 *  6270) + (1 << 13)) >> 14;
+    t13a =   (t13 *  6270 - t10 * 15137  + (1 << 13)) >> 14;
+    t17a =   (t30 *  3196 - t17 * 16069  + (1 << 13)) >> 14;
+    t30a =   (t30 * 16069 + t17 *  3196  + (1 << 13)) >> 14;
+    t18a = (-(t29 * 16069 + t18 *  3196) + (1 << 13)) >> 14;
+    t29a =   (t29 *  3196 - t18 * 16069  + (1 << 13)) >> 14;
+    t21a =   (t26 * 13623 - t21 *  9102  + (1 << 13)) >> 14;
+    t26a =   (t26 *  9102 + t21 * 13623  + (1 << 13)) >> 14;
+    t22a = (-(t25 *  9102 + t22 * 13623) + (1 << 13)) >> 14;
+    t25a =   (t25 * 13623 - t22 *  9102  + (1 << 13)) >> 14;
+
+    t0a  = t0   + t7;
+    t1a  = t1   + t6a;
+    t2a  = t2   + t5a;
+    t3a  = t3   + t4;
+    t4a  = t3   - t4;
+    t5   = t2   - t5a;
+    t6   = t1   - t6a;
+    t7a  = t0   - t7;
+    t8a  = t8   + t11;
+    t9   = t9a  + t10a;
+    t10  = t9a  - t10a;
+    t11a = t8   - t11;
+    t12a = t15  - t12;
+    t13  = t14a - t13a;
+    t14  = t14a + t13a;
+    t15a = t15  + t12;
+    t16a = t16  + t19;
+    t17  = t17a + t18a;
+    t18  = t17a - t18a;
+    t19a = t16  - t19;
+    t20a = t23  - t20;
+    t21  = t22a - t21a;
+    t22  = t22a + t21a;
+    t23a = t23  + t20;
+    t24a = t24  + t27;
+    t25  = t25a + t26a;
+    t26  = t25a - t26a;
+    t27a = t24  - t27;
+    t28a = t31  - t28;
+    t29  = t30a - t29a;
+    t30  = t30a + t29a;
+    t31a = t31  + t28;
+
+    t10a = ((t13           - t10)  * 11585  + (1 << 13)) >> 14;
+    t13a = ((t13           + t10)  * 11585  + (1 << 13)) >> 14;
+    t11  = ((t12a          - t11a) * 11585  + (1 << 13)) >> 14;
+    t12  = ((t12a          + t11a) * 11585  + (1 << 13)) >> 14;
+    t18a =   (t29  *  6270 - t18   * 15137  + (1 << 13)) >> 14;
+    t29a =   (t29  * 15137 + t18   *  6270  + (1 << 13)) >> 14;
+    t19  =   (t28a *  6270 - t19a  * 15137  + (1 << 13)) >> 14;
+    t28  =   (t28a * 15137 + t19a  *  6270  + (1 << 13)) >> 14;
+    t20  = (-(t27a * 15137 + t20a  *  6270) + (1 << 13)) >> 14;
+    t27  =   (t27a *  6270 - t20a  * 15137  + (1 << 13)) >> 14;
+    t21a = (-(t26  * 15137 + t21   *  6270) + (1 << 13)) >> 14;
+    t26a =   (t26  *  6270 - t21   * 15137  + (1 << 13)) >> 14;
+
+    t0   = t0a  + t15a;
+    t1   = t1a  + t14;
+    t2   = t2a  + t13a;
+    t3   = t3a  + t12;
+    t4   = t4a  + t11;
+    t5a  = t5   + t10a;
+    t6a  = t6   + t9;
+    t7   = t7a  + t8a;
+    t8   = t7a  - t8a;
+    t9a  = t6   - t9;
+    t10  = t5   - t10a;
+    t11a = t4a  - t11;
+    t12a = t3a  - t12;
+    t13  = t2a  - t13a;
+    t14a = t1a  - t14;
+    t15  = t0a  - t15a;
+    t16  = t16a + t23a;
+    t17a = t17  + t22;
+    t18  = t18a + t21a;
+    t19a = t19  + t20;
+    t20a = t19  - t20;
+    t21  = t18a - t21a;
+    t22a = t17  - t22;
+    t23  = t16a - t23a;
+    t24  = t31a - t24a;
+    t25a = t30  - t25;
+    t26  = t29a - t26a;
+    t27a = t28  - t27;
+    t28a = t28  + t27;
+    t29  = t29a + t26a;
+    t30a = t30  + t25;
+    t31  = t31a + t24a;
+
+    t20  = ((t27a - t20a) * 11585 + (1 << 13)) >> 14;
+    t27  = ((t27a + t20a) * 11585 + (1 << 13)) >> 14;
+    t21a = ((t26  - t21)  * 11585 + (1 << 13)) >> 14;
+    t26a = ((t26  + t21)  * 11585 + (1 << 13)) >> 14;
+    t22  = ((t25a - t22a) * 11585 + (1 << 13)) >> 14;
+    t25  = ((t25a + t22a) * 11585 + (1 << 13)) >> 14;
+    t23a = ((t24  - t23)  * 11585 + (1 << 13)) >> 14;
+    t24a = ((t24  + t23)  * 11585 + (1 << 13)) >> 14;
+
+    out[0]  = t0   + t31;
+    out[1]  = t1   + t30a;
+    out[2]  = t2   + t29;
+    out[3]  = t3   + t28a;
+    out[4]  = t4   + t27;
+    out[5]  = t5a  + t26a;
+    out[6]  = t6a  + t25;
+    out[7]  = t7   + t24a;
+    out[8]  = t8   + t23a;
+    out[9]  = t9a  + t22;
+    out[10] = t10  + t21a;
+    out[11] = t11a + t20;
+    out[12] = t12a + t19a;
+    out[13] = t13  + t18;
+    out[14] = t14a + t17a;
+    out[15] = t15  + t16;
+    out[16] = t15  - t16;
+    out[17] = t14a - t17a;
+    out[18] = t13  - t18;
+    out[19] = t12a - t19a;
+    out[20] = t11a - t20;
+    out[21] = t10  - t21a;
+    out[22] = t9a  - t22;
+    out[23] = t8   - t23a;
+    out[24] = t7   - t24a;
+    out[25] = t6a  - t25;
+    out[26] = t5a  - t26a;
+    out[27] = t4   - t27;
+    out[28] = t3   - t28a;
+    out[29] = t2   - t29;
+    out[30] = t1   - t30a;
+    out[31] = t0   - t31;
+}
+
+itxfm_wrapper(idct, idct, 32, 6)
+
+static av_always_inline void iwht4_1d(int16_t *out, const int16_t *in,
+                                      ptrdiff_t stride, int pass)
+{
+    int t0, t1, t2, t3, t4;
+
+    if (pass == 0) {
+        t0 = IN(0) >> 2;
+        t1 = IN(3) >> 2;
+        t2 = IN(1) >> 2;
+        t3 = IN(2) >> 2;
+    } else {
+        t0 = IN(0);
+        t1 = IN(3);
+        t2 = IN(1);
+        t3 = IN(2);
+    }
+
+    t0 += t2;
+    t3 -= t1;
+    t4 = (t0 - t3) >> 1;
+    t1 = t4 - t1;
+    t2 = t4 - t2;
+    t0 -= t1;
+    t3 += t2;
+
+    out[0] = t0;
+    out[1] = t1;
+    out[2] = t2;
+    out[3] = t3;
+}
+
+itxfm_wrapper(iwht, iwht, 4, 0)
+
+#undef IN
+#undef itxfm_wrapper
+#undef itxfm_wrap
+
+static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp)
+{
+#define init_itxfm(tx, sz)                                        \
+    dsp->itxfm_add[tx][DCT_DCT]   = idct_idct_   ## sz ## _add_c; \
+    dsp->itxfm_add[tx][DCT_ADST]  = iadst_idct_  ## sz ## _add_c; \
+    dsp->itxfm_add[tx][ADST_DCT]  = idct_iadst_  ## sz ## _add_c; \
+    dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_ ## sz ## _add_c
+
+#define init_idct(tx, nm)                               \
+    dsp->itxfm_add[tx][DCT_DCT]   =                     \
+    dsp->itxfm_add[tx][ADST_DCT]  =                     \
+    dsp->itxfm_add[tx][DCT_ADST]  =                     \
+    dsp->itxfm_add[tx][ADST_ADST] = nm ## _add_c
+
+    init_itxfm(TX_4X4, 4x4);
+    init_itxfm(TX_8X8, 8x8);
+    init_itxfm(TX_16X16, 16x16);
+    init_idct(TX_32X32, idct_idct_32x32);
+    init_idct(4 /* lossless */, iwht_iwht_4x4);
+
+#undef init_itxfm
+#undef init_idct
+}
+
+static av_always_inline void loop_filter(uint8_t *dst, ptrdiff_t stride,
+                                         int E, int I, int H,
+                                         ptrdiff_t stridea, ptrdiff_t strideb,
+                                         int wd)
+{
+    int i;
+
+    for (i = 0; i < 8; i++, dst += stridea) {
+        int p7, p6, p5, p4;
+        int p3 = dst[strideb * -4], p2 = dst[strideb * -3];
+        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
+        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
+        int q2 = dst[strideb * +2], q3 = dst[strideb * +3];
+        int q4, q5, q6, q7;
+        int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
+                 FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I &&
+                 FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I &&
+                 FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E;
+        int flat8out, flat8in;
+
+        if (!fm)
+            continue;
+
+        if (wd >= 16) {
+            p7 = dst[strideb * -8];
+            p6 = dst[strideb * -7];
+            p5 = dst[strideb * -6];
+            p4 = dst[strideb * -5];
+            q4 = dst[strideb * +4];
+            q5 = dst[strideb * +5];
+            q6 = dst[strideb * +6];
+            q7 = dst[strideb * +7];
+
+            flat8out = FFABS(p7 - p0) <= 1 && FFABS(p6 - p0) <= 1 &&
+                       FFABS(p5 - p0) <= 1 && FFABS(p4 - p0) <= 1 &&
+                       FFABS(q4 - q0) <= 1 && FFABS(q5 - q0) <= 1 &&
+                       FFABS(q6 - q0) <= 1 && FFABS(q7 - q0) <= 1;
+        }
+
+        if (wd >= 8)
+            flat8in = FFABS(p3 - p0) <= 1 && FFABS(p2 - p0) <= 1 &&
+                      FFABS(p1 - p0) <= 1 && FFABS(q1 - q0) <= 1 &&
+                      FFABS(q2 - q0) <= 1 && FFABS(q3 - q0) <= 1;
+
+        if (wd >= 16 && flat8out && flat8in) {
+            dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 +
+                                 p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
+            dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 +
+                                 p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
+            dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 +
+                                 p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
+            dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 +
+                                 p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
+            dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 +
+                                 p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
+            dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+                                 p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
+            dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                                 q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+            dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+                                 q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
+            dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+                                 q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4;
+            dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+                                 q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
+                                 q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
+                                 q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
+                                 q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+            dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
+                                 q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
+        } else if (wd >= 8 && flat8in) {
+            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
+            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
+            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
+            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
+            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
+            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
+        } else {
+            int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H;
+
+            if (hev) {
+                int f = av_clip_int8(3 * (q0 - p0) + av_clip_int8(p1 - q1));
+                int f1 = FFMIN(f + 4, 127) >> 3;
+                int f2 = FFMIN(f + 3, 127) >> 3;
+
+                dst[strideb * -1] = av_clip_uint8(p0 + f2);
+                dst[strideb * +0] = av_clip_uint8(q0 - f1);
+            } else {
+                int f = av_clip_int8(3 * (q0 - p0));
+                int f1 = FFMIN(f + 4, 127) >> 3;
+                int f2 = FFMIN(f + 3, 127) >> 3;
+
+                dst[strideb * -1] = av_clip_uint8(p0 + f2);
+                dst[strideb * +0] = av_clip_uint8(q0 - f1);
+
+                f = (f1 + 1) >> 1;
+                dst[strideb * -2] = av_clip_uint8(p1 + f);
+                dst[strideb * +1] = av_clip_uint8(q1 - f);
+            }
+        }
+    }
+}
+
+#define lf_8_fn(dir, wd, stridea, strideb)                                  \
+static void loop_filter_ ## dir ## _ ## wd  ## _8_c(uint8_t *dst,           \
+                                                    ptrdiff_t stride,       \
+                                                    int E, int I, int H)    \
+{                                                                           \
+    loop_filter(dst, stride, E, I, H, stridea, strideb, wd);                \
+}
+
+#define lf_8_fns(wd)          \
+    lf_8_fn(h, wd, stride, 1) \
+    lf_8_fn(v, wd, 1, stride)
+
+lf_8_fns(4)
+lf_8_fns(8)
+lf_8_fns(16)
+
+#undef lf_8_fn
+#undef lf_8_fns
+
+#define lf_16_fn(dir, stridea)                                          \
+static void loop_filter_ ## dir ## _16_16_c(uint8_t *dst,               \
+                                            ptrdiff_t stride,           \
+                                            int E, int I, int H)        \
+{                                                                       \
+    loop_filter_ ## dir ## _16_8_c(dst, stride, E, I, H);               \
+    loop_filter_ ## dir ## _16_8_c(dst + 8 * stridea, stride, E, I, H); \
+}
+
+lf_16_fn(h, stride)
+lf_16_fn(v, 1)
+
+#undef lf_16_fn
+
+#define lf_mix_fn(dir, wd1, wd2, stridea)                                     \
+static void loop_filter_ ## dir ## _ ## wd1 ## wd2 ## _16_c(uint8_t *dst,     \
+                                                            ptrdiff_t stride, \
+                                                            int E, int I,     \
+                                                            int H)            \
+{                                                                             \
+    loop_filter_ ## dir ## _ ## wd1 ## _8_c(dst, stride, E & 0xff,            \
+                                            I & 0xff, H & 0xff);              \
+    loop_filter_ ## dir ## _ ## wd2 ## _8_c(dst + 8 * stridea, stride,        \
+                                            E >> 8, I >> 8, H >> 8);          \
+}
+
+#define lf_mix_fns(wd1, wd2)       \
+    lf_mix_fn(h, wd1, wd2, stride) \
+    lf_mix_fn(v, wd1, wd2, 1)
+
+lf_mix_fns(4, 4)
+lf_mix_fns(4, 8)
+lf_mix_fns(8, 4)
+lf_mix_fns(8, 8)
+
+#undef lf_mix_fn
+#undef lf_mix_fns
+
+static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp)
+{
+    dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c;
+    dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c;
+    dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c;
+    dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c;
+    dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c;
+    dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c;
+
+    dsp->loop_filter_16[0] = loop_filter_h_16_16_c;
+    dsp->loop_filter_16[1] = loop_filter_v_16_16_c;
+
+    dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c;
+    dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c;
+    dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c;
+    dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c;
+    dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c;
+    dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c;
+    dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c;
+    dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c;
+}
+
+static av_always_inline void copy_c(uint8_t *dst, const uint8_t *src,
+                                    ptrdiff_t dst_stride,
+                                    ptrdiff_t src_stride,
+                                    int w, int h)
+{
+    do {
+        memcpy(dst, src, w);
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+static av_always_inline void avg_c(uint8_t *dst, const uint8_t *src,
+                                   ptrdiff_t dst_stride,
+                                   ptrdiff_t src_stride,
+                                   int w, int h)
+{
+    do {
+        int x;
+
+        for (x = 0; x < w; x += 4)
+            AV_WN32A(&dst[x], rnd_avg32(AV_RN32A(&dst[x]), AV_RN32(&src[x])));
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define fpel_fn(type, sz)                                      \
+static void type ## sz ## _c(uint8_t *dst, const uint8_t *src, \
+                             ptrdiff_t dst_stride,             \
+                             ptrdiff_t src_stride,             \
+                             int h, int mx, int my)            \
+{                                                              \
+    type ## _c(dst, src, dst_stride, src_stride, sz, h);       \
+}
+
+#define copy_avg_fn(sz) \
+    fpel_fn(copy, sz)   \
+    fpel_fn(avg, sz)
+
+copy_avg_fn(64)
+copy_avg_fn(32)
+copy_avg_fn(16)
+copy_avg_fn(8)
+copy_avg_fn(4)
+
+#undef fpel_fn
+#undef copy_avg_fn
+
+static const int8_t vp9_subpel_filters[3][15][8] = {
+    [FILTER_8TAP_REGULAR] = {
+        {  0,  1,  -5, 126,   8,  -3,  1,  0 },
+        { -1,  3, -10, 122,  18,  -6,  2,  0 },
+        { -1,  4, -13, 118,  27,  -9,  3, -1 },
+        { -1,  4, -16, 112,  37, -11,  4, -1 },
+        { -1,  5, -18, 105,  48, -14,  4, -1 },
+        { -1,  5, -19,  97,  58, -16,  5, -1 },
+        { -1,  6, -19,  88,  68, -18,  5, -1 },
+        { -1,  6, -19,  78,  78, -19,  6, -1 },
+        { -1,  5, -18,  68,  88, -19,  6, -1 },
+        { -1,  5, -16,  58,  97, -19,  5, -1 },
+        { -1,  4, -14,  48, 105, -18,  5, -1 },
+        { -1,  4, -11,  37, 112, -16,  4, -1 },
+        { -1,  3,  -9,  27, 118, -13,  4, -1 },
+        {  0,  2,  -6,  18, 122, -10,  3, -1 },
+        {  0,  1,  -3,   8, 126,  -5,  1,  0 },
+    }, [FILTER_8TAP_SHARP] = {
+        { -1,  3,  -7, 127,   8,  -3,  1,  0 },
+        { -2,  5, -13, 125,  17,  -6,  3, -1 },
+        { -3,  7, -17, 121,  27, -10,  5, -2 },
+        { -4,  9, -20, 115,  37, -13,  6, -2 },
+        { -4, 10, -23, 108,  48, -16,  8, -3 },
+        { -4, 10, -24, 100,  59, -19,  9, -3 },
+        { -4, 11, -24,  90,  70, -21, 10, -4 },
+        { -4, 11, -23,  80,  80, -23, 11, -4 },
+        { -4, 10, -21,  70,  90, -24, 11, -4 },
+        { -3,  9, -19,  59, 100, -24, 10, -4 },
+        { -3,  8, -16,  48, 108, -23, 10, -4 },
+        { -2,  6, -13,  37, 115, -20,  9, -4 },
+        { -2,  5, -10,  27, 121, -17,  7, -3 },
+        { -1,  3,  -6,  17, 125, -13,  5, -2 },
+        {  0,  1,  -3,   8, 127,  -7,  3, -1 },
+    }, [FILTER_8TAP_SMOOTH] = {
+        { -3, -1,  32,  64,  38,   1, -3,  0 },
+        { -2, -2,  29,  63,  41,   2, -3,  0 },
+        { -2, -2,  26,  63,  43,   4, -4,  0 },
+        { -2, -3,  24,  62,  46,   5, -4,  0 },
+        { -2, -3,  21,  60,  49,   7, -4,  0 },
+        { -1, -4,  18,  59,  51,   9, -4,  0 },
+        { -1, -4,  16,  57,  53,  12, -4, -1 },
+        { -1, -4,  14,  55,  55,  14, -4, -1 },
+        { -1, -4,  12,  53,  57,  16, -4, -1 },
+        {  0, -4,   9,  51,  59,  18, -4, -1 },
+        {  0, -4,   7,  49,  60,  21, -3, -2 },
+        {  0, -4,   5,  46,  62,  24, -3, -2 },
+        {  0, -4,   4,  43,  63,  26, -2, -2 },
+        {  0, -3,   2,  41,  63,  29, -2, -2 },
+        {  0, -3,   1,  38,  64,  32, -1, -3 },
+    }
+};
+
+#define FILTER_8TAP(src, x, F, stride)              \
+    av_clip_uint8((F[0] * src[x + -3 * stride] +    \
+                   F[1] * src[x + -2 * stride] +    \
+                   F[2] * src[x + -1 * stride] +    \
+                   F[3] * src[x + +0 * stride] +    \
+                   F[4] * src[x + +1 * stride] +    \
+                   F[5] * src[x + +2 * stride] +    \
+                   F[6] * src[x + +3 * stride] +    \
+                   F[7] * src[x + +4 * stride] + 64) >> 7)
+
+static av_always_inline void do_8tap_1d_c(uint8_t *dst, const uint8_t *src,
+                                          ptrdiff_t dst_stride,
+                                          ptrdiff_t src_stride,
+                                          int w, int h, ptrdiff_t ds,
+                                          const int8_t *filter, int avg)
+{
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg)
+                dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1;
+            else
+                dst[x] = FILTER_8TAP(src, x, filter, ds);
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define filter_8tap_1d_fn(opn, opa, dir, ds)                                \
+static av_noinline void opn ## _8tap_1d_ ## dir ## _c(uint8_t *dst,         \
+                                                      const uint8_t *src,   \
+                                                      ptrdiff_t dst_stride, \
+                                                      ptrdiff_t src_stride, \
+                                                      int w, int h,         \
+                                                      const int8_t *filter) \
+{                                                                           \
+    do_8tap_1d_c(dst, src, dst_stride, src_stride, w, h, ds, filter, opa);  \
+}
+
+filter_8tap_1d_fn(put, 0, v, src_stride)
+filter_8tap_1d_fn(put, 0, h, 1)
+filter_8tap_1d_fn(avg, 1, v, src_stride)
+filter_8tap_1d_fn(avg, 1, h, 1)
+
+#undef filter_8tap_1d_fn
+
+static av_always_inline void do_8tap_2d_c(uint8_t *dst, const uint8_t *src,
+                                          ptrdiff_t dst_stride,
+                                          ptrdiff_t src_stride,
+                                          int w, int h, const int8_t *filterx,
+                                          const int8_t *filtery, int avg)
+{
+    int tmp_h = h + 7;
+    uint8_t tmp[64 * 71], *tmp_ptr = tmp;
+
+    src -= src_stride * 3;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1);
+
+        tmp_ptr += 64;
+        src     += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp + 64 * 3;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg)
+                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1;
+            else
+                dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64);
+
+        tmp_ptr += 64;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define filter_8tap_2d_fn(opn, opa)                                     \
+static av_noinline void opn ## _8tap_2d_hv_c(uint8_t *dst,              \
+                                             const uint8_t *src,        \
+                                             ptrdiff_t dst_stride,      \
+                                             ptrdiff_t src_stride,      \
+                                             int w, int h,              \
+                                             const int8_t *filterx,     \
+                                             const int8_t *filtery)     \
+{                                                                       \
+    do_8tap_2d_c(dst, src, dst_stride, src_stride,                      \
+                 w, h, filterx, filtery, opa);                          \
+}
+
+filter_8tap_2d_fn(put, 0)
+filter_8tap_2d_fn(avg, 1)
+
+#undef filter_8tap_2d_fn
+
+#undef FILTER_8TAP
+
+#define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg)                   \
+static void                                                                 \
+avg ## _8tap_ ## type ## _ ## sz ## dir ## _c(uint8_t *dst,                 \
+                                              const uint8_t *src,           \
+                                              ptrdiff_t dst_stride,         \
+                                              ptrdiff_t src_stride,         \
+                                              int h, int mx, int my)        \
+{                                                                           \
+    avg ## _8tap_1d_ ## dir ## _c(dst, src, dst_stride, src_stride, sz, h,  \
+                                  vp9_subpel_filters[type_idx][dir_m - 1]); \
+}
+
+#define filter_fn_2d(sz, type, type_idx, avg)                               \
+static void avg ## _8tap_ ## type ## _ ## sz ## hv_c(uint8_t *dst,          \
+                                                     const uint8_t *src,    \
+                                                     ptrdiff_t dst_stride,  \
+                                                     ptrdiff_t src_stride,  \
+                                                     int h, int mx, int my) \
+{                                                                           \
+    avg ## _8tap_2d_hv_c(dst, src, dst_stride, src_stride, sz, h,           \
+                         vp9_subpel_filters[type_idx][mx - 1],              \
+                         vp9_subpel_filters[type_idx][my - 1]);             \
+}
+
+#define FILTER_BILIN(src, x, mxy, stride)                       \
+    (src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4))
+
+static av_always_inline void do_bilin_1d_c(uint8_t *dst,
+                                           const uint8_t *src,
+                                           ptrdiff_t dst_stride,
+                                           ptrdiff_t src_stride,
+                                           int w, int h, ptrdiff_t ds,
+                                           int mxy, int avg)
+{
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg)
+                dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1;
+            else
+                dst[x] = FILTER_BILIN(src, x, mxy, ds);
+
+        dst += dst_stride;
+        src += src_stride;
+    } while (--h);
+}
+
+#define bilin_1d_fn(opn, opa, dir, ds)                                        \
+static av_noinline void opn ## _bilin_1d_ ## dir ## _c(uint8_t *dst,          \
+                                                       const uint8_t *src,    \
+                                                       ptrdiff_t dst_stride,  \
+                                                       ptrdiff_t src_stride,  \
+                                                       int w, int h, int mxy) \
+{                                                                             \
+    do_bilin_1d_c(dst, src, dst_stride, src_stride, w, h, ds, mxy, opa);      \
+}
+
+bilin_1d_fn(put, 0, v, src_stride)
+bilin_1d_fn(put, 0, h, 1)
+bilin_1d_fn(avg, 1, v, src_stride)
+bilin_1d_fn(avg, 1, h, 1)
+
+#undef bilin_1d_fn
+
+static av_always_inline void do_bilin_2d_c(uint8_t *dst,
+                                           const uint8_t *src,
+                                           ptrdiff_t dst_stride,
+                                           ptrdiff_t src_stride,
+                                           int w, int h, int mx, int my,
+                                           int avg)
+{
+    uint8_t tmp[64 * 65], *tmp_ptr = tmp;
+    int tmp_h = h + 1;
+
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1);
+
+        tmp_ptr += 64;
+        src     += src_stride;
+    } while (--tmp_h);
+
+    tmp_ptr = tmp;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            if (avg)
+                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
+            else
+                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
+
+        tmp_ptr += 64;
+        dst += dst_stride;
+    } while (--h);
+}
+
+#define bilin_2d_fn(opn, opa)                                           \
+static av_noinline void opn ## _bilin_2d_hv_c(uint8_t *dst,             \
+                                              const uint8_t *src,       \
+                                              ptrdiff_t dst_stride,     \
+                                              ptrdiff_t src_stride,     \
+                                              int w, int h,             \
+                                              int mx, int my)           \
+{                                                                       \
+    do_bilin_2d_c(dst, src, dst_stride, src_stride, w, h, mx, my, opa); \
+}
+
+bilin_2d_fn(put, 0)
+bilin_2d_fn(avg, 1)
+
+#undef bilin_2d_fn
+
+#undef FILTER_BILIN
+
+#define bilinf_fn_1d(sz, dir, dir_m, avg)                               \
+static void avg ## _bilin_ ## sz ## dir ## _c(uint8_t *dst,             \
+                                              const uint8_t *src,       \
+                                              ptrdiff_t dst_stride,     \
+                                              ptrdiff_t src_stride,     \
+                                              int h, int mx, int my)    \
+{                                                                       \
+    avg ## _bilin_1d_ ## dir ## _c(dst, src, dst_stride, src_stride,    \
+                                   sz, h, dir_m);                       \
+}
+
+#define bilinf_fn_2d(sz, avg)                                        \
+static void avg ## _bilin_ ## sz ## hv_c(uint8_t *dst,               \
+                                         const uint8_t *src,         \
+                                         ptrdiff_t dst_stride,       \
+                                         ptrdiff_t src_stride,       \
+                                         int h, int mx, int my)      \
+{                                                                    \
+    avg ## _bilin_2d_hv_c(dst, src, dst_stride, src_stride,          \
+                          sz, h, mx, my);                            \
+}
+
+#define filter_fn(sz, avg)                                     \
+    filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \
+    filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \
+    filter_fn_2d(sz, regular, FILTER_8TAP_REGULAR, avg)        \
+    filter_fn_1d(sz, h, mx, smooth, FILTER_8TAP_SMOOTH, avg)   \
+    filter_fn_1d(sz, v, my, smooth, FILTER_8TAP_SMOOTH, avg)   \
+    filter_fn_2d(sz, smooth, FILTER_8TAP_SMOOTH, avg)          \
+    filter_fn_1d(sz, h, mx, sharp, FILTER_8TAP_SHARP, avg)     \
+    filter_fn_1d(sz, v, my, sharp, FILTER_8TAP_SHARP, avg)     \
+    filter_fn_2d(sz, sharp, FILTER_8TAP_SHARP, avg)            \
+    bilinf_fn_1d(sz, h, mx, avg)                               \
+    bilinf_fn_1d(sz, v, my, avg)                               \
+    bilinf_fn_2d(sz, avg)
+
+#define filter_fn_set(avg) \
+    filter_fn(64, avg)     \
+    filter_fn(32, avg)     \
+    filter_fn(16, avg)     \
+    filter_fn(8, avg)      \
+    filter_fn(4, avg)
+
+filter_fn_set(put)
+filter_fn_set(avg)
+
+#undef filter_fn
+#undef filter_fn_set
+#undef filter_fn_1d
+#undef filter_fn_2d
+#undef bilinf_fn_1d
+#undef bilinf_fn_2d
+
+static av_cold void vp9dsp_mc_init(VP9DSPContext *dsp)
+{
+#define init_fpel(idx1, idx2, sz, type)                                \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][0][0]  = type ## sz ## _c; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type ## sz ## _c; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][0][0]   = type ## sz ## _c; \
+    dsp->mc[idx1][FILTER_BILINEAR][idx2][0][0]     = type ## sz ## _c
+
+#define init_copy_avg(idx, sz)          \
+    init_fpel(idx, 0, sz, copy);        \
+    init_fpel(idx, 1, sz, avg)
+
+    init_copy_avg(0, 64);
+    init_copy_avg(1, 32);
+    init_copy_avg(2, 16);
+    init_copy_avg(3,  8);
+    init_copy_avg(4,  4);
+
+#undef init_copy_avg
+#undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type)             \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv]  = type ## _8tap_smooth_  ## sz ## dir ## _c; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _c; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv]   = type ## _8tap_sharp_   ## sz ## dir ## _c; \
+    dsp->mc[idx1][FILTER_BILINEAR][idx2][idxh][idxv]     = type ## _bilin_        ## sz ## dir ## _c
+
+#define init_subpel2(idx, idxh, idxv, dir, type)     \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type); \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
+
+#define init_subpel3(idx, type)         \
+    init_subpel2(idx, 1, 1, hv, type);  \
+    init_subpel2(idx, 0, 1, v, type);   \
+    init_subpel2(idx, 1, 0, h, type)
+
+    init_subpel3(0, put);
+    init_subpel3(1, avg);
+
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
+}
+
+av_cold void ff_vp9dsp_init(VP9DSPContext *dsp)
+{
+    vp9dsp_intrapred_init(dsp);
+    vp9dsp_itxfm_init(dsp);
+    vp9dsp_loopfilter_init(dsp);
+    vp9dsp_mc_init(dsp);
+
+    if (ARCH_X86)
+        ff_vp9dsp_init_x86(dsp);
+}
diff --git a/libavcodec/vp9mvs.c b/libavcodec/vp9mvs.c
new file mode 100644
index 0000000000..2f37755046
--- /dev/null
+++ b/libavcodec/vp9mvs.c
@@ -0,0 +1,344 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "internal.h"
+#include "vp56.h"
+#include "vp9.h"
+#include "vp9data.h"
+
+static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
+                                      VP9Context *s)
+{
+    dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
+    dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
+}
+
+static void find_ref_mvs(VP9Context *s,
+                         VP56mv *pmv, int ref, int z, int idx, int sb)
+{
+    static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
+        [BS_64x64] = { {  3, -1 }, { -1,  3 }, {  4, -1 }, { -1,  4 },
+                       { -1, -1 }, {  0, -1 }, { -1,  0 }, {  6, -1 } },
+        [BS_64x32] = { {  0, -1 }, { -1,  0 }, {  4, -1 }, { -1,  2 },
+                       { -1, -1 }, {  0, -3 }, { -3,  0 }, {  2, -1 } },
+        [BS_32x64] = { { -1,  0 }, {  0, -1 }, { -1,  4 }, {  2, -1 },
+                       { -1, -1 }, { -3,  0 }, {  0, -3 }, { -1,  2 } },
+        [BS_32x32] = { {  1, -1 }, { -1,  1 }, {  2, -1 }, { -1,  2 },
+                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 } },
+        [BS_32x16] = { {  0, -1 }, { -1,  0 }, {  2, -1 }, { -1, -1 },
+                       { -1,  1 }, {  0, -3 }, { -3,  0 }, { -3, -3 } },
+        [BS_16x32] = { { -1,  0 }, {  0, -1 }, { -1,  2 }, { -1, -1 },
+                       {  1, -1 }, { -3,  0 }, {  0, -3 }, { -3, -3 } },
+        [BS_16x16] = { {  0, -1 }, { -1,  0 }, {  1, -1 }, { -1,  1 },
+                       { -1, -1 }, {  0, -3 }, { -3,  0 }, { -3, -3 } },
+        [BS_16x8]  = { {  0, -1 }, { -1,  0 }, {  1, -1 }, { -1, -1 },
+                       {  0, -2 }, { -2,  0 }, { -2, -1 }, { -1, -2 } },
+        [BS_8x16]  = { { -1,  0 }, {  0, -1 }, { -1,  1 }, { -1, -1 },
+                       { -2,  0 }, {  0, -2 }, { -1, -2 }, { -2, -1 } },
+        [BS_8x8]   = { {  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
+                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } },
+        [BS_8x4]   = { {  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
+                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } },
+        [BS_4x8]   = { {  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
+                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } },
+        [BS_4x4]   = { {  0, -1 }, { -1,  0 }, { -1, -1 }, {  0, -2 },
+                       { -2,  0 }, { -1, -2 }, { -2, -1 }, { -2, -2 } },
+    };
+    VP9Block *const b = &s->b;
+    int row = b->row, col = b->col, row7 = b->row7;
+    const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
+#define INVALID_MV 0x80008000U
+    uint32_t mem = INVALID_MV;
+    int i;
+
+#define RETURN_DIRECT_MV(mv)                    \
+    do {                                        \
+        uint32_t m = AV_RN32A(&mv);             \
+        if (!idx) {                             \
+            AV_WN32A(pmv, m);                   \
+            return;                             \
+        } else if (mem == INVALID_MV) {         \
+            mem = m;                            \
+        } else if (m != mem) {                  \
+            AV_WN32A(pmv, m);                   \
+            return;                             \
+        }                                       \
+    } while (0)
+
+    if (sb >= 0) {
+        if (sb == 2 || sb == 1) {
+            RETURN_DIRECT_MV(b->mv[0][z]);
+        } else if (sb == 3) {
+            RETURN_DIRECT_MV(b->mv[2][z]);
+            RETURN_DIRECT_MV(b->mv[1][z]);
+            RETURN_DIRECT_MV(b->mv[0][z]);
+        }
+
+#define RETURN_MV(mv)                           \
+    do {                                        \
+        if (sb > 0) {                           \
+            VP56mv tmp;                         \
+            uint32_t m;                         \
+            clamp_mv(&tmp, &mv, s);             \
+            m = AV_RN32A(&tmp);                 \
+            if (!idx) {                         \
+                AV_WN32A(pmv, m);               \
+                return;                         \
+            } else if (mem == INVALID_MV) {     \
+                mem = m;                        \
+            } else if (m != mem) {              \
+                AV_WN32A(pmv, m);               \
+                return;                         \
+            }                                   \
+        } else {                                \
+            uint32_t m = AV_RN32A(&mv);         \
+            if (!idx) {                         \
+                clamp_mv(pmv, &mv, s);          \
+                return;                         \
+            } else if (mem == INVALID_MV) {     \
+                mem = m;                        \
+            } else if (m != mem) {              \
+                clamp_mv(pmv, &mv, s);          \
+                return;                         \
+            }                                   \
+        }                                       \
+    } while (0)
+
+        if (row > 0) {
+            VP9MVRefPair *mv = &s->mv[0][(row - 1) * s->sb_cols * 8 + col];
+
+            if (mv->ref[0] == ref)
+                RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
+            else if (mv->ref[1] == ref)
+                RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
+        }
+        if (col > s->tiling.tile_col_start) {
+            VP9MVRefPair *mv = &s->mv[0][row * s->sb_cols * 8 + col - 1];
+
+            if (mv->ref[0] == ref)
+                RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
+            else if (mv->ref[1] == ref)
+                RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
+        }
+        i = 2;
+    } else {
+        i = 0;
+    }
+
+    // previously coded MVs in the neighborhood, using same reference frame
+    for (; i < 8; i++) {
+        int c = p[i][0] + col, r = p[i][1] + row;
+
+        if (c >= s->tiling.tile_col_start && c < s->cols &&
+            r >= 0 && r < s->rows) {
+            VP9MVRefPair *mv = &s->mv[0][r * s->sb_cols * 8 + c];
+
+            if (mv->ref[0] == ref)
+                RETURN_MV(mv->mv[0]);
+            else if (mv->ref[1] == ref)
+                RETURN_MV(mv->mv[1]);
+        }
+    }
+
+    // MV at this position in previous frame, using same reference frame
+    if (s->use_last_frame_mvs) {
+        VP9MVRefPair *mv = &s->mv[1][row * s->sb_cols * 8 + col];
+
+        if (mv->ref[0] == ref)
+            RETURN_MV(mv->mv[0]);
+        else if (mv->ref[1] == ref)
+            RETURN_MV(mv->mv[1]);
+    }
+
+#define RETURN_SCALE_MV(mv, scale)              \
+    do {                                        \
+        if (scale) {                            \
+            VP56mv mv_temp = { -mv.x, -mv.y };  \
+            RETURN_MV(mv_temp);                 \
+        } else {                                \
+            RETURN_MV(mv);                      \
+        }                                       \
+    } while (0)
+
+    // previously coded MVs in the neighborhood, using different reference frame
+    for (i = 0; i < 8; i++) {
+        int c = p[i][0] + col, r = p[i][1] + row;
+
+        if (c >= s->tiling.tile_col_start && c < s->cols &&
+            r >= 0 && r < s->rows) {
+            VP9MVRefPair *mv = &s->mv[0][r * s->sb_cols * 8 + c];
+
+            if (mv->ref[0] != ref && mv->ref[0] >= 0)
+                RETURN_SCALE_MV(mv->mv[0],
+                                s->signbias[mv->ref[0]] != s->signbias[ref]);
+            if (mv->ref[1] != ref && mv->ref[1] >= 0)
+                RETURN_SCALE_MV(mv->mv[1],
+                                s->signbias[mv->ref[1]] != s->signbias[ref]);
+        }
+    }
+
+    // MV at this position in previous frame, using different reference frame
+    if (s->use_last_frame_mvs) {
+        VP9MVRefPair *mv = &s->mv[1][row * s->sb_cols * 8 + col];
+
+        if (mv->ref[0] != ref && mv->ref[0] >= 0)
+            RETURN_SCALE_MV(mv->mv[0],
+                            s->signbias[mv->ref[0]] != s->signbias[ref]);
+        if (mv->ref[1] != ref && mv->ref[1] >= 0)
+            RETURN_SCALE_MV(mv->mv[1],
+                            s->signbias[mv->ref[1]] != s->signbias[ref]);
+    }
+
+    AV_ZERO32(pmv);
+#undef INVALID_MV
+#undef RETURN_MV
+#undef RETURN_SCALE_MV
+}
+
+static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
+{
+    int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
+    int n, c = vp8_rac_get_tree(&s->c, ff_vp9_mv_class_tree,
+                                s->prob.p.mv_comp[idx].classes);
+
+    s->counts.mv_comp[idx].sign[sign]++;
+    s->counts.mv_comp[idx].classes[c]++;
+    if (c) {
+        int m;
+
+        for (n = 0, m = 0; m < c; m++) {
+            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
+            n  |= bit << m;
+            s->counts.mv_comp[idx].bits[m][bit]++;
+        }
+        n <<= 3;
+        bit = vp8_rac_get_tree(&s->c, ff_vp9_mv_fp_tree,
+                               s->prob.p.mv_comp[idx].fp);
+        n  |= bit << 1;
+        s->counts.mv_comp[idx].fp[bit]++;
+        if (hp) {
+            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
+            s->counts.mv_comp[idx].hp[bit]++;
+            n |= bit;
+        } else {
+            n |= 1;
+            // bug in libvpx - we count for bw entropy purposes even if the
+            // bit wasn't coded
+            s->counts.mv_comp[idx].hp[1]++;
+        }
+        n += 8 << c;
+    } else {
+        n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
+        s->counts.mv_comp[idx].class0[n]++;
+        bit = vp8_rac_get_tree(&s->c, ff_vp9_mv_fp_tree,
+                               s->prob.p.mv_comp[idx].class0_fp[n]);
+        s->counts.mv_comp[idx].class0_fp[n][bit]++;
+        n = (n << 3) | (bit << 1);
+        if (hp) {
+            bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
+            s->counts.mv_comp[idx].class0_hp[bit]++;
+            n |= bit;
+        } else {
+            n |= 1;
+            // bug in libvpx - we count for bw entropy purposes even if the
+            // bit wasn't coded
+            s->counts.mv_comp[idx].class0_hp[1]++;
+        }
+    }
+
+    return sign ? -(n + 1) : (n + 1);
+}
+
+void ff_vp9_fill_mv(VP9Context *s, VP56mv *mv, int mode, int sb)
+{
+    VP9Block *const b = &s->b;
+
+    if (mode == ZEROMV) {
+        memset(mv, 0, sizeof(*mv) * 2);
+    } else {
+        int hp;
+
+        // FIXME cache this value and reuse for other subblocks
+        find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
+                     mode == NEWMV ? -1 : sb);
+        // FIXME maybe move this code into find_ref_mvs()
+        if ((mode == NEWMV || sb == -1) &&
+            !(hp = s->highprecisionmvs &&
+              abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
+            if (mv[0].y & 1) {
+                if (mv[0].y < 0)
+                    mv[0].y++;
+                else
+                    mv[0].y--;
+            }
+            if (mv[0].x & 1) {
+                if (mv[0].x < 0)
+                    mv[0].x++;
+                else
+                    mv[0].x--;
+            }
+        }
+        if (mode == NEWMV) {
+            enum MVJoint j = vp8_rac_get_tree(&s->c, ff_vp9_mv_joint_tree,
+                                              s->prob.p.mv_joint);
+
+            s->counts.mv_joint[j]++;
+            if (j >= MV_JOINT_V)
+                mv[0].y += read_mv_component(s, 0, hp);
+            if (j & 1)
+                mv[0].x += read_mv_component(s, 1, hp);
+        }
+
+        if (b->comp) {
+            // FIXME cache this value and reuse for other subblocks
+            find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
+                         mode == NEWMV ? -1 : sb);
+            if ((mode == NEWMV || sb == -1) &&
+                !(hp = s->highprecisionmvs &&
+                  abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
+                if (mv[1].y & 1) {
+                    if (mv[1].y < 0)
+                        mv[1].y++;
+                    else
+                        mv[1].y--;
+                }
+                if (mv[1].x & 1) {
+                    if (mv[1].x < 0)
+                        mv[1].x++;
+                    else
+                        mv[1].x--;
+                }
+            }
+            if (mode == NEWMV) {
+                enum MVJoint j = vp8_rac_get_tree(&s->c, ff_vp9_mv_joint_tree,
+                                                  s->prob.p.mv_joint);
+
+                s->counts.mv_joint[j]++;
+                if (j >= MV_JOINT_V)
+                    mv[1].y += read_mv_component(s, 0, hp);
+                if (j & 1)
+                    mv[1].x += read_mv_component(s, 1, hp);
+            }
+        }
+    }
+}
diff --git a/libavcodec/vp9prob.c b/libavcodec/vp9prob.c
new file mode 100644
index 0000000000..b8a7c22af4
--- /dev/null
+++ b/libavcodec/vp9prob.c
@@ -0,0 +1,274 @@
+/*
+ * VP9 compatible video decoder
+ *
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "vp56.h"
+#include "vp9.h"
+#include "vp9data.h"
+
+static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
+                                        int max_count, int update_factor)
+{
+    unsigned ct = ct0 + ct1, p2, p1;
+
+    if (!ct)
+        return;
+
+    p1 = *p;
+    p2 = ((ct0 << 8) + (ct >> 1)) / ct;
+    p2 = av_clip(p2, 1, 255);
+    ct = FFMIN(ct, max_count);
+    update_factor = FASTDIV(update_factor * ct, max_count);
+
+    // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
+    *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
+}
+
+void ff_vp9_adapt_probs(VP9Context *s)
+{
+    int i, j, k, l, m;
+    ProbContext *p = &s->prob_ctx[s->framectxid].p;
+    int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
+
+    // coefficients
+    for (i = 0; i < 4; i++)
+        for (j = 0; j < 2; j++)
+            for (k = 0; k < 2; k++)
+                for (l = 0; l < 6; l++)
+                    for (m = 0; m < 6; m++) {
+                        uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
+                        unsigned *e = s->counts.eob[i][j][k][l][m];
+                        unsigned *c = s->counts.coef[i][j][k][l][m];
+
+                        if (l == 0 && m >= 3) // dc only has 3 pt
+                            break;
+
+                        adapt_prob(&pp[0], e[0], e[1], 24, uf);
+                        adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
+                        adapt_prob(&pp[2], c[1], c[2], 24, uf);
+                    }
+
+    if (s->keyframe || s->intraonly) {
+        memcpy(p->skip,  s->prob.p.skip,  sizeof(p->skip));
+        memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
+        memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
+        memcpy(p->tx8p,  s->prob.p.tx8p,  sizeof(p->tx8p));
+        return;
+    }
+
+    // skip flag
+    for (i = 0; i < 3; i++)
+        adapt_prob(&p->skip[i], s->counts.skip[i][0],
+                   s->counts.skip[i][1], 20, 128);
+
+    // intra/inter flag
+    for (i = 0; i < 4; i++)
+        adapt_prob(&p->intra[i], s->counts.intra[i][0],
+                   s->counts.intra[i][1], 20, 128);
+
+    // comppred flag
+    if (s->comppredmode == PRED_SWITCHABLE) {
+        for (i = 0; i < 5; i++)
+            adapt_prob(&p->comp[i], s->counts.comp[i][0],
+                       s->counts.comp[i][1], 20, 128);
+    }
+
+    // reference frames
+    if (s->comppredmode != PRED_SINGLEREF) {
+        for (i = 0; i < 5; i++)
+            adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
+                       s->counts.comp_ref[i][1], 20, 128);
+    }
+
+    if (s->comppredmode != PRED_COMPREF) {
+        for (i = 0; i < 5; i++) {
+            uint8_t *pp = p->single_ref[i];
+            unsigned (*c)[2] = s->counts.single_ref[i];
+
+            adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
+            adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
+        }
+    }
+
+    // block partitioning
+    for (i = 0; i < 4; i++)
+        for (j = 0; j < 4; j++) {
+            uint8_t *pp = p->partition[i][j];
+            unsigned *c = s->counts.partition[i][j];
+
+            adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
+            adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
+            adapt_prob(&pp[2], c[2], c[3], 20, 128);
+        }
+
+    // tx size
+    if (s->txfmmode == TX_SWITCHABLE) {
+        for (i = 0; i < 2; i++) {
+            unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
+
+            adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0],
+                       s->counts.tx8p[i][1], 20, 128);
+            adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
+            adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
+            adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
+            adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
+            adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
+        }
+    }
+
+    // interpolation filter
+    if (s->filtermode == FILTER_SWITCHABLE) {
+        for (i = 0; i < 4; i++) {
+            uint8_t *pp = p->filter[i];
+            unsigned *c = s->counts.filter[i];
+
+            adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
+            adapt_prob(&pp[1], c[1], c[2], 20, 128);
+        }
+    }
+
+    // inter modes
+    for (i = 0; i < 7; i++) {
+        uint8_t *pp = p->mv_mode[i];
+        unsigned *c = s->counts.mv_mode[i];
+
+        adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
+        adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
+        adapt_prob(&pp[2], c[1], c[3], 20, 128);
+    }
+
+    // mv joints
+    {
+        uint8_t *pp = p->mv_joint;
+        unsigned *c = s->counts.mv_joint;
+
+        adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
+        adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
+        adapt_prob(&pp[2], c[2], c[3], 20, 128);
+    }
+
+    // mv components
+    for (i = 0; i < 2; i++) {
+        uint8_t *pp;
+        unsigned *c, (*c2)[2], sum;
+
+        adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
+                   s->counts.mv_comp[i].sign[1], 20, 128);
+
+        pp  = p->mv_comp[i].classes;
+        c   = s->counts.mv_comp[i].classes;
+        sum = c[1] + c[2] + c[3] + c[4] + c[5] +
+              c[6] + c[7] + c[8] + c[9] + c[10];
+        adapt_prob(&pp[0], c[0], sum, 20, 128);
+        sum -= c[1];
+        adapt_prob(&pp[1], c[1], sum, 20, 128);
+        sum -= c[2] + c[3];
+        adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
+        adapt_prob(&pp[3], c[2], c[3], 20, 128);
+        sum -= c[4] + c[5];
+        adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
+        adapt_prob(&pp[5], c[4], c[5], 20, 128);
+        sum -= c[6];
+        adapt_prob(&pp[6], c[6], sum, 20, 128);
+        adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
+        adapt_prob(&pp[8], c[7], c[8], 20, 128);
+        adapt_prob(&pp[9], c[9], c[10], 20, 128);
+
+        adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
+                   s->counts.mv_comp[i].class0[1], 20, 128);
+        pp = p->mv_comp[i].bits;
+        c2 = s->counts.mv_comp[i].bits;
+        for (j = 0; j < 10; j++)
+            adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
+
+        for (j = 0; j < 2; j++) {
+            pp = p->mv_comp[i].class0_fp[j];
+            c  = s->counts.mv_comp[i].class0_fp[j];
+            adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
+            adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
+            adapt_prob(&pp[2], c[2], c[3], 20, 128);
+        }
+        pp = p->mv_comp[i].fp;
+        c  = s->counts.mv_comp[i].fp;
+        adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
+        adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
+        adapt_prob(&pp[2], c[2], c[3], 20, 128);
+
+        if (s->highprecisionmvs) {
+            adapt_prob(&p->mv_comp[i].class0_hp,
+                       s->counts.mv_comp[i].class0_hp[0],
+                       s->counts.mv_comp[i].class0_hp[1], 20, 128);
+            adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
+                       s->counts.mv_comp[i].hp[1], 20, 128);
+        }
+    }
+
+    // y intra modes
+    for (i = 0; i < 4; i++) {
+        uint8_t *pp = p->y_mode[i];
+        unsigned *c = s->counts.y_mode[i], sum, s2;
+
+        sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
+        adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
+        sum -= c[TM_VP8_PRED];
+        adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
+        sum -= c[VERT_PRED];
+        adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
+        s2   = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
+        sum -= s2;
+        adapt_prob(&pp[3], s2, sum, 20, 128);
+        s2 -= c[HOR_PRED];
+        adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
+        adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED],
+                   20, 128);
+        sum -= c[DIAG_DOWN_LEFT_PRED];
+        adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
+        sum -= c[VERT_LEFT_PRED];
+        adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
+        adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
+    }
+
+    // uv intra modes
+    for (i = 0; i < 10; i++) {
+        uint8_t *pp = p->uv_mode[i];
+        unsigned *c = s->counts.uv_mode[i], sum, s2;
+
+        sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
+        adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
+        sum -= c[TM_VP8_PRED];
+        adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
+        sum -= c[VERT_PRED];
+        adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
+        s2   = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
+        sum -= s2;
+        adapt_prob(&pp[3], s2, sum, 20, 128);
+        s2 -= c[HOR_PRED];
+        adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
+        adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED],
+                   20, 128);
+        sum -= c[DIAG_DOWN_LEFT_PRED];
+        adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
+        sum -= c[VERT_LEFT_PRED];
+        adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
+        adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
+    }
+}
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 0fe1c1af5e..6f4935bc3e 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -35,6 +35,7 @@ OBJS-$(CONFIG_VORBIS_DECODER)          += x86/vorbisdsp_init.o
 OBJS-$(CONFIG_VP3DSP)                  += x86/vp3dsp_init.o
 OBJS-$(CONFIG_VP6_DECODER)             += x86/vp6dsp_init.o
 OBJS-$(CONFIG_VP8_DECODER)             += x86/vp8dsp_init.o
+OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o
 OBJS-$(CONFIG_XMM_CLOBBER_TEST)        += x86/w64xmmtest.o
 
 MMX-OBJS-$(CONFIG_DSPUTIL)             += x86/dsputil_mmx.o             \
@@ -90,3 +91,4 @@ YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
 YASM-OBJS-$(CONFIG_VP6_DECODER)        += x86/vp6dsp.o
 YASM-OBJS-$(CONFIG_VP8_DECODER)        += x86/vp8dsp.o                  \
                                           x86/vp8dsp_loopfilter.o
+YASM-OBJS-$(CONFIG_VP9_DECODER)        += x86/vp9dsp.o
diff --git a/libavcodec/x86/vp9dsp.asm b/libavcodec/x86/vp9dsp.asm
new file mode 100644
index 0000000000..6488f3092d
--- /dev/null
+++ b/libavcodec/x86/vp9dsp.asm
@@ -0,0 +1,277 @@
+;******************************************************************************
+;* VP9 SIMD optimizations
+;*
+;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+; FIXME share with vp8dsp.asm
+pw_256:   times 8 dw 256
+
+%macro F8_TAPS 8
+times 8 db %1, %2
+times 8 db %3, %4
+times 8 db %5, %6
+times 8 db %7, %8
+%endmacro
+; int8_t ff_filters_ssse3[3][15][4][16]
+const filters_ssse3 ; smooth
+                    F8_TAPS -3, -1,  32,  64,  38,   1, -3,  0
+                    F8_TAPS -2, -2,  29,  63,  41,   2, -3,  0
+                    F8_TAPS -2, -2,  26,  63,  43,   4, -4,  0
+                    F8_TAPS -2, -3,  24,  62,  46,   5, -4,  0
+                    F8_TAPS -2, -3,  21,  60,  49,   7, -4,  0
+                    F8_TAPS -1, -4,  18,  59,  51,   9, -4,  0
+                    F8_TAPS -1, -4,  16,  57,  53,  12, -4, -1
+                    F8_TAPS -1, -4,  14,  55,  55,  14, -4, -1
+                    F8_TAPS -1, -4,  12,  53,  57,  16, -4, -1
+                    F8_TAPS  0, -4,   9,  51,  59,  18, -4, -1
+                    F8_TAPS  0, -4,   7,  49,  60,  21, -3, -2
+                    F8_TAPS  0, -4,   5,  46,  62,  24, -3, -2
+                    F8_TAPS  0, -4,   4,  43,  63,  26, -2, -2
+                    F8_TAPS  0, -3,   2,  41,  63,  29, -2, -2
+                    F8_TAPS  0, -3,   1,  38,  64,  32, -1, -3
+                    ; regular
+                    F8_TAPS  0,  1,  -5, 126,   8,  -3,  1,  0
+                    F8_TAPS -1,  3, -10, 122,  18,  -6,  2,  0
+                    F8_TAPS -1,  4, -13, 118,  27,  -9,  3, -1
+                    F8_TAPS -1,  4, -16, 112,  37, -11,  4, -1
+                    F8_TAPS -1,  5, -18, 105,  48, -14,  4, -1
+                    F8_TAPS -1,  5, -19,  97,  58, -16,  5, -1
+                    F8_TAPS -1,  6, -19,  88,  68, -18,  5, -1
+                    F8_TAPS -1,  6, -19,  78,  78, -19,  6, -1
+                    F8_TAPS -1,  5, -18,  68,  88, -19,  6, -1
+                    F8_TAPS -1,  5, -16,  58,  97, -19,  5, -1
+                    F8_TAPS -1,  4, -14,  48, 105, -18,  5, -1
+                    F8_TAPS -1,  4, -11,  37, 112, -16,  4, -1
+                    F8_TAPS -1,  3,  -9,  27, 118, -13,  4, -1
+                    F8_TAPS  0,  2,  -6,  18, 122, -10,  3, -1
+                    F8_TAPS  0,  1,  -3,   8, 126,  -5,  1,  0
+                    ; sharp
+                    F8_TAPS -1,  3,  -7, 127,   8,  -3,  1,  0
+                    F8_TAPS -2,  5, -13, 125,  17,  -6,  3, -1
+                    F8_TAPS -3,  7, -17, 121,  27, -10,  5, -2
+                    F8_TAPS -4,  9, -20, 115,  37, -13,  6, -2
+                    F8_TAPS -4, 10, -23, 108,  48, -16,  8, -3
+                    F8_TAPS -4, 10, -24, 100,  59, -19,  9, -3
+                    F8_TAPS -4, 11, -24,  90,  70, -21, 10, -4
+                    F8_TAPS -4, 11, -23,  80,  80, -23, 11, -4
+                    F8_TAPS -4, 10, -21,  70,  90, -24, 11, -4
+                    F8_TAPS -3,  9, -19,  59, 100, -24, 10, -4
+                    F8_TAPS -3,  8, -16,  48, 108, -23, 10, -4
+                    F8_TAPS -2,  6, -13,  37, 115, -20,  9, -4
+                    F8_TAPS -2,  5, -10,  27, 121, -17,  7, -3
+                    F8_TAPS -1,  3,  -6,  17, 125, -13,  5, -2
+                    F8_TAPS  0,  1,  -3,   8, 127,  -7,  3, -1
+
+SECTION .text
+
+%macro filter_h_fn 1
+%assign %%px mmsize/2
+cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, src, dstride, sstride, h, filtery
+    mova        m6, [pw_256]
+    mova        m7, [filteryq+ 0]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+16]
+    mova        m9, [filteryq+32]
+    mova       m10, [filteryq+48]
+%endif
+.loop:
+    movh        m0, [srcq-3]
+    movh        m1, [srcq-2]
+    movh        m2, [srcq-1]
+    movh        m3, [srcq+0]
+    movh        m4, [srcq+1]
+    movh        m5, [srcq+2]
+    punpcklbw   m0, m1
+    punpcklbw   m2, m3
+    movh        m1, [srcq+3]
+    movh        m3, [srcq+4]
+    add       srcq, sstrideq
+    punpcklbw   m4, m5
+    punpcklbw   m1, m3
+    pmaddubsw   m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddubsw   m2, m8
+    pmaddubsw   m4, m9
+    pmaddubsw   m1, m10
+%else
+    pmaddubsw   m2, [filteryq+16]
+    pmaddubsw   m4, [filteryq+32]
+    pmaddubsw   m1, [filteryq+48]
+%endif
+    paddw       m0, m2
+    paddw       m4, m1
+    paddsw      m0, m4
+    pmulhrsw    m0, m6
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    packuswb    m0, m0
+%ifidn %1, avg
+    pavgb       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_MMX ssse3
+filter_h_fn put
+filter_h_fn avg
+
+INIT_XMM ssse3
+filter_h_fn put
+filter_h_fn avg
+
+%macro filter_v_fn 1
+%assign %%px mmsize/2
+%if ARCH_X86_64
+cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, src, dstride, sstride, h, filtery, src4, sstride3
+%else
+cglobal %1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, src, dstride, sstride, filtery, src4, sstride3
+    mov   filteryq, r5mp
+%define hd r4mp
+%endif
+    sub       srcq, sstrideq
+    lea  sstride3q, [sstrideq*3]
+    sub       srcq, sstrideq
+    mova        m6, [pw_256]
+    sub       srcq, sstrideq
+    mova        m7, [filteryq+ 0]
+    lea      src4q, [srcq+sstrideq*4]
+%if ARCH_X86_64 && mmsize > 8
+    mova        m8, [filteryq+16]
+    mova        m9, [filteryq+32]
+    mova       m10, [filteryq+48]
+%endif
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just more generally
+    ; unroll this to prevent multiple loads of the same data?
+    movh        m0, [srcq]
+    movh        m1, [srcq+sstrideq]
+    movh        m2, [srcq+sstrideq*2]
+    movh        m3, [srcq+sstride3q]
+    movh        m4, [src4q]
+    movh        m5, [src4q+sstrideq]
+    punpcklbw   m0, m1
+    punpcklbw   m2, m3
+    movh        m1, [src4q+sstrideq*2]
+    movh        m3, [src4q+sstride3q]
+    add       srcq, sstrideq
+    add      src4q, sstrideq
+    punpcklbw   m4, m5
+    punpcklbw   m1, m3
+    pmaddubsw   m0, m7
+%if ARCH_X86_64 && mmsize > 8
+    pmaddubsw   m2, m8
+    pmaddubsw   m4, m9
+    pmaddubsw   m1, m10
+%else
+    pmaddubsw   m2, [filteryq+16]
+    pmaddubsw   m4, [filteryq+32]
+    pmaddubsw   m1, [filteryq+48]
+%endif
+    paddw       m0, m2
+    paddw       m4, m1
+    paddsw      m0, m4
+    pmulhrsw    m0, m6
+%ifidn %1, avg
+    movh        m1, [dstq]
+%endif
+    packuswb    m0, m0
+%ifidn %1, avg
+    pavgb       m0, m1
+%endif
+    movh    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_MMX ssse3
+filter_v_fn put
+filter_v_fn avg
+
+INIT_XMM ssse3
+filter_v_fn put
+filter_v_fn avg
+
+%macro fpel_fn 6
+%if %2 == 4
+%define %%srcfn movh
+%define %%dstfn movh
+%else
+%define %%srcfn movu
+%define %%dstfn mova
+%endif
+
+%if %2 <= 16
+cglobal %1%2, 5, 7, 4, dst, src, dstride, sstride, h, dstride3, sstride3
+    lea  sstride3q, [sstrideq*3]
+    lea  dstride3q, [dstrideq*3]
+%else
+cglobal %1%2, 5, 5, 4, dst, src, dstride, sstride, h
+%endif
+.loop:
+    %%srcfn     m0, [srcq]
+    %%srcfn     m1, [srcq+s%3]
+    %%srcfn     m2, [srcq+s%4]
+    %%srcfn     m3, [srcq+s%5]
+    lea       srcq, [srcq+sstrideq*%6]
+%ifidn %1, avg
+    pavgb       m0, [dstq]
+    pavgb       m1, [dstq+d%3]
+    pavgb       m2, [dstq+d%4]
+    pavgb       m3, [dstq+d%5]
+%endif
+    %%dstfn [dstq], m0
+    %%dstfn [dstq+d%3], m1
+    %%dstfn [dstq+d%4], m2
+    %%dstfn [dstq+d%5], m3
+    lea       dstq, [dstq+dstrideq*%6]
+    sub         hd, %6
+    jnz .loop
+    RET
+%endmacro
+
+%define d16 16
+%define s16 16
+INIT_MMX mmx
+fpel_fn put, 4,  strideq, strideq*2, stride3q, 4
+fpel_fn put, 8,  strideq, strideq*2, stride3q, 4
+INIT_MMX sse
+fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4
+fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4
+INIT_XMM sse
+fpel_fn put, 16, strideq, strideq*2, stride3q, 4
+fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1
+INIT_XMM sse2
+fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
+fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2
+fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1
+%undef s16
+%undef d16
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
new file mode 100644
index 0000000000..540dc3882f
--- /dev/null
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -0,0 +1,245 @@
+/*
+ * VP9 SIMD optimizations
+ *
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje@gmail.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/internal.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vp9.h"
+
+#if HAVE_YASM
+
+#define fpel_func(avg, sz, opt)                                         \
+void ff_ ## avg ## sz ## _ ## opt(uint8_t *dst, const uint8_t *src,     \
+                                  ptrdiff_t dst_stride,                 \
+                                  ptrdiff_t src_stride,                 \
+                                  int h, int mx, int my)
+
+fpel_func(put,  4, mmx);
+fpel_func(put,  8, mmx);
+fpel_func(put, 16, sse);
+fpel_func(put, 32, sse);
+fpel_func(put, 64, sse);
+fpel_func(avg,  4, sse);
+fpel_func(avg,  8, sse);
+fpel_func(avg, 16, sse2);
+fpel_func(avg, 32, sse2);
+fpel_func(avg, 64, sse2);
+#undef fpel_func
+
+#define mc_func(avg, sz, dir, opt)                                          \
+void                                                                        \
+ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst,         \
+                                                      const uint8_t *src,   \
+                                                      ptrdiff_t dst_stride, \
+                                                      ptrdiff_t src_stride, \
+                                                      int h,                \
+                                                      const int8_t (*filter)[16])
+
+#define mc_funcs(sz)            \
+    mc_func(put, sz, h, ssse3); \
+    mc_func(avg, sz, h, ssse3); \
+    mc_func(put, sz, v, ssse3); \
+    mc_func(avg, sz, v, ssse3)
+
+mc_funcs(4);
+mc_funcs(8);
+
+#undef mc_funcs
+#undef mc_func
+
+#define mc_rep_func(avg, sz, hsz, dir, opt)                                 \
+static av_always_inline void                                                \
+ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst,         \
+                                                      const uint8_t *src,   \
+                                                      ptrdiff_t dst_stride, \
+                                                      ptrdiff_t src_stride, \
+                                                      int h,                \
+                                                      const int8_t (*filter)[16]) \
+{                                                                           \
+    ff_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst, src,        \
+                                                           dst_stride,      \
+                                                           src_stride,      \
+                                                           h,               \
+                                                           filter);         \
+    ff_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst + hsz,       \
+                                                           src + hsz,       \
+                                                           dst_stride,      \
+                                                           src_stride,      \
+                                                           h, filter);      \
+}
+
+#define mc_rep_funcs(sz, hsz)            \
+    mc_rep_func(put, sz, hsz, h, ssse3); \
+    mc_rep_func(avg, sz, hsz, h, ssse3); \
+    mc_rep_func(put, sz, hsz, v, ssse3); \
+    mc_rep_func(avg, sz, hsz, v, ssse3)
+
+mc_rep_funcs(16, 8);
+mc_rep_funcs(32, 16);
+mc_rep_funcs(64, 32);
+
+#undef mc_rep_funcs
+#undef mc_rep_func
+
+extern const int8_t ff_filters_ssse3[3][15][4][16];
+
+#define filter_8tap_2d_fn(op, sz, f, fname)                             \
+static void                                                             \
+op ## _8tap_ ## fname ## _ ## sz ## hv_ssse3(uint8_t *dst,              \
+                                             const uint8_t *src,        \
+                                             ptrdiff_t dst_stride,      \
+                                             ptrdiff_t src_stride,      \
+                                             int h, int mx, int my)     \
+{                                                                       \
+    LOCAL_ALIGNED_16(uint8_t, temp, [71 * 64]);                         \
+    ff_put_8tap_1d_h_ ## sz ## _ssse3(temp, src - 3 * src_stride,       \
+                                      64, src_stride,                   \
+                                      h + 7,                            \
+                                      ff_filters_ssse3[f][mx - 1]);     \
+    ff_ ## op ## _8tap_1d_v_ ## sz ## _ssse3(dst, temp + 3 * 64,        \
+                                             dst_stride, 64,            \
+                                             h,                         \
+                                             ff_filters_ssse3[f][my - 1]); \
+}
+
+#define filters_8tap_2d_fn(op, sz)                          \
+    filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \
+    filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp)     \
+    filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth)
+
+#define filters_8tap_2d_fn2(op) \
+    filters_8tap_2d_fn(op, 64)  \
+    filters_8tap_2d_fn(op, 32)  \
+    filters_8tap_2d_fn(op, 16)  \
+    filters_8tap_2d_fn(op, 8)   \
+    filters_8tap_2d_fn(op, 4)
+
+filters_8tap_2d_fn2(put)
+filters_8tap_2d_fn2(avg)
+
+#undef filters_8tap_2d_fn2
+#undef filters_8tap_2d_fn
+#undef filter_8tap_2d_fn
+
+#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar)                  \
+static void                                                             \
+op ## _8tap_ ## fname ## _ ## sz ## dir ## _ssse3(uint8_t *dst,         \
+                                                  const uint8_t *src,   \
+                                                  ptrdiff_t dst_stride, \
+                                                  ptrdiff_t src_stride, \
+                                                  int h, int mx,        \
+                                                  int my)               \
+{                                                                       \
+    ff_ ## op ## _8tap_1d_ ## dir ## _ ## sz ## _ssse3(dst, src,        \
+                                                       dst_stride,      \
+                                                       src_stride, h,   \
+                                                       ff_filters_ssse3[f][dvar - 1]); \
+}
+
+#define filters_8tap_1d_fn(op, sz, dir, dvar)                          \
+    filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \
+    filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar)     \
+    filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar)
+
+#define filters_8tap_1d_fn2(op, sz)             \
+    filters_8tap_1d_fn(op, sz, h, mx)           \
+    filters_8tap_1d_fn(op, sz, v, my)
+
+#define filters_8tap_1d_fn3(op) \
+    filters_8tap_1d_fn2(op, 64) \
+    filters_8tap_1d_fn2(op, 32) \
+    filters_8tap_1d_fn2(op, 16) \
+    filters_8tap_1d_fn2(op,  8) \
+    filters_8tap_1d_fn2(op,  4)
+
+filters_8tap_1d_fn3(put)
+filters_8tap_1d_fn3(avg)
+
+#undef filters_8tap_1d_fn
+#undef filters_8tap_1d_fn2
+#undef filters_8tap_1d_fn3
+#undef filter_8tap_1d_fn
+
+#endif /* HAVE_YASM */
+
+av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+#define init_fpel(idx1, idx2, sz, type, opt)                            \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] =                    \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] =                    \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] =                    \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_ ## type ## sz ## _ ## opt
+
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv]  = type ## _8tap_smooth_  ## sz ## dir ## _ ## opt; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _ ## opt; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv]   = type ## _8tap_sharp_   ## sz ## dir ## _ ## opt
+
+#define init_subpel2(idx, idxh, idxv, dir, type, opt)     \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type, opt); \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt); \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type, opt); \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type, opt); \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type, opt)
+
+#define init_subpel3(idx, type, opt)        \
+    init_subpel2(idx, 1, 1, hv, type, opt); \
+    init_subpel2(idx, 0, 1,  v, type, opt); \
+    init_subpel2(idx, 1, 0,  h, type, opt)
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        init_fpel(4, 0,  4, put, mmx);
+        init_fpel(3, 0,  8, put, mmx);
+    }
+
+    if (EXTERNAL_SSE(cpu_flags)) {
+        init_fpel(2, 0, 16, put, sse);
+        init_fpel(1, 0, 32, put, sse);
+        init_fpel(0, 0, 64, put, sse);
+        init_fpel(4, 1,  4, avg, sse);
+        init_fpel(3, 1,  8, avg, sse);
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        init_fpel(2, 1, 16, avg, sse2);
+        init_fpel(1, 1, 32, avg, sse2);
+        init_fpel(0, 1, 64, avg, sse2);
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        init_subpel3(0, put, ssse3);
+        init_subpel3(1, avg, ssse3);
+    }
+
+#undef init_fpel
+#undef init_subpel1
+#undef init_subpel2
+#undef init_subpel3
+
+#endif /* HAVE_YASM */
+}
-- 
cgit v1.2.3