31 files changed, 4379 insertions, 104 deletions
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 13b5d318ec..5382027f70 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -1,11 +1,32 @@
+OBJS-$(CONFIG_EQ_FILTER)                     += x86/vf_eq.o
+OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp_init.o
 OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun_init.o
 OBJS-$(CONFIG_HQDN3D_FILTER)                 += x86/vf_hqdn3d_init.o
+OBJS-$(CONFIG_IDET_FILTER)                   += x86/vf_idet_init.o
 OBJS-$(CONFIG_INTERLACE_FILTER)              += x86/vf_interlace_init.o
+OBJS-$(CONFIG_NOISE_FILTER)                  += x86/vf_noise.o
+OBJS-$(CONFIG_PP7_FILTER)                    += x86/vf_pp7_init.o
+OBJS-$(CONFIG_PSNR_FILTER)                   += x86/vf_psnr_init.o
+OBJS-$(CONFIG_PULLUP_FILTER)                 += x86/vf_pullup_init.o
+OBJS-$(CONFIG_REMOVEGRAIN_FILTER)            += x86/vf_removegrain_init.o
+OBJS-$(CONFIG_SPP_FILTER)                    += x86/vf_spp.o
+OBJS-$(CONFIG_SSIM_FILTER)                   += x86/vf_ssim_init.o
+OBJS-$(CONFIG_TINTERLACE_FILTER)             += x86/vf_tinterlace_init.o
 OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
 OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
 
+YASM-OBJS-$(CONFIG_FSPP_FILTER)              += x86/vf_fspp.o
 YASM-OBJS-$(CONFIG_GRADFUN_FILTER)           += x86/vf_gradfun.o
 YASM-OBJS-$(CONFIG_HQDN3D_FILTER)            += x86/vf_hqdn3d.o
+YASM-OBJS-$(CONFIG_IDET_FILTER)              += x86/vf_idet.o
 YASM-OBJS-$(CONFIG_INTERLACE_FILTER)         += x86/vf_interlace.o
+YASM-OBJS-$(CONFIG_PP7_FILTER)               += x86/vf_pp7.o
+YASM-OBJS-$(CONFIG_PSNR_FILTER)              += x86/vf_psnr.o
+YASM-OBJS-$(CONFIG_PULLUP_FILTER)            += x86/vf_pullup.o
+ifdef CONFIG_GPL
+YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER)       += x86/vf_removegrain.o
+endif
+YASM-OBJS-$(CONFIG_SSIM_FILTER)              += x86/vf_ssim.o
+YASM-OBJS-$(CONFIG_TINTERLACE_FILTER)        += x86/vf_interlace.o
 YASM-OBJS-$(CONFIG_VOLUME_FILTER)            += x86/af_volume.o
-YASM-OBJS-$(CONFIG_YADIF_FILTER)             += x86/vf_yadif.o
+YASM-OBJS-$(CONFIG_YADIF_FILTER)             += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
diff --git a/libavfilter/x86/af_volume.asm b/libavfilter/x86/af_volume.asm
index 4e5ad2258c..f4cbcbc5de 100644
--- a/libavfilter/x86/af_volume.asm
+++ b/libavfilter/x86/af_volume.asm
@@ -2,20 +2,20 @@
 ;* x86-optimized functions for volume filter
 ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -99,9 +99,11 @@ cglobal scale_samples_s32, 4,4,4, dst, src, len, volume
 INIT_XMM sse2
 %define CVTDQ2PD cvtdq2pd
 SCALE_SAMPLES_S32
+%if HAVE_AVX_EXTERNAL
 %define CVTDQ2PD vcvtdq2pd
 INIT_YMM avx
 SCALE_SAMPLES_S32
+%endif
 %undef CVTDQ2PD
 
 ; NOTE: This is not bit-identical with the C version because it clips to
diff --git a/libavfilter/x86/af_volume_init.c b/libavfilter/x86/af_volume_init.c
index 26605fb2ce..88f5a9679a 100644
--- a/libavfilter/x86/af_volume_init.c
+++ b/libavfilter/x86/af_volume_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavfilter/x86/vf_eq.c b/libavfilter/x86/vf_eq.c
new file mode 100644
index 0000000000..16f399505f
--- /dev/null
+++ b/libavfilter/x86/vf_eq.c
@@ -0,0 +1,96 @@
+/*
+ *
+ * Original MPlayer filters by Richard Felker.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavfilter/vf_eq.h"
+
+#if HAVE_MMX_INLINE && HAVE_6REGS
+static void process_MMX(EQParameters *param, uint8_t *dst, int dst_stride,
+                        const uint8_t *src, int src_stride, int w, int h)
+{
+        int i;
+        int pel;
+        int dstep = dst_stride - w;
+        int sstep = src_stride - w;
+        short brvec[4];
+        short contvec[4];
+        int contrast = (int) (param->contrast * 256 * 16);
+        int brightness = ((int) (100.0 * param->brightness + 100.0) * 511) / 200 - 128 - contrast / 32;
+
+        brvec[0] = brvec[1] = brvec[2] = brvec[3] = brightness;
+        contvec[0] = contvec[1] = contvec[2] = contvec[3] = contrast;
+
+        while (h--) {
+                __asm__ volatile (
+                        "movq (%5), %%mm3      \n\t"
+                        "movq (%6), %%mm4      \n\t"
+                        "pxor %%mm0, %%mm0     \n\t"
+                        "movl %4, %%eax        \n\t"
+                        ".p2align 4 \n\t"
+                        "1:                    \n\t"
+                        "movq (%0), %%mm1      \n\t"
+                        "movq (%0), %%mm2      \n\t"
+                        "punpcklbw %%mm0, %%mm1\n\t"
+                        "punpckhbw %%mm0, %%mm2\n\t"
+                        "psllw $4, %%mm1       \n\t"
+                        "psllw $4, %%mm2       \n\t"
+                        "pmulhw %%mm4, %%mm1   \n\t"
+                        "pmulhw %%mm4, %%mm2   \n\t"
+                        "paddw %%mm3, %%mm1    \n\t"
+                        "paddw %%mm3, %%mm2    \n\t"
+                        "packuswb %%mm2, %%mm1 \n\t"
+                        "add $8, %0            \n\t"
+                        "movq %%mm1, (%1)      \n\t"
+                        "add $8, %1            \n\t"
+                        "decl %%eax            \n\t"
+                        "jnz 1b                \n\t"
+                        : "=r" (src), "=r" (dst)
+                        : "0" (src), "1" (dst), "r" (w>>3), "r" (brvec), "r" (contvec)
+                        : "%eax"
+                );
+
+                for (i = w&7; i; i--) {
+                        pel = ((*src++ * contrast) >> 12) + brightness;
+                        if (pel & ~255)
+                            pel = (-pel) >> 31;
+                        *dst++ = pel;
+                }
+
+                src += sstep;
+                dst += dstep;
+        }
+        __asm__ volatile ( "emms \n\t" ::: "memory" );
+}
+#endif
+
+av_cold void ff_eq_init_x86(EQContext *eq)
+{
+#if HAVE_MMX_INLINE && HAVE_6REGS
+    int cpu_flags = av_get_cpu_flags();
+
+    if (cpu_flags & AV_CPU_FLAG_MMX) {
+        eq->process = process_MMX;
+    }
+#endif
+}
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
new file mode 100644
index 0000000000..c7f8f64f1b
--- /dev/null
+++ b/libavfilter/x86/vf_fspp.asm
@@ -0,0 +1,727 @@
+;*****************************************************************************
+;* x86-optimized functions for fspp filter
+;*
+;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_dither: db 0,  48,  12,  60,   3,  51,  15,  63, 32,  16,  44,  28,  35,  19,  47,  31, \
+              8,  56,   4,  52,  11,  59,   7,  55, 40,  24,  36,  20,  43,  27,  39,  23, \
+              2,  50,  14,  62,   1,  49,  13,  61, 34,  18,  46,  30,  33,  17,  45,  29, \
+             10,  58,   6,  54,   9,  57,   5,  53, 42,  26,  38,  22,  41,  25,  37,  21
+pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
+pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
+pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
+pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
+pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
+pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
+pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
+pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
+pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
+pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
+pw_4:    times 4 dw 4
+pw_2:    times 4 dw 2
+
+SECTION .text
+
+%define DCTSIZE 8
+
+INIT_MMX mmx
+
+;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
+;                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
+;                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+%if ARCH_X86_64
+cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
+%else
+cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+%define dst_strideq r2m
+%define src_strideq r3m
+    mov       widthq, r4m
+    mov       dither_heightq, r5m
+    mov       ditherq, r6m ; log2_scale
+%endif
+    add       widthq, 7
+    mov       tmpq, src_strideq
+    and       widthq, ~7
+    sub       dst_strideq, widthq
+    movd      m5, ditherd ; log2_scale
+    xor       ditherq, -1 ; log2_scale
+    mov       tmp2q, tmpq
+    add       ditherq, 7 ; log2_scale
+    neg       tmpq
+    sub       tmp2q, widthq
+    movd      m2, ditherd ; log2_scale
+    add       tmp2q, tmp2q
+    lea       ditherq, [pb_dither]
+    mov       src_strideq, tmp2q
+    shl       tmpq, 4
+    lea       dither_heightq, [ditherq+dither_heightq*8]
+    pxor      m7, m7
+
+.loop_height:
+    movq      m3, [ditherq]
+    movq      m4, m3
+    punpcklbw m3, m7
+    punpckhbw m4, m7
+    mov       tmp2q, widthq
+    psraw     m3, m5
+    psraw     m4, m5
+
+.loop_width:
+    movq      [srcq+tmpq], m7
+    movq      m0, [srcq]
+    movq      m1, [srcq+8]
+    movq      [srcq+tmpq+8], m7
+    paddw     m0, m3
+    paddw     m1, m4
+    movq      [srcq], m7
+    psraw     m0, m2
+    psraw     m1, m2
+    movq      [srcq+8], m7
+    packuswb  m0, m1
+    add       srcq, 16
+    movq      [dstq], m0
+    add       dstq, 8
+    sub       tmp2q, 8
+    jg .loop_width
+
+    add       srcq, src_strideq
+    add       ditherq, 8
+    add       dstq, dst_strideq
+    cmp       ditherq, dither_heightq
+    jl .loop_height
+    RET
+
+;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
+;                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
+;                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+%if ARCH_X86_64
+cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
+%else
+cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+%define dst_strideq r2m
+%define src_strideq r3m
+    mov       dstq, dstm
+    mov       srcq, srcm
+    mov       widthq, r4m
+    mov       dither_heightq, r5m
+    mov       ditherq, r6m ; log2_scale
+%endif
+    add       widthq, 7
+    mov       tmpq, src_strideq
+    and       widthq, ~7
+    sub       dst_strideq, widthq
+    movd      m5, ditherd ; log2_scale
+    xor       ditherq, -1 ; log2_scale
+    mov       tmp2q, tmpq
+    add       ditherq, 7 ; log2_scale
+    sub       tmp2q, widthq
+    movd      m2, ditherd ; log2_scale
+    add       tmp2q, tmp2q
+    lea       ditherq, [pb_dither]
+    mov       src_strideq, tmp2q
+    shl       tmpq, 5
+    lea       dither_heightq, [ditherq+dither_heightq*8]
+    pxor      m7, m7
+
+.loop_height:
+    movq      m3, [ditherq]
+    movq      m4, m3
+    punpcklbw m3, m7
+    punpckhbw m4, m7
+    mov       tmp2q,widthq
+    psraw     m3, m5
+    psraw     m4, m5
+
+.loop_width:
+    movq      m0, [srcq]
+    movq      m1, [srcq+8]
+    paddw     m0, m3
+    paddw     m0, [srcq+tmpq]
+    paddw     m1, m4
+    movq      m6, [srcq+tmpq+8]
+    movq      [srcq+tmpq], m7
+    psraw     m0, m2
+    paddw     m1, m6
+    movq      [srcq+tmpq+8], m7
+    psraw     m1, m2
+    packuswb  m0, m1
+    movq      [dstq], m0
+    add       srcq, 16
+    add       dstq, 8
+    sub       tmp2q, 8
+    jg .loop_width
+
+    add       srcq, src_strideq
+    add       ditherq, 8
+    add       dstq, dst_strideq
+    cmp       ditherq, dither_heightq
+    jl .loop_height
+    RET
+
+;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
+    movd      m7, qd
+    movq      m0, [thrnq]
+    punpcklwd m7, m7
+    movq      m1, [thrnq+8]
+    punpckldq m7, m7
+    pmullw    m0, m7
+    movq      m2, [thrnq+8*2]
+    pmullw    m1, m7
+    movq      m3, [thrnq+8*3]
+    pmullw    m2, m7
+    movq      [thrq], m0
+    movq      m4, [thrnq+8*4]
+    pmullw    m3, m7
+    movq      [thrq+8], m1
+    movq      m5, [thrnq+8*5]
+    pmullw    m4, m7
+    movq      [thrq+8*2], m2
+    movq      m6, [thrnq+8*6]
+    pmullw    m5, m7
+    movq      [thrq+8*3], m3
+    movq      m0, [thrnq+8*7]
+    pmullw    m6, m7
+    movq      [thrq+8*4], m4
+    movq      m1, [thrnq+8*7+8]
+    pmullw    m0, m7
+    movq      [thrq+8*5], m5
+    movq      m2, [thrnq+8*7+8*2]
+    pmullw    m1, m7
+    movq      [thrq+8*6], m6
+    movq      m3, [thrnq+8*7+8*3]
+    pmullw    m2, m7
+    movq      [thrq+8*7], m0
+    movq      m4, [thrnq+8*7+8*4]
+    pmullw    m3, m7
+    movq      [thrq+8*7+8], m1
+    movq      m5, [thrnq+8*7+8*5]
+    pmullw    m4, m7
+    movq      [thrq+8*7+8*2], m2
+    movq      m6, [thrnq+8*7+8*6]
+    pmullw    m5, m7
+    movq      [thrq+8*7+8*3], m3
+    movq      m0, [thrnq+14*8]
+    pmullw    m6, m7
+    movq      [thrq+8*7+8*4], m4
+    movq      m1, [thrnq+14*8+8]
+    pmullw    m0, m7
+    movq      [thrq+8*7+8*5], m5
+    pmullw    m1, m7
+    movq      [thrq+8*7+8*6], m6
+    movq      [thrq+14*8], m0
+    movq      [thrq+14*8+8], m1
+    RET
+
+%macro COLUMN_FDCT 1-3 0, 0
+    movq      m1, [srcq+DCTSIZE*0*2]
+    movq      m7, [srcq+DCTSIZE*3*2]
+    movq      m0, m1
+    paddw     m1, [srcq+DCTSIZE*7*2]
+    movq      m3, m7
+    paddw     m7, [srcq+DCTSIZE*4*2]
+    movq      m5, m1
+    movq      m6, [srcq+DCTSIZE*1*2]
+    psubw     m1, m7
+    movq      m2, [srcq+DCTSIZE*2*2]
+    movq      m4, m6
+    paddw     m6, [srcq+DCTSIZE*6*2]
+    paddw     m5, m7
+    paddw     m2, [srcq+DCTSIZE*5*2]
+    movq      m7, m6
+    paddw     m6, m2
+    psubw     m7, m2
+    movq      m2, m5
+    paddw     m5, m6
+    psubw     m2, m6
+    paddw     m7, m1
+    movq      m6, [thrq+4*16+%2]
+    psllw     m7, 2
+    psubw     m5, [thrq+%2]
+    psubw     m2, m6
+    paddusw   m5, [thrq+%2]
+    paddusw   m2, m6
+    pmulhw    m7, [pw_2D41]
+    paddw     m5, [thrq+%2]
+    paddw     m2, m6
+    psubusw   m5, [thrq+%2]
+    psubusw   m2, m6
+    paddw     m5, [pw_2]
+    movq      m6, m2
+    paddw     m2, m5
+    psubw     m5, m6
+    movq      m6, m1
+    paddw     m1, m7
+    psubw     m1, [thrq+2*16+%2]
+    psubw     m6, m7
+    movq      m7, [thrq+6*16+%2]
+    psraw     m5, 2
+    paddusw   m1, [thrq+2*16+%2]
+    psubw     m6, m7
+    paddw     m1, [thrq+2*16+%2]
+    paddusw   m6, m7
+    psubusw   m1, [thrq+2*16+%2]
+    paddw     m6, m7
+    psubw     m3, [srcq+DCTSIZE*4*2]
+    psubusw   m6, m7
+    movq      m7, m1
+    psraw     m2, 2
+    psubw     m4, [srcq+DCTSIZE*6*2]
+    psubw     m1, m6
+    psubw     m0, [srcq+DCTSIZE*7*2]
+    paddw     m6, m7
+    psraw     m6, 2
+    movq      m7, m2
+    pmulhw    m1, [pw_5A82]
+    paddw     m2, m6
+    movq      [rsp], m2
+    psubw     m7, m6
+    movq      m2, [srcq+DCTSIZE*2*2]
+    psubw     m1, m6
+    psubw     m2, [srcq+DCTSIZE*5*2]
+    movq      m6, m5
+    movq      [rsp+8*3], m7
+    paddw     m3, m2
+    paddw     m2, m4
+    paddw     m4, m0
+    movq      m7, m3
+    psubw     m3, m4
+    psllw     m3, 2
+    psllw     m7, 2
+    pmulhw    m3, [pw_187E]
+    psllw     m4, 2
+    pmulhw    m7, [pw_22A3]
+    psllw     m2, 2
+    pmulhw    m4, [pw_539F]
+    paddw     m5, m1
+    pmulhw    m2, [pw_2D41]
+    psubw     m6, m1
+    paddw     m7, m3
+    movq      [rsp+8], m5
+    paddw     m4, m3
+    movq      m3, [thrq+3*16+%2]
+    movq      m1, m0
+    movq      [rsp+8*2], m6
+    psubw     m1, m2
+    paddw     m0, m2
+    movq      m5, m1
+    movq      m2, [thrq+5*16+%2]
+    psubw     m1, m7
+    paddw     m5, m7
+    psubw     m1, m3
+    movq      m7, [thrq+16+%2]
+    psubw     m5, m2
+    movq      m6, m0
+    paddw     m0, m4
+    paddusw   m1, m3
+    psubw     m6, m4
+    movq      m4, [thrq+7*16+%2]
+    psubw     m0, m7
+    psubw     m6, m4
+    paddusw   m5, m2
+    paddusw   m6, m4
+    paddw     m1, m3
+    paddw     m5, m2
+    paddw     m6, m4
+    psubusw   m1, m3
+    psubusw   m5, m2
+    psubusw   m6, m4
+    movq      m4, m1
+    por       m4, m5
+    paddusw   m0, m7
+    por       m4, m6
+    paddw     m0, m7
+    packssdw  m4, m4
+    psubusw   m0, m7
+    movd      tmpd, m4
+    or        tmpd, tmpd
+    jnz %1
+    movq      m4, [rsp]
+    movq      m1, m0
+    pmulhw    m0, [pw_3642]
+    movq      m2, m1
+    movq      m5, [outq+DCTSIZE*0*2]
+    movq      m3, m2
+    pmulhw    m1, [pw_2441]
+    paddw     m5, m4
+    movq      m6, [rsp+8]
+    psraw     m3, 2
+    pmulhw    m2, [pw_0CBB]
+    psubw     m4, m3
+    movq      m7, [outq+DCTSIZE*1*2]
+    paddw     m5, m3
+    movq      [outq+DCTSIZE*7*2], m4
+    paddw     m7, m6
+    movq      m3, [rsp+8*2]
+    psubw     m6, m0
+    movq      m4, [outq+DCTSIZE*2*2]
+    paddw     m7, m0
+    movq      [outq], m5
+    paddw     m4, m3
+    movq      [outq+DCTSIZE*6*2], m6
+    psubw     m3, m1
+    movq      m5, [outq+DCTSIZE*5*2]
+    paddw     m4, m1
+    movq      m6, [outq+DCTSIZE*3*2]
+    paddw     m5, m3
+    movq      m0, [rsp+8*3]
+    add       srcq, 8+%3
+    movq      [outq+DCTSIZE*1*2], m7
+    paddw     m6, m0
+    movq      [outq+DCTSIZE*2*2], m4
+    psubw     m0, m2
+    movq      m7, [outq+DCTSIZE*4*2]
+    paddw     m6, m2
+    movq      [outq+DCTSIZE*5*2], m5
+    paddw     m7, m0
+    movq      [outq+DCTSIZE*3*2], m6
+    movq      [outq+DCTSIZE*4*2], m7
+    add       outq, 8+%3
+%endmacro
+
+%macro COLUMN_IDCT 0-1 0
+    movq      m3, m5
+    psubw     m5, m1
+    psllw     m5, 1
+    paddw     m3, m1
+    movq      m2, m0
+    psubw     m0, m6
+    movq      m1, m5
+    psllw     m0, 1
+    pmulhw    m1, [pw_AC62]
+    paddw     m5, m0
+    pmulhw    m5, [pw_3B21]
+    paddw     m2, m6
+    pmulhw    m0, [pw_22A3]
+    movq      m7, m2
+    movq      m4, [rsp]
+    psubw     m2, m3
+    psllw     m2, 1
+    paddw     m7, m3
+    pmulhw    m2, [pw_2D41]
+    movq      m6, m4
+    psraw     m7, 2
+    paddw     m4, [outq]
+    psubw     m6, m7
+    movq      m3, [rsp+8]
+    paddw     m4, m7
+    movq      [outq+DCTSIZE*7*2], m6
+    paddw     m1, m5
+    movq      [outq], m4
+    psubw     m1, m7
+    movq      m7, [rsp+8*2]
+    psubw     m0, m5
+    movq      m6, [rsp+8*3]
+    movq      m5, m3
+    paddw     m3, [outq+DCTSIZE*1*2]
+    psubw     m5, m1
+    psubw     m2, m1
+    paddw     m3, m1
+    movq      [outq+DCTSIZE*6*2], m5
+    movq      m4, m7
+    paddw     m7, [outq+DCTSIZE*2*2]
+    psubw     m4, m2
+    paddw     m4, [outq+DCTSIZE*5*2]
+    paddw     m7, m2
+    movq      [outq+DCTSIZE*1*2], m3
+    paddw     m0, m2
+    movq      [outq+DCTSIZE*2*2], m7
+    movq      m1, m6
+    paddw     m6, [outq+DCTSIZE*4*2]
+    psubw     m1, m0
+    paddw     m1, [outq+DCTSIZE*3*2]
+    paddw     m6, m0
+    movq      [outq+DCTSIZE*5*2], m4
+    add       srcq, 8+%1
+    movq      [outq+DCTSIZE*4*2], m6
+    movq      [outq+DCTSIZE*3*2], m1
+    add       outq, 8+%1
+%endmacro
+
+;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
+cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
+.fdct1:
+    COLUMN_FDCT .idct1
+    jmp .fdct2
+
+.idct1:
+    COLUMN_IDCT
+
+.fdct2:
+    COLUMN_FDCT .idct2, 8, 16
+    sub    cntd, 2
+    jg .fdct1
+    RET
+
+.idct2:
+    COLUMN_IDCT 16
+    sub    cntd, 2
+    jg .fdct1
+    RET
+
+;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
+cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
+    add       strideq, strideq
+    lea       stride3q, [strideq+strideq*2]
+.loop:
+    movq      m0, [srcq+DCTSIZE*0*2]
+    movq      m1, [srcq+DCTSIZE*1*2]
+    movq      m4, m0
+    movq      m2, [srcq+DCTSIZE*2*2]
+    punpcklwd m0, m1
+    movq      m3, [srcq+DCTSIZE*3*2]
+    punpckhwd m4, m1
+    movq      m7, m2
+    punpcklwd m2, m3
+    movq      m6, m0
+    punpckldq m0, m2
+    punpckhdq m6, m2
+    movq      m5, m0
+    punpckhwd m7, m3
+    psubw     m0, m6
+    pmulhw    m0, [pw_5A82]
+    movq      m2, m4
+    punpckldq m4, m7
+    paddw     m5, m6
+    punpckhdq m2, m7
+    movq      m1, m4
+    psllw     m0, 2
+    paddw     m4, m2
+    movq      m3, [srcq+DCTSIZE*0*2+8]
+    psubw     m1, m2
+    movq      m2, [srcq+DCTSIZE*1*2+8]
+    psubw     m0, m5
+    movq      m6, m4
+    paddw     m4, m5
+    psubw     m6, m5
+    movq      m7, m1
+    movq      m5, [srcq+DCTSIZE*2*2+8]
+    paddw     m1, m0
+    movq      [rsp], m4
+    movq      m4, m3
+    movq      [rsp+8], m6
+    punpcklwd m3, m2
+    movq      m6, [srcq+DCTSIZE*3*2+8]
+    punpckhwd m4, m2
+    movq      m2, m5
+    punpcklwd m5, m6
+    psubw     m7, m0
+    punpckhwd m2, m6
+    movq      m0, m3
+    punpckldq m3, m5
+    punpckhdq m0, m5
+    movq      m5, m4
+    movq      m6, m3
+    punpckldq m4, m2
+    psubw     m3, m0
+    punpckhdq m5, m2
+    paddw     m6, m0
+    movq      m2, m4
+    movq      m0, m3
+    psubw     m4, m5
+    pmulhw    m0, [pw_AC62]
+    paddw     m3, m4
+    pmulhw    m3, [pw_3B21]
+    paddw     m2, m5
+    pmulhw    m4, [pw_22A3]
+    movq      m5, m2
+    psubw     m2, m6
+    paddw     m5, m6
+    pmulhw    m2, [pw_2D41]
+    paddw     m0, m3
+    psllw     m0, 3
+    psubw     m4, m3
+    movq      m6, [rsp]
+    movq      m3, m1
+    psllw     m4, 3
+    psubw     m0, m5
+    psllw     m2, 3
+    paddw     m1, m0
+    psubw     m2, m0
+    psubw     m3, m0
+    paddw     m4, m2
+    movq      m0, m7
+    paddw     m7, m2
+    psubw     m0, m2
+    movq      m2, [pw_4]
+    psubw     m6, m5
+    paddw     m5, [rsp]
+    paddw     m1, m2
+    paddw     m5, m2
+    psraw     m1, 3
+    paddw     m7, m2
+    psraw     m5, 3
+    paddw     m5, [dstq]
+    psraw     m7, 3
+    paddw     m1, [dstq+strideq*1]
+    paddw     m0, m2
+    paddw     m7, [dstq+strideq*2]
+    paddw     m3, m2
+    movq      [dstq], m5
+    paddw     m6, m2
+    movq      [dstq+strideq*1], m1
+    psraw     m0, 3
+    movq      [dstq+strideq*2], m7
+    add       dstq, stride3q
+    movq      m5, [rsp+8]
+    psraw     m3, 3
+    paddw     m0, [dstq+strideq*2]
+    psubw     m5, m4
+    paddw     m3, [dstq+stride3q*1]
+    psraw     m6, 3
+    paddw     m4, [rsp+8]
+    paddw     m5, m2
+    paddw     m6, [dstq+strideq*4]
+    paddw     m4, m2
+    movq      [dstq+strideq*2], m0
+    psraw     m5, 3
+    paddw     m5, [dstq]
+    psraw     m4, 3
+    paddw     m4, [dstq+strideq*1]
+    add       srcq, DCTSIZE*2*4
+    movq      [dstq+stride3q*1], m3
+    movq      [dstq+strideq*4], m6
+    movq      [dstq], m5
+    movq      [dstq+strideq*1], m4
+    sub       dstq, stride3q
+    add       dstq, 8
+    dec       r3d
+    jnz .loop
+    RET
+
+;void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
+cglobal row_fdct, 4, 5, 0, 16, src, pix, stride, cnt, stride3
+    lea       stride3q, [strideq+strideq*2]
+.loop:
+    movd      m0, [pixq]
+    pxor      m7, m7
+    movd      m1, [pixq+strideq*1]
+    punpcklbw m0, m7
+    movd      m2, [pixq+strideq*2]
+    punpcklbw m1, m7
+    punpcklbw m2, m7
+    add       pixq,stride3q
+    movq      m5, m0
+    movd      m3, [pixq+strideq*4]
+    movq      m6, m1
+    movd      m4, [pixq+stride3q*1]
+    punpcklbw m3, m7
+    psubw     m5, m3
+    punpcklbw m4, m7
+    paddw     m0, m3
+    psubw     m6, m4
+    movd      m3, [pixq+strideq*2]
+    paddw     m1, m4
+    movq      [rsp], m5
+    punpcklbw m3, m7
+    movq      [rsp+8], m6
+    movq      m4, m2
+    movd      m5, [pixq]
+    paddw     m2, m3
+    movd      m6, [pixq+strideq*1]
+    punpcklbw m5, m7
+    psubw     m4, m3
+    punpcklbw m6, m7
+    movq      m3, m5
+    paddw     m5, m6
+    psubw     m3, m6
+    movq      m6, m0
+    movq      m7, m1
+    psubw     m0, m5
+    psubw     m1, m2
+    paddw     m7, m2
+    paddw     m1, m0
+    movq      m2, m7
+    psllw     m1, 2
+    paddw     m6, m5
+    pmulhw    m1, [pw_2D41]
+    paddw     m7, m6
+    psubw     m6, m2
+    movq      m5, m0
+    movq      m2, m7
+    punpcklwd m7, m6
+    paddw     m0, m1
+    punpckhwd m2, m6
+    psubw     m5, m1
+    movq      m6, m0
+    movq      m1, [rsp+8]
+    punpcklwd m0, m5
+    punpckhwd m6, m5
+    movq      m5, m0
+    punpckldq m0, m7
+    paddw     m3, m4
+    punpckhdq m5, m7
+    movq      m7, m6
+    movq      [srcq+DCTSIZE*0*2], m0
+    punpckldq m6, m2
+    movq      [srcq+DCTSIZE*1*2], m5
+    punpckhdq m7, m2
+    movq      [srcq+DCTSIZE*2*2], m6
+    paddw     m4, m1
+    movq      [srcq+DCTSIZE*3*2], m7
+    psllw     m3, 2
+    movq      m2, [rsp]
+    psllw     m4, 2
+    pmulhw    m4, [pw_2D41]
+    paddw     m1, m2
+    psllw     m1, 2
+    movq      m0, m3
+    pmulhw    m0, [pw_22A3]
+    psubw     m3, m1
+    pmulhw    m3, [pw_187E]
+    movq      m5, m2
+    pmulhw    m1, [pw_539F]
+    psubw     m2, m4
+    paddw     m5, m4
+    movq      m6, m2
+    paddw     m0, m3
+    movq      m7, m5
+    paddw     m2, m0
+    psubw     m6, m0
+    movq      m4, m2
+    paddw     m1, m3
+    punpcklwd m2, m6
+    paddw     m5, m1
+    punpckhwd m4, m6
+    psubw     m7, m1
+    movq      m6, m5
+    punpcklwd m5, m7
+    punpckhwd m6, m7
+    movq      m7, m2
+    punpckldq m2, m5
+    sub       pixq, stride3q
+    punpckhdq m7, m5
+    movq      m5, m4
+    movq      [srcq+DCTSIZE*0*2+8], m2
+    punpckldq m4, m6
+    movq      [srcq+DCTSIZE*1*2+8], m7
+    punpckhdq m5, m6
+    movq      [srcq+DCTSIZE*2*2+8], m4
+    add       pixq, 4
+    movq      [srcq+DCTSIZE*3*2+8], m5
+    add       srcq, DCTSIZE*4*2
+    dec       cntd
+    jnz .loop
+    RET
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
new file mode 100644
index 0000000000..8e00317cb7
--- /dev/null
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_fspp.h"
+
+void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
+                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
+                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
+void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
+void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
+
+av_cold void ff_fspp_init_x86(FSPPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        s->store_slice  = ff_store_slice_mmx;
+        s->store_slice2 = ff_store_slice2_mmx;
+        s->mul_thrmat   = ff_mul_thrmat_mmx;
+        s->column_fidct = ff_column_fidct_mmx;
+        s->row_idct     = ff_row_idct_mmx;
+        s->row_fdct     = ff_row_fdct_mmx;
+    }
+}
diff --git a/libavfilter/x86/vf_gradfun.asm b/libavfilter/x86/vf_gradfun.asm
index 00fcb166fb..3581f89fe8 100644
--- a/libavfilter/x86/vf_gradfun.asm
+++ b/libavfilter/x86/vf_gradfun.asm
@@ -1,20 +1,20 @@
 ;******************************************************************************
 ;* x86-optimized functions for gradfun filter
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavfilter/x86/vf_gradfun_init.c b/libavfilter/x86/vf_gradfun_init.c
index 3f23bf6799..c638a05e87 100644
--- a/libavfilter/x86/vf_gradfun_init.c
+++ b/libavfilter/x86/vf_gradfun_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2009 Loren Merritt <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,29 +26,29 @@
 #include "libavutil/x86/cpu.h"
 #include "libavfilter/gradfun.h"
 
-void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, uint8_t *src,
-                                   uint16_t *dc, int thresh,
+void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, const uint8_t *src,
+                                   const uint16_t *dc, int thresh,
                                    const uint16_t *dithers);
-
-void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, uint8_t *src,
-                                  uint16_t *dc, int thresh,
+void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, const uint8_t *src,
+                                  const uint16_t *dc, int thresh,
                                   const uint16_t *dithers);
 
 void ff_gradfun_blur_line_movdqa_sse2(intptr_t x, uint16_t *buf,
-                                      uint16_t *buf1, uint16_t *dc,
-                                      uint8_t *src1, uint8_t *src2);
+                                      const uint16_t *buf1, uint16_t *dc,
+                                      const uint8_t *src1, const uint8_t *src2);
 void ff_gradfun_blur_line_movdqu_sse2(intptr_t x, uint16_t *buf,
-                                      uint16_t *buf1, uint16_t *dc,
-                                      uint8_t *src1, uint8_t *src2);
+                                      const uint16_t *buf1, uint16_t *dc,
+                                      const uint8_t *src1, const uint8_t *src2);
 
 #if HAVE_YASM
-static void gradfun_filter_line(uint8_t *dst, uint8_t *src, uint16_t *dc,
-                                int width, int thresh, const uint16_t *dithers,
-                                int alignment)
+static void gradfun_filter_line_mmxext(uint8_t *dst, const uint8_t *src,
+                                       const uint16_t *dc,
+                                       int width, int thresh,
+                                       const uint16_t *dithers)
 {
     intptr_t x;
-    if (width & alignment) {
-        x = width & ~alignment;
+    if (width & 3) {
+        x = width & ~3;
         ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2,
                                  width - x, thresh, dithers);
         width = x;
@@ -58,22 +58,25 @@ static void gradfun_filter_line(uint8_t *dst, uint8_t *src, uint16_t *dc,
                                   thresh, dithers);
 }
 
-static void gradfun_filter_line_mmxext(uint8_t *dst, uint8_t *src, uint16_t *dc,
-                                       int width, int thresh,
-                                       const uint16_t *dithers)
-{
-    gradfun_filter_line(dst, src, dc, width, thresh, dithers, 3);
-}
-
-static void gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc,
+static void gradfun_filter_line_ssse3(uint8_t *dst, const uint8_t *src, const uint16_t *dc,
                                       int width, int thresh,
                                       const uint16_t *dithers)
 {
-    gradfun_filter_line(dst, src, dc, width, thresh, dithers, 7);
+    intptr_t x;
+    if (width & 7) {
+        // could be 10% faster if I somehow eliminated this
+        x = width & ~7;
+        ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2,
+                                 width - x, thresh, dithers);
+        width = x;
+    }
+    x = -width;
+    ff_gradfun_filter_line_ssse3(x, dst + width, src + width, dc + width / 2,
+                                 thresh, dithers);
 }
 
-static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1,
-                                   uint8_t *src, int src_linesize, int width)
+static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, const uint16_t *buf1,
+                                   const uint8_t *src, int src_linesize, int width)
 {
     intptr_t x = -2 * width;
     if (((intptr_t) src | src_linesize) & 15)
diff --git a/libavfilter/x86/vf_hqdn3d.asm b/libavfilter/x86/vf_hqdn3d.asm
index 02632a1f09..e3b1bdca53 100644
--- a/libavfilter/x86/vf_hqdn3d.asm
+++ b/libavfilter/x86/vf_hqdn3d.asm
@@ -1,20 +1,20 @@
 ;******************************************************************************
 ;* Copyright (c) 2012 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -27,8 +27,8 @@ SECTION .text
 %if lut_bits != 8
     sar    %1q, 8-lut_bits
 %endif
-    movsx  %1d, word [%3q+%1q*2]
-    add    %1d, %2d
+    movsx  %1q, word [%3q+%1q*2]
+    add    %1q, %2q
 %endmacro
 
 %macro LOAD 3 ; dstreg, x, bitdepth
diff --git a/libavfilter/x86/vf_hqdn3d_init.c b/libavfilter/x86/vf_hqdn3d_init.c
index 06f9e00ec9..b63916b674 100644
--- a/libavfilter/x86/vf_hqdn3d_init.c
+++ b/libavfilter/x86/vf_hqdn3d_init.c
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2012 Loren Merritt
  *
- * Libav is free software; you can redistribute it and/or modify
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License along
- * with Libav; if not, write to the Free Software Foundation, Inc.,
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm
new file mode 100644
index 0000000000..007e63deb9
--- /dev/null
+++ b/libavfilter/x86/vf_idet.asm
@@ -0,0 +1,170 @@
+;*****************************************************************************
+;* x86-optimized functions for idet filter
+;*
+;* Copyright (C) 2014 Pascal Massimino (pascal.massimino@gmail.com)
+;* Copyright (c) 2014 Neil Birkbeck (birkbeck@google.com)
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+; Implementation that does 8-bytes at a time using single-word operations.
+%macro IDET_FILTER_LINE 1
+INIT_MMX %1
+cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index
+    xor       indexq, indexq
+%define   m_zero m2
+%define   m_sum  m5
+    pxor      m_sum, m_sum
+    pxor      m_zero, m_zero
+
+.loop:
+    movu      m0, [aq + indexq*1]
+    punpckhbw m1, m0, m_zero
+    punpcklbw m0, m_zero
+
+    movu      m3, [cq + indexq*1]
+    punpckhbw m4, m3, m_zero
+    punpcklbw m3, m_zero
+
+    paddsw    m1, m4
+    paddsw    m0, m3
+
+    movu      m3, [bq + indexq*1]
+    punpckhbw m4, m3, m_zero
+    punpcklbw m3, m_zero
+
+    paddw     m4, m4
+    paddw     m3, m3
+    psubsw    m1, m4
+    psubsw    m0, m3
+
+    ABS2      m1, m0, m4, m3
+
+    paddw     m0, m1
+    punpckhwd m1, m0, m_zero
+    punpcklwd m0, m_zero
+
+    paddd     m0, m1
+    paddd     m_sum, m0
+
+    add       indexq, 0x8
+    CMP       widthd, indexd
+    jg        .loop
+
+    HADDD     m_sum, m0
+    movd      eax, m_sum
+    RET
+%endmacro
+
+%if ARCH_X86_32
+IDET_FILTER_LINE mmxext
+IDET_FILTER_LINE mmx
+%endif
+
+;******************************************************************************
+; 16bit implementation that does 4/8-pixels at a time
+
+%macro PABS_DIFF_WD 3    ; a, b, junk   , output=a
+  psubusw   %3, %2, %1
+  psubusw   %1, %2
+  por       %1, %3
+
+  mova      %2, %1
+  punpcklwd %1, m_zero
+  punpckhwd %2, m_zero
+  paddd     %1, %2
+%endmacro
+
+%macro IDET_FILTER_LINE_16BIT 1   ; %1=increment (4 or 8 words)
+cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
+    xor       indexq, indexq
+%define m_zero m1
+%define m_sum  m0
+    pxor      m_sum, m_sum
+    pxor      m_zero, m_zero
+
+.loop_16bit:
+    movu      m2, [bq + indexq * 2]  ; B
+    movu      m3, [aq + indexq * 2]  ; A
+    mova      m6, m2
+    psubusw   m5, m2, m3             ; ba
+
+    movu      m4, [cq + indexq * 2]  ; C
+    add       indexq, %1
+    psubusw   m3, m2                 ; ab
+    CMP       indexd, widthd
+
+    psubusw   m6, m4                 ; bc
+    psubusw   m4, m2                 ; cb
+
+    PABS_DIFF_WD   m3, m6, m7        ; |ab - bc|
+    PABS_DIFF_WD   m5, m4, m7        ; |ba - cb|
+    paddd          m_sum, m3
+    paddd          m_sum, m5
+    jl        .loop_16bit
+
+    HADDD     m_sum, m2
+    movd      eax, m_sum
+    RET
+%endmacro
+
+INIT_XMM sse2
+IDET_FILTER_LINE_16BIT 8
+%if ARCH_X86_32
+INIT_MMX mmx
+IDET_FILTER_LINE_16BIT 4
+%endif
+
+;******************************************************************************
+; SSE2 8-bit implementation that does 16-bytes at a time:
+
+INIT_XMM sse2
+cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
+    xor       indexq, indexq
+    pxor      m0, m0
+    pxor      m1, m1
+
+.sse2_loop:
+    movu      m2, [bq + indexq*1]  ; B
+    movu      m3, [aq + indexq*1]  ; A
+    mova      m6, m2
+    mova      m4, m3
+    psubusb   m5, m2, m3           ; ba
+
+    movu      m3, [cq + indexq*1]  ; C
+    add       indexq, 0x10
+    psubusb   m4, m2               ; ab
+    CMP       indexd, widthd
+
+    psubusb   m6, m3               ; bc
+    psubusb   m3, m2               ; cb
+
+    psadbw    m4, m6               ; |ab - bc|
+    paddq     m0, m4
+    psadbw    m5, m3               ; |ba - cb|
+    paddq     m1, m5
+    jl       .sse2_loop
+
+    paddq     m0, m1
+    movhlps   m1, m0
+    paddq     m0, m1
+    movd      eax, m0
+    RET
diff --git a/libavfilter/x86/vf_idet_init.c b/libavfilter/x86/vf_idet_init.c
new file mode 100644
index 0000000000..1147ca8ba8
--- /dev/null
+++ b/libavfilter/x86/vf_idet_init.c
@@ -0,0 +1,87 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_idet.h"
+
+#if HAVE_YASM
+
+/* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */
+#define FUNC_MAIN_DECL(KIND, SPAN)                                        \
+int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b,        \
+                               const uint8_t *c, int w);                  \
+static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b,    \
+                                   const uint8_t *c, int w) {             \
+    int sum = 0;                                                          \
+    const int left_over = w & (SPAN - 1);                                 \
+    w -= left_over;                                                       \
+    if (w > 0)                                                            \
+        sum += ff_idet_filter_line_##KIND(a, b, c, w);                    \
+    if (left_over > 0)                                                    \
+        sum += ff_idet_filter_line_c(a + w, b + w, c + w, left_over);     \
+    return sum;                                                           \
+}
+
+
+#define FUNC_MAIN_DECL_16bit(KIND, SPAN)                                       \
+int ff_idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b,     \
+                                     const uint16_t *c, int w);                \
+static int idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \
+                                         const uint16_t *c, int w) {           \
+    int sum = 0;                                                               \
+    const int left_over = w & (SPAN - 1);                                      \
+    w -= left_over;                                                            \
+    if (w > 0)                                                                 \
+        sum += ff_idet_filter_line_16bit_##KIND(a, b, c, w);                   \
+    if (left_over > 0)                                                         \
+        sum += ff_idet_filter_line_c_16bit(a + w, b + w, c + w, left_over);    \
+    return sum;                                                                \
+}
+
+FUNC_MAIN_DECL(sse2, 16)
+FUNC_MAIN_DECL_16bit(sse2, 8)
+#if ARCH_X86_32
+FUNC_MAIN_DECL(mmx, 8)
+FUNC_MAIN_DECL(mmxext, 8)
+FUNC_MAIN_DECL_16bit(mmx, 4)
+#endif
+
+#endif
+av_cold void ff_idet_init_x86(IDETContext *idet, int for_16b)
+{
+#if HAVE_YASM
+    const int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+    if (EXTERNAL_MMX(cpu_flags)) {
+        idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmx;
+    }
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmxext;
+    }
+#endif // ARCH_x86_32
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_sse2 : idet_filter_line_sse2;
+    }
+#endif // HAVE_YASM
+}
diff --git a/libavfilter/x86/vf_interlace.asm b/libavfilter/x86/vf_interlace.asm
index 85811da8d1..f70c700965 100644
--- a/libavfilter/x86/vf_interlace.asm
+++ b/libavfilter/x86/vf_interlace.asm
@@ -4,20 +4,20 @@
 ;* Copyright (C) 2014 Kieran Kunhya <kierank@obe.tv>
 ;* Copyright (c) 2014 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or modify
+;* FFmpeg is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
 ;* the Free Software Foundation; either version 2 of the License, or
 ;* (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ;* GNU General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU General Public License along
-;* with Libav; if not, write to the Free Software Foundation, Inc.,
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
 ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 ;******************************************************************************
 
@@ -37,7 +37,7 @@ cglobal lowpass_line, 5, 5, 7
 
     pcmpeqb m6, m6
 
-.loop
+.loop:
     mova m0, [r3+r1]
     mova m1, [r3+r1+mmsize]
     pavgb m0, [r4+r1]
diff --git a/libavfilter/x86/vf_interlace_init.c b/libavfilter/x86/vf_interlace_init.c
index 231ab85a1c..68ee47d9bc 100644
--- a/libavfilter/x86/vf_interlace_init.c
+++ b/libavfilter/x86/vf_interlace_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2014 Kieran Kunhya <kierank@obe.tv>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
-  * Libav is free software; you can redistribute it and/or modify
+  * FFmpeg is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License along
- * with Libav; if not, write to the Free Software Foundation, Inc.,
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
diff --git a/libavfilter/x86/vf_noise.c b/libavfilter/x86/vf_noise.c
new file mode 100644
index 0000000000..0a86cb084b
--- /dev/null
+++ b/libavfilter/x86/vf_noise.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2013 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavfilter/vf_noise.h"
+
+#if HAVE_INLINE_ASM
+static void line_noise_mmx(uint8_t *dst, const uint8_t *src,
+                           const int8_t *noise, int len, int shift)
+{
+    x86_reg mmx_len= len & (~7);
+    noise += shift;
+
+    __asm__ volatile(
+            "mov %3, %%"REG_a"               \n\t"
+            "pcmpeqb %%mm7, %%mm7            \n\t"
+            "psllw $15, %%mm7                \n\t"
+            "packsswb %%mm7, %%mm7           \n\t"
+            ".p2align 4                      \n\t"
+            "1:                              \n\t"
+            "movq (%0, %%"REG_a"), %%mm0     \n\t"
+            "movq (%1, %%"REG_a"), %%mm1     \n\t"
+            "pxor %%mm7, %%mm0               \n\t"
+            "paddsb %%mm1, %%mm0             \n\t"
+            "pxor %%mm7, %%mm0               \n\t"
+            "movq %%mm0, (%2, %%"REG_a")     \n\t"
+            "add $8, %%"REG_a"               \n\t"
+            " js 1b                          \n\t"
+            :: "r" (src+mmx_len), "r" (noise+mmx_len), "r" (dst+mmx_len), "g" (-mmx_len)
+            : "%"REG_a
+    );
+    if (mmx_len != len)
+        ff_line_noise_c(dst+mmx_len, src+mmx_len, noise+mmx_len, len-mmx_len, 0);
+}
+
+#if HAVE_6REGS
+static void line_noise_avg_mmx(uint8_t *dst, const uint8_t *src,
+                                      int len, const int8_t * const *shift)
+{
+    x86_reg mmx_len = len & (~7);
+
+    __asm__ volatile(
+            "mov %5, %%"REG_a"              \n\t"
+            ".p2align 4                     \n\t"
+            "1:                             \n\t"
+            "movq (%1, %%"REG_a"), %%mm1    \n\t"
+            "movq (%0, %%"REG_a"), %%mm0    \n\t"
+            "paddb (%2, %%"REG_a"), %%mm1   \n\t"
+            "paddb (%3, %%"REG_a"), %%mm1   \n\t"
+            "movq %%mm0, %%mm2              \n\t"
+            "movq %%mm1, %%mm3              \n\t"
+            "punpcklbw %%mm0, %%mm0         \n\t"
+            "punpckhbw %%mm2, %%mm2         \n\t"
+            "punpcklbw %%mm1, %%mm1         \n\t"
+            "punpckhbw %%mm3, %%mm3         \n\t"
+            "pmulhw %%mm0, %%mm1            \n\t"
+            "pmulhw %%mm2, %%mm3            \n\t"
+            "paddw %%mm1, %%mm1             \n\t"
+            "paddw %%mm3, %%mm3             \n\t"
+            "paddw %%mm0, %%mm1             \n\t"
+            "paddw %%mm2, %%mm3             \n\t"
+            "psrlw $8, %%mm1                \n\t"
+            "psrlw $8, %%mm3                \n\t"
+            "packuswb %%mm3, %%mm1          \n\t"
+            "movq %%mm1, (%4, %%"REG_a")    \n\t"
+            "add $8, %%"REG_a"              \n\t"
+            " js 1b                         \n\t"
+            :: "r" (src+mmx_len), "r" (shift[0]+mmx_len), "r" (shift[1]+mmx_len), "r" (shift[2]+mmx_len),
+               "r" (dst+mmx_len), "g" (-mmx_len)
+            : "%"REG_a
+        );
+
+    if (mmx_len != len){
+        const int8_t *shift2[3] = { shift[0]+mmx_len, shift[1]+mmx_len, shift[2]+mmx_len };
+        ff_line_noise_avg_c(dst+mmx_len, src+mmx_len, len-mmx_len, shift2);
+    }
+}
+#endif /* HAVE_6REGS */
+
+static void line_noise_mmxext(uint8_t *dst, const uint8_t *src,
+                              const int8_t *noise, int len, int shift)
+{
+    x86_reg mmx_len = len & (~7);
+    noise += shift;
+
+    __asm__ volatile(
+            "mov %3, %%"REG_a"                \n\t"
+            "pcmpeqb %%mm7, %%mm7             \n\t"
+            "psllw $15, %%mm7                 \n\t"
+            "packsswb %%mm7, %%mm7            \n\t"
+            ".p2align 4                       \n\t"
+            "1:                               \n\t"
+            "movq (%0, %%"REG_a"), %%mm0      \n\t"
+            "movq (%1, %%"REG_a"), %%mm1      \n\t"
+            "pxor %%mm7, %%mm0                \n\t"
+            "paddsb %%mm1, %%mm0              \n\t"
+            "pxor %%mm7, %%mm0                \n\t"
+            "movntq %%mm0, (%2, %%"REG_a")    \n\t"
+            "add $8, %%"REG_a"                \n\t"
+            " js 1b                           \n\t"
+            :: "r" (src+mmx_len), "r" (noise+mmx_len), "r" (dst+mmx_len), "g" (-mmx_len)
+            : "%"REG_a
+            );
+    if (mmx_len != len)
+        ff_line_noise_c(dst+mmx_len, src+mmx_len, noise+mmx_len, len-mmx_len, 0);
+}
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_noise_init_x86(NoiseContext *n)
+{
+#if HAVE_INLINE_ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (INLINE_MMX(cpu_flags)) {
+        n->line_noise     = line_noise_mmx;
+#if HAVE_6REGS
+        n->line_noise_avg = line_noise_avg_mmx;
+#endif
+    }
+    if (INLINE_MMXEXT(cpu_flags)) {
+        n->line_noise     = line_noise_mmxext;
+    }
+#endif
+}
diff --git a/libavfilter/x86/vf_pp7.asm b/libavfilter/x86/vf_pp7.asm
new file mode 100644
index 0000000000..7b3e5cf5e3
--- /dev/null
+++ b/libavfilter/x86/vf_pp7.asm
@@ -0,0 +1,57 @@
+;*****************************************************************************
+;* x86-optimized functions for pp7 filter
+;*
+;* Copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_MMX mmx
+
+;void ff_pp7_dctB_mmx(int16_t *dst, int16_t *src)
+cglobal pp7_dctB, 2, 2, 0, dst, src
+    movq   m0, [srcq]
+    movq   m1, [srcq+mmsize*1]
+    paddw  m0, [srcq+mmsize*6]
+    paddw  m1, [srcq+mmsize*5]
+    movq   m2, [srcq+mmsize*2]
+    movq   m3, [srcq+mmsize*3]
+    paddw  m2, [srcq+mmsize*4]
+    paddw  m3, m3
+    movq   m4, m3
+    psubw  m3, m0
+    paddw  m4, m0
+    movq   m0, m2
+    psubw  m2, m1
+    paddw  m0, m1
+    movq   m1, m4
+    psubw  m4, m0
+    paddw  m1, m0
+    movq   m0, m3
+    psubw  m3, m2
+    psubw  m3, m2
+    paddw  m2, m0
+    paddw  m2, m0
+    movq   [dstq], m1
+    movq   [dstq+mmsize*2], m4
+    movq   [dstq+mmsize*1], m2
+    movq   [dstq+mmsize*3], m3
+    RET
diff --git a/libavfilter/x86/vf_pp7_init.c b/libavfilter/x86/vf_pp7_init.c
new file mode 100644
index 0000000000..165b0dd5d0
--- /dev/null
+++ b/libavfilter/x86/vf_pp7_init.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_pp7.h"
+
+void ff_pp7_dctB_mmx(int16_t *dst, int16_t *src);
+
+av_cold void ff_pp7_init_x86(PP7Context *p)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags))
+        p->dctB = ff_pp7_dctB_mmx;
+}
diff --git a/libavfilter/x86/vf_psnr.asm b/libavfilter/x86/vf_psnr.asm
new file mode 100644
index 0000000000..ef88d6f694
--- /dev/null
+++ b/libavfilter/x86/vf_psnr.asm
@@ -0,0 +1,140 @@
+;*****************************************************************************
+;* x86-optimized functions for interlace filter
+;*
+;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SSE_LINE_FN 2 ; 8 or 16, byte or word
+INIT_XMM sse2
+%if ARCH_X86_32
+%if %1 == 8
+cglobal sse_line_%1 %+ bit, 0, 6, 8, res, buf, w, px1, px2, ref
+%else
+cglobal sse_line_%1 %+ bit, 0, 7, 8, res, buf, reshigh, w, px1, px2, ref
+%endif
+    mov       bufq, r0mp
+    mov       refq, r1mp
+    mov         wd, r2m
+%else
+cglobal sse_line_%1 %+ bit, 3, 5, 8, buf, ref, w, px1, px2
+%endif
+    pxor        m6, m6
+    pxor        m7, m7
+    sub         wd, mmsize*2
+    jl .end
+
+.loop:
+    movu        m0, [bufq+mmsize*0]
+    movu        m1, [bufq+mmsize*1]
+    movu        m2, [refq+mmsize*0]
+    movu        m3, [refq+mmsize*1]
+%if %1 == 8
+    add       bufq, mmsize*2
+    add       refq, mmsize*2
+    psubusb     m4, m0, m2
+    psubusb     m5, m1, m3
+    psubusb     m2, m0
+    psubusb     m3, m1
+    por         m2, m4
+    por         m3, m5
+    punpcklbw   m0, m2, m6
+    punpcklbw   m1, m3, m6
+    punpckhbw   m2, m6
+    punpckhbw   m3, m6
+%else
+    psubw       m0, m2
+    psubw       m1, m3
+    movu        m2, [bufq+mmsize*2]
+    movu        m3, [bufq+mmsize*3]
+    movu        m4, [refq+mmsize*2]
+    movu        m5, [refq+mmsize*3]
+    psubw       m2, m4
+    psubw       m3, m5
+    add       bufq, mmsize*4
+    add       refq, mmsize*4
+%endif
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    pmaddwd     m2, m2
+    pmaddwd     m3, m3
+    paddd       m0, m1
+    paddd       m2, m3
+%if %1 == 8
+    paddd       m7, m0
+    paddd       m7, m2
+%else
+    paddd       m0, m2
+    punpckldq   m2, m0, m6
+    punpckhdq   m0, m6
+    paddq       m7, m0
+    paddq       m7, m2
+%endif
+    sub         wd, mmsize*2
+    jge .loop
+
+.end:
+    add         wd, mmsize*2
+    movhlps     m0, m7
+%if %1 == 8
+    paddd       m7, m0
+    pshufd      m0, m7, 1
+    paddd       m7, m0
+    movd       eax, m7
+%else
+    paddq       m7, m0
+%if ARCH_X86_32
+    movd       eax, m7
+    psrldq      m7, 4
+    movd       edx, m7
+%else
+    movq       rax, m7
+%endif
+%endif
+
+    ; deal with cases where w % 32 != 0
+    test        wd, wd
+    jz .end_scalar
+.loop_scalar:
+    movzx     px1d, %2 [bufq+wq*(%1/8)-(%1/8)]
+    movzx     px2d, %2 [refq+wq*(%1/8)-(%1/8)]
+    sub       px1d, px2d
+    imul      px1d, px1d
+%if %1 == 8
+    add        eax, px1d
+%elif ARCH_X86_64
+    add        rax, px1q
+%else
+    add        eax, px1d
+    adc        edx, 0
+%endif
+    dec         wd
+    jg .loop_scalar
+
+.end_scalar:
+    ; for %1=8, no need to zero edx on x86-32, since edx=wd, which is zero
+    RET
+%endmacro
+
+INIT_XMM sse2
+SSE_LINE_FN  8, byte
+SSE_LINE_FN 16, word
diff --git a/libavfilter/x86/vf_psnr_init.c b/libavfilter/x86/vf_psnr_init.c
new file mode 100644
index 0000000000..c387812204
--- /dev/null
+++ b/libavfilter/x86/vf_psnr_init.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+
+#include "libavfilter/psnr.h"
+
+uint64_t ff_sse_line_8bit_sse2(const uint8_t *buf, const uint8_t *ref, int w);
+uint64_t ff_sse_line_16bit_sse2(const uint8_t *buf, const uint8_t *ref, int w);
+
+void ff_psnr_init_x86(PSNRDSPContext *dsp, int bpp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        if (bpp <= 8) {
+            dsp->sse_line = ff_sse_line_8bit_sse2;
+        } else if (bpp <= 15) {
+            dsp->sse_line = ff_sse_line_16bit_sse2;
+        }
+    }
+}
diff --git a/libavfilter/x86/vf_pullup.asm b/libavfilter/x86/vf_pullup.asm
new file mode 100644
index 0000000000..d3a195511e
--- /dev/null
+++ b/libavfilter/x86/vf_pullup.asm
@@ -0,0 +1,178 @@
+;*****************************************************************************
+;* x86-optimized functions for pullup filter
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+INIT_MMX mmx
+cglobal pullup_filter_diff, 3, 5, 8, first, second, size
+    mov        r3, 4
+    pxor       m4, m4
+    pxor       m7, m7
+
+.loop:
+    movq       m0, [firstq]
+    movq       m2, [firstq]
+    add        firstq, sizeq
+    movq       m1, [secondq]
+    add        secondq, sizeq
+    psubusb    m2, m1
+    psubusb    m1, m0
+    movq       m0, m2
+    movq       m3, m1
+    punpcklbw  m0, m7
+    punpcklbw  m1, m7
+    punpckhbw  m2, m7
+    punpckhbw  m3, m7
+    paddw      m4, m0
+    paddw      m4, m1
+    paddw      m4, m2
+    paddw      m4, m3
+
+    dec        r3
+    jnz .loop
+
+    movq       m3, m4
+    punpcklwd  m4, m7
+    punpckhwd  m3, m7
+    paddd      m3, m4
+    movd      eax, m3
+    psrlq      m3, 32
+    movd      r4d, m3
+    add       eax, r4d
+    RET
+
+INIT_MMX mmx
+cglobal pullup_filter_comb, 3, 5, 8, first, second, size
+    mov        r3, 4
+    pxor       m6, m6
+    pxor       m7, m7
+    sub        secondq, sizeq
+
+.loop:
+    movq       m0, [firstq]
+    movq       m1, [secondq]
+    punpcklbw  m0, m7
+    movq       m2, [secondq+sizeq]
+    punpcklbw  m1, m7
+    punpcklbw  m2, m7
+    paddw      m0, m0
+    paddw      m1, m2
+    movq       m2, m0
+    psubusw    m0, m1
+    psubusw    m1, m2
+    paddw      m6, m0
+    paddw      m6, m1
+
+    movq       m0, [firstq]
+    movq       m1, [secondq]
+    punpckhbw  m0, m7
+    movq       m2, [secondq+sizeq]
+    punpckhbw  m1, m7
+    punpckhbw  m2, m7
+    paddw      m0, m0
+    paddw      m1, m2
+    movq       m2, m0
+    psubusw    m0, m1
+    psubusw    m1, m2
+    paddw      m6, m0
+    paddw      m6, m1
+
+    movq       m0, [secondq+sizeq]
+    movq       m1, [firstq]
+    punpcklbw  m0, m7
+    movq       m2, [firstq+sizeq]
+    punpcklbw  m1, m7
+    punpcklbw  m2, m7
+    paddw      m0, m0
+    paddw      m1, m2
+    movq       m2, m0
+    psubusw    m0, m1
+    psubusw    m1, m2
+    paddw      m6, m0
+    paddw      m6, m1
+
+    movq       m0, [secondq+sizeq]
+    movq       m1, [firstq]
+    punpckhbw  m0, m7
+    movq       m2, [firstq+sizeq]
+    punpckhbw  m1, m7
+    punpckhbw  m2, m7
+    paddw      m0, m0
+    paddw      m1, m2
+    movq       m2, m0
+    psubusw    m0, m1
+    psubusw    m1, m2
+    paddw      m6, m0
+    paddw      m6, m1
+
+    add        firstq, sizeq
+    add        secondq, sizeq
+    dec        r3
+    jnz .loop
+
+    movq       m5, m6
+    punpcklwd  m6, m7
+    punpckhwd  m5, m7
+    paddd      m5, m6
+    movd      eax, m5
+    psrlq      m5, 32
+    movd      r4d, m5
+    add       eax, r4d
+    RET
+
+INIT_MMX mmx
+cglobal pullup_filter_var, 3, 5, 8, first, second, size
+    mov        r3, 3
+    pxor       m4, m4
+    pxor       m7, m7
+
+.loop:
+    movq       m0, [firstq]
+    movq       m2, [firstq]
+    movq       m1, [firstq+sizeq]
+    add        firstq, sizeq
+    psubusb    m2, m1
+    psubusb    m1, m0
+    movq       m0, m2
+    movq       m3, m1
+    punpcklbw  m0, m7
+    punpcklbw  m1, m7
+    punpckhbw  m2, m7
+    punpckhbw  m3, m7
+    paddw      m4, m0
+    paddw      m4, m1
+    paddw      m4, m2
+    paddw      m4, m3
+
+    dec        r3
+    jnz .loop
+
+    movq       m3, m4
+    punpcklwd  m4, m7
+    punpckhwd  m3, m7
+    paddd      m3, m4
+    movd      eax, m3
+    psrlq      m3, 32
+    movd      r4d, m3
+    add       eax, r4d
+    shl       eax, 2
+    RET
diff --git a/libavfilter/x86/vf_pullup_init.c b/libavfilter/x86/vf_pullup_init.c
new file mode 100644
index 0000000000..5b36b68e51
--- /dev/null
+++ b/libavfilter/x86/vf_pullup_init.c
@@ -0,0 +1,41 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_pullup.h"
+
+int ff_pullup_filter_diff_mmx(const uint8_t *a, const uint8_t *b, ptrdiff_t s);
+int ff_pullup_filter_comb_mmx(const uint8_t *a, const uint8_t *b, ptrdiff_t s);
+int ff_pullup_filter_var_mmx (const uint8_t *a, const uint8_t *b, ptrdiff_t s);
+
+av_cold void ff_pullup_init_x86(PullupContext *s)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        s->diff = ff_pullup_filter_diff_mmx;
+        s->comb = ff_pullup_filter_comb_mmx;
+        s->var  = ff_pullup_filter_var_mmx;
+    }
+#endif
+}
diff --git a/libavfilter/x86/vf_removegrain.asm b/libavfilter/x86/vf_removegrain.asm
new file mode 100644
index 0000000000..c09f89ea30
--- /dev/null
+++ b/libavfilter/x86/vf_removegrain.asm
@@ -0,0 +1,1218 @@
+;*****************************************************************************
+;* x86-optimized functions for removegrain filter
+;*
+;* Copyright (C) 2015 James Darnley
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;*****************************************************************************
+
+; column: -1  0 +1
+; row -1: a1 a2 a3
+; row  0: a4  c a5
+; row +1: a6 a7 a8
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_4:    times 16 dw 4
+pw_8:    times 16 dw 8
+pw_div9: times 16 dw ((1<<16)+4)/9
+
+SECTION_TEXT
+
+;*** Preprocessor helpers
+
+%define a1 srcq+stride_n-1
+%define a2 srcq+stride_n
+%define a3 srcq+stride_n+1
+%define a4 srcq-1
+%define c  srcq
+%define a5 srcq+1
+%define a6 srcq+stride_p-1
+%define a7 srcq+stride_p
+%define a8 srcq+stride_p+1
+
+; %1 dest simd register
+; %2 source memory location
+; %3 zero location (simd register/memory)
+%macro LOAD 3
+    movh %1, %2
+    punpcklbw %1, %3
+%endmacro
+
+%macro LOAD_SQUARE 0
+    movu m1, [a1]
+    movu m2, [a2]
+    movu m3, [a3]
+    movu m4, [a4]
+    movu m0, [c]
+    movu m5, [a5]
+    movu m6, [a6]
+    movu m7, [a7]
+    movu m8, [a8]
+%endmacro
+
+; %1 zero location (simd register/memory)
+%macro LOAD_SQUARE_16 1
+    LOAD m1, [a1], %1
+    LOAD m2, [a2], %1
+    LOAD m3, [a3], %1
+    LOAD m4, [a4], %1
+    LOAD m0, [c], %1
+    LOAD m5, [a5], %1
+    LOAD m6, [a6], %1
+    LOAD m7, [a7], %1
+    LOAD m8, [a8], %1
+%endmacro
+
+; %1 data type
+; %2 simd register to hold maximums
+; %3 simd register to hold minimums
+; %4 temp location (simd register/memory)
+%macro SORT_PAIR 4
+    mova   %4, %2
+    pmin%1 %2, %3
+    pmax%1 %3, %4
+%endmacro
+
+%macro SORT_AXIS 0
+    SORT_PAIR ub, m1, m8, m9
+    SORT_PAIR ub, m2, m7, m10
+    SORT_PAIR ub, m3, m6, m11
+    SORT_PAIR ub, m4, m5, m12
+%endmacro
+
+
+%macro SORT_AXIS_16 0
+    SORT_PAIR sw, m1, m8, m9
+    SORT_PAIR sw, m2, m7, m10
+    SORT_PAIR sw, m3, m6, m11
+    SORT_PAIR sw, m4, m5, m12
+%endmacro
+
+; The loop doesn't need to do all the iterations.  It could stop when the right
+; pixels are in the right registers.
+%macro SORT_SQUARE 0
+    %assign k 7
+    %rep 7
+        %assign i 1
+        %assign j 2
+        %rep k
+            SORT_PAIR ub, m %+ i , m %+ j , m9
+            %assign i i+1
+            %assign j j+1
+        %endrep
+        %assign k k-1
+    %endrep
+%endmacro
+
+; %1 dest simd register
+; %2 source (simd register/memory)
+; %3 temp simd register
+%macro ABS_DIFF 3
+    mova %3, %2
+    psubusb %3, %1
+    psubusb %1, %2
+    por %1, %3
+%endmacro
+
+; %1 dest simd register
+; %2 source (simd register/memory)
+; %3 temp simd register
+%macro ABS_DIFF_W 3
+    mova %3, %2
+    psubusw %3, %1
+    psubusw %1, %2
+    por %1, %3
+%endmacro
+
+; %1 simd register that holds the "false" values and will hold the result
+; %2 simd register that holds the "true" values
+; %3 location (simd register/memory) that hold the mask
+%macro BLEND 3
+%if cpuflag(avx2)
+    vpblendvb %1, %1, %2, %3
+%else
+    pand      %2, %3
+    pandn     %3, %1
+    por       %3, %2
+    SWAP      %1, %3
+%endif
+%endmacro
+
+; Functions
+
+INIT_XMM sse2
+cglobal rg_fl_mode_1, 4, 5, 3, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    .loop:
+        movu m0, [a1]
+        mova m1, m0
+
+        movu m2, [a2]
+        pmaxub m0, m2
+        pminub m1, m2
+
+        movu m2, [a3]
+        pmaxub m0, m2
+        pminub m1, m2
+
+        movu m2, [a4]
+        pmaxub m0, m2
+        pminub m1, m2
+
+        movu m2, [a5]
+        pmaxub m0, m2
+        pminub m1, m2
+
+        movu m2, [a6]
+        pmaxub m0, m2
+        pminub m1, m2
+
+        movu m2, [a7]
+        pmaxub m0, m2
+        pminub m1, m2
+
+        movu m2, [a8]
+        pmaxub m0, m2
+        pminub m1, m2
+
+        movu m2, [c]
+        pminub m2, m0
+        pmaxub m2, m1
+
+        movu [dstq], m2
+        add srcq, mmsize
+        add dstq, mmsize
+        sub pixelsd, mmsize
+    jg .loop
+RET
+
+%if ARCH_X86_64
+cglobal rg_fl_mode_2, 4, 5, 10, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    .loop:
+        LOAD_SQUARE
+        SORT_SQUARE
+
+        CLIPUB m0, m2, m7
+
+        movu [dstq], m0
+        add srcq, mmsize
+        add dstq, mmsize
+        sub pixelsd, mmsize
+    jg .loop
+RET
+
+cglobal rg_fl_mode_3, 4, 5, 10, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    .loop:
+        LOAD_SQUARE
+        SORT_SQUARE
+
+        CLIPUB m0, m3, m6
+
+        movu [dstq], m0
+        add srcq, mmsize
+        add dstq, mmsize
+        sub pixelsd, mmsize
+    jg .loop
+RET
+
+cglobal rg_fl_mode_4, 4, 5, 10, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    .loop:
+        LOAD_SQUARE
+        SORT_SQUARE
+
+        CLIPUB m0, m4, m5
+
+        movu [dstq], m0
+        add srcq, mmsize
+        add dstq, mmsize
+        sub pixelsd, mmsize
+    jg .loop
+RET
+
+cglobal rg_fl_mode_5, 4, 5, 13, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    .loop:
+        LOAD_SQUARE
+        SORT_AXIS
+
+        mova m9, m0
+        mova m10, m0
+        mova m11, m0
+        mova m12, m0
+
+        CLIPUB m9, m1, m8
+        CLIPUB m10, m2, m7
+        CLIPUB m11, m3, m6
+        CLIPUB m12, m4, m5
+
+        mova m8, m9  ; clip1
+        mova m7, m10 ; clip2
+        mova m6, m11 ; clip3
+        mova m5, m12 ; clip4
+
+        ABS_DIFF m9, m0, m1  ; c1
+        ABS_DIFF m10, m0, m2 ; c2
+        ABS_DIFF m11, m0, m3 ; c3
+        ABS_DIFF m12, m0, m4 ; c4
+
+        pminub m9, m10
+        pminub m9, m11
+        pminub m9, m12 ; mindiff
+
+        pcmpeqb m10, m9
+        pcmpeqb m11, m9
+        pcmpeqb m12, m9
+
+        ; Notice the order here: c1, c3, c2, c4
+        BLEND m8, m6, m11
+        BLEND m8, m7, m10
+        BLEND m8, m5, m12
+
+        movu [dstq], m8
+        add srcq, mmsize
+        add dstq, mmsize
+        sub pixelsd, mmsize
+    jg .loop
+RET
+
+cglobal rg_fl_mode_6, 4, 5, 16, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    ; Some register saving suggestions: the zero can be somewhere other than a
+    ; register, the center pixels could be on the stack.
+
+    pxor m15, m15
+    .loop:
+        LOAD_SQUARE_16 m15
+        SORT_AXIS_16
+
+        mova m9, m0
+        mova m10, m0
+        mova m11, m0
+        mova m12, m0
+        CLIPW m9, m1, m8  ; clip1
+        CLIPW m10, m2, m7 ; clip2
+        CLIPW m11, m3, m6 ; clip3
+        CLIPW m12, m4, m5 ; clip4
+
+        psubw m8, m1 ; d1
+        psubw m7, m2 ; d2
+        psubw m6, m3 ; d3
+        psubw m5, m4 ; d4
+
+        mova m1, m9
+        mova m2, m10
+        mova m3, m11
+        mova m4, m12
+        ABS_DIFF_W m1, m0, m13
+        ABS_DIFF_W m2, m0, m14
+        ABS_DIFF_W m3, m0, m13
+        ABS_DIFF_W m4, m0, m14
+        psllw m1, 1
+        psllw m2, 1
+        psllw m3, 1
+        psllw m4, 1
+        paddw m1, m8 ; c1
+        paddw m2, m7 ; c2
+        paddw m3, m6 ; c3
+        paddw m4, m5 ; c4
+        ; As the differences (d1..d4) can only be postive, there is no need to
+        ; clip to zero.  Also, the maximum positive value is less than 768.
+
+        pminsw m1, m2
+        pminsw m1, m3
+        pminsw m1, m4
+
+        pcmpeqw m2, m1
+        pcmpeqw m3, m1
+        pcmpeqw m4, m1
+
+        BLEND m9, m11, m3
+        BLEND m9, m10, m2
+        BLEND m9, m12, m4
+        packuswb m9, m9
+
+        movh [dstq], m9
+        add srcq, mmsize/2
+        add dstq, mmsize/2
+        sub pixelsd, mmsize/2
+    jg .loop
+RET
+
+; This is just copy-pasted straight from mode 6 with the left shifts removed.
+cglobal rg_fl_mode_7, 4, 5, 16, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    ; Can this be done without unpacking?
+
+    pxor m15, m15
+    .loop:
+        LOAD_SQUARE_16 m15
+        SORT_AXIS_16
+
+        mova m9, m0
+        mova m10, m0
+        mova m11, m0
+        mova m12, m0
+        CLIPW m9, m1, m8  ; clip1
+        CLIPW m10, m2, m7 ; clip2
+        CLIPW m11, m3, m6 ; clip3
+        CLIPW m12, m4, m5 ; clip4
+
+        psubw m8, m1 ; d1
+        psubw m7, m2 ; d2
+        psubw m6, m3 ; d3
+        psubw m5, m4 ; d4
+
+        mova m1, m9
+        mova m2, m10
+        mova m3, m11
+        mova m4, m12
+        ABS_DIFF_W m1, m0, m13
+        ABS_DIFF_W m2, m0, m14
+        ABS_DIFF_W m3, m0, m13
+        ABS_DIFF_W m4, m0, m14
+        paddw m1, m8 ; c1
+        paddw m2, m7 ; c2
+        paddw m3, m6 ; c3
+        paddw m4, m5 ; c4
+
+        pminsw m1, m2
+        pminsw m1, m3
+        pminsw m1, m4
+
+        pcmpeqw m2, m1
+        pcmpeqw m3, m1
+        pcmpeqw m4, m1
+
+        BLEND m9, m11, m3
+        BLEND m9, m10, m2
+        BLEND m9, m12, m4
+        packuswb m9, m9
+
+        movh [dstq], m9
+        add srcq, mmsize/2
+        add dstq, mmsize/2
+        sub pixelsd, mmsize/2
+    jg .loop
+RET
+
+; This is just copy-pasted straight from mode 6 with a few changes.
+cglobal rg_fl_mode_8, 4, 5, 16, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    pxor m15, m15
+    .loop:
+        LOAD_SQUARE_16 m15
+        SORT_AXIS_16
+
+        mova m9, m0
+        mova m10, m0
+        mova m11, m0
+        mova m12, m0
+        CLIPW m9, m1, m8  ; clip1
+        CLIPW m10, m2, m7 ; clip2
+        CLIPW m11, m3, m6 ; clip3
+        CLIPW m12, m4, m5 ; clip4
+
+        psubw m8, m1 ; d1
+        psubw m7, m2 ; d2
+        psubw m6, m3 ; d3
+        psubw m5, m4 ; d4
+        psllw m8, 1
+        psllw m7, 1
+        psllw m6, 1
+        psllw m5, 1
+
+        mova m1, m9
+        mova m2, m10
+        mova m3, m11
+        mova m4, m12
+        ABS_DIFF_W m1, m0, m13
+        ABS_DIFF_W m2, m0, m14
+        ABS_DIFF_W m3, m0, m13
+        ABS_DIFF_W m4, m0, m14
+        paddw m1, m8 ; c1
+        paddw m2, m7 ; c1
+        paddw m3, m6 ; c1
+        paddw m4, m5 ; c1
+        ; As the differences (d1..d4) can only be postive, there is no need to
+        ; clip to zero.  Also, the maximum positive value is less than 768.
+
+        pminsw m1, m2
+        pminsw m1, m3
+        pminsw m1, m4
+
+        pcmpeqw m2, m1
+        pcmpeqw m3, m1
+        pcmpeqw m4, m1
+
+        BLEND m9, m11, m3
+        BLEND m9, m10, m2
+        BLEND m9, m12, m4
+        packuswb m9, m9
+
+        movh [dstq], m9
+        add srcq, mmsize/2
+        add dstq, mmsize/2
+        sub pixelsd, mmsize/2
+    jg .loop
+RET
+
+cglobal rg_fl_mode_9, 4, 5, 13, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    .loop:
+        LOAD_SQUARE
+        SORT_AXIS
+
+        mova m9, m0
+        mova m10, m0
+        mova m11, m0
+        mova m12, m0
+        CLIPUB m9, m1, m8  ; clip1
+        CLIPUB m10, m2, m7 ; clip2
+        CLIPUB m11, m3, m6 ; clip3
+        CLIPUB m12, m4, m5 ; clip4
+
+        psubb m8, m1 ; d1
+        psubb m7, m2 ; d2
+        psubb m6, m3 ; d3
+        psubb m5, m4 ; d4
+
+        pminub m8, m7
+        pminub m8, m6
+        pminub m8, m5
+
+        pcmpeqb m7, m8
+        pcmpeqb m6, m8
+        pcmpeqb m5, m8
+
+        BLEND m9, m11, m6
+        BLEND m9, m10, m7
+        BLEND m9, m12, m5
+
+        movu [dstq], m9
+        add srcq, mmsize
+        add dstq, mmsize
+        sub pixelsd, mmsize
+    jg .loop
+RET
+%endif
+
+cglobal rg_fl_mode_10, 4, 5, 8, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    .loop:
+        movu m0, [c]
+
+        movu m1, [a4]
+        mova m2, m1
+        ABS_DIFF m1, m0, m7
+
+        movu m3, [a5]       ; load pixel
+        mova m4, m3
+        ABS_DIFF m4, m0, m7 ; absolute difference from center
+        pminub m1, m4       ; mindiff
+        pcmpeqb m4, m1      ; if (difference == mindiff)
+        BLEND m2, m3, m4    ;     return pixel
+
+        movu m5, [a1]
+        mova m6, m5
+        ABS_DIFF m6, m0, m7
+        pminub m1, m6
+        pcmpeqb m6, m1
+        BLEND m2, m5, m6
+
+        movu m3, [a3]
+        mova m4, m3
+        ABS_DIFF m4, m0, m7
+        pminub m1, m4
+        pcmpeqb m4, m1
+        BLEND m2, m3, m4
+
+        movu m5, [a2]
+        mova m6, m5
+        ABS_DIFF m6, m0, m7
+        pminub m1, m6
+        pcmpeqb m6, m1
+        BLEND m2, m5, m6
+
+        movu m3, [a6]
+        mova m4, m3
+        ABS_DIFF m4, m0, m7
+        pminub m1, m4
+        pcmpeqb m4, m1
+        BLEND m2, m3, m4
+
+        movu m5, [a8]
+        mova m6, m5
+        ABS_DIFF m6, m0, m7
+        pminub m1, m6
+        pcmpeqb m6, m1
+        BLEND m2, m5, m6
+
+        movu m3, [a7]
+        mova m4, m3
+        ABS_DIFF m4, m0, m7
+        pminub m1, m4
+        pcmpeqb m4, m1
+        BLEND m2, m3, m4
+
+        movu [dstq], m2
+        add srcq, mmsize
+        add dstq, mmsize
+        sub pixelsd, mmsize
+    jg .loop
+RET
+
+cglobal rg_fl_mode_11_12, 4, 5, 7, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    pxor m0, m0
+    .loop:
+        LOAD m1, [c], m0
+        LOAD m2, [a2], m0
+        LOAD m3, [a4], m0
+        LOAD m4, [a5], m0
+        LOAD m5, [a7], m0
+
+        psllw m1, 2
+        paddw m2, m3
+        paddw m4, m5
+        paddw m2, m4
+        psllw m2, 1
+
+        LOAD m3, [a1], m0
+        LOAD m4, [a3], m0
+        LOAD m5, [a6], m0
+        LOAD m6, [a8], m0
+        paddw m1, m2
+        paddw m3, m4
+        paddw m5, m6
+        paddw m1, m3
+        paddw m1, m5
+
+        paddw m1, [pw_8]
+        psraw m1, 4
+
+        packuswb m1, m1
+
+        movh [dstq], m1
+        add srcq, mmsize/2
+        add dstq, mmsize/2
+        sub pixelsd, mmsize/2
+    jg .loop
+RET
+
+cglobal rg_fl_mode_13_14, 4, 5, 8, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    .loop:
+        movu m1, [a1]
+        movu m2, [a8]
+        mova m0, m1
+        pavgb m1, m2
+        ABS_DIFF m0, m2, m6
+
+        movu m3, [a3]
+        movu m4, [a6]
+        mova m5, m3
+        pavgb m3, m4
+        ABS_DIFF m5, m4, m7
+        pminub m0, m5
+        pcmpeqb m5, m0
+        BLEND m1, m3, m5
+
+        movu m2, [a2]
+        movu m3, [a7]
+        mova m4, m2
+        pavgb m2, m3
+        ABS_DIFF m4, m3, m6
+        pminub m0, m4
+        pcmpeqb m4, m0
+        BLEND m1, m2, m4
+
+        movu [dstq], m1
+        add srcq, mmsize
+        add dstq, mmsize
+        sub pixelsd, mmsize
+    jg .loop
+RET
+
+%if ARCH_X86_64
+cglobal rg_fl_mode_15_16, 4, 5, 16, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    pxor m15, m15
+    .loop:
+        LOAD_SQUARE_16 m15
+
+        mova m9, m1
+        mova m10, m2
+        mova m11, m3
+        ABS_DIFF_W m9, m8, m12
+        ABS_DIFF_W m10, m7, m13
+        ABS_DIFF_W m11, m6, m14
+        pminsw m9, m10
+        pminsw m9, m11
+        pcmpeqw m10, m9
+        pcmpeqw m11, m9
+
+        mova m12, m2
+        mova m13, m1
+        mova m14, m6
+        paddw m12, m7
+        psllw m12, 1
+        paddw m13, m3
+        paddw m14, m8
+        paddw m12, [pw_4]
+        paddw m13, m14
+        paddw m12, m13
+        psrlw m12, 3
+
+        SORT_PAIR ub, m1, m8, m0
+        SORT_PAIR ub, m2, m7, m9
+        SORT_PAIR ub, m3, m6, m14
+        mova m4, m12
+        mova m5, m12
+        CLIPW m4, m1, m8
+        CLIPW m5, m2, m7
+        CLIPW m12, m3, m6
+
+        BLEND m4, m12, m11
+        BLEND m4,  m5, m10
+        packuswb m4, m4
+
+        movh [dstq], m4
+        add srcq, mmsize/2
+        add dstq, mmsize/2
+        sub pixelsd, mmsize/2
+    jg .loop
+RET
+
+cglobal rg_fl_mode_17, 4, 5, 9, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    .loop:
+        LOAD_SQUARE
+        SORT_AXIS
+
+        pmaxub m1, m2
+        pmaxub m3, m4
+
+        pminub m8, m7
+        pminub m5, m6
+
+        pmaxub m1, m3
+        pminub m8, m5
+
+        mova m2, m1
+        pminub m1, m8
+        pmaxub m8, m2
+
+        CLIPUB m0, m1, m8
+
+        movu [dstq], m0
+        add srcq, mmsize
+        add dstq, mmsize
+        sub pixelsd, mmsize
+    jg .loop
+RET
+
+cglobal rg_fl_mode_18, 4, 5, 16, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    .loop:
+        LOAD_SQUARE
+
+        mova m9, m1
+        mova m10, m8
+        ABS_DIFF m9, m0, m11
+        ABS_DIFF m10, m0, m12
+        pmaxub m9, m10 ; m9 = d1
+
+        mova m10, m2
+        mova m11, m7
+        ABS_DIFF m10, m0, m12
+        ABS_DIFF m11, m0, m13
+        pmaxub m10, m11 ; m10 = d2
+
+        mova m11, m3
+        mova m12, m6
+        ABS_DIFF m11, m0, m13
+        ABS_DIFF m12, m0, m14
+        pmaxub m11, m12 ; m11 = d3
+
+        mova m12, m4
+        mova m13, m5
+        ABS_DIFF m12, m0, m14
+        ABS_DIFF m13, m0, m15
+        pmaxub m12, m13 ; m12 = d4
+
+        mova m13, m9
+        pminub m13, m10
+        pminub m13, m11
+        pminub m13, m12 ; m13 = mindiff
+
+        pcmpeqb m10, m13
+        pcmpeqb m11, m13
+        pcmpeqb m12, m13
+
+        mova m14, m1
+        pminub m1, m8
+        pmaxub m8, m14
+
+        mova m13, m0
+        mova m14, m1
+        pminub m1, m8
+        pmaxub m8, m14
+        CLIPUB m13, m1, m8 ; m13 = ret...d1
+
+        mova m14, m0
+        mova m15, m3
+        pminub m3, m6
+        pmaxub m6, m15
+        CLIPUB m14, m3, m6
+        pand m14, m11
+        pandn m11, m13
+        por m14, m11 ; m14 = ret...d3
+
+        mova m15, m0
+        mova m1, m2
+        pminub m2, m7
+        pmaxub m7, m1
+        CLIPUB m15, m2, m7
+        pand m15, m10
+        pandn m10, m14
+        por m15, m10 ; m15 = ret...d2
+
+        mova m1, m0
+        mova m2, m4
+        pminub m4, m5
+        pmaxub m5, m2
+        CLIPUB m1, m4, m5
+        pand m1, m12
+        pandn m12, m15
+        por m1, m12 ; m15 = ret...d4
+
+        movu [dstq], m1
+        add srcq, mmsize
+        add dstq, mmsize
+        sub pixelsd, mmsize
+    jg .loop
+RET
+%endif
+
+cglobal rg_fl_mode_19, 4, 5, 7, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    pxor m0, m0
+    .loop:
+        LOAD m1, [a1], m0
+        LOAD m2, [a2], m0
+        paddw m1, m2
+
+        LOAD m3, [a3], m0
+        LOAD m4, [a4], m0
+        paddw m3, m4
+
+        LOAD m5, [a5], m0
+        LOAD m6, [a6], m0
+        paddw m5, m6
+
+        LOAD m2, [a7], m0
+        LOAD m4, [a8], m0
+        paddw m2, m4
+
+        paddw m1, m3
+        paddw m2, m5
+        paddw m1, m2
+
+        paddw m1, [pw_4]
+        psraw m1, 3
+
+        packuswb m1, m1
+
+        movh [dstq], m1
+        add srcq, mmsize/2
+        add dstq, mmsize/2
+        sub pixelsd, mmsize/2
+    jg .loop
+RET
+
+cglobal rg_fl_mode_20, 4, 5, 7, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    pxor m0, m0
+    .loop:
+        LOAD m1, [a1], m0
+        LOAD m2, [a2], m0
+        paddw m1, m2
+
+        LOAD m3, [a3], m0
+        LOAD m4, [a4], m0
+        paddw m3, m4
+
+        LOAD m5, [a5], m0
+        LOAD m6, [a6], m0
+        paddw m5, m6
+
+        LOAD m2, [a7], m0
+        LOAD m4, [a8], m0
+        paddw m2, m4
+
+        LOAD m6, [c], m0
+        paddw m1, m3
+        paddw m2, m5
+        paddw m6, [pw_4]
+
+        paddw m1, m2
+        paddw m1, m6
+
+        pmulhuw m1, [pw_div9]
+
+        packuswb m1, m1
+
+        movh [dstq], m1
+        add srcq, mmsize/2
+        add dstq, mmsize/2
+        sub pixelsd, mmsize/2
+    jg .loop
+RET
+
+cglobal rg_fl_mode_21, 4, 5, 8, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    pxor m0, m0
+    .loop:
+        movu m1, [a1]
+        movu m2, [a8]
+        pavgb m7, m1, m2
+        punpckhbw m3, m1, m0
+        punpcklbw m1, m0
+        punpckhbw m4, m2, m0
+        punpcklbw m2, m0
+        paddw m3, m4
+        paddw m1, m2
+        psrlw m3, 1
+        psrlw m1, 1
+        packuswb m1, m3
+
+        movu m2, [a2]
+        movu m3, [a7]
+        pavgb m6, m2, m3
+        punpckhbw m4, m2, m0
+        punpcklbw m2, m0
+        punpckhbw m5, m3, m0
+        punpcklbw m3, m0
+        paddw m4, m5
+        paddw m2, m3
+        psrlw m4, 1
+        psrlw m2, 1
+        packuswb m2, m4
+
+        pminub m1, m2
+        pmaxub m7, m6
+
+        movu m2, [a3]
+        movu m3, [a6]
+        pavgb m6, m2, m3
+        punpckhbw m4, m2, m0
+        punpcklbw m2, m0
+        punpckhbw m5, m3, m0
+        punpcklbw m3, m0
+        paddw m4, m5
+        paddw m2, m3
+        psrlw m4, 1
+        psrlw m2, 1
+        packuswb m2, m4
+
+        pminub m1, m2
+        pmaxub m7, m6
+
+        movu m2, [a4]
+        movu m3, [a5]
+        pavgb m6, m2, m3
+        punpckhbw m4, m2, m0
+        punpcklbw m2, m0
+        punpckhbw m5, m3, m0
+        punpcklbw m3, m0
+        paddw m4, m5
+        paddw m2, m3
+        psrlw m4, 1
+        psrlw m2, 1
+        packuswb m2, m4
+
+        pminub m1, m2
+        pmaxub m7, m6
+
+        movu m3, [c]
+        CLIPUB m3, m1, m7
+
+        movu [dstq], m3
+        add srcq, mmsize
+        add dstq, mmsize
+        sub pixelsd, mmsize
+    jg .loop
+RET
+
+cglobal rg_fl_mode_22, 4, 5, 8, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    .loop:
+        movu m0, [a1]
+        movu m1, [a8]
+        pavgb m0, m1
+        movu m2, [a2]
+        movu m3, [a7]
+        pavgb m2, m3
+        movu m4, [a3]
+        movu m5, [a6]
+        pavgb m4, m5
+        movu m6, [a4]
+        movu m7, [a5]
+        pavgb m6, m7
+
+        mova m1, m0
+        mova m3, m2
+        mova m5, m4
+        mova m7, m6
+        pminub m0, m2
+        pminub m4, m6
+        pmaxub m1, m3
+        pmaxub m5, m7
+        pminub m0, m4
+        pmaxub m1, m5
+
+        movu m2, [c]
+        CLIPUB m2, m0, m1
+
+        movu [dstq], m2
+        add srcq, mmsize
+        add dstq, mmsize
+        sub pixelsd, mmsize
+    jg .loop
+RET
+
+%if ARCH_X86_64
+cglobal rg_fl_mode_23, 4, 5, 16, 0, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    pxor m15, m15
+    .loop:
+        LOAD_SQUARE_16 m15
+        SORT_AXIS_16
+
+        mova m9, m8
+        mova m10, m7
+        mova m11, m6
+        mova m12, m5
+        psubw m9, m1  ; linediff1
+        psubw m10, m2 ; linediff2
+        psubw m11, m3 ; linediff3
+        psubw m12, m4 ; linediff4
+
+        psubw m1, m0
+        psubw m2, m0
+        psubw m3, m0
+        psubw m4, m0
+        pminsw m1, m9  ; d1
+        pminsw m2, m10 ; d2
+        pminsw m3, m11 ; d3
+        pminsw m4, m12 ; d4
+        pmaxsw m1, m2
+        pmaxsw m3, m4
+        pmaxsw m1, m3
+        pmaxsw m1, m15 ; d
+
+        mova m13, m0
+        mova m14, m0
+        mova m2, m0
+        mova m4, m0
+        psubw m13, m8
+        psubw m14, m7
+        psubw m2, m6
+        psubw m4, m5
+        pminsw m9, m13  ; u1
+        pminsw m10, m14 ; u2
+        pminsw m11, m2  ; u3
+        pminsw m12, m4  ; u4
+        pmaxsw m9, m10
+        pmaxsw m11, m12
+        pmaxsw m9, m11
+        pmaxsw m9, m15  ; u
+
+        paddw m0, m1
+        psubw m0, m9
+        packuswb m0, m0
+
+        movh [dstq], m0
+        add srcq, mmsize/2
+        add dstq, mmsize/2
+        sub pixelsd, mmsize/2
+    jg .loop
+RET
+
+cglobal rg_fl_mode_24, 4, 5, 16, mmsize, dst, src, stride, pixels
+    mov r4q, strideq
+    neg r4q
+    %define stride_p strideq
+    %define stride_n r4q
+
+    pxor m15, m15
+    .loop:
+        LOAD_SQUARE_16 m15
+        mova [rsp], m0
+        SORT_AXIS_16
+
+        mova m9, m8
+        mova m10, m7
+        mova m11, m6
+        mova m12, m5
+        psubw m9, m1  ; linediff1
+        psubw m10, m2 ; linediff2
+        psubw m11, m3 ; linediff3
+        psubw m12, m4 ; linediff4
+
+        psubw m1, [rsp] ; td1
+        psubw m2, [rsp] ; td2
+        psubw m3, [rsp] ; td3
+        psubw m4, [rsp] ; td4
+        mova m0, m9
+        mova m13, m10
+        mova m14, m11
+        mova m15, m12
+        psubw m0, m1
+        psubw m13, m2
+        psubw m14, m3
+        psubw m15, m4
+        pminsw m1, m0  ; d1
+        pminsw m2, m13 ; d2
+        pminsw m3, m14 ; d3
+        pminsw m4, m15 ; d4
+        pmaxsw m1, m2
+        pmaxsw m3, m4
+
+        mova m0, [rsp]
+        mova m13, [rsp]
+        mova m14, [rsp]
+        mova m15, [rsp]
+        psubw m0, m8  ; tu1
+        psubw m13, m7 ; tu2
+        psubw m14, m6 ; tu3
+        psubw m15, m5 ; tu4
+        psubw m9, m0
+        psubw m10, m13
+        psubw m11, m14
+        psubw m12, m15
+        pminsw m9, m0   ; u1
+        pminsw m10, m13 ; u2
+        pminsw m11, m14 ; u3
+        pminsw m12, m15 ; u4
+        pmaxsw m9, m10
+        pmaxsw m11, m12
+
+        pmaxsw m1, m3  ; d without max(d,0)
+        pmaxsw m9, m11  ; u without max(u,0)
+        pxor m15, m15
+        pmaxsw m1, m15
+        pmaxsw m9, m15
+
+        mova m0, [rsp]
+        paddw m0, m1
+        psubw m0, m9
+        packuswb m0, m0
+
+        movh [dstq], m0
+        add srcq, mmsize/2
+        add dstq, mmsize/2
+        sub pixelsd, mmsize/2
+    jg .loop
+RET
+%endif
diff --git a/libavfilter/x86/vf_removegrain_init.c b/libavfilter/x86/vf_removegrain_init.c
new file mode 100644
index 0000000000..07314b3244
--- /dev/null
+++ b/libavfilter/x86/vf_removegrain_init.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2015 James Darnley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/removegrain.h"
+
+void ff_rg_fl_mode_1_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_10_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_11_12_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_13_14_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_19_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_20_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_21_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_22_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+#if ARCH_X86_64
+void ff_rg_fl_mode_2_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_3_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_4_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_5_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_6_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_7_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_8_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_9_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_15_16_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_17_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_18_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_23_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+void ff_rg_fl_mode_24_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels);
+#endif
+
+av_cold void ff_removegrain_init_x86(RemoveGrainContext *rg)
+{
+#if CONFIG_GPL
+    int cpu_flags = av_get_cpu_flags();
+    int i;
+
+    for (i = 0; i < rg->nb_planes; i++) {
+        if (EXTERNAL_SSE2(cpu_flags))
+            switch (rg->mode[i]) {
+                case 1: rg->fl[i] = ff_rg_fl_mode_1_sse2; break;
+                case 10: rg->fl[i] = ff_rg_fl_mode_10_sse2; break;
+                case 11: /* fall through */
+                case 12: rg->fl[i] = ff_rg_fl_mode_11_12_sse2; break;
+                case 13: /* fall through */
+                case 14: rg->fl[i] = ff_rg_fl_mode_13_14_sse2; break;
+                case 19: rg->fl[i] = ff_rg_fl_mode_19_sse2; break;
+                case 20: rg->fl[i] = ff_rg_fl_mode_20_sse2; break;
+                case 21: rg->fl[i] = ff_rg_fl_mode_21_sse2; break;
+                case 22: rg->fl[i] = ff_rg_fl_mode_22_sse2; break;
+#if ARCH_X86_64
+                case 2: rg->fl[i] = ff_rg_fl_mode_2_sse2; break;
+                case 3: rg->fl[i] = ff_rg_fl_mode_3_sse2; break;
+                case 4: rg->fl[i] = ff_rg_fl_mode_4_sse2; break;
+                case 5: rg->fl[i] = ff_rg_fl_mode_5_sse2; break;
+                case 6: rg->fl[i] = ff_rg_fl_mode_6_sse2; break;
+                case 7: rg->fl[i] = ff_rg_fl_mode_7_sse2; break;
+                case 8: rg->fl[i] = ff_rg_fl_mode_8_sse2; break;
+                case 9: rg->fl[i] = ff_rg_fl_mode_9_sse2; break;
+                case 15: /* fall through */
+                case 16: rg->fl[i] = ff_rg_fl_mode_15_16_sse2; break;
+                case 17: rg->fl[i] = ff_rg_fl_mode_17_sse2; break;
+                case 18: rg->fl[i] = ff_rg_fl_mode_18_sse2; break;
+                case 23: rg->fl[i] = ff_rg_fl_mode_23_sse2; break;
+                case 24: rg->fl[i] = ff_rg_fl_mode_24_sse2; break;
+#endif /* ARCH_x86_64 */
+            }
+    }
+#endif /* CONFIG_GPL */
+}
diff --git a/libavfilter/x86/vf_spp.c b/libavfilter/x86/vf_spp.c
new file mode 100644
index 0000000000..1cfb9e81f7
--- /dev/null
+++ b/libavfilter/x86/vf_spp.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavfilter/vf_spp.h"
+
+#if HAVE_MMX_INLINE
+static void hardthresh_mmx(int16_t dst[64], const int16_t src[64],
+                           int qp, const uint8_t *permutation)
+{
+    int bias = 0; //FIXME
+    unsigned int threshold1;
+
+    threshold1 = qp * ((1<<4) - bias) - 1;
+
+#define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3)    \
+    "movq " #src0 ", %%mm0      \n"                                     \
+    "movq " #src1 ", %%mm1      \n"                                     \
+    "movq " #src2 ", %%mm2      \n"                                     \
+    "movq " #src3 ", %%mm3      \n"                                     \
+    "psubw %%mm4, %%mm0         \n"                                     \
+    "psubw %%mm4, %%mm1         \n"                                     \
+    "psubw %%mm4, %%mm2         \n"                                     \
+    "psubw %%mm4, %%mm3         \n"                                     \
+    "paddusw %%mm5, %%mm0       \n"                                     \
+    "paddusw %%mm5, %%mm1       \n"                                     \
+    "paddusw %%mm5, %%mm2       \n"                                     \
+    "paddusw %%mm5, %%mm3       \n"                                     \
+    "paddw %%mm6, %%mm0         \n"                                     \
+    "paddw %%mm6, %%mm1         \n"                                     \
+    "paddw %%mm6, %%mm2         \n"                                     \
+    "paddw %%mm6, %%mm3         \n"                                     \
+    "psubusw %%mm6, %%mm0       \n"                                     \
+    "psubusw %%mm6, %%mm1       \n"                                     \
+    "psubusw %%mm6, %%mm2       \n"                                     \
+    "psubusw %%mm6, %%mm3       \n"                                     \
+    "psraw $3, %%mm0            \n"                                     \
+    "psraw $3, %%mm1            \n"                                     \
+    "psraw $3, %%mm2            \n"                                     \
+    "psraw $3, %%mm3            \n"                                     \
+                                                                        \
+    "movq %%mm0, %%mm7          \n"                                     \
+    "punpcklwd %%mm2, %%mm0     \n" /*A*/                               \
+    "punpckhwd %%mm2, %%mm7     \n" /*C*/                               \
+    "movq %%mm1, %%mm2          \n"                                     \
+    "punpcklwd %%mm3, %%mm1     \n" /*B*/                               \
+    "punpckhwd %%mm3, %%mm2     \n" /*D*/                               \
+    "movq %%mm0, %%mm3          \n"                                     \
+    "punpcklwd %%mm1, %%mm0     \n" /*A*/                               \
+    "punpckhwd %%mm7, %%mm3     \n" /*C*/                               \
+    "punpcklwd %%mm2, %%mm7     \n" /*B*/                               \
+    "punpckhwd %%mm2, %%mm1     \n" /*D*/                               \
+                                                                        \
+    "movq %%mm0, " #dst0 "      \n"                                     \
+    "movq %%mm7, " #dst1 "      \n"                                     \
+    "movq %%mm3, " #dst2 "      \n"                                     \
+    "movq %%mm1, " #dst3 "      \n"
+
+    __asm__ volatile(
+        "movd %2, %%mm4             \n"
+        "movd %3, %%mm5             \n"
+        "movd %4, %%mm6             \n"
+        "packssdw %%mm4, %%mm4      \n"
+        "packssdw %%mm5, %%mm5      \n"
+        "packssdw %%mm6, %%mm6      \n"
+        "packssdw %%mm4, %%mm4      \n"
+        "packssdw %%mm5, %%mm5      \n"
+        "packssdw %%mm6, %%mm6      \n"
+        REQUANT_CORE(  (%1),  8(%1), 16(%1), 24(%1),  (%0), 8(%0), 64(%0), 72(%0))
+        REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
+        REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
+        REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
+        : : "r" (src), "r" (dst), "g" (threshold1+1), "g" (threshold1+5), "g" (threshold1-4) //FIXME maybe more accurate then needed?
+    );
+    dst[0] = (src[0] + 4) >> 3;
+}
+
+static void softthresh_mmx(int16_t dst[64], const int16_t src[64],
+                           int qp, const uint8_t *permutation)
+{
+    int bias = 0; //FIXME
+    unsigned int threshold1;
+
+    threshold1 = qp*((1<<4) - bias) - 1;
+
+#undef REQUANT_CORE
+#define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3)    \
+    "movq " #src0 ", %%mm0      \n"                                     \
+    "movq " #src1 ", %%mm1      \n"                                     \
+    "pxor %%mm6, %%mm6          \n"                                     \
+    "pxor %%mm7, %%mm7          \n"                                     \
+    "pcmpgtw %%mm0, %%mm6       \n"                                     \
+    "pcmpgtw %%mm1, %%mm7       \n"                                     \
+    "pxor %%mm6, %%mm0          \n"                                     \
+    "pxor %%mm7, %%mm1          \n"                                     \
+    "psubusw %%mm4, %%mm0       \n"                                     \
+    "psubusw %%mm4, %%mm1       \n"                                     \
+    "pxor %%mm6, %%mm0          \n"                                     \
+    "pxor %%mm7, %%mm1          \n"                                     \
+    "movq " #src2 ", %%mm2      \n"                                     \
+    "movq " #src3 ", %%mm3      \n"                                     \
+    "pxor %%mm6, %%mm6          \n"                                     \
+    "pxor %%mm7, %%mm7          \n"                                     \
+    "pcmpgtw %%mm2, %%mm6       \n"                                     \
+    "pcmpgtw %%mm3, %%mm7       \n"                                     \
+    "pxor %%mm6, %%mm2          \n"                                     \
+    "pxor %%mm7, %%mm3          \n"                                     \
+    "psubusw %%mm4, %%mm2       \n"                                     \
+    "psubusw %%mm4, %%mm3       \n"                                     \
+    "pxor %%mm6, %%mm2          \n"                                     \
+    "pxor %%mm7, %%mm3          \n"                                     \
+                                                                        \
+    "paddsw %%mm5, %%mm0        \n"                                     \
+    "paddsw %%mm5, %%mm1        \n"                                     \
+    "paddsw %%mm5, %%mm2        \n"                                     \
+    "paddsw %%mm5, %%mm3        \n"                                     \
+    "psraw $3, %%mm0            \n"                                     \
+    "psraw $3, %%mm1            \n"                                     \
+    "psraw $3, %%mm2            \n"                                     \
+    "psraw $3, %%mm3            \n"                                     \
+                                                                        \
+    "movq %%mm0, %%mm7          \n"                                     \
+    "punpcklwd %%mm2, %%mm0     \n" /*A*/                               \
+    "punpckhwd %%mm2, %%mm7     \n" /*C*/                               \
+    "movq %%mm1, %%mm2          \n"                                     \
+    "punpcklwd %%mm3, %%mm1     \n" /*B*/                               \
+    "punpckhwd %%mm3, %%mm2     \n" /*D*/                               \
+    "movq %%mm0, %%mm3          \n"                                     \
+    "punpcklwd %%mm1, %%mm0     \n" /*A*/                               \
+    "punpckhwd %%mm7, %%mm3     \n" /*C*/                               \
+    "punpcklwd %%mm2, %%mm7     \n" /*B*/                               \
+    "punpckhwd %%mm2, %%mm1     \n" /*D*/                               \
+                                                                        \
+    "movq %%mm0, " #dst0 "      \n"                                     \
+    "movq %%mm7, " #dst1 "      \n"                                     \
+    "movq %%mm3, " #dst2 "      \n"                                     \
+    "movq %%mm1, " #dst3 "      \n"
+
+    __asm__ volatile(
+        "movd %2, %%mm4             \n"
+        "movd %3, %%mm5             \n"
+        "packssdw %%mm4, %%mm4      \n"
+        "packssdw %%mm5, %%mm5      \n"
+        "packssdw %%mm4, %%mm4      \n"
+        "packssdw %%mm5, %%mm5      \n"
+        REQUANT_CORE(  (%1),  8(%1), 16(%1), 24(%1),  (%0), 8(%0), 64(%0), 72(%0))
+        REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
+        REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
+        REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
+        : : "r" (src), "r" (dst), "g" (threshold1), "rm" (4) //FIXME maybe more accurate then needed?
+    );
+
+    dst[0] = (src[0] + 4) >> 3;
+}
+
+static void store_slice_mmx(uint8_t *dst, const int16_t *src,
+                            int dst_stride, int src_stride,
+                            int width, int height, int log2_scale,
+                            const uint8_t dither[8][8])
+{
+    int y;
+
+    for (y = 0; y < height; y++) {
+        uint8_t *dst1 = dst;
+        const int16_t *src1 = src;
+        __asm__ volatile(
+            "movq (%3), %%mm3           \n"
+            "movq (%3), %%mm4           \n"
+            "movd %4, %%mm2             \n"
+            "pxor %%mm0, %%mm0          \n"
+            "punpcklbw %%mm0, %%mm3     \n"
+            "punpckhbw %%mm0, %%mm4     \n"
+            "psraw %%mm2, %%mm3         \n"
+            "psraw %%mm2, %%mm4         \n"
+            "movd %5, %%mm2             \n"
+            "1:                         \n"
+            "movq (%0), %%mm0           \n"
+            "movq 8(%0), %%mm1          \n"
+            "paddw %%mm3, %%mm0         \n"
+            "paddw %%mm4, %%mm1         \n"
+            "psraw %%mm2, %%mm0         \n"
+            "psraw %%mm2, %%mm1         \n"
+            "packuswb %%mm1, %%mm0      \n"
+            "movq %%mm0, (%1)           \n"
+            "add $16, %0                \n"
+            "add $8, %1                 \n"
+            "cmp %2, %1                 \n"
+            " jb 1b                     \n"
+            : "+r" (src1), "+r"(dst1)
+            : "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(MAX_LEVEL - log2_scale)
+        );
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+#endif /* HAVE_MMX_INLINE */
+
+av_cold void ff_spp_init_x86(SPPContext *s)
+{
+#if HAVE_MMX_INLINE
+    int cpu_flags = av_get_cpu_flags();
+
+    if (cpu_flags & AV_CPU_FLAG_MMX) {
+        s->store_slice = store_slice_mmx;
+        if (av_get_int(s->dct, "bits_per_sample", NULL) <= 8) {
+            switch (s->mode) {
+            case 0: s->requantize = hardthresh_mmx; break;
+            case 1: s->requantize = softthresh_mmx; break;
+            }
+        }
+    }
+#endif
+}
diff --git a/libavfilter/x86/vf_ssim.asm b/libavfilter/x86/vf_ssim.asm
new file mode 100644
index 0000000000..3293e66701
--- /dev/null
+++ b/libavfilter/x86/vf_ssim.asm
@@ -0,0 +1,247 @@
+;*****************************************************************************
+;* x86-optimized functions for ssim filter
+;*
+;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_1: times 8 dw 1
+ssim_c1: times 4 dd 416 ;(.01*.01*255*255*64 + .5)
+ssim_c2: times 4 dd 235963 ;(.03*.03*255*255*64*63 + .5)
+
+SECTION .text
+
+%macro SSIM_4X4_LINE 1
+%if ARCH_X86_64
+cglobal ssim_4x4_line, 6, 8, %1, buf, buf_stride, ref, ref_stride, sums, w, buf_stride3, ref_stride3
+%else
+cglobal ssim_4x4_line, 5, 7, %1, buf, buf_stride, ref, ref_stride, sums, buf_stride3, ref_stride3
+%define wd r5mp
+%endif
+    lea     ref_stride3q, [ref_strideq*3]
+    lea     buf_stride3q, [buf_strideq*3]
+%if notcpuflag(xop)
+    pxor              m7, m7
+    mova             m15, [pw_1]
+%endif
+
+.loop:
+%if cpuflag(xop)
+    pmovzxbw          m0, [bufq+buf_strideq*0]
+    pmovzxbw          m1, [refq+ref_strideq*0]
+    pmaddwd           m4, m0, m0
+    pmaddwd           m6, m0, m1
+    pmovzxbw          m2, [bufq+buf_strideq*1]
+    vpmadcswd         m4, m1, m1, m4
+    pmovzxbw          m3, [refq+ref_strideq*1]
+    paddw             m0, m2
+    vpmadcswd         m4, m2, m2, m4
+    vpmadcswd         m6, m2, m3, m6
+    paddw             m1, m3
+    vpmadcswd         m4, m3, m3, m4
+
+    pmovzxbw          m2, [bufq+buf_strideq*2]
+    pmovzxbw          m3, [refq+ref_strideq*2]
+    vpmadcswd         m4, m2, m2, m4
+    vpmadcswd         m6, m2, m3, m6
+    pmovzxbw          m5, [bufq+buf_stride3q]
+    pmovzxbw          m7, [refq+ref_stride3q]
+    vpmadcswd         m4, m3, m3, m4
+    vpmadcswd         m6, m5, m7, m6
+    paddw             m0, m2
+    paddw             m1, m3
+    vpmadcswd         m4, m5, m5, m4
+    paddw             m0, m5
+    paddw             m1, m7
+    vpmadcswd         m4, m7, m7, m4
+%else
+    movh              m0, [bufq+buf_strideq*0]  ; a1
+    movh              m1, [refq+ref_strideq*0]  ; b1
+    movh              m2, [bufq+buf_strideq*1]  ; a2
+    movh              m3, [refq+ref_strideq*1]  ; b2
+    punpcklbw         m0, m7                    ; s1 [word]
+    punpcklbw         m1, m7                    ; s2 [word]
+    punpcklbw         m2, m7                    ; s1 [word]
+    punpcklbw         m3, m7                    ; s2 [word]
+    pmaddwd           m4, m0, m0                ; a1 * a1
+    pmaddwd           m5, m1, m1                ; b1 * b1
+    pmaddwd           m8, m2, m2                ; a2 * a2
+    pmaddwd           m9, m3, m3                ; b2 * b2
+    paddd             m4, m5                    ; ss
+    paddd             m8, m9                    ; ss
+    pmaddwd           m6, m0, m1                ; a1 * b1 = ss12
+    pmaddwd           m5, m2, m3                ; a2 * b2 = ss12
+    paddw             m0, m2
+    paddw             m1, m3
+    paddd             m6, m5                    ; s12
+    paddd             m4, m8                    ; ss
+
+    movh              m2, [bufq+buf_strideq*2]  ; a3
+    movh              m3, [refq+ref_strideq*2]  ; b3
+    movh              m5, [bufq+buf_stride3q]   ; a4
+    movh              m8, [refq+ref_stride3q]   ; b4
+    punpcklbw         m2, m7                    ; s1 [word]
+    punpcklbw         m3, m7                    ; s2 [word]
+    punpcklbw         m5, m7                    ; s1 [word]
+    punpcklbw         m8, m7                    ; s2 [word]
+    pmaddwd           m9, m2, m2                ; a3 * a3
+    pmaddwd          m10, m3, m3                ; b3 * b3
+    pmaddwd          m12, m5, m5                ; a4 * a4
+    pmaddwd          m13, m8, m8                ; b4 * b4
+    pmaddwd          m11, m2, m3                ; a3 * b3 = ss12
+    pmaddwd          m14, m5, m8                ; a4 * b4 = ss12
+    paddd             m9, m10
+    paddd            m12, m13
+    paddw             m0, m2
+    paddw             m1, m3
+    paddw             m0, m5
+    paddw             m1, m8
+    paddd             m6, m11
+    paddd             m4, m9
+    paddd             m6, m14
+    paddd             m4, m12
+%endif
+
+    ; m0 = [word] s1 a,a,a,a,b,b,b,b
+    ; m1 = [word] s2 a,a,a,a,b,b,b,b
+    ; m4 = [dword] ss a,a,b,b
+    ; m6 = [dword] s12 a,a,b,b
+
+%if cpuflag(xop)
+    vphaddwq          m0, m0                    ; [dword] s1  a, 0, b, 0
+    vphaddwq          m1, m1                    ; [dword] s2  a, 0, b, 0
+    vphadddq          m4, m4                    ; [dword] ss  a, 0, b, 0
+    vphadddq          m6, m6                    ; [dword] s12 a, 0, b, 0
+    punpckhdq     m2, m0, m1                    ; [dword] s1  b, s2 b, 0, 0
+    punpckldq         m0, m1                    ; [dword] s1  a, s2 a, 0, 0
+    punpckhdq     m3, m4, m6                    ; [dword] ss  b, s12 b, 0, 0
+    punpckldq         m4, m6                    ; [dword] ss  a, s12 a, 0, 0
+    punpcklqdq    m1, m2, m3                    ; [dword] b s1, s2, ss, s12
+    punpcklqdq        m0, m4                    ; [dword] a s1, s2, ss, s12
+%else
+    pmaddwd           m0, m15                   ; [dword] s1 a,a,b,b
+    pmaddwd           m1, m15                   ; [dword] s2 a,a,b,b
+    phaddd            m0, m4                    ; [dword] s1 a, b, ss a, b
+    phaddd            m1, m6                    ; [dword] s2 a, b, s12 a, b
+    punpckhdq     m2, m0, m1                    ; [dword] ss a, s12 a, ss b, s12 b
+    punpckldq         m0, m1                    ; [dword] s1 a, s2 a, s1 b, s2 b
+    punpckhqdq    m1, m0, m2                    ; [dword] b s1, s2, ss, s12
+    punpcklqdq        m0, m2                    ; [dword] a s1, s2, ss, s12
+%endif
+
+    mova  [sumsq+     0], m0
+    mova  [sumsq+mmsize], m1
+
+    add             bufq, mmsize/2
+    add             refq, mmsize/2
+    add            sumsq, mmsize*2
+    sub               wd, mmsize/8
+    jg .loop
+    RET
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM ssse3
+SSIM_4X4_LINE 16
+%endif
+%if HAVE_XOP_EXTERNAL
+INIT_XMM xop
+SSIM_4X4_LINE 8
+%endif
+
+INIT_XMM sse4
+cglobal ssim_end_line, 3, 3, 6, sum0, sum1, w
+    pxor              m0, m0
+.loop:
+    mova              m1, [sum0q+mmsize*0]
+    mova              m2, [sum0q+mmsize*1]
+    mova              m3, [sum0q+mmsize*2]
+    mova              m4, [sum0q+mmsize*3]
+    paddd             m1, [sum1q+mmsize*0]
+    paddd             m2, [sum1q+mmsize*1]
+    paddd             m3, [sum1q+mmsize*2]
+    paddd             m4, [sum1q+mmsize*3]
+    paddd             m1, m2
+    paddd             m2, m3
+    paddd             m3, m4
+    paddd             m4, [sum0q+mmsize*4]
+    paddd             m4, [sum1q+mmsize*4]
+    TRANSPOSE4x4D      1, 2, 3, 4, 5
+
+    ; m1 = fs1, m2 = fs2, m3 = fss, m4 = fs12
+    pslld             m3, 6
+    pslld             m4, 6
+    pmulld            m5, m1, m2                ; fs1 * fs2
+    pmulld            m1, m1                    ; fs1 * fs1
+    pmulld            m2, m2                    ; fs2 * fs2
+    psubd             m3, m1
+    psubd             m4, m5                    ; covariance
+    psubd             m3, m2                    ; variance
+
+    ; m1 = fs1 * fs1, m2 = fs2 * fs2, m3 = variance, m4 = covariance, m5 = fs1 * fs2
+    paddd             m4, m4                    ; 2 * covariance
+    paddd             m5, m5                    ; 2 * fs1 * fs2
+    paddd             m1, m2                    ; fs1 * fs1 + fs2 * fs2
+    paddd             m3, [ssim_c2]             ; variance + ssim_c2
+    paddd             m4, [ssim_c2]             ; 2 * covariance + ssim_c2
+    paddd             m5, [ssim_c1]             ; 2 * fs1 * fs2 + ssim_c1
+    paddd             m1, [ssim_c1]             ; fs1 * fs1 + fs2 * fs2 + ssim_c1
+
+    ; convert to float
+    cvtdq2ps          m3, m3
+    cvtdq2ps          m4, m4
+    cvtdq2ps          m5, m5
+    cvtdq2ps          m1, m1
+    mulps             m4, m5
+    mulps             m3, m1
+    divps             m4, m3                    ; ssim_endl
+    addps             m0, m4                    ; ssim
+    add            sum0q, mmsize*4
+    add            sum1q, mmsize*4
+    sub               wd, 4
+    jg .loop
+
+    ; subps the ones we added too much
+    test              wd, wd
+    jz .end
+    add               wd, 4
+    test              wd, 2
+    jz .skip2
+    psrldq            m4, 8
+.skip2:
+    test              wd, 1
+    jz .skip1
+    psrldq            m4, 4
+.skip1:
+    subps             m0, m4
+
+.end:
+    movhlps           m4, m0
+    addps             m0, m4
+    movss             m4, m0
+    shufps            m0, m0, 1
+    addss             m0, m4
+%if ARCH_X86_32
+    movss            r0m, m0
+    fld             r0mp
+%endif
+    RET
diff --git a/libavfilter/x86/vf_ssim_init.c b/libavfilter/x86/vf_ssim_init.c
new file mode 100644
index 0000000000..599c928403
--- /dev/null
+++ b/libavfilter/x86/vf_ssim_init.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+
+#include "libavfilter/ssim.h"
+
+void ff_ssim_4x4_line_ssse3(const uint8_t *buf, ptrdiff_t buf_stride,
+                            const uint8_t *ref, ptrdiff_t ref_stride,
+                            int (*sums)[4], int w);
+void ff_ssim_4x4_line_xop  (const uint8_t *buf, ptrdiff_t buf_stride,
+                            const uint8_t *ref, ptrdiff_t ref_stride,
+                            int (*sums)[4], int w);
+float ff_ssim_end_line_sse4(const int (*sum0)[4], const int (*sum1)[4], int w);
+
+void ff_ssim_init_x86(SSIMDSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (ARCH_X86_64 && EXTERNAL_SSSE3(cpu_flags))
+        dsp->ssim_4x4_line = ff_ssim_4x4_line_ssse3;
+    if (EXTERNAL_SSE4(cpu_flags))
+        dsp->ssim_end_line = ff_ssim_end_line_sse4;
+    if (EXTERNAL_XOP(cpu_flags))
+        dsp->ssim_4x4_line = ff_ssim_4x4_line_xop;
+}
diff --git a/libavfilter/x86/vf_tinterlace_init.c b/libavfilter/x86/vf_tinterlace_init.c
new file mode 100644
index 0000000000..ddb0cced36
--- /dev/null
+++ b/libavfilter/x86/vf_tinterlace_init.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2014 Kieran Kunhya <kierank@obe.tv>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/internal.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+
+#include "libavfilter/tinterlace.h"
+
+void ff_lowpass_line_sse2(uint8_t *dstp, ptrdiff_t linesize,
+                          const uint8_t *srcp,
+                          const uint8_t *srcp_above,
+                          const uint8_t *srcp_below);
+void ff_lowpass_line_avx (uint8_t *dstp, ptrdiff_t linesize,
+                          const uint8_t *srcp,
+                          const uint8_t *srcp_above,
+                          const uint8_t *srcp_below);
+
+av_cold void ff_tinterlace_init_x86(TInterlaceContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        s->lowpass_line = ff_lowpass_line_sse2;
+    if (EXTERNAL_AVX(cpu_flags))
+        s->lowpass_line = ff_lowpass_line_avx;
+}
diff --git a/libavfilter/x86/vf_yadif.asm b/libavfilter/x86/vf_yadif.asm
index 3d8b2bc180..a29620ce55 100644
--- a/libavfilter/x86/vf_yadif.asm
+++ b/libavfilter/x86/vf_yadif.asm
@@ -4,20 +4,20 @@
 ;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
 ;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -39,11 +39,7 @@ SECTION .text
     pavgb     m5, m3
     pand      m4, [pb_1]
     psubusb   m5, m4
-%if mmsize == 16
-    psrldq    m5, 1
-%else
-    psrlq     m5, 8
-%endif
+    RSHIFT    m5, 1
     punpcklbw m5, m7
     mova      m4, m2
     psubusb   m2, m3
@@ -51,13 +47,8 @@ SECTION .text
     pmaxub    m2, m3
     mova      m3, m2
     mova      m4, m2
-%if mmsize == 16
-    psrldq    m3, 1
-    psrldq    m4, 2
-%else
-    psrlq     m3, 8
-    psrlq     m4, 16
-%endif
+    RSHIFT    m3, 1
+    RSHIFT    m4, 2
     punpcklbw m2, m7
     punpcklbw m3, m7
     punpcklbw m4, m7
@@ -90,17 +81,17 @@ SECTION .text
 %endmacro
 
 %macro LOAD 2
-    movh      m%1, %2
-    punpcklbw m%1, m7
+    movh      %1, %2
+    punpcklbw %1, m7
 %endmacro
 
 %macro FILTER 3
 .loop%1:
     pxor         m7, m7
-    LOAD          0, [curq+t1]
-    LOAD          1, [curq+t0]
-    LOAD          2, [%2]
-    LOAD          3, [%3]
+    LOAD         m0, [curq+t1]
+    LOAD         m1, [curq+t0]
+    LOAD         m2, [%2]
+    LOAD         m3, [%3]
     mova         m4, m3
     paddw        m3, m2
     psraw        m3, 1
@@ -109,8 +100,8 @@ SECTION .text
     mova   [rsp+32], m1
     psubw        m2, m4
     ABS1         m2, m4
-    LOAD          3, [prevq+t1]
-    LOAD          4, [prevq+t0]
+    LOAD         m3, [prevq+t1]
+    LOAD         m4, [prevq+t0]
     psubw        m3, m0
     psubw        m4, m1
     ABS1         m3, m5
@@ -119,8 +110,8 @@ SECTION .text
     psrlw        m2, 1
     psrlw        m3, 1
     pmaxsw       m2, m3
-    LOAD          3, [nextq+t1]
-    LOAD          4, [nextq+t0]
+    LOAD         m3, [nextq+t1]
+    LOAD         m4, [nextq+t0]
     psubw        m3, m0
     psubw        m4, m1
     ABS1         m3, m5
@@ -166,10 +157,10 @@ SECTION .text
     mova         m6, [rsp+48]
     cmp   DWORD r8m, 2
     jge .end%1
-    LOAD          2, [%2+t1*2]
-    LOAD          4, [%3+t1*2]
-    LOAD          3, [%2+t0*2]
-    LOAD          5, [%3+t0*2]
+    LOAD         m2, [%2+t1*2]
+    LOAD         m4, [%3+t1*2]
+    LOAD         m3, [%2+t0*2]
+    LOAD         m5, [%3+t0*2]
     paddw        m2, m4
     paddw        m3, m5
     psrlw        m2, 1
@@ -220,8 +211,6 @@ cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \
 cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
                                         mrefs, parity, mode
 %endif
-    cmp      DWORD wm, 0
-    jle .ret
 %if ARCH_X86_32
     mov            r4, r5mp
     mov            r5, r6mp
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index 510a02394c..1460a642c3 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -1,26 +1,25 @@
 /*
  * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
@@ -36,16 +35,63 @@ void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur,
                                 void *next, int w, int prefs,
                                 int mrefs, int parity, int mode);
 
+void ff_yadif_filter_line_16bit_mmxext(void *dst, void *prev, void *cur,
+                                       void *next, int w, int prefs,
+                                       int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur,
+                                     void *next, int w, int prefs,
+                                     int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_ssse3(void *dst, void *prev, void *cur,
+                                      void *next, int w, int prefs,
+                                      int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_sse4(void *dst, void *prev, void *cur,
+                                     void *next, int w, int prefs,
+                                     int mrefs, int parity, int mode);
+
+void ff_yadif_filter_line_10bit_mmxext(void *dst, void *prev, void *cur,
+                                       void *next, int w, int prefs,
+                                       int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_sse2(void *dst, void *prev, void *cur,
+                                     void *next, int w, int prefs,
+                                     int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur,
+                                      void *next, int w, int prefs,
+                                      int mrefs, int parity, int mode);
+
 av_cold void ff_yadif_init_x86(YADIFContext *yadif)
 {
     int cpu_flags = av_get_cpu_flags();
+    int bit_depth = (!yadif->csp) ? 8
+                                  : yadif->csp->comp[0].depth_minus1 + 1;
 
+    if (bit_depth >= 15) {
+#if ARCH_X86_32
+        if (EXTERNAL_MMXEXT(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_16bit_mmxext;
+#endif /* ARCH_X86_32 */
+        if (EXTERNAL_SSE2(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_16bit_sse2;
+        if (EXTERNAL_SSSE3(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_16bit_ssse3;
+        if (EXTERNAL_SSE4(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_16bit_sse4;
+    } else if ( bit_depth >= 9 && bit_depth <= 14) {
+#if ARCH_X86_32
+        if (EXTERNAL_MMXEXT(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_10bit_mmxext;
+#endif /* ARCH_X86_32 */
+        if (EXTERNAL_SSE2(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_10bit_sse2;
+        if (EXTERNAL_SSSE3(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_10bit_ssse3;
+    } else {
 #if ARCH_X86_32
-    if (EXTERNAL_MMXEXT(cpu_flags))
-        yadif->filter_line = ff_yadif_filter_line_mmxext;
+        if (EXTERNAL_MMXEXT(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_mmxext;
 #endif /* ARCH_X86_32 */
-    if (EXTERNAL_SSE2(cpu_flags))
-        yadif->filter_line = ff_yadif_filter_line_sse2;
-    if (EXTERNAL_SSSE3(cpu_flags))
-        yadif->filter_line = ff_yadif_filter_line_ssse3;
+        if (EXTERNAL_SSE2(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_sse2;
+        if (EXTERNAL_SSSE3(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_ssse3;
+    }
 }
diff --git a/libavfilter/x86/yadif-10.asm b/libavfilter/x86/yadif-10.asm
new file mode 100644
index 0000000000..8853e0d2c7
--- /dev/null
+++ b/libavfilter/x86/yadif-10.asm
@@ -0,0 +1,255 @@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
+;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro PMAXUW 2
+%if cpuflag(sse4)
+    pmaxuw %1, %2
+%else
+    psubusw %1, %2
+    paddusw %1, %2
+%endif
+%endmacro
+
+%macro CHECK 2
+    movu      m2, [curq+t1+%1*2]
+    movu      m3, [curq+t0+%2*2]
+    mova      m4, m2
+    mova      m5, m2
+    pxor      m4, m3
+    pavgw     m5, m3
+    pand      m4, [pw_1]
+    psubusw   m5, m4
+    RSHIFT    m5, 2
+    mova      m4, m2
+    psubusw   m2, m3
+    psubusw   m3, m4
+    PMAXUW    m2, m3
+    mova      m3, m2
+    mova      m4, m2
+    RSHIFT    m3, 2
+    RSHIFT    m4, 4
+    paddw     m2, m3
+    paddw     m2, m4
+%endmacro
+
+%macro CHECK1 0
+    mova    m3, m0
+    pcmpgtw m3, m2
+    pminsw  m0, m2
+    mova    m6, m3
+    pand    m5, m3
+    pandn   m3, m1
+    por     m3, m5
+    mova    m1, m3
+%endmacro
+
+; %macro CHECK2 0
+;     paddw   m6, [pw_1]
+;     psllw   m6, 14
+;     paddsw  m2, m6
+;     mova    m3, m0
+;     pcmpgtw m3, m2
+;     pminsw  m0, m2
+;     pand    m5, m3
+;     pandn   m3, m1
+;     por     m3, m5
+;     mova    m1, m3
+; %endmacro
+
+; This version of CHECK2 is required for 14-bit samples.  The left-shift trick
+; in the old code is not large enough to correctly select pixels or scores.
+
+%macro CHECK2 0
+    mova    m3, m0
+    pcmpgtw m0, m2
+    pand    m0, m6
+    mova    m6, m0
+    pand    m5, m6
+    pand    m2, m0
+    pandn   m6, m1
+    pandn   m0, m3
+    por     m6, m5
+    por     m0, m2
+    mova    m1, m6
+%endmacro
+
+%macro LOAD 2
+    movu      %1, %2
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+    pxor         m7, m7
+    LOAD         m0, [curq+t1]
+    LOAD         m1, [curq+t0]
+    LOAD         m2, [%2]
+    LOAD         m3, [%3]
+    mova         m4, m3
+    paddw        m3, m2
+    psraw        m3, 1
+    mova   [rsp+ 0], m0
+    mova   [rsp+16], m3
+    mova   [rsp+32], m1
+    psubw        m2, m4
+    ABS1         m2, m4
+    LOAD         m3, [prevq+t1]
+    LOAD         m4, [prevq+t0]
+    psubw        m3, m0
+    psubw        m4, m1
+    ABS2         m3, m4, m5, m6
+    paddw        m3, m4
+    psrlw        m2, 1
+    psrlw        m3, 1
+    pmaxsw       m2, m3
+    LOAD         m3, [nextq+t1]
+    LOAD         m4, [nextq+t0]
+    psubw        m3, m0
+    psubw        m4, m1
+    ABS2         m3, m4, m5, m6
+    paddw        m3, m4
+    psrlw        m3, 1
+    pmaxsw       m2, m3
+    mova   [rsp+48], m2
+
+    paddw        m1, m0
+    paddw        m0, m0
+    psubw        m0, m1
+    psrlw        m1, 1
+    ABS1         m0, m2
+
+    movu         m2, [curq+t1-1*2]
+    movu         m3, [curq+t0-1*2]
+    mova         m4, m2
+    psubusw      m2, m3
+    psubusw      m3, m4
+    PMAXUW       m2, m3
+    mova         m3, m2
+    RSHIFT       m3, 4
+    paddw        m0, m2
+    paddw        m0, m3
+    psubw        m0, [pw_1]
+
+    CHECK -2, 0
+    CHECK1
+    CHECK -3, 1
+    CHECK2
+    CHECK 0, -2
+    CHECK1
+    CHECK 1, -3
+    CHECK2
+
+    mova         m6, [rsp+48]
+    cmp   DWORD r8m, 2
+    jge .end%1
+    LOAD         m2, [%2+t1*2]
+    LOAD         m4, [%3+t1*2]
+    LOAD         m3, [%2+t0*2]
+    LOAD         m5, [%3+t0*2]
+    paddw        m2, m4
+    paddw        m3, m5
+    psrlw        m2, 1
+    psrlw        m3, 1
+    mova         m4, [rsp+ 0]
+    mova         m5, [rsp+16]
+    mova         m7, [rsp+32]
+    psubw        m2, m4
+    psubw        m3, m7
+    mova         m0, m5
+    psubw        m5, m4
+    psubw        m0, m7
+    mova         m4, m2
+    pminsw       m2, m3
+    pmaxsw       m3, m4
+    pmaxsw       m2, m5
+    pminsw       m3, m5
+    pmaxsw       m2, m0
+    pminsw       m3, m0
+    pxor         m4, m4
+    pmaxsw       m6, m3
+    psubw        m4, m2
+    pmaxsw       m6, m4
+
+.end%1:
+    mova         m2, [rsp+16]
+    mova         m3, m2
+    psubw        m2, m6
+    paddw        m3, m6
+    pmaxsw       m1, m2
+    pminsw       m1, m3
+
+    movu     [dstq], m1
+    add        dstq, mmsize-4
+    add       prevq, mmsize-4
+    add        curq, mmsize-4
+    add       nextq, mmsize-4
+    sub   DWORD r4m, mmsize/2-2
+    jg .loop%1
+%endmacro
+
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, parity, mode
+%else
+cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, parity, mode
+%endif
+%if ARCH_X86_32
+    mov            r4, r5mp
+    mov            r5, r6mp
+    DECLARE_REG_TMP 4,5
+%else
+    movsxd         r5, DWORD r5m
+    movsxd         r6, DWORD r6m
+    DECLARE_REG_TMP 5,6
+%endif
+
+    cmp DWORD paritym, 0
+    je .parity0
+    FILTER 1, prevq, curq
+    jmp .ret
+
+.parity0:
+    FILTER 0, curq, nextq
+
+.ret:
+    RET
+%endmacro
+
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif
diff --git a/libavfilter/x86/yadif-16.asm b/libavfilter/x86/yadif-16.asm
new file mode 100644
index 0000000000..79d127dfaa
--- /dev/null
+++ b/libavfilter/x86/yadif-16.asm
@@ -0,0 +1,317 @@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
+;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_1:    times 8 dw 1
+pw_8000: times 8 dw 0x8000
+pd_1:    times 4 dd 1
+pd_8000: times 4 dd 0x8000
+
+SECTION .text
+
+%macro PABS 2
+%if cpuflag(ssse3)
+    pabsd %1, %1
+%else
+    pxor    %2, %2
+    pcmpgtd %2, %1
+    pxor    %1, %2
+    psubd   %1, %2
+%endif
+%endmacro
+
+%macro PACK 1
+%if cpuflag(sse4)
+    packusdw %1, %1
+%else
+    psubd    %1, [pd_8000]
+    packssdw %1, %1
+    paddw    %1, [pw_8000]
+%endif
+%endmacro
+
+%macro PMINSD 3
+%if cpuflag(sse4)
+    pminsd %1, %2
+%else
+    mova    %3, %2
+    pcmpgtd %3, %1
+    pand    %1, %3
+    pandn   %3, %2
+    por     %1, %3
+%endif
+%endmacro
+
+%macro PMAXSD 3
+%if cpuflag(sse4)
+    pmaxsd %1, %2
+%else
+    mova    %3, %1
+    pcmpgtd %3, %2
+    pand    %1, %3
+    pandn   %3, %2
+    por     %1, %3
+%endif
+%endmacro
+
+%macro PMAXUW 2
+%if cpuflag(sse4)
+    pmaxuw %1, %2
+%else
+    psubusw %1, %2
+    paddusw %1, %2
+%endif
+%endmacro
+
+%macro CHECK 2
+    movu      m2, [curq+t1+%1*2]
+    movu      m3, [curq+t0+%2*2]
+    mova      m4, m2
+    mova      m5, m2
+    pxor      m4, m3
+    pavgw     m5, m3
+    pand      m4, [pw_1]
+    psubusw   m5, m4
+    RSHIFT    m5, 2
+    punpcklwd m5, m7
+    mova      m4, m2
+    psubusw   m2, m3
+    psubusw   m3, m4
+    PMAXUW    m2, m3
+    mova      m3, m2
+    mova      m4, m2
+    RSHIFT    m3, 2
+    RSHIFT    m4, 4
+    punpcklwd m2, m7
+    punpcklwd m3, m7
+    punpcklwd m4, m7
+    paddd     m2, m3
+    paddd     m2, m4
+%endmacro
+
+%macro CHECK1 0
+    mova    m3, m0
+    pcmpgtd m3, m2
+    PMINSD  m0, m2, m6
+    mova    m6, m3
+    pand    m5, m3
+    pandn   m3, m1
+    por     m3, m5
+    mova    m1, m3
+%endmacro
+
+%macro CHECK2 0
+    paddd   m6, [pd_1]
+    pslld   m6, 30
+    paddd   m2, m6
+    mova    m3, m0
+    pcmpgtd m3, m2
+    PMINSD  m0, m2, m4
+    pand    m5, m3
+    pandn   m3, m1
+    por     m3, m5
+    mova    m1, m3
+%endmacro
+
+; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I
+; am not sure whether it is any faster.  A rewrite or refactor of the filter
+; code should make it possible to eliminate the move instruction at the end.  It
+; exists to satisfy the expectation that the "score" values are in m1.
+
+; %macro CHECK2 0
+;     mova    m3, m0
+;     pcmpgtd m0, m2
+;     pand    m0, m6
+;     mova    m6, m0
+;     pand    m5, m6
+;     pand    m2, m0
+;     pandn   m6, m1
+;     pandn   m0, m3
+;     por     m6, m5
+;     por     m0, m2
+;     mova    m1, m6
+; %endmacro
+
+%macro LOAD 2
+    movh      %1, %2
+    punpcklwd %1, m7
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+    pxor         m7, m7
+    LOAD         m0, [curq+t1]
+    LOAD         m1, [curq+t0]
+    LOAD         m2, [%2]
+    LOAD         m3, [%3]
+    mova         m4, m3
+    paddd        m3, m2
+    psrad        m3, 1
+    mova   [rsp+ 0], m0
+    mova   [rsp+16], m3
+    mova   [rsp+32], m1
+    psubd        m2, m4
+    PABS         m2, m4
+    LOAD         m3, [prevq+t1]
+    LOAD         m4, [prevq+t0]
+    psubd        m3, m0
+    psubd        m4, m1
+    PABS         m3, m5
+    PABS         m4, m5
+    paddd        m3, m4
+    psrld        m2, 1
+    psrld        m3, 1
+    PMAXSD       m2, m3, m6
+    LOAD         m3, [nextq+t1]
+    LOAD         m4, [nextq+t0]
+    psubd        m3, m0
+    psubd        m4, m1
+    PABS         m3, m5
+    PABS         m4, m5
+    paddd        m3, m4
+    psrld        m3, 1
+    PMAXSD       m2, m3, m6
+    mova   [rsp+48], m2
+
+    paddd        m1, m0
+    paddd        m0, m0
+    psubd        m0, m1
+    psrld        m1, 1
+    PABS         m0, m2
+
+    movu         m2, [curq+t1-1*2]
+    movu         m3, [curq+t0-1*2]
+    mova         m4, m2
+    psubusw      m2, m3
+    psubusw      m3, m4
+    PMAXUW       m2, m3
+    mova         m3, m2
+    RSHIFT       m3, 4
+    punpcklwd    m2, m7
+    punpcklwd    m3, m7
+    paddd        m0, m2
+    paddd        m0, m3
+    psubd        m0, [pd_1]
+
+    CHECK -2, 0
+    CHECK1
+    CHECK -3, 1
+    CHECK2
+    CHECK 0, -2
+    CHECK1
+    CHECK 1, -3
+    CHECK2
+
+    mova         m6, [rsp+48]
+    cmp   DWORD r8m, 2
+    jge .end%1
+    LOAD         m2, [%2+t1*2]
+    LOAD         m4, [%3+t1*2]
+    LOAD         m3, [%2+t0*2]
+    LOAD         m5, [%3+t0*2]
+    paddd        m2, m4
+    paddd        m3, m5
+    psrld        m2, 1
+    psrld        m3, 1
+    mova         m4, [rsp+ 0]
+    mova         m5, [rsp+16]
+    mova         m7, [rsp+32]
+    psubd        m2, m4
+    psubd        m3, m7
+    mova         m0, m5
+    psubd        m5, m4
+    psubd        m0, m7
+    mova         m4, m2
+    PMINSD       m2, m3, m7
+    PMAXSD       m3, m4, m7
+    PMAXSD       m2, m5, m7
+    PMINSD       m3, m5, m7
+    PMAXSD       m2, m0, m7
+    PMINSD       m3, m0, m7
+    pxor         m4, m4
+    PMAXSD       m6, m3, m7
+    psubd        m4, m2
+    PMAXSD       m6, m4, m7
+
+.end%1:
+    mova         m2, [rsp+16]
+    mova         m3, m2
+    psubd        m2, m6
+    paddd        m3, m6
+    PMAXSD       m1, m2, m7
+    PMINSD       m1, m3, m7
+    PACK         m1
+
+    movh     [dstq], m1
+    add        dstq, mmsize/2
+    add       prevq, mmsize/2
+    add        curq, mmsize/2
+    add       nextq, mmsize/2
+    sub   DWORD r4m, mmsize/4
+    jg .loop%1
+%endmacro
+
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, parity, mode
+%else
+cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, parity, mode
+%endif
+%if ARCH_X86_32
+    mov            r4, r5mp
+    mov            r5, r6mp
+    DECLARE_REG_TMP 4,5
+%else
+    movsxd         r5, DWORD r5m
+    movsxd         r6, DWORD r6m
+    DECLARE_REG_TMP 5,6
+%endif
+
+    cmp DWORD paritym, 0
+    je .parity0
+    FILTER 1, prevq, curq
+    jmp .ret
+
+.parity0:
+    FILTER 0, curq, nextq
+
+.ret:
+    RET
+%endmacro
+
+INIT_XMM sse4
+YADIF
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif