154 files changed, 14055 insertions, 3134 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 3cec000e1d..2fa56b9aab 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -4,17 +4,21 @@ OBJS                                   += x86/constants.o               \
 # subsystems
 OBJS-$(CONFIG_AC3DSP)                  += x86/ac3dsp_init.o
 OBJS-$(CONFIG_AUDIODSP)                += x86/audiodsp_init.o
-OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp.o
+OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp_init.o
 OBJS-$(CONFIG_BSWAPDSP)                += x86/bswapdsp_init.o
 OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
 OBJS-$(CONFIG_FDCTDSP)                 += x86/fdctdsp_init.o
 OBJS-$(CONFIG_FFT)                     += x86/fft_init.o
+OBJS-$(CONFIG_FLAC_DECODER)            += x86/flacdsp_init.o
+OBJS-$(CONFIG_FLAC_ENCODER)            += x86/flacdsp_init.o
 OBJS-$(CONFIG_H263DSP)                 += x86/h263dsp_init.o
 OBJS-$(CONFIG_H264CHROMA)              += x86/h264chroma_init.o
 OBJS-$(CONFIG_H264DSP)                 += x86/h264dsp_init.o
 OBJS-$(CONFIG_H264PRED)                += x86/h264_intrapred_init.o
 OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
+OBJS-$(CONFIG_LLAUDDSP)                += x86/lossless_audiodsp_init.o
+OBJS-$(CONFIG_LLVIDDSP)                += x86/lossless_videodsp_init.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
 OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_mmx.o
 OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
@@ -33,41 +37,42 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST)        += x86/w64xmmtest.o
 
 # decoders/encoders
 OBJS-$(CONFIG_AAC_DECODER)             += x86/sbrdsp_init.o
-OBJS-$(CONFIG_APE_DECODER)             += x86/apedsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
 OBJS-$(CONFIG_HEVC_DECODER)            += x86/hevcdsp_init.o
-OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp.o
+OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp_init.o
 OBJS-$(CONFIG_MPEG4_DECODER)           += x86/xvididct_init.o
 OBJS-$(CONFIG_PNG_DECODER)             += x86/pngdsp_init.o
 OBJS-$(CONFIG_PRORES_DECODER)          += x86/proresdsp_init.o
+OBJS-$(CONFIG_PRORES_LGPL_DECODER)     += x86/proresdsp_init.o
 OBJS-$(CONFIG_RV30_DECODER)            += x86/rv34dsp_init.o
 OBJS-$(CONFIG_RV40_DECODER)            += x86/rv34dsp_init.o            \
                                           x86/rv40dsp_init.o
-OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc.o
-OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp.o
+OBJS-$(CONFIG_SVQ1_ENCODER)            += x86/svq1enc_init.o
+OBJS-$(CONFIG_V210_DECODER)            += x86/v210-init.o
+OBJS-$(CONFIG_TTA_DECODER)             += x86/ttadsp_init.o
+OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp_init.o
 OBJS-$(CONFIG_VC1_DECODER)             += x86/vc1dsp_init.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += x86/vorbisdsp_init.o
 OBJS-$(CONFIG_VP6_DECODER)             += x86/vp6dsp_init.o
 OBJS-$(CONFIG_VP7_DECODER)             += x86/vp8dsp_init.o
 OBJS-$(CONFIG_VP8_DECODER)             += x86/vp8dsp_init.o
 OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o
+OBJS-$(CONFIG_WEBP_DECODER)            += x86/vp8dsp_init.o
 
 
 # GCC inline assembly optimizations
 # subsystems
-MMX-OBJS-$(CONFIG_AUDIODSP)            += x86/audiodsp_mmx.o
-MMX-OBJS-$(CONFIG_HPELDSP)             += x86/fpel_mmx.o                \
-                                          x86/hpeldsp_mmx.o
+MMX-OBJS-$(CONFIG_DIRAC_DECODER)       += x86/dirac_dwt.o
 MMX-OBJS-$(CONFIG_FDCTDSP)             += x86/fdct.o
-MMX-OBJS-$(CONFIG_IDCTDSP)             += x86/idctdsp_mmx.o             \
-                                          x86/simple_idct.o
-MMX-OBJS-$(CONFIG_QPELDSP)             += x86/fpel_mmx.o
+MMX-OBJS-$(CONFIG_IDCTDSP)             += x86/simple_idct.o
 
 # decoders/encoders
 MMX-OBJS-$(CONFIG_MPEG4_DECODER)       += x86/xvididct_mmx.o            \
                                           x86/xvididct_sse2.o
+MMX-OBJS-$(CONFIG_SNOW_DECODER)        += x86/snowdsp.o
+MMX-OBJS-$(CONFIG_SNOW_ENCODER)        += x86/snowdsp.o
 MMX-OBJS-$(CONFIG_VC1_DECODER)         += x86/vc1dsp_mmx.o
 
 
@@ -78,10 +83,17 @@ YASM-OBJS                              += x86/deinterlace.o             \
 # subsystems
 YASM-OBJS-$(CONFIG_AC3DSP)             += x86/ac3dsp.o
 YASM-OBJS-$(CONFIG_AUDIODSP)           += x86/audiodsp.o
+YASM-OBJS-$(CONFIG_BLOCKDSP)           += x86/blockdsp.o
 YASM-OBJS-$(CONFIG_BSWAPDSP)           += x86/bswapdsp.o
 YASM-OBJS-$(CONFIG_DCT)                += x86/dct32.o
+YASM-OBJS-$(CONFIG_DIRAC_DECODER)      += x86/diracdsp_mmx.o x86/diracdsp_yasm.o\
+                                          x86/dwt_yasm.o
 YASM-OBJS-$(CONFIG_DNXHD_ENCODER)      += x86/dnxhdenc.o
 YASM-OBJS-$(CONFIG_FFT)                += x86/fft.o
+YASM-OBJS-$(CONFIG_FLAC_DECODER)       += x86/flacdsp.o
+ifdef CONFIG_GPL
+YASM-OBJS-$(CONFIG_FLAC_ENCODER)       += x86/flac_dsp_gpl.o
+endif
 YASM-OBJS-$(CONFIG_H263DSP)            += x86/h263_loopfilter.o
 YASM-OBJS-$(CONFIG_H264CHROMA)         += x86/h264_chromamc.o           \
                                           x86/h264_chromamc_10bit.o
@@ -100,6 +112,9 @@ YASM-OBJS-$(CONFIG_H264QPEL)           += x86/h264_qpel_8bit.o          \
 YASM-OBJS-$(CONFIG_HPELDSP)            += x86/fpel.o                    \
                                           x86/hpeldsp.o
 YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o
+YASM-OBJS-$(CONFIG_IDCTDSP)            += x86/idctdsp.o
+YASM-OBJS-$(CONFIG_LLAUDDSP)           += x86/lossless_audiodsp.o
+YASM-OBJS-$(CONFIG_LLVIDDSP)           += x86/lossless_videodsp.o
 YASM-OBJS-$(CONFIG_ME_CMP)             += x86/me_cmp.o
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
 YASM-OBJS-$(CONFIG_MPEGVIDEOENC)       += x86/mpegvideoencdsp.o
@@ -112,14 +127,22 @@ YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
 
 # decoders/encoders
 YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
-YASM-OBJS-$(CONFIG_APE_DECODER)        += x86/apedsp.o
 YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o
-YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_deblock.o
+YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_mc.o                 \
+                                          x86/hevc_deblock.o            \
+                                          x86/hevc_idct.o               \
+                                          x86/hevc_res_add.o
+YASM-OBJS-$(CONFIG_MLP_DECODER)        += x86/mlpdsp.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
+YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_RV30_DECODER)       += x86/rv34dsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)       += x86/rv34dsp.o                 \
                                           x86/rv40dsp.o
+YASM-OBJS-$(CONFIG_SVQ1_ENCODER)       += x86/svq1enc.o
+YASM-OBJS-$(CONFIG_TRUEHD_DECODER)     += x86/mlpdsp.o
+YASM-OBJS-$(CONFIG_TTA_DECODER)        += x86/ttadsp.o
+YASM-OBJS-$(CONFIG_V210_DECODER)       += x86/v210.o
 YASM-OBJS-$(CONFIG_VC1_DECODER)        += x86/vc1dsp.o
 YASM-OBJS-$(CONFIG_VORBIS_DECODER)     += x86/vorbisdsp.o
 YASM-OBJS-$(CONFIG_VP6_DECODER)        += x86/vp6dsp.o
@@ -127,4 +150,8 @@ YASM-OBJS-$(CONFIG_VP7_DECODER)        += x86/vp8dsp.o                  \
                                           x86/vp8dsp_loopfilter.o
 YASM-OBJS-$(CONFIG_VP8_DECODER)        += x86/vp8dsp.o                  \
                                           x86/vp8dsp_loopfilter.o
-YASM-OBJS-$(CONFIG_VP9_DECODER)        += x86/vp9dsp.o
+YASM-OBJS-$(CONFIG_VP9_DECODER)        += x86/vp9intrapred.o            \
+                                          x86/vp9itxfm.o                \
+                                          x86/vp9lpf.o                  \
+                                          x86/vp9mc.o
+YASM-OBJS-$(CONFIG_WEBP_DECODER)       += x86/vp8dsp.o
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index 817d5a319c..b24441614b 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -2,20 +2,20 @@
 ;* x86-optimized AC-3 DSP functions
 ;* Copyright (c) 2011 Justin Ruggles
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c
index cd638b9228..30a85f996e 100644
--- a/libavcodec/x86/ac3dsp_init.c
+++ b/libavcodec/x86/ac3dsp_init.c
@@ -2,20 +2,20 @@
  * x86-optimized AC-3 DSP functions
  * Copyright (c) 2011 Justin Ruggles
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,6 +64,11 @@ void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
                                       const int16_t *window, unsigned int len);
 
+#if ARCH_X86_32 && defined(__INTEL_COMPILER)
+#       undef HAVE_7REGS
+#       define HAVE_7REGS 0
+#endif
+
 #if HAVE_SSE_INLINE && HAVE_7REGS
 
 #define IF1(x) x
diff --git a/libavcodec/x86/audiodsp.asm b/libavcodec/x86/audiodsp.asm
index f2e831df17..273b9ef660 100644
--- a/libavcodec/x86/audiodsp.asm
+++ b/libavcodec/x86/audiodsp.asm
@@ -2,20 +2,20 @@
 ;* optimized audio functions
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -40,15 +40,11 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
     paddd   m2, m1
     add     orderq, mmsize*2
     jl .loop
-%if mmsize == 16
-    movhlps m0, m2
-    paddd   m2, m0
-    pshuflw m0, m2, 0x4e
-%else
-    pshufw  m0, m2, 0x4e
-%endif
-    paddd   m2, m0
+    HADDD   m2, m0
     movd   eax, m2
+%if mmsize == 8
+    emms
+%endif
     RET
 %endmacro
 
@@ -80,17 +76,17 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
     SPLATD    m4
     SPLATD    m5
 .loop:
-%assign %%i 1
+%assign %%i 0
 %rep %2
-    mova      m0,  [srcq+mmsize*0*%%i]
-    mova      m1,  [srcq+mmsize*1*%%i]
-    mova      m2,  [srcq+mmsize*2*%%i]
-    mova      m3,  [srcq+mmsize*3*%%i]
+    mova      m0,  [srcq+mmsize*(0+%%i)]
+    mova      m1,  [srcq+mmsize*(1+%%i)]
+    mova      m2,  [srcq+mmsize*(2+%%i)]
+    mova      m3,  [srcq+mmsize*(3+%%i)]
 %if %3
-    mova      m7,  [srcq+mmsize*4*%%i]
-    mova      m8,  [srcq+mmsize*5*%%i]
-    mova      m9,  [srcq+mmsize*6*%%i]
-    mova      m10, [srcq+mmsize*7*%%i]
+    mova      m7,  [srcq+mmsize*(4+%%i)]
+    mova      m8,  [srcq+mmsize*(5+%%i)]
+    mova      m9,  [srcq+mmsize*(6+%%i)]
+    mova      m10, [srcq+mmsize*(7+%%i)]
 %endif
     CLIPD  m0,  m4, m5, m6
     CLIPD  m1,  m4, m5, m6
@@ -102,17 +98,17 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
     CLIPD  m9,  m4, m5, m6
     CLIPD  m10, m4, m5, m6
 %endif
-    mova  [dstq+mmsize*0*%%i], m0
-    mova  [dstq+mmsize*1*%%i], m1
-    mova  [dstq+mmsize*2*%%i], m2
-    mova  [dstq+mmsize*3*%%i], m3
+    mova  [dstq+mmsize*(0+%%i)], m0
+    mova  [dstq+mmsize*(1+%%i)], m1
+    mova  [dstq+mmsize*(2+%%i)], m2
+    mova  [dstq+mmsize*(3+%%i)], m3
 %if %3
-    mova  [dstq+mmsize*4*%%i], m7
-    mova  [dstq+mmsize*5*%%i], m8
-    mova  [dstq+mmsize*6*%%i], m9
-    mova  [dstq+mmsize*7*%%i], m10
+    mova  [dstq+mmsize*(4+%%i)], m7
+    mova  [dstq+mmsize*(5+%%i)], m8
+    mova  [dstq+mmsize*(6+%%i)], m9
+    mova  [dstq+mmsize*(7+%%i)], m10
 %endif
-%assign %%i %%i+1
+%assign %%i %%i+4*(%3+1)
 %endrep
     add     srcq, mmsize*4*(%2+%3)
     add     dstq, mmsize*4*(%2+%3)
@@ -135,3 +131,47 @@ VECTOR_CLIP_INT32 11, 1, 1, 0
 %else
 VECTOR_CLIP_INT32 6, 1, 0, 0
 %endif
+
+;-----------------------------------------------------
+;void ff_vector_clipf(float *dst, const float *src,
+;                     float min, float max, int len)
+;-----------------------------------------------------
+INIT_XMM sse
+%if UNIX64
+cglobal vector_clipf, 3,3,6, dst, src, len
+%else
+cglobal vector_clipf, 5,5,6, dst, src, min, max, len
+%endif
+%if WIN64
+    SWAP 0, 2
+    SWAP 1, 3
+%elif ARCH_X86_32
+    movss   m0, minm
+    movss   m1, maxm
+%endif
+    SPLATD  m0
+    SPLATD  m1
+        shl lend, 2
+        add srcq, lenq
+        add dstq, lenq
+        neg lenq
+.loop:
+    mova    m2,  [srcq+lenq+mmsize*0]
+    mova    m3,  [srcq+lenq+mmsize*1]
+    mova    m4,  [srcq+lenq+mmsize*2]
+    mova    m5,  [srcq+lenq+mmsize*3]
+    maxps   m2, m0
+    maxps   m3, m0
+    maxps   m4, m0
+    maxps   m5, m0
+    minps   m2, m1
+    minps   m3, m1
+    minps   m4, m1
+    minps   m5, m1
+    mova    [dstq+lenq+mmsize*0], m2
+    mova    [dstq+lenq+mmsize*1], m3
+    mova    [dstq+lenq+mmsize*2], m4
+    mova    [dstq+lenq+mmsize*3], m5
+    add     lenq, mmsize*4
+    jl .loop
+    REP_RET
diff --git a/libavcodec/x86/audiodsp.h b/libavcodec/x86/audiodsp.h
deleted file mode 100644
index 321056b8b7..0000000000
--- a/libavcodec/x86/audiodsp.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_AUDIODSP_H
-#define AVCODEC_X86_AUDIODSP_H
-
-void ff_vector_clipf_sse(float *dst, const float *src,
-                         float min, float max, int len);
-
-#endif /* AVCODEC_X86_AUDIODSP_H */
diff --git a/libavcodec/x86/audiodsp_init.c b/libavcodec/x86/audiodsp_init.c
index 743f5a3699..a2ce231f32 100644
--- a/libavcodec/x86/audiodsp_init.c
+++ b/libavcodec/x86/audiodsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,7 +24,6 @@
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/audiodsp.h"
-#include "audiodsp.h"
 
 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
                                       int order);
@@ -39,6 +38,8 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
                                    int32_t min, int32_t max, unsigned int len);
 void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
                                int32_t min, int32_t max, unsigned int len);
+void ff_vector_clipf_sse(float *dst, const float *src,
+                         float min, float max, int len);
 
 av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
 {
@@ -50,7 +51,7 @@ av_cold void ff_audiodsp_init_x86(AudioDSPContext *c)
     if (EXTERNAL_MMXEXT(cpu_flags))
         c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
 
-    if (INLINE_SSE(cpu_flags))
+    if (EXTERNAL_SSE(cpu_flags))
         c->vector_clipf = ff_vector_clipf_sse;
 
     if (EXTERNAL_SSE2(cpu_flags)) {
diff --git a/libavcodec/x86/audiodsp_mmx.c b/libavcodec/x86/audiodsp_mmx.c
deleted file mode 100644
index cb550598f9..0000000000
--- a/libavcodec/x86/audiodsp_mmx.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/x86/asm.h"
-#include "audiodsp.h"
-
-#if HAVE_INLINE_ASM
-
-void ff_vector_clipf_sse(float *dst, const float *src,
-                         float min, float max, int len)
-{
-    x86_reg i = (len - 16) * 4;
-    __asm__ volatile (
-        "movss          %3, %%xmm4      \n\t"
-        "movss          %4, %%xmm5      \n\t"
-        "shufps $0, %%xmm4, %%xmm4      \n\t"
-        "shufps $0, %%xmm5, %%xmm5      \n\t"
-        "1:                             \n\t"
-        "movaps   (%2, %0), %%xmm0      \n\t" // 3/1 on intel
-        "movaps 16(%2, %0), %%xmm1      \n\t"
-        "movaps 32(%2, %0), %%xmm2      \n\t"
-        "movaps 48(%2, %0), %%xmm3      \n\t"
-        "maxps      %%xmm4, %%xmm0      \n\t"
-        "maxps      %%xmm4, %%xmm1      \n\t"
-        "maxps      %%xmm4, %%xmm2      \n\t"
-        "maxps      %%xmm4, %%xmm3      \n\t"
-        "minps      %%xmm5, %%xmm0      \n\t"
-        "minps      %%xmm5, %%xmm1      \n\t"
-        "minps      %%xmm5, %%xmm2      \n\t"
-        "minps      %%xmm5, %%xmm3      \n\t"
-        "movaps     %%xmm0,   (%1, %0)  \n\t"
-        "movaps     %%xmm1, 16(%1, %0)  \n\t"
-        "movaps     %%xmm2, 32(%1, %0)  \n\t"
-        "movaps     %%xmm3, 48(%1, %0)  \n\t"
-        "sub           $64, %0          \n\t"
-        "jge            1b              \n\t"
-        : "+&r" (i)
-        : "r" (dst), "r" (src), "m" (min), "m" (max)
-        : "memory");
-}
-
-#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/blockdsp.asm b/libavcodec/x86/blockdsp.asm
new file mode 100644
index 0000000000..c793858861
--- /dev/null
+++ b/libavcodec/x86/blockdsp.asm
@@ -0,0 +1,86 @@
+;******************************************************************************
+;* SIMD-optimized clear block functions
+;* Copyright (c) 2002 Michael Niedermayer
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2009 Fiona Glaser
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+;----------------------------------------
+; void ff_clear_block(int16_t *blocks);
+;----------------------------------------
+; %1 = number of xmm registers used
+; %2 = number of inline store loops
+%macro CLEAR_BLOCK 2
+cglobal clear_block, 1, 1, %1, blocks
+    ZERO  m0, m0
+%assign %%i 0
+%rep %2
+    mova  [blocksq+mmsize*(0+%%i)], m0
+    mova  [blocksq+mmsize*(1+%%i)], m0
+    mova  [blocksq+mmsize*(2+%%i)], m0
+    mova  [blocksq+mmsize*(3+%%i)], m0
+    mova  [blocksq+mmsize*(4+%%i)], m0
+    mova  [blocksq+mmsize*(5+%%i)], m0
+    mova  [blocksq+mmsize*(6+%%i)], m0
+    mova  [blocksq+mmsize*(7+%%i)], m0
+%assign %%i %%i+8
+%endrep
+    RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCK 0, 2
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCK 1, 1
+
+;-----------------------------------------
+; void ff_clear_blocks(int16_t *blocks);
+;-----------------------------------------
+; %1 = number of xmm registers used
+%macro CLEAR_BLOCKS 1
+cglobal clear_blocks, 1, 2, %1, blocks, len
+    add   blocksq, 768
+    mov      lenq, -768
+    ZERO       m0, m0
+.loop
+    mova  [blocksq+lenq+mmsize*0], m0
+    mova  [blocksq+lenq+mmsize*1], m0
+    mova  [blocksq+lenq+mmsize*2], m0
+    mova  [blocksq+lenq+mmsize*3], m0
+    mova  [blocksq+lenq+mmsize*4], m0
+    mova  [blocksq+lenq+mmsize*5], m0
+    mova  [blocksq+lenq+mmsize*6], m0
+    mova  [blocksq+lenq+mmsize*7], m0
+    add   lenq, mmsize*8
+    js .loop
+    RET
+%endmacro
+
+INIT_MMX mmx
+%define ZERO pxor
+CLEAR_BLOCKS 0
+INIT_XMM sse
+%define ZERO xorps
+CLEAR_BLOCKS 1
diff --git a/libavcodec/x86/blockdsp.c b/libavcodec/x86/blockdsp.c
deleted file mode 100644
index b5294242ab..0000000000
--- a/libavcodec/x86/blockdsp.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/internal.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/blockdsp.h"
-#include "libavcodec/version.h"
-
-#if HAVE_INLINE_ASM
-
-#define CLEAR_BLOCKS(name, n)                           \
-static void name(int16_t *blocks)                       \
-{                                                       \
-    __asm__ volatile (                                  \
-        "pxor %%mm7, %%mm7              \n\t"           \
-        "mov     %1,        %%"REG_a"   \n\t"           \
-        "1:                             \n\t"           \
-        "movq %%mm7,   (%0, %%"REG_a")  \n\t"           \
-        "movq %%mm7,  8(%0, %%"REG_a")  \n\t"           \
-        "movq %%mm7, 16(%0, %%"REG_a")  \n\t"           \
-        "movq %%mm7, 24(%0, %%"REG_a")  \n\t"           \
-        "add    $32, %%"REG_a"          \n\t"           \
-        "js      1b                     \n\t"           \
-        :: "r"(((uint8_t *) blocks) + 128 * n),         \
-           "i"(-128 * n)                                \
-        : "%"REG_a);                                    \
-}
-CLEAR_BLOCKS(clear_blocks_mmx, 6)
-CLEAR_BLOCKS(clear_block_mmx, 1)
-
-static void clear_block_sse(int16_t *block)
-{
-    __asm__ volatile (
-        "xorps  %%xmm0, %%xmm0          \n"
-        "movaps %%xmm0,    (%0)         \n"
-        "movaps %%xmm0,  16(%0)         \n"
-        "movaps %%xmm0,  32(%0)         \n"
-        "movaps %%xmm0,  48(%0)         \n"
-        "movaps %%xmm0,  64(%0)         \n"
-        "movaps %%xmm0,  80(%0)         \n"
-        "movaps %%xmm0,  96(%0)         \n"
-        "movaps %%xmm0, 112(%0)         \n"
-        :: "r" (block)
-        : "memory");
-}
-
-static void clear_blocks_sse(int16_t *blocks)
-{
-    __asm__ volatile (
-        "xorps  %%xmm0, %%xmm0              \n"
-        "mov        %1,         %%"REG_a"   \n"
-        "1:                                 \n"
-        "movaps %%xmm0,    (%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  16(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  32(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  48(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  64(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  80(%0, %%"REG_a")  \n"
-        "movaps %%xmm0,  96(%0, %%"REG_a")  \n"
-        "movaps %%xmm0, 112(%0, %%"REG_a")  \n"
-        "add      $128,         %%"REG_a"   \n"
-        "js         1b                      \n"
-        :: "r"(((uint8_t *) blocks) + 128 * 6), "i"(-128 * 6)
-        : "%"REG_a);
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-#if FF_API_XVMC
-av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth,
-                                  AVCodecContext *avctx)
-#else
-av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth)
-#endif /* FF_API_XVMC */
-{
-#if HAVE_INLINE_ASM
-    int cpu_flags = av_get_cpu_flags();
-
-    if (!high_bit_depth) {
-        if (INLINE_MMX(cpu_flags)) {
-            c->clear_block  = clear_block_mmx;
-            c->clear_blocks = clear_blocks_mmx;
-        }
-
-#if FF_API_XVMC
-FF_DISABLE_DEPRECATION_WARNINGS
-    /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
-    if (CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)
-        return;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_XVMC */
-
-        if (INLINE_SSE(cpu_flags)) {
-            c->clear_block  = clear_block_sse;
-            c->clear_blocks = clear_blocks_sse;
-        }
-    }
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/libavcodec/x86/blockdsp_init.c b/libavcodec/x86/blockdsp_init.c
new file mode 100644
index 0000000000..7780184af6
--- /dev/null
+++ b/libavcodec/x86/blockdsp_init.c
@@ -0,0 +1,60 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/blockdsp.h"
+#include "libavcodec/version.h"
+
+void ff_clear_block_mmx(int16_t *block);
+void ff_clear_block_sse(int16_t *block);
+void ff_clear_blocks_mmx(int16_t *blocks);
+void ff_clear_blocks_sse(int16_t *blocks);
+
+#if FF_API_XVMC
+av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth,
+                                  AVCodecContext *avctx)
+#else
+av_cold void ff_blockdsp_init_x86(BlockDSPContext *c, unsigned high_bit_depth)
+#endif /* FF_API_XVMC */
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (!high_bit_depth) {
+        if (EXTERNAL_MMX(cpu_flags)) {
+            c->clear_block  = ff_clear_block_mmx;
+            c->clear_blocks = ff_clear_blocks_mmx;
+        }
+
+    /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
+    if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
+        return;
+
+        if (EXTERNAL_SSE(cpu_flags)) {
+            c->clear_block  = ff_clear_block_sse;
+            c->clear_blocks = ff_clear_blocks_sse;
+        }
+    }
+#endif /* HAVE_YASM */
+}
diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm
index 17a6cb1be3..ec060c93b6 100644
--- a/libavcodec/x86/bswapdsp.asm
+++ b/libavcodec/x86/bswapdsp.asm
@@ -1,21 +1,23 @@
 ;******************************************************************************
 ;* optimized bswap buffer functions
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -24,6 +26,8 @@
 SECTION_RODATA
 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
+cextern pb_80
+
 SECTION_TEXT
 
 ; %1 = aligned/unaligned
@@ -90,6 +94,7 @@ cglobal bswap32_buf, 3,4,3
 cglobal bswap32_buf, 3,4,5
     mov      r3, r1
 %endif
+    or       r3, r0
     and      r3, 15
     jz       .start_align
     BSWAP_LOOPS  u
diff --git a/libavcodec/x86/bswapdsp_init.c b/libavcodec/x86/bswapdsp_init.c
index ba40f2dbe1..c042e56371 100644
--- a/libavcodec/x86/bswapdsp_init.c
+++ b/libavcodec/x86/bswapdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/cabac.h b/libavcodec/x86/cabac.h
index d1701bf071..3510336f95 100644
--- a/libavcodec/x86/cabac.h
+++ b/libavcodec/x86/cabac.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,8 +27,28 @@
 #include "libavutil/x86/asm.h"
 #include "config.h"
 
+#if   (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
+   || (                  !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)\
+   || (defined(__INTEL_COMPILER) && defined(_MSC_VER))
+#       define BROKEN_COMPILER 1
+#else
+#       define BROKEN_COMPILER 0
+#endif
+
 #if HAVE_INLINE_ASM
 
+#ifndef UNCHECKED_BITSTREAM_READER
+#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
+#endif
+
+#if UNCHECKED_BITSTREAM_READER
+#define END_CHECK(end) ""
+#else
+#define END_CHECK(end) \
+        "cmp    "end"       , %%"REG_c"                                 \n\t"\
+        "jge    1f                                                      \n\t"
+#endif
+
 #ifdef BROKEN_RELOCATIONS
 #define TABLES_ARG , "r"(tables)
 
@@ -73,8 +93,7 @@
         "test   "lowword"   , "lowword"                                 \n\t"\
         "jnz    2f                                                      \n\t"\
         "mov    "byte"      , %%"REG_c"                                 \n\t"\
-        "cmp    "end"       , %%"REG_c"                                 \n\t"\
-        "jge    1f                                                      \n\t"\
+        END_CHECK(end)\
         "add"OPSIZE" $2     , "byte"                                    \n\t"\
         "1:                                                             \n\t"\
         "movzwl (%%"REG_c") , "tmp"                                     \n\t"\
@@ -92,7 +111,8 @@
         "2:                                                             \n\t"
 
 #else /* BROKEN_RELOCATIONS */
-#define TABLES_ARG
+#define TABLES_ARG NAMED_CONSTRAINTS_ARRAY_ADD(ff_h264_cabac_tables)
+#define RIP_ARG
 
 #if HAVE_FAST_CMOV
 #define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
@@ -134,8 +154,7 @@
         "test   "lowword"   , "lowword"                                 \n\t"\
         " jnz   2f                                                      \n\t"\
         "mov    "byte"      , %%"REG_c"                                 \n\t"\
-        "cmp    "end"       , %%"REG_c"                                 \n\t"\
-        "jge    1f                                                      \n\t"\
+        END_CHECK(end)\
         "add"OPSIZE" $2     , "byte"                                    \n\t"\
         "1:                                                             \n\t"\
         "movzwl (%%"REG_c")     , "tmp"                                 \n\t"\
@@ -154,8 +173,7 @@
 
 #endif /* BROKEN_RELOCATIONS */
 
-
-#if HAVE_7REGS
+#if HAVE_7REGS && !BROKEN_COMPILER
 #define get_cabac_inline get_cabac_inline_x86
 static av_always_inline int get_cabac_inline_x86(CABACContext *c,
                                                  uint8_t *const state)
@@ -167,6 +185,7 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
     __asm__ volatile(
         "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -178,17 +197,19 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c,
                              AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
                              AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
                              "%8")
-        : "=&r"(bit), "+&r"(c->low), "+&r"(c->range), "=&q"(tmp)
+        : "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
         : "r"(state), "r"(c),
           "i"(offsetof(CABACContext, bytestream)),
           "i"(offsetof(CABACContext, bytestream_end))
           TABLES_ARG
+          ,"1"(c->low), "2"(c->range)
         : "%"REG_c, "memory"
     );
     return bit & 1;
 }
-#endif /* HAVE_7REGS */
+#endif /* HAVE_7REGS && !BROKEN_COMPILER */
 
+#if !BROKEN_COMPILER
 #define get_cabac_bypass_sign get_cabac_bypass_sign_x86
 static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
 {
@@ -199,7 +220,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
         "shl             $17, %k1       \n\t"
         "add           %%eax, %%eax     \n\t"
         "sub             %k1, %%eax     \n\t"
-        "cltd                           \n\t"
+        "cdq                            \n\t"
         "and           %%edx, %k1       \n\t"
         "add             %k1, %%eax     \n\t"
         "xor           %%edx, %%ecx     \n\t"
@@ -211,10 +232,16 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
         "movzwl         (%1), %%edx     \n\t"
         "bswap         %%edx            \n\t"
         "shrl            $15, %%edx     \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "add              $2, %1        \n\t"
+        "addl          %%edx, %%eax     \n\t"
+        "mov              %1, %c4(%2)   \n\t"
+#else
         "addl          %%edx, %%eax     \n\t"
         "cmp         %c5(%2), %1        \n\t"
         "jge              1f            \n\t"
         "add"OPSIZE"      $2, %c4(%2)   \n\t"
+#endif
         "1:                             \n\t"
         "movl          %%eax, %c3(%2)   \n\t"
 
@@ -240,7 +267,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
         "shl             $17, %k1       \n\t"
         "add           %%eax, %%eax     \n\t"
         "sub             %k1, %%eax     \n\t"
-        "cltd                           \n\t"
+        "cdq                            \n\t"
         "and           %%edx, %k1       \n\t"
         "add             %k1, %%eax     \n\t"
         "inc           %%edx            \n\t"
@@ -268,6 +295,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
     );
     return res;
 }
+#endif /* !BROKEN_COMPILER */
 
 #endif /* HAVE_INLINE_ASM */
 #endif /* AVCODEC_X86_CABAC_H */
diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c
index b323a105f2..190f6eed66 100644
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -5,20 +5,20 @@
  * MMX-optimized DSP functions, based on H.264 optimizations by
  * Michael Niedermayer and Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -139,7 +139,7 @@ static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
 static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
 {
     int i;
-    DECLARE_ALIGNED(8, int16_t, b2)[64];
+    DECLARE_ALIGNED(16, int16_t, b2)[64];
 
     for(i=0; i<2; i++){
         DECLARE_ALIGNED(8, uint64_t, tmp);
@@ -198,7 +198,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
         );
     }
 
-    ff_add_pixels_clamped_mmx(b2, dst, stride);
+    ff_add_pixels_clamped(b2, dst, stride);
 }
 
 #endif /* HAVE_MMX_INLINE */
@@ -212,10 +212,10 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
  ****************************************************************************/
 
 /* vertical filter [-1 -2 96 42 -7  0]  */
-#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV1(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
-        "pmullw %5, %%mm6           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
         "movq "#D", %%mm7           \n\t"\
         "pmullw "MANGLE(MUL2)", %%mm7\n\t"\
         "psllw $3, "#E"             \n\t"\
@@ -230,35 +230,35 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
         "psubw "#B", %%mm6          \n\t"\
         "psraw $1, "#B"             \n\t"\
         "psubw "#A", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $7, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
         "add %3, %1                 \n\t"
 
 /* vertical filter [ 0 -1  5  5 -1  0]  */
-#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV2(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
         "paddw "#D", %%mm6          \n\t"\
-        "pmullw %5, %%mm6           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm6\n\t"\
         "add %2, %0                 \n\t"\
         "punpcklbw %%mm7, "#F"      \n\t"\
         "psubw "#B", %%mm6          \n\t"\
         "psubw "#E", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $3, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
         "add %3, %1                 \n\t"
 
 /* vertical filter [ 0 -7 42 96 -2 -1]  */
-#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \
+#define QPEL_CAVSV3(A,B,C,D,E,F,OP,ADD, MUL1, MUL2) \
         "movd (%0), "#F"            \n\t"\
         "movq "#C", %%mm6           \n\t"\
         "pmullw "MANGLE(MUL2)", %%mm6\n\t"\
         "movq "#D", %%mm7           \n\t"\
-        "pmullw %5, %%mm7           \n\t"\
+        "pmullw "MANGLE(MUL1)", %%mm7\n\t"\
         "psllw $3, "#B"             \n\t"\
         "psubw "#B", %%mm6          \n\t"\
         "psraw $3, "#B"             \n\t"\
@@ -271,7 +271,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
         "psubw "#E", %%mm6          \n\t"\
         "psraw $1, "#E"             \n\t"\
         "psubw "#F", %%mm6          \n\t"\
-        "paddw %4, %%mm6            \n\t"\
+        "paddw "MANGLE(ADD)", %%mm6 \n\t"\
         "psraw $7, %%mm6            \n\t"\
         "packuswb %%mm6, %%mm6      \n\t"\
         OP(%%mm6, (%1), A, d)            \
@@ -300,32 +300,34 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
         "punpcklbw %%mm7, %%mm2     \n\t"\
         "punpcklbw %%mm7, %%mm3     \n\t"\
         "punpcklbw %%mm7, %%mm4     \n\t"\
-        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
-        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
-        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
-        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
-        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+        VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
         \
         : "+a"(src), "+c"(dst)\
-        : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
+        : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+          NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
         : "memory"\
      );\
      if(h==16){\
         __asm__ volatile(\
-            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
-            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
-            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
-            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
-            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
-            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
-            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, ADD, MUL1, MUL2)\
+            VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, ADD, MUL1, MUL2)\
             \
            : "+a"(src), "+c"(dst)\
-           : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD),  "m"(MUL1)\
+           : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride)\
+             NAMED_CONSTRAINTS_ADD(ADD,MUL1,MUL2)\
            : "memory"\
         );\
      }\
@@ -338,7 +340,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, int
     int h=8;\
     __asm__ volatile(\
         "pxor %%mm7, %%mm7          \n\t"\
-        "movq %5, %%mm6             \n\t"\
+        "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
         "1:                         \n\t"\
         "movq    (%0), %%mm0        \n\t"\
         "movq   1(%0), %%mm2        \n\t"\
@@ -364,7 +366,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, int
         "paddw %%mm3, %%mm5         \n\t"\
         "psubw %%mm2, %%mm0         \n\t"\
         "psubw %%mm5, %%mm1         \n\t"\
-        "movq %6, %%mm5             \n\t"\
+        "movq "MANGLE(ff_pw_4)", %%mm5\n\t"\
         "paddw %%mm5, %%mm0         \n\t"\
         "paddw %%mm5, %%mm1         \n\t"\
         "psraw $3, %%mm0            \n\t"\
@@ -376,7 +378,8 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, const uint8_t *src, int
         "decl %2                    \n\t"\
         " jnz 1b                    \n\t"\
         : "+a"(src), "+c"(dst), "+m"(h)\
-        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
+        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_4,ff_pw_5)\
         : "memory"\
     );\
 }\
@@ -386,7 +389,7 @@ static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, const uint8
 }\
 \
 static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\
-  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5)         \
+  QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_42)        \
 }\
 \
 static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h){\
@@ -459,7 +462,7 @@ static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, const uin
 
 #endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
 
-#if HAVE_MMX_INLINE
+#if HAVE_MMX_EXTERNAL
 static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
                                     ptrdiff_t stride)
 {
@@ -472,6 +475,12 @@ static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, const uint8_t *src,
     ff_avg_pixels8_mmx(dst, src, stride, 8);
 }
 
+static void avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride)
+{
+    ff_avg_pixels8_mmxext(dst, src, stride, 8);
+}
+
 static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
                                      ptrdiff_t stride)
 {
@@ -484,18 +493,40 @@ static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, const uint8_t *src,
     ff_avg_pixels16_mmx(dst, src, stride, 16);
 }
 
+static void avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                        ptrdiff_t stride)
+{
+    ff_avg_pixels16_mmxext(dst, src, stride, 16);
+}
+
+static void put_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride)
+{
+    ff_put_pixels16_sse2(dst, src, stride, 16);
+}
+
+static void avg_cavs_qpel16_mc00_sse2(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride)
+{
+    ff_avg_pixels16_sse2(dst, src, stride, 16);
+}
+#endif
+
 static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
                                      AVCodecContext *avctx)
 {
+#if HAVE_MMX_EXTERNAL
     c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
     c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
     c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
     c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
+#endif
 
+#if HAVE_MMX_INLINE
     c->cavs_idct8_add = cavs_idct8_add_mmx;
     c->idct_perm      = FF_IDCT_PERM_TRANSPOSE;
-}
 #endif /* HAVE_MMX_INLINE */
+}
 
 #define DSPFUNC(PFX, IDX, NUM, EXT)                                                       \
     c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
@@ -511,15 +542,6 @@ CAVS_MC(put_,  8, mmxext)
 CAVS_MC(put_, 16, mmxext)
 CAVS_MC(avg_,  8, mmxext)
 CAVS_MC(avg_, 16, mmxext)
-
-static av_cold void cavsdsp_init_mmxext(CAVSDSPContext *c,
-                                        AVCodecContext *avctx)
-{
-    DSPFUNC(put, 0, 16, mmxext);
-    DSPFUNC(put, 1,  8, mmxext);
-    DSPFUNC(avg, 0, 16, mmxext);
-    DSPFUNC(avg, 1,  8, mmxext);
-}
 #endif /* HAVE_MMXEXT_INLINE */
 
 #if HAVE_AMD3DNOW_INLINE
@@ -543,18 +565,31 @@ static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
 
 av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
 {
-#if HAVE_MMX_INLINE
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags))
-        cavsdsp_init_mmx(c, avctx);
-#endif /* HAVE_MMX_INLINE */
+    cavsdsp_init_mmx(c, avctx);
 #if HAVE_AMD3DNOW_INLINE
     if (INLINE_AMD3DNOW(cpu_flags))
         cavsdsp_init_3dnow(c, avctx);
 #endif /* HAVE_AMD3DNOW_INLINE */
 #if HAVE_MMXEXT_INLINE
-    if (INLINE_MMXEXT(cpu_flags))
-        cavsdsp_init_mmxext(c, avctx);
-#endif /* HAVE_MMXEXT_INLINE */
+    if (INLINE_MMXEXT(cpu_flags)) {
+        DSPFUNC(put, 0, 16, mmxext);
+        DSPFUNC(put, 1,  8, mmxext);
+        DSPFUNC(avg, 0, 16, mmxext);
+        DSPFUNC(avg, 1,  8, mmxext);
+    }
+#endif
+#if HAVE_MMX_EXTERNAL
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmxext;
+        c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmxext;
+    }
+#endif
+#if HAVE_SSE2_EXTERNAL
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2;
+        c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2;
+    }
+#endif
 }
diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index 5b8d1b224f..aa3df00633 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -1,20 +1,20 @@
 /*
- * MMX/SSE constants used across x86 dsp optimizations.
+ * MMX/SSE/AVX constants used across x86 dsp optimizations.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,8 +22,6 @@
 #include "libavutil/x86/asm.h" // for xmm_reg
 #include "constants.h"
 
-DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
-
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
@@ -43,11 +41,21 @@ DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x004
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255)  =   0x00ff00ff00ff00ffULL;
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_256)  = { 0x0100010001000100ULL, 0x0100010001000100ULL,
+                                                    0x0100010001000100ULL, 0x0100010001000100ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1024) = { 0x0400040004000400ULL, 0x0400040004000400ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL };
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_m1)   = { 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL };
 
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL,
+                                                    0x0101010101010101ULL, 0x0101010101010101ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL,
+                                                    0x0303030303030303ULL, 0x0303030303030303ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
+
+DECLARE_ALIGNED(16, const xmm_reg,  ff_ps_neg)  = { 0x8000000080000000ULL, 0x8000000080000000ULL };
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index f38fbe3425..e75fff9b9a 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -1,20 +1,20 @@
 /*
  * MMX/SSE constants used across x86 dsp optimizations.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,12 +25,13 @@
 
 #include "libavutil/x86/asm.h"
 
-extern const uint64_t ff_wtwo;
-
+extern const xmm_reg  ff_pw_1;
+extern const xmm_reg  ff_pw_2;
 extern const xmm_reg  ff_pw_3;
 extern const xmm_reg  ff_pw_4;
 extern const xmm_reg  ff_pw_5;
 extern const xmm_reg  ff_pw_8;
+extern const xmm_reg  ff_pw_9;
 extern const uint64_t ff_pw_15;
 extern const xmm_reg  ff_pw_16;
 extern const xmm_reg  ff_pw_18;
@@ -42,10 +43,18 @@ extern const xmm_reg  ff_pw_64;
 extern const uint64_t ff_pw_96;
 extern const uint64_t ff_pw_128;
 extern const uint64_t ff_pw_255;
+extern const xmm_reg  ff_pw_512;
+extern const xmm_reg  ff_pw_1024;
+extern const xmm_reg  ff_pw_2048;
+extern const xmm_reg  ff_pw_8192;
+extern const xmm_reg  ff_pw_m1;
 
-extern const xmm_reg  ff_pb_1;
-extern const xmm_reg  ff_pb_3;
+extern const ymm_reg  ff_pb_1;
+extern const ymm_reg  ff_pb_3;
+extern const xmm_reg  ff_pb_80;
 extern const xmm_reg  ff_pb_F8;
 extern const uint64_t ff_pb_FC;
 
+extern const xmm_reg  ff_ps_neg;
+
 #endif /* AVCODEC_X86_CONSTANTS_H */
diff --git a/libavcodec/x86/dca.h b/libavcodec/x86/dca.h
deleted file mode 100644
index 11d45ae61c..0000000000
--- a/libavcodec/x86/dca.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_X86_DCA_H
-#define AVCODEC_X86_DCA_H
-
-#include "config.h"
-
-#if ARCH_X86_64 && HAVE_SSE2_INLINE
-# include "libavutil/x86/asm.h"
-# include "libavutil/mem.h"
-#include "libavcodec/dcadsp.h"
-
-# define int8x8_fmul_int32 int8x8_fmul_int32
-static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
-                                     float *dst, const int8_t *src, int scale)
-{
-    DECLARE_ALIGNED(16, static const uint32_t, inverse16) = 0x3D800000;
-    __asm__ volatile (
-        "cvtsi2ss        %2, %%xmm0 \n\t"
-        "mulss           %3, %%xmm0 \n\t"
-        "movq          (%1), %%xmm1 \n\t"
-        "punpcklbw   %%xmm1, %%xmm1 \n\t"
-        "movaps      %%xmm1, %%xmm2 \n\t"
-        "punpcklwd   %%xmm1, %%xmm1 \n\t"
-        "punpckhwd   %%xmm2, %%xmm2 \n\t"
-        "psrad          $24, %%xmm1 \n\t"
-        "psrad          $24, %%xmm2 \n\t"
-        "shufps  $0, %%xmm0, %%xmm0 \n\t"
-        "cvtdq2ps    %%xmm1, %%xmm1 \n\t"
-        "cvtdq2ps    %%xmm2, %%xmm2 \n\t"
-        "mulps       %%xmm0, %%xmm1 \n\t"
-        "mulps       %%xmm0, %%xmm2 \n\t"
-        "movaps      %%xmm1,  0(%0) \n\t"
-        "movaps      %%xmm2, 16(%0) \n\t"
-        :: "r"(dst), "r"(src), "m"(scale), "m"(inverse16)
-        XMM_CLOBBERS_ONLY("xmm0", "xmm1", "xmm2")
-    );
-}
-
-#endif /* ARCH_X86_64 && HAVE_SSE2_INLINE */
-
-#endif /* AVCODEC_X86_DCA_H */
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index c42ee23faf..1ac237885a 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -2,20 +2,20 @@
 ;* SSE-optimized functions for the DCA decoder
 ;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -132,11 +132,16 @@ DECODE_HF
     mulps       va, %2
     mulps       vb, %2
 %if %0 == 3
+%if cpuflag(fma3)
+    fmaddps     va, m4, %3, va
+    fmaddps     vb, m0, %3, vb
+%else
     mulps       m4, %3
     mulps       m0, %3
     addps       va, m4
     addps       vb, m0
 %endif
+%endif
     ; va = va1 va2 va3 va4
     ; vb = vb1 vb2 vb3 vb4
 %if %1
@@ -148,7 +153,7 @@ DECODE_HF
     addps       m4, va ; va1+3 vb1+3 va2+4 vb2+4
     movhlps     vb, m4 ; va1+3  vb1+3
     addps       vb, m4 ; va0..4 vb0..4
-    movh    [outq + count], vb
+    movlps  [outq + count], vb
 %if %1
     sub       cf0q, 8*NUM_COEF
 %endif
@@ -198,6 +203,10 @@ cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
 INIT_XMM sse
 DCA_LFE_FIR 0
 DCA_LFE_FIR 1
+%if HAVE_FMA3_EXTERNAL
+INIT_XMM fma3
+DCA_LFE_FIR 0
+%endif
 
 %macro SETZERO 1
 %if cpuflag(sse2) && notcpuflag(avx)
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 9acb818c94..bb86c26037 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,6 +34,7 @@ void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS
                        int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
 void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
 void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
+void ff_dca_lfe_fir0_fma3(float *out, const float *in, const float *coefs);
 
 av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
 {
@@ -54,6 +55,10 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
     if (EXTERNAL_SSE4(cpu_flags)) {
         s->decode_hf = ff_decode_hf_sse4;
     }
+
+    if (EXTERNAL_FMA3(cpu_flags)) {
+        s->lfe_fir[0]        = ff_dca_lfe_fir0_fma3;
+    }
 }
 
 
diff --git a/libavcodec/x86/dct-test.c b/libavcodec/x86/dct-test.c
index 9d4aaf5415..3414cb0cb0 100644
--- a/libavcodec/x86/dct-test.c
+++ b/libavcodec/x86/dct-test.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,6 +22,27 @@
 #include "xvididct.h"
 #include "simple_idct.h"
 
+#if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
+void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
+                                int16_t *block, int16_t *qmat);
+
+static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){
+    DECLARE_ALIGNED(16, static int16_t, qmat)[64];
+    DECLARE_ALIGNED(16, static int16_t, tmp)[64];
+    int i;
+
+    for(i=0; i<64; i++){
+        qmat[i]=4;
+        tmp[i]= dst[i];
+    }
+    ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
+
+    for(i=0; i<64; i++) {
+         dst[i] -= 512;
+    }
+}
+#endif
+
 static const struct algo fdct_tab_arch[] = {
 #if HAVE_MMX_INLINE
     { "MMX",    ff_fdct_mmx,    FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMX },
@@ -48,6 +69,9 @@ static const struct algo idct_tab_arch[] = {
 #endif
 #if HAVE_SSE2_INLINE
     { "XVID-SSE2",   ff_xvid_idct_sse2,   FF_IDCT_PERM_SSE2,   AV_CPU_FLAG_SSE2,   1 },
+#if ARCH_X86_64 && HAVE_YASM
+    { "PR-SSE2",     ff_prores_idct_put_10_sse2_wrap, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
+#endif
 #endif
 #endif /* CONFIG_MPEG4_DECODER */
     { 0 }
diff --git a/libavcodec/x86/dct32.asm b/libavcodec/x86/dct32.asm
index 9c147b9c00..c70f6c9c49 100644
--- a/libavcodec/x86/dct32.asm
+++ b/libavcodec/x86/dct32.asm
@@ -2,20 +2,20 @@
 ;* 32 point SSE-optimized DCT transform
 ;* Copyright (c) 2010 Vitor Sessak
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -192,6 +192,7 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
 
 INIT_YMM avx
 SECTION_TEXT
+%if HAVE_AVX_EXTERNAL
 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
 cglobal dct32_float, 2,3,8, out, in, tmp
     ; pass 1
@@ -264,6 +265,7 @@ cglobal dct32_float, 2,3,8, out, in, tmp
 INIT_XMM
     PASS6_AND_PERMUTE
     RET
+%endif
 
 %if ARCH_X86_64
 %define SPILL SWAP
@@ -482,7 +484,9 @@ cglobal dct32_float, 2, 3, 16, out, in, tmp
 %endif
 %endmacro
 
+%if ARCH_X86_32
 INIT_XMM sse
 DCT32_FUNC
+%endif
 INIT_XMM sse2
 DCT32_FUNC
diff --git a/libavcodec/x86/dct_init.c b/libavcodec/x86/dct_init.c
index 7bda5e81b6..30c8f12bf2 100644
--- a/libavcodec/x86/dct_init.c
+++ b/libavcodec/x86/dct_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@ av_cold void ff_dct_init_x86(DCTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_SSE(cpu_flags))
+    if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags))
         s->dct32 = ff_dct32_float_sse;
     if (EXTERNAL_SSE2(cpu_flags))
         s->dct32 = ff_dct32_float_sse2;
diff --git a/libavcodec/x86/deinterlace.asm b/libavcodec/x86/deinterlace.asm
index 70d000e0db..baa9249db2 100644
--- a/libavcodec/x86/deinterlace.asm
+++ b/libavcodec/x86/deinterlace.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2010 Vitor Sessak
 ;* Copyright (c) 2002 Michael Niedermayer
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/dirac_dwt.c b/libavcodec/x86/dirac_dwt.c
new file mode 100644
index 0000000000..3c51ea6ffa
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt.c
@@ -0,0 +1,202 @@
+/*
+ * MMX optimized discrete wavelet transform
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "dirac_dwt.h"
+
+#define COMPOSE_VERTICAL(ext, align) \
+void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
+void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
+void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
+void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
+void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \
+void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
+void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
+\
+static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) \
+        b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
+\
+    ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
+} \
+\
+static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
+                                           IDWTELEM *b3, IDWTELEM *b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+\
+static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
+                                          IDWTELEM *b3, IDWTELEM *b4, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) \
+        b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
+\
+    ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
+} \
+static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \
+{ \
+    int i, width_align = width&~(align-1); \
+\
+    for(i=width_align; i<width; i++) { \
+        b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
+        b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
+    } \
+\
+    ff_vertical_compose_haar##ext(b0, b1, width_align); \
+} \
+static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    ff_horizontal_compose_haar0i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = tmp[x];\
+        b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
+    }\
+}\
+static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
+{\
+    int w2= w>>1;\
+    int x= w2 - (w2&(align-1));\
+    ff_horizontal_compose_haar1i##ext(b, tmp, w);\
+\
+    for (; x < w2; x++) {\
+        b[2*x  ] = (tmp[x] + 1)>>1;\
+        b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
+    }\
+}\
+\
+
+#if HAVE_YASM
+#if !ARCH_X86_64
+COMPOSE_VERTICAL(_mmx, 4)
+#endif
+COMPOSE_VERTICAL(_sse2, 8)
+
+
+void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w);
+
+static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w)
+{
+    int w2= w>>1;
+    int x= w2 - (w2&7);
+    ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
+
+    for (; x < w2; x++) {
+        b[2*x  ] = (tmp[x] + 1)>>1;
+        b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
+    }
+}
+#endif
+
+void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
+{
+#if HAVE_YASM
+  int mm_flags = av_get_cpu_flags();
+
+#if !ARCH_X86_64
+    if (!(mm_flags & AV_CPU_FLAG_MMX))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar0i_mmx;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_mmx;
+        d->horizontal_compose = horizontal_compose_haar1i_mmx;
+        break;
+    }
+#endif
+
+    if (!(mm_flags & AV_CPU_FLAG_SSE2))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_LEGALL5_3:
+        d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
+        break;
+    case DWT_DIRAC_DD13_7:
+        d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
+        d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
+        break;
+    case DWT_DIRAC_HAAR0:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar0i_sse2;
+        break;
+    case DWT_DIRAC_HAAR1:
+        d->vertical_compose   = (void*)vertical_compose_haar_sse2;
+        d->horizontal_compose = horizontal_compose_haar1i_sse2;
+        break;
+    }
+
+    if (!(mm_flags & AV_CPU_FLAG_SSSE3))
+        return;
+
+    switch (type) {
+    case DWT_DIRAC_DD9_7:
+        d->horizontal_compose = horizontal_compose_dd97i_ssse3;
+        break;
+    }
+#endif // HAVE_YASM
+}
diff --git a/libavcodec/x86/dirac_dwt.h b/libavcodec/x86/dirac_dwt.h
new file mode 100644
index 0000000000..126b29029f
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt.h
@@ -0,0 +1,30 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DIRAC_DWT_H
+#define AVCODEC_X86_DIRAC_DWT_H
+
+#include "libavcodec/dirac_dwt.h"
+
+void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
+
+void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type);
+
+#endif
diff --git a/libavcodec/x86/diracdsp_mmx.c b/libavcodec/x86/diracdsp_mmx.c
new file mode 100644
index 0000000000..11df5e395e
--- /dev/null
+++ b/libavcodec/x86/diracdsp_mmx.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+#include "diracdsp_mmx.h"
+#include "fpel.h"
+
+void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+
+#define HPEL_FILTER(MMSIZE, EXT)                                                             \
+    void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int);               \
+    void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int);                    \
+                                                                                             \
+    static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,       \
+                                          const uint8_t *src, int stride, int width, int height)   \
+    {                                                                                        \
+        while( height-- )                                                                    \
+        {                                                                                    \
+            ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
+            ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width);                                \
+            ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width);                               \
+                                                                                             \
+            dsth += stride;                                                                  \
+            dstv += stride;                                                                  \
+            dstc += stride;                                                                  \
+            src  += stride;                                                                  \
+        }                                                                                    \
+    }
+
+#if !ARCH_X86_64
+HPEL_FILTER(8, mmx)
+#endif
+HPEL_FILTER(16, sse2)
+
+#define PIXFUNC(PFX, IDX, EXT)                                                   \
+    /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/  \
+    c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
+    c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
+
+#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
+void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3)\
+        ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
+    else\
+        OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    if (h&3) {\
+        ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
+    } else {\
+        OPNAME ## _pixels16_ ## EXT(dst   , src[0]   , stride, h);\
+        OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
+    }\
+}
+
+DIRAC_PIXOP(put, ff_put, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmx)
+DIRAC_PIXOP(avg, ff_avg, mmxext)
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3)
+        ff_put_dirac_pixels16_c(dst, src, stride, h);
+    else
+    ff_put_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3)
+        ff_avg_dirac_pixels16_c(dst, src, stride, h);
+    else
+    ff_avg_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3) {
+        ff_put_dirac_pixels32_c(dst, src, stride, h);
+    } else {
+    ff_put_pixels16_sse2(dst   , src[0]   , stride, h);
+    ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
+}
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    if (h&3) {
+        ff_avg_dirac_pixels32_c(dst, src, stride, h);
+    } else {
+    ff_avg_pixels16_sse2(dst   , src[0]   , stride, h);
+    ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
+    }
+}
+
+void ff_diracdsp_init_mmx(DiracDSPContext* c)
+{
+    int mm_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(mm_flags)) {
+    c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
+#if !ARCH_X86_64
+    c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
+    c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
+    c->dirac_hpel_filter = dirac_hpel_filter_mmx;
+    c->add_rect_clamped = ff_add_rect_clamped_mmx;
+    c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx;
+#endif
+    PIXFUNC(put, 0, mmx);
+    PIXFUNC(avg, 0, mmx);
+    }
+
+    if (EXTERNAL_MMXEXT(mm_flags)) {
+        PIXFUNC(avg, 0, mmxext);
+    }
+
+    if (EXTERNAL_SSE2(mm_flags)) {
+        c->dirac_hpel_filter = dirac_hpel_filter_sse2;
+        c->add_rect_clamped = ff_add_rect_clamped_sse2;
+        c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2;
+
+        c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
+        c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
+
+        c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
+        c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
+        c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
+        c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
+    }
+}
diff --git a/libavcodec/x86/diracdsp_mmx.h b/libavcodec/x86/diracdsp_mmx.h
new file mode 100644
index 0000000000..89858544f3
--- /dev/null
+++ b/libavcodec/x86/diracdsp_mmx.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DIRACDSP_H
+#define AVCODEC_X86_DIRACDSP_H
+
+#include "libavcodec/diracdsp.h"
+
+void ff_diracdsp_init_mmx(DiracDSPContext* c);
+
+DECL_DIRAC_PIXOP(put, mmx);
+DECL_DIRAC_PIXOP(avg, mmx);
+DECL_DIRAC_PIXOP(avg, mmxext);
+
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
+
+void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
+
+void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
+
+#endif
diff --git a/libavcodec/x86/diracdsp_yasm.asm b/libavcodec/x86/diracdsp_yasm.asm
new file mode 100644
index 0000000000..d3cf9f1971
--- /dev/null
+++ b/libavcodec/x86/diracdsp_yasm.asm
@@ -0,0 +1,265 @@
+;******************************************************************************
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_7: times 8 dw 7
+
+cextern pw_3
+cextern pw_16
+cextern pw_32
+cextern pb_80
+
+section .text
+
+%macro UNPACK_ADD 6
+    mov%5   %1, %3
+    mov%6   m5, %4
+    mova    m4, %1
+    mova    %2, m5
+    punpcklbw %1, m7
+    punpcklbw m5, m7
+    punpckhbw m4, m7
+    punpckhbw %2, m7
+    paddw   %1, m5
+    paddw   %2, m4
+%endmacro
+
+%macro HPEL_FILTER 1
+; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
+cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
+    mov     src0q, srcq
+    lea     stridex3q, [3*strideq]
+    sub     src0q, stridex3q
+    pxor    m7, m7
+.loop:
+    ; 7*(src[0] + src[1])
+    UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
+    pmullw  m0, [pw_7]
+    pmullw  m1, [pw_7]
+
+    ; 3*( ... + src[-2] + src[3])
+    UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
+    paddw   m0, m2
+    paddw   m1, m3
+    pmullw  m0, [pw_3]
+    pmullw  m1, [pw_3]
+
+    ; ... - 7*(src[-1] + src[2])
+    UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
+    pmullw  m2, [pw_7]
+    pmullw  m3, [pw_7]
+    psubw   m0, m2
+    psubw   m1, m3
+
+    ; ... - (src[-3] + src[4])
+    UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
+    psubw   m0, m2
+    psubw   m1, m3
+
+    paddw   m0, [pw_16]
+    paddw   m1, [pw_16]
+    psraw   m0, 5
+    psraw   m1, 5
+    packuswb m0, m1
+    mova    [dstq], m0
+    add     dstq, mmsize
+    add     srcq, mmsize
+    add     src0q, mmsize
+    sub     widthd, mmsize
+    jg      .loop
+    RET
+
+; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
+cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
+    dec     widthd
+    pxor    m7, m7
+    and     widthd, ~(mmsize-1)
+.loop:
+    ; 7*(src[0] + src[1])
+    UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
+    pmullw  m0, [pw_7]
+    pmullw  m1, [pw_7]
+
+    ; 3*( ... + src[-2] + src[3])
+    UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
+    paddw   m0, m2
+    paddw   m1, m3
+    pmullw  m0, [pw_3]
+    pmullw  m1, [pw_3]
+
+    ; ... - 7*(src[-1] + src[2])
+    UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
+    pmullw  m2, [pw_7]
+    pmullw  m3, [pw_7]
+    psubw   m0, m2
+    psubw   m1, m3
+
+    ; ... - (src[-3] + src[4])
+    UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
+    psubw   m0, m2
+    psubw   m1, m3
+
+    paddw   m0, [pw_16]
+    paddw   m1, [pw_16]
+    psraw   m0, 5
+    psraw   m1, 5
+    packuswb m0, m1
+    mova    [dstq + widthq], m0
+    sub     widthd, mmsize
+    jge     .loop
+    RET
+%endmacro
+
+%macro PUT_RECT 1
+; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
+    mova    m0, [pb_80]
+    add     wd, (mmsize-1)
+    and     wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+    movsxd   dst_strideq, dst_strided
+    movsxd   src_strideq, src_strided
+    mov   r7d, r5m
+    mov   r8d, wd
+    %define wspill r8d
+    %define hd r7d
+%else
+    mov    r4m, wd
+    %define wspill r4m
+    %define hd r5mp
+%endif
+
+.loopy
+    lea     src2q, [srcq+src_strideq*2]
+    lea     dst2q, [dstq+dst_strideq]
+.loopx:
+    sub      wd, mmsize
+    mova     m1, [srcq +2*wq]
+    mova     m2, [src2q+2*wq]
+    packsswb m1, [srcq +2*wq+mmsize]
+    packsswb m2, [src2q+2*wq+mmsize]
+    paddb    m1, m0
+    paddb    m2, m0
+    mova    [dstq +wq], m1
+    mova    [dst2q+wq], m2
+    jg      .loopx
+
+    lea   srcq, [srcq+src_strideq*4]
+    lea   dstq, [dstq+dst_strideq*2]
+    sub     hd, 2
+    mov     wd, wspill
+    jg      .loopy
+    RET
+%endm
+
+%macro ADD_RECT 1
+; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
+cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
+    mova    m0, [pw_32]
+    add     wd, (mmsize-1)
+    and     wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+    movsxd   strideq, strided
+    movsxd   idwt_strideq, idwt_strided
+    mov   r8d, wd
+    %define wspill r8d
+%else
+    mov    r5m, wd
+    %define wspill r5m
+%endif
+
+.loop:
+    sub     wd, mmsize
+    movu    m1, [srcq +2*wq] ; FIXME: ensure alignment
+    paddw   m1, m0
+    psraw   m1, 6
+    movu    m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
+    paddw   m2, m0
+    psraw   m2, 6
+    paddw   m1, [idwtq+2*wq]
+    paddw   m2, [idwtq+2*wq+mmsize]
+    packuswb m1, m2
+    mova    [dstq +wq], m1
+    jg      .loop
+
+    lea   srcq, [srcq + 2*strideq]
+    add   dstq, strideq
+    lea  idwtq, [idwtq+ 2*idwt_strideq]
+    sub     hd, 1
+    mov     wd, wspill
+    jg      .loop
+    RET
+%endm
+
+%macro ADD_OBMC 2
+; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
+cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
+    pxor        m4, m4
+.loop:
+%assign i 0
+%rep %1 / mmsize
+    mova        m0, [srcq+i]
+    mova        m1, m0
+    punpcklbw   m0, m4
+    punpckhbw   m1, m4
+    mova        m2, [obmcq+i]
+    mova        m3, m2
+   punpcklbw   m2, m4
+    punpckhbw   m3, m4
+    pmullw      m0, m2
+    pmullw      m1, m3
+    movu        m2, [dstq+2*i]
+    movu        m3, [dstq+2*i+mmsize]
+    paddw       m0, m2
+    paddw       m1, m3
+    movu        [dstq+2*i], m0
+    movu        [dstq+2*i+mmsize], m1
+%assign i i+mmsize
+%endrep
+    lea         srcq, [srcq+strideq]
+    lea         dstq, [dstq+2*strideq]
+    add         obmcq, 32
+    sub         yblend, 1
+    jg          .loop
+    RET
+%endm
+
+INIT_MMX
+%if ARCH_X86_64 == 0
+PUT_RECT mmx
+ADD_RECT mmx
+
+HPEL_FILTER mmx
+ADD_OBMC 32, mmx
+ADD_OBMC 16, mmx
+%endif
+ADD_OBMC 8, mmx
+
+INIT_XMM
+PUT_RECT sse2
+ADD_RECT sse2
+
+HPEL_FILTER sse2
+ADD_OBMC 32, sse2
+ADD_OBMC 16, sse2
diff --git a/libavcodec/x86/dnxhdenc.asm b/libavcodec/x86/dnxhdenc.asm
index d39b07b9f4..9dd6d51ee6 100644
--- a/libavcodec/x86/dnxhdenc.asm
+++ b/libavcodec/x86/dnxhdenc.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
 ;* Copyright (c) 2014 Tiancheng "Timothy" Gu <timothygu99@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/dnxhdenc_init.c b/libavcodec/x86/dnxhdenc_init.c
index f1ff7bd986..fd6f15005a 100644
--- a/libavcodec/x86/dnxhdenc_init.c
+++ b/libavcodec/x86/dnxhdenc_init.c
@@ -4,20 +4,20 @@
  *
  * VC-3 encoder funded by the British Broadcasting Corporation
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/dwt_yasm.asm b/libavcodec/x86/dwt_yasm.asm
new file mode 100644
index 0000000000..658acc13fc
--- /dev/null
+++ b/libavcodec/x86/dwt_yasm.asm
@@ -0,0 +1,307 @@
+;******************************************************************************
+;* MMX optimized discrete wavelet trasnform
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_1991: times 4 dw 9,-1
+
+cextern pw_1
+cextern pw_2
+cextern pw_8
+cextern pw_16
+
+section .text
+
+; %1 -= (%2 + %3 + 2)>>2     %4 is pw_2
+%macro COMPOSE_53iL0 4
+    paddw   %2, %3
+    paddw   %2, %4
+    psraw   %2, 2
+    psubw   %1, %2
+%endm
+
+; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
+; if %4 is supplied, %1 is loaded unaligned from there
+; m2: clobbered  m3: pw_8  m4: pw_1991
+%macro COMPOSE_DD97iH0 3-4
+    paddw   m0, %3
+    paddw   m1, %2
+    psubw   m0, m3
+    mova    m2, m1
+    punpcklwd m1, m0
+    punpckhwd m2, m0
+    pmaddwd m1, m4
+    pmaddwd m2, m4
+%if %0 > 3
+    movu    %1, %4
+%endif
+    psrad   m1, 4
+    psrad   m2, 4
+    packssdw m1, m2
+    paddw   m1, %1
+%endm
+
+%macro COMPOSE_VERTICAL 1
+; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                  int width)
+cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
+    mova    m2, [pw_2]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m1, [b0q+2*widthq]
+    mova    m0, [b1q+2*widthq]
+    COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
+    mova    [b1q+2*widthq], m0
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                  int width)
+cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
+    mova    m1, [pw_1]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    paddw   m0, [b2q+2*widthq]
+    paddw   m0, m1
+    psraw   m0, 1
+    paddw   m0, [b1q+2*widthq]
+    mova    [b1q+2*widthq], m0
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                               IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
+    mova    m3, [pw_8]
+    mova    m4, [pw_1991]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    mova    m1, [b1q+2*widthq]
+    COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
+    mova    [b2q+2*widthq], m1
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+;                                IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
+    mova    m3, [pw_16]
+    mova    m4, [pw_1991]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m0, [b0q+2*widthq]
+    mova    m1, [b1q+2*widthq]
+    mova    m5, [b2q+2*widthq]
+    paddw   m0, [b4q+2*widthq]
+    paddw   m1, [b3q+2*widthq]
+    psubw   m0, m3
+    mova    m2, m1
+    punpcklwd m1, m0
+    punpckhwd m2, m0
+    pmaddwd m1, m4
+    pmaddwd m2, m4
+    psrad   m1, 5
+    psrad   m2, 5
+    packssdw m1, m2
+    psubw   m5, m1
+    mova    [b2q+2*widthq], m5
+    jg      .loop
+    REP_RET
+
+; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
+cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
+    mova    m3, [pw_1]
+%if ARCH_X86_64
+    mov     widthd, widthd
+%endif
+.loop:
+    sub     widthq, mmsize/2
+    mova    m1, [b1q+2*widthq]
+    mova    m0, [b0q+2*widthq]
+    mova    m2, m1
+    paddw   m1, m3
+    psraw   m1, 1
+    psubw   m0, m1
+    mova    [b0q+2*widthq], m0
+    paddw   m2, m0
+    mova    [b1q+2*widthq], m2
+    jg      .loop
+    REP_RET
+%endmacro
+
+; extend the left and right edges of the tmp array by %1 and %2 respectively
+%macro EDGE_EXTENSION 3
+    mov     %3, [tmpq]
+%assign %%i 1
+%rep %1
+    mov     [tmpq-2*%%i], %3
+    %assign %%i %%i+1
+%endrep
+    mov     %3, [tmpq+2*w2q-2]
+%assign %%i 0
+%rep %2
+    mov     [tmpq+2*w2q+2*%%i], %3
+    %assign %%i %%i+1
+%endrep
+%endmacro
+
+
+%macro HAAR_HORIZONTAL 2
+; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
+    mov    w2d, wd
+    xor     xq, xq
+    shr    w2d, 1
+    lea  b_w2q, [bq+wq]
+    mova    m3, [pw_1]
+.lowpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [bq    + 2*xq]
+    paddw   m1, m3
+    psraw   m1, 1
+    psubw   m0, m1
+    mova    [tmpq + 2*xq], m0
+    add     xq, mmsize/2
+    cmp     xq, w2q
+    jl      .lowpass_loop
+
+    xor     xq, xq
+    and    w2q, ~(mmsize/2 - 1)
+    cmp    w2q, mmsize/2
+    jl      .end
+
+.highpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [tmpq  + 2*xq]
+    paddw   m1, m0
+
+    ; shift and interleave
+%if %2 == 1
+    paddw   m0, m3
+    paddw   m1, m3
+    psraw   m0, 1
+    psraw   m1, 1
+%endif
+    mova    m2, m0
+    punpcklwd m0, m1
+    punpckhwd m2, m1
+    mova    [bq+4*xq], m0
+    mova    [bq+4*xq+mmsize], m2
+
+    add     xq, mmsize/2
+    cmp     xq, w2q
+    jl      .highpass_loop
+.end:
+    REP_RET
+%endmacro
+
+
+INIT_XMM
+; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
+    mov    w2d, wd
+    xor     xd, xd
+    shr    w2d, 1
+    lea  b_w2q, [bq+wq]
+    movu    m4, [bq+wq]
+    mova    m7, [pw_2]
+    pslldq  m4, 14
+.lowpass_loop:
+    movu    m1, [b_w2q + 2*xq]
+    mova    m0, [bq    + 2*xq]
+    mova    m2, m1
+    palignr m1, m4, 14
+    mova    m4, m2
+    COMPOSE_53iL0 m0, m1, m2, m7
+    mova    [tmpq + 2*xq], m0
+    add     xd, mmsize/2
+    cmp     xd, w2d
+    jl      .lowpass_loop
+
+    EDGE_EXTENSION 1, 2, xw
+    ; leave the last up to 7 (sse) or 3 (mmx) values for C
+    xor     xd, xd
+    and    w2d, ~(mmsize/2 - 1)
+    cmp    w2d, mmsize/2
+    jl      .end
+
+    mova    m7, [tmpq-mmsize]
+    mova    m0, [tmpq]
+    mova    m5, [pw_1]
+    mova    m3, [pw_8]
+    mova    m4, [pw_1991]
+.highpass_loop:
+    mova    m6, m0
+    palignr m0, m7, 14
+    mova    m7, [tmpq + 2*xq + 16]
+    mova    m1, m7
+    mova    m2, m7
+    palignr m1, m6, 2
+    palignr m2, m6, 4
+    COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
+    mova    m0, m7
+    mova    m7, m6
+
+    ; shift and interleave
+    paddw   m6, m5
+    paddw   m1, m5
+    psraw   m6, 1
+    psraw   m1, 1
+    mova    m2, m6
+    punpcklwd m6, m1
+    punpckhwd m2, m1
+    mova    [bq+4*xq], m6
+    mova    [bq+4*xq+mmsize], m2
+
+    add     xd, mmsize/2
+    cmp     xd, w2d
+    jl      .highpass_loop
+.end:
+    REP_RET
+
+
+%if ARCH_X86_64 == 0
+INIT_MMX
+COMPOSE_VERTICAL mmx
+HAAR_HORIZONTAL mmx, 0
+HAAR_HORIZONTAL mmx, 1
+%endif
+
+;;INIT_XMM
+INIT_XMM
+COMPOSE_VERTICAL sse2
+HAAR_HORIZONTAL sse2, 0
+HAAR_HORIZONTAL sse2, 1
diff --git a/libavcodec/x86/fdct.c b/libavcodec/x86/fdct.c
index 6528b57361..112566ded0 100644
--- a/libavcodec/x86/fdct.c
+++ b/libavcodec/x86/fdct.c
@@ -13,20 +13,20 @@
  * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
  * Skal's fdct at http://skal.planet-d.net/coding/dct.html
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -70,7 +70,7 @@ DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
 
 DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
 
-static struct
+static const struct
 {
  DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
 } fdct_r_row_sse2 =
@@ -153,7 +153,7 @@ DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = {  // forward_dct
   29692,  -12299,   26722,  -31521,
 };
 
-static struct
+static const struct
 {
  DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
 } tab_frw_01234567_sse2 =
diff --git a/libavcodec/x86/fdct.h b/libavcodec/x86/fdct.h
index c94a977e8f..648cdc5350 100644
--- a/libavcodec/x86/fdct.h
+++ b/libavcodec/x86/fdct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fdctdsp_init.c b/libavcodec/x86/fdctdsp_init.c
index 4e8e4eb60d..0cb5fd625b 100644
--- a/libavcodec/x86/fdctdsp_init.c
+++ b/libavcodec/x86/fdctdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm
index e4744a3b60..877997e7a6 100644
--- a/libavcodec/x86/fft.asm
+++ b/libavcodec/x86/fft.asm
@@ -6,20 +6,20 @@
 ;* This algorithm (though not any of the implementation details) is
 ;* based on libdjbfft by D. J. Bernstein.
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -36,6 +36,8 @@
 %define pointer resd
 %endif
 
+SECTION_RODATA 32
+
 struc FFTContext
     .nbits:    resd 1
     .reverse:  resd 1
@@ -51,13 +53,10 @@ struc FFTContext
     .imdcthalf:pointer 1
 endstruc
 
-SECTION_RODATA
-
 %define M_SQRT1_2 0.70710678118654752440
 %define M_COS_PI_1_8 0.923879532511287
 %define M_COS_PI_3_8 0.38268343236509
 
-align 32
 ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
 ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
 
@@ -69,9 +68,10 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
-ps_m1m1m1m1: times 4 dd 1<<31
 ps_m1p1: dd 1<<31, 0
 
+cextern ps_neg
+
 %assign i 16
 %rep 13
 cextern cos_ %+ i
@@ -305,6 +305,7 @@ IF%1 mova  Z(1), m5
 
 INIT_YMM avx
 
+%if HAVE_AVX_EXTERNAL
 align 16
 fft8_avx:
     mova      m0, Z(0)
@@ -394,6 +395,8 @@ fft32_interleave_avx:
     jg .deint_loop
     ret
 
+%endif
+
 INIT_XMM sse
 
 align 16
@@ -537,6 +540,7 @@ DEFINE_ARGS zc, w, n, o1, o3
 
 INIT_YMM avx
 
+%if HAVE_AVX_EXTERNAL
 %macro INTERL_AVX 5
     vunpckhps      %3, %2, %1
     vunpcklps      %2, %2, %1
@@ -558,6 +562,7 @@ cglobal fft_calc, 2,5,8
     FFT_DISPATCH _interleave %+ SUFFIX, r1
     REP_RET
 
+%endif
 
 INIT_XMM sse
 
@@ -681,7 +686,7 @@ cglobal imdct_calc, 3,5,3
     mov     r2, r3
     sub     r3, mmsize
     neg     r2
-    mova    m2, [ps_m1m1m1m1]
+    mova    m2, [ps_neg]
 .loop:
 %if mmsize == 8
     PSWAPD  m0, [r1 + r3]
@@ -776,9 +781,11 @@ align 8
 dispatch_tab %+ fullsuffix: pointer list_of_fft
 %endmacro ; DECL_FFT
 
+%if HAVE_AVX_EXTERNAL
 INIT_YMM avx
 DECL_FFT 6
 DECL_FFT 6, _interleave
+%endif
 INIT_XMM sse
 DECL_FFT 5
 DECL_FFT 5, _interleave
@@ -992,7 +999,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
     sub   r4, r3
 %endif
 %if notcpuflag(3dnowext) && mmsize == 8
-    movd  m7, [ps_m1m1m1m1]
+    movd  m7, [ps_neg]
 %endif
 .pre:
 %if ARCH_X86_64 == 0
@@ -1080,4 +1087,7 @@ DECL_IMDCT POSROTATESHUF_3DNOW
 %endif
 
 INIT_YMM avx
+
+%if HAVE_AVX_EXTERNAL
 DECL_IMDCT POSROTATESHUF_AVX
+%endif
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index a604956836..398091eb1f 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/fft_init.c b/libavcodec/x86/fft_init.c
index 7ca72c54a4..5682230c8e 100644
--- a/libavcodec/x86/fft_init.c
+++ b/libavcodec/x86/fft_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
new file mode 100644
index 0000000000..cedf0837a7
--- /dev/null
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -0,0 +1,101 @@
+;******************************************************************************
+;* FLAC DSP functions
+;*
+;* Copyright (c) 2014 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+INIT_XMM sse4
+%if ARCH_X86_64
+    cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
+    DECLARE_REG_TMP 5, 6
+    %define length r2d
+
+    movsxd orderq, orderd
+%else
+    cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs
+    DECLARE_REG_TMP 2, 5
+    %define length r2mp
+%endif
+
+; Here we assume that the maximum order value is 32.  This means that we only
+; need to copy a maximum of 32 samples.  Therefore we let the preprocessor
+; unroll this loop and copy all 32.
+%assign iter 0
+%rep 32/(mmsize/4)
+    movu  m0,         [smpq+iter]
+    movu [resq+iter],  m0
+    %assign iter iter+mmsize
+%endrep
+
+lea  resq,   [resq+orderq*4]
+lea  smpq,   [smpq+orderq*4]
+lea  coefsq, [coefsq+orderq*4]
+sub  length,  orderd
+movd m3,      r5m
+neg  orderq
+
+%define posj t0q
+%define negj t1q
+
+.looplen:
+    pxor m0,   m0
+    pxor m4,   m4
+    pxor m6,   m6
+    mov  posj, orderq
+    xor  negj, negj
+
+    .looporder:
+        movd   m2, [coefsq+posj*4] ; c = coefs[j]
+        SPLATD m2
+        movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+        movu   m5, [smpq+negj*4-4+mmsize]
+        movu   m7, [smpq+negj*4-4+mmsize*2]
+        pmulld m1,  m2
+        pmulld m5,  m2
+        pmulld m7,  m2
+        paddd  m0,  m1             ; p += c * s
+        paddd  m4,  m5
+        paddd  m6,  m7
+
+        dec    negj
+        inc    posj
+    jnz .looporder
+
+    psrad  m0,     m3              ; p >>= shift
+    psrad  m4,     m3
+    psrad  m6,     m3
+    movu   m1,    [smpq]
+    movu   m5,    [smpq+mmsize]
+    movu   m7,    [smpq+mmsize*2]
+    psubd  m1,     m0              ; smp[i] - p
+    psubd  m5,     m4
+    psubd  m7,     m6
+    movu  [resq],  m1              ; res[i] = smp[i] - (p >> shift)
+    movu  [resq+mmsize], m5
+    movu  [resq+mmsize*2], m7
+
+    add resq,    3*mmsize
+    add smpq,    3*mmsize
+    sub length, (3*mmsize)/4
+jg .looplen
+RET
diff --git a/libavcodec/x86/flacdsp.asm b/libavcodec/x86/flacdsp.asm
new file mode 100644
index 0000000000..37ee87b163
--- /dev/null
+++ b/libavcodec/x86/flacdsp.asm
@@ -0,0 +1,74 @@
+;******************************************************************************
+;* FLAC DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro LPC_32 1
+INIT_XMM %1
+cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
+    sub    lend, pred_orderd
+    jle .ret
+    lea    decodedq, [decodedq+pred_orderq*4-8]
+    lea    coeffsq, [coeffsq+pred_orderq*4]
+    neg    pred_orderq
+    movd   m4, qlevelm
+ALIGN 16
+.loop_sample:
+    movd   m0, [decodedq+pred_orderq*4+8]
+    add    decodedq, 8
+    movd   m1, [coeffsq+pred_orderq*4]
+    pxor   m2, m2
+    pxor   m3, m3
+    lea    jq, [pred_orderq+1]
+    test   jq, jq
+    jz .end_order
+.loop_order:
+    PMACSDQL m2, m0, m1, m2, m0
+    movd   m0, [decodedq+jq*4]
+    PMACSDQL m3, m1, m0, m3, m1
+    movd   m1, [coeffsq+jq*4]
+    inc    jq
+    jl .loop_order
+.end_order:
+    PMACSDQL m2, m0, m1, m2, m0
+    psrlq  m2, m4
+    movd   m0, [decodedq]
+    paddd  m0, m2
+    movd   [decodedq], m0
+    sub  lend, 2
+    jl .ret
+    PMACSDQL m3, m1, m0, m3, m1
+    psrlq  m3, m4
+    movd   m1, [decodedq+4]
+    paddd  m1, m3
+    movd   [decodedq+4], m1
+    jg .loop_sample
+.ret:
+    REP_RET
+%endmacro
+
+%if HAVE_XOP_EXTERNAL
+LPC_32 xop
+%endif
+LPC_32 sse4
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
new file mode 100644
index 0000000000..ad88e5bbd1
--- /dev/null
+++ b/libavcodec/x86/flacdsp_init.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/flacdsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
+                         int qlevel, int len);
+void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
+                        int qlevel, int len);
+
+void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
+
+av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt,
+                                 int bps)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        if (bps > 16 && CONFIG_FLAC_DECODER)
+            c->lpc = ff_flac_lpc_32_sse4;
+        if (bps == 16 && CONFIG_FLAC_ENCODER && CONFIG_GPL)
+            c->lpc_encode = ff_flac_enc_lpc_16_sse4;
+    }
+    if (EXTERNAL_XOP(cpu_flags)) {
+        if (bps > 16 && CONFIG_FLAC_DECODER)
+            c->lpc = ff_flac_lpc_32_xop;
+    }
+#endif
+}
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index 818437672f..19595f7d00 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -2,20 +2,20 @@
 ;* x86 optimized Format Conversion Utils
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -77,6 +77,44 @@ INT32_TO_FLOAT_FMUL_SCALAR 5
 INIT_XMM sse2
 INT32_TO_FLOAT_FMUL_SCALAR 3
 
+;------------------------------------------------------------------------------
+; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
+;                                    const float *mul, int len);
+;------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_ARRAY8 0
+cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
+    shl     lend, 2
+    add     srcq, lenq
+    add     dstq, lenq
+    neg     lenq
+.loop:
+    movss     m0, [mulq]
+    SPLATD    m0
+%if cpuflag(sse2)
+    cvtdq2ps  m1, [srcq+lenq   ]
+    cvtdq2ps  m2, [srcq+lenq+16]
+%else
+    cvtpi2ps  m1, [srcq+lenq   ]
+    cvtpi2ps  m3, [srcq+lenq+ 8]
+    cvtpi2ps  m2, [srcq+lenq+16]
+    cvtpi2ps  m4, [srcq+lenq+24]
+    movlhps   m1, m3
+    movlhps   m2, m4
+%endif
+    mulps     m1, m0
+    mulps     m2, m0
+    mova  [dstq+lenq   ], m1
+    mova  [dstq+lenq+16], m2
+    add     mulq, 4
+    add     lenq, 32
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse
+INT32_TO_FLOAT_FMUL_ARRAY8
+INIT_XMM sse2
+INT32_TO_FLOAT_FMUL_ARRAY8
 
 ;------------------------------------------------------------------------------
 ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
diff --git a/libavcodec/x86/fmtconvert_init.c b/libavcodec/x86/fmtconvert_init.c
index 3d75df92bd..cbb96e2530 100644
--- a/libavcodec/x86/fmtconvert_init.c
+++ b/libavcodec/x86/fmtconvert_init.c
@@ -5,20 +5,20 @@
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,10 @@
 
 void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
 void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
+void ff_int32_to_float_fmul_array8_sse (FmtConvertContext *c, float *dst, const int32_t *src,
+                                        const float *mul, int len);
+void ff_int32_to_float_fmul_array8_sse2(FmtConvertContext *c, float *dst, const int32_t *src,
+                                        const float *mul, int len);
 
 void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
 void ff_float_to_int16_sse  (int16_t *dst, const float *src, long len);
@@ -134,12 +138,14 @@ av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx
     }
     if (EXTERNAL_SSE(cpu_flags)) {
         c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse;
         c->float_to_int16             = ff_float_to_int16_sse;
         c->float_to_int16_interleave  = float_to_int16_interleave_sse;
         c->float_interleave           = float_interleave_sse;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
+        c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse2;
         c->float_to_int16             = ff_float_to_int16_sse2;
         c->float_to_int16_interleave  = float_to_int16_interleave_sse2;
     }
diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm
index b581471296..0e3b444e2a 100644
--- a/libavcodec/x86/fpel.asm
+++ b/libavcodec/x86/fpel.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2003-2013 Michael Niedermayer
 ;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -25,85 +25,83 @@
 
 SECTION .text
 
-INIT_MMX mmxext
+%macro PAVGB_MMX 4
+    LOAD   %3, %1
+    por    %3, %2
+    pxor   %2, %1
+    pand   %2, %4
+    psrlq  %2, 1
+    psubb  %3, %2
+    SWAP   %2, %3
+%endmacro
+
 ; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels,
 ;                        ptrdiff_t line_size, int h)
-%macro PIXELS48 2
-%if %2 == 4
-%define OP movh
+%macro OP_PIXELS 2
+%if %2 == mmsize/2
+%define LOAD movh
+%define SAVE movh
+%define LEN  mmsize
 %else
-%define OP mova
+%define LOAD movu
+%define SAVE mova
+%define LEN  %2
 %endif
-cglobal %1_pixels%2, 4,5
+cglobal %1_pixels%2, 4,5,4
     movsxdifnidn r2, r2d
     lea          r4, [r2*3]
+%ifidn %1, avg
+%if notcpuflag(mmxext)
+    pcmpeqd      m6, m6
+    paddb        m6, m6
+%endif
+%endif
 .loop:
-    OP           m0, [r1]
-    OP           m1, [r1+r2]
-    OP           m2, [r1+r2*2]
-    OP           m3, [r1+r4]
-    lea          r1, [r1+r2*4]
+%assign %%i 0
+%rep LEN/mmsize
+    LOAD         m0, [r1 + %%i]
+    LOAD         m1, [r1+r2 + %%i]
+    LOAD         m2, [r1+r2*2 + %%i]
+    LOAD         m3, [r1+r4 + %%i]
 %ifidn %1, avg
-    pavgb        m0, [r0]
-    pavgb        m1, [r0+r2]
-    pavgb        m2, [r0+r2*2]
-    pavgb        m3, [r0+r4]
+%if notcpuflag(mmxext)
+    PAVGB_MMX    [r0 + %%i], m0, m4, m6
+    PAVGB_MMX    [r0+r2 + %%i], m1, m5, m6
+    PAVGB_MMX    [r0+r2*2 + %%i], m2, m4, m6
+    PAVGB_MMX    [r0+r4 + %%i], m3, m5, m6
+%else
+    pavgb        m0, [r0 + %%i]
+    pavgb        m1, [r0+r2 + %%i]
+    pavgb        m2, [r0+r2*2 + %%i]
+    pavgb        m3, [r0+r4 + %%i]
+%endif
 %endif
-    OP         [r0], m0
-    OP      [r0+r2], m1
-    OP    [r0+r2*2], m2
-    OP      [r0+r4], m3
+    SAVE       [r0 + %%i], m0
+    SAVE    [r0+r2 + %%i], m1
+    SAVE  [r0+r2*2 + %%i], m2
+    SAVE    [r0+r4 + %%i], m3
+%assign %%i %%i+mmsize
+%endrep
     sub         r3d, 4
+    lea          r1, [r1+r2*4]
     lea          r0, [r0+r2*4]
     jne       .loop
     RET
 %endmacro
 
-PIXELS48 put, 4
-PIXELS48 avg, 4
-PIXELS48 put, 8
-PIXELS48 avg, 8
+INIT_MMX mmx
+OP_PIXELS put, 4
+OP_PIXELS avg, 4
+OP_PIXELS put, 8
+OP_PIXELS avg, 8
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
 
+INIT_MMX mmxext
+OP_PIXELS avg, 4
+OP_PIXELS avg, 8
+OP_PIXELS avg, 16
 
 INIT_XMM sse2
-; void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
-;                           ptrdiff_t line_size, int h)
-cglobal put_pixels16, 4,5,4
-    lea          r4, [r2*3]
-.loop:
-    movu         m0, [r1]
-    movu         m1, [r1+r2]
-    movu         m2, [r1+r2*2]
-    movu         m3, [r1+r4]
-    lea          r1, [r1+r2*4]
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova  [r0+r2*2], m2
-    mova    [r0+r4], m3
-    sub         r3d, 4
-    lea          r0, [r0+r2*4]
-    jnz       .loop
-    REP_RET
-
-; void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
-;                           ptrdiff_t line_size, int h)
-cglobal avg_pixels16, 4,5,4
-    lea          r4, [r2*3]
-.loop:
-    movu         m0, [r1]
-    movu         m1, [r1+r2]
-    movu         m2, [r1+r2*2]
-    movu         m3, [r1+r4]
-    lea          r1, [r1+r2*4]
-    pavgb        m0, [r0]
-    pavgb        m1, [r0+r2]
-    pavgb        m2, [r0+r2*2]
-    pavgb        m3, [r0+r4]
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova  [r0+r2*2], m2
-    mova    [r0+r4], m3
-    sub         r3d, 4
-    lea          r0, [r0+r2*4]
-    jnz       .loop
-    REP_RET
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
diff --git a/libavcodec/x86/fpel.h b/libavcodec/x86/fpel.h
index 88d1415ade..4d93959a96 100644
--- a/libavcodec/x86/fpel.h
+++ b/libavcodec/x86/fpel.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,6 +28,8 @@ void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int h);
 void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
                          ptrdiff_t line_size, int h);
+void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int h);
 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int h);
 void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
diff --git a/libavcodec/x86/fpel_mmx.c b/libavcodec/x86/fpel_mmx.c
deleted file mode 100644
index eef05ecc74..0000000000
--- a/libavcodec/x86/fpel_mmx.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * MMX-optimized avg/put pixel routines
- *
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "config.h"
-#include "fpel.h"
-#include "inline_asm.h"
-
-#if HAVE_MMX_INLINE
-
-// in case more speed is needed - unrolling would certainly help
-void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
-                        ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movq  %0, %%mm0           \n\t"
-             "movq  %1, %%mm1           \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, %0           \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
-                         ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movq  %0, %%mm0           \n\t"
-             "movq  %1, %%mm1           \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, %0           \n\t"
-             "movq  8%0, %%mm0          \n\t"
-             "movq  8%1, %%mm1          \n\t"
-             PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movq  %%mm2, 8%0          \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
-                        ptrdiff_t line_size, int h)
-{
-    __asm__ volatile (
-        "lea   (%3, %3), %%"REG_a"      \n\t"
-        ".p2align     3                 \n\t"
-        "1:                             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq     %%mm0, (%2)           \n\t"
-        "movq     %%mm1, (%2, %3)       \n\t"
-        "add  %%"REG_a", %1             \n\t"
-        "add  %%"REG_a", %2             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq     %%mm0, (%2)           \n\t"
-        "movq     %%mm1, (%2, %3)       \n\t"
-        "add  %%"REG_a", %1             \n\t"
-        "add  %%"REG_a", %2             \n\t"
-        "subl        $4, %0             \n\t"
-        "jnz         1b                 \n\t"
-        : "+g"(h), "+r"(pixels),  "+r"(block)
-        : "r"((x86_reg)line_size)
-        : "%"REG_a, "memory"
-        );
-}
-
-void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
-                         ptrdiff_t line_size, int h)
-{
-    __asm__ volatile (
-        "lea   (%3, %3), %%"REG_a"      \n\t"
-        ".p2align     3                 \n\t"
-        "1:                             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq 8(%1    ), %%mm4          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq 8(%1, %3), %%mm5          \n\t"
-        "movq     %%mm0,  (%2)          \n\t"
-        "movq     %%mm4, 8(%2)          \n\t"
-        "movq     %%mm1,  (%2, %3)      \n\t"
-        "movq     %%mm5, 8(%2, %3)      \n\t"
-        "add  %%"REG_a", %1             \n\t"
-        "add  %%"REG_a", %2             \n\t"
-        "movq  (%1    ), %%mm0          \n\t"
-        "movq 8(%1    ), %%mm4          \n\t"
-        "movq  (%1, %3), %%mm1          \n\t"
-        "movq 8(%1, %3), %%mm5          \n\t"
-        "movq     %%mm0,  (%2)          \n\t"
-        "movq     %%mm4, 8(%2)          \n\t"
-        "movq     %%mm1,  (%2, %3)      \n\t"
-        "movq     %%mm5, 8(%2, %3)      \n\t"
-        "add  %%"REG_a", %1             \n\t"
-        "add  %%"REG_a", %2             \n\t"
-        "subl        $4, %0             \n\t"
-        "jnz         1b                 \n\t"
-        : "+g"(h), "+r"(pixels),  "+r"(block)
-        : "r"((x86_reg)line_size)
-        : "%"REG_a, "memory"
-        );
-}
-
-#endif /* HAVE_MMX_INLINE */
diff --git a/libavcodec/x86/h263_loopfilter.asm b/libavcodec/x86/h263_loopfilter.asm
index 673f795daa..2fcd1a26e5 100644
--- a/libavcodec/x86/h263_loopfilter.asm
+++ b/libavcodec/x86/h263_loopfilter.asm
@@ -1,20 +1,22 @@
 ;******************************************************************************
 ;* MMX-optimized H.263 loop filter
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h263dsp_init.c b/libavcodec/x86/h263dsp_init.c
index d4fab981bf..ab81063233 100644
--- a/libavcodec/x86/h263dsp_init.c
+++ b/libavcodec/x86/h263dsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013 Diego Biurrun <diego@biurrun.de>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index cc41f00461..107ae51cbc 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
 ;*               2005-2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm
index 7b003515cc..c358482092 100644
--- a/libavcodec/x86/h264_chromamc_10bit.asm
+++ b/libavcodec/x86/h264_chromamc_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -252,8 +252,10 @@ cglobal %1_h264_chroma_mc2_10, 6,7
 %define CHROMAMC_AVG  NOTHING
 INIT_XMM sse2
 CHROMA_MC8 put
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 CHROMA_MC8 put
+%endif
 INIT_MMX mmxext
 CHROMA_MC4 put
 CHROMA_MC2 put
@@ -261,8 +263,10 @@ CHROMA_MC2 put
 %define CHROMAMC_AVG  AVG
 INIT_XMM sse2
 CHROMA_MC8 avg
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 CHROMA_MC8 avg
+%endif
 INIT_MMX mmxext
 CHROMA_MC4 avg
 CHROMA_MC2 avg
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index d2067c86e7..14c8205bab 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -7,20 +7,20 @@
 ;*          Fiona Glaser <fiona@x264.com>
 ;*          Oskar Arvidsson <oskar@irock.se>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -384,8 +384,10 @@ cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
 
 INIT_XMM sse2
 DEBLOCK_LUMA
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA
+%endif
 
 %else
 
@@ -499,8 +501,10 @@ INIT_MMX mmxext
 DEBLOCK_LUMA v8, 8
 INIT_XMM sse2
 DEBLOCK_LUMA v, 16
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA v, 16
+%endif
 
 %endif ; ARCH
 
@@ -772,8 +776,10 @@ cglobal deblock_h_luma_intra_8, 2,4,8,0x80
 
 INIT_XMM sse2
 DEBLOCK_LUMA_INTRA v
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_INTRA v
+%endif
 %if ARCH_X86_64 == 0
 INIT_MMX mmxext
 DEBLOCK_LUMA_INTRA v8
@@ -836,7 +842,11 @@ cglobal deblock_h_chroma_8, 5,7
     TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
     movq  buf0, m0
     movq  buf1, m3
-    call ff_chroma_inter_body_mmxext
+    LOAD_MASK  r2d, r3d
+    movd       m6, [r4] ; tc0
+    punpcklbw  m6, m6
+    pand       m7, m6
+    DEBLOCK_P0_Q0
     movq  m0, buf0
     movq  m3, buf1
     TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index d049c62bf2..d8ace17ec9 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -7,20 +7,20 @@
 ;*          Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -418,9 +418,11 @@ cglobal deblock_h_luma_10, 5,7,15
 
 INIT_XMM sse2
 DEBLOCK_LUMA_64
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_64
 %endif
+%endif
 
 %macro SWAPMOVA 2
 %ifid %1
@@ -715,8 +717,10 @@ cglobal deblock_h_luma_intra_10, 4,7,16
 
 INIT_XMM sse2
 DEBLOCK_LUMA_INTRA_64
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA_INTRA_64
+%endif
 
 %endif
 
@@ -802,10 +806,12 @@ DEBLOCK_LUMA_INTRA
 INIT_XMM sse2
 DEBLOCK_LUMA
 DEBLOCK_LUMA_INTRA
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_LUMA
 DEBLOCK_LUMA_INTRA
 %endif
+%endif
 
 ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
 ; out: %1=p0', %2=q0'
@@ -918,5 +924,7 @@ DEBLOCK_CHROMA
 %endif
 INIT_XMM sse2
 DEBLOCK_CHROMA
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEBLOCK_CHROMA
+%endif
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index bb881c35df..ef65cf86b7 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -2,20 +2,20 @@
  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -38,7 +38,7 @@
 
 //FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
 //as that would make optimization work hard)
-#if HAVE_7REGS
+#if HAVE_7REGS && !BROKEN_COMPILER
 #define decode_significance decode_significance_x86
 static int decode_significance_x86(CABACContext *c, int max_coeff,
                                    uint8_t *significant_coeff_ctx_base,
@@ -55,6 +55,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff,
     __asm__ volatile(
         "lea   "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -130,6 +131,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
     __asm__ volatile(
         "lea    "MANGLE(ff_h264_cabac_tables)", %0      \n\t"
         : "=&r"(tables)
+        : NAMED_CONSTRAINTS_ARRAY(ff_h264_cabac_tables)
     );
 #endif
 
@@ -198,7 +200,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
     );
     return coeff_count;
 }
-#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
+#endif /* HAVE_7REGS && BROKEN_COMPILER */
 
 #endif /* HAVE_INLINE_ASM */
 #endif /* AVCODEC_X86_H264_I386_H */
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 313791a5d9..7fafe195f1 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -9,20 +9,20 @@
 ;*          Holger Lubitz <hal@duncan.ol.sub.de>
 ;*          Min Chen <chenm001.163.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index b7d51051d3..5c3acb1d38 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -83,8 +83,10 @@ cglobal h264_idct_add_10, 3,3
 
 INIT_XMM sse2
 IDCT_ADD_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD_10
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset,
@@ -117,9 +119,11 @@ add4x4_idct %+ SUFFIX:
 INIT_XMM sse2
 ALIGN 16
 ADD4x4IDCT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 ALIGN 16
 ADD4x4IDCT
+%endif
 
 %macro ADD16_OP 2
     cmp          byte [r4+%2], 0
@@ -155,8 +159,10 @@ cglobal h264_idct_add16_10, 5,6
 
 INIT_XMM sse2
 IDCT_ADD16_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD16_10
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
@@ -220,8 +226,10 @@ cglobal h264_idct8_dc_add_10,3,4,7
 
 INIT_XMM sse2
 IDCT8_DC_ADD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_DC_ADD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset,
@@ -293,8 +301,10 @@ cglobal h264_idct_add16intra_10,5,7,8
 
 INIT_XMM sse2
 IDCT_ADD16INTRA_10
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD16INTRA_10
+%endif
 
 %assign last_block 36
 ;-----------------------------------------------------------------------------
@@ -330,8 +340,10 @@ cglobal h264_idct_add8_10,5,8,7
 
 INIT_XMM sse2
 IDCT_ADD8
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT_ADD8
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride)
@@ -537,8 +549,10 @@ h264_idct8_add1_10 %+ SUFFIX:
 
 INIT_XMM sse2
 IDCT8_ADD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_ADD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset,
@@ -577,5 +591,7 @@ cglobal h264_idct8_add4_10, 0,7,16
 
 INIT_XMM sse2
 IDCT8_ADD4
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 IDCT8_ADD4
+%endif
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
index df657a443c..c88d91b49e 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -5,20 +5,20 @@
 ;* Copyright (c) 2010 Loren Merritt
 ;* Copyright (c) 2010 Ronald S. Bultje
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -2497,10 +2497,7 @@ cglobal pred4x4_tm_vp8_8, 3,3
     pshufb     mm3, mm6
     pshufb     mm4, mm6
     pshufb     mm5, mm6
-    psubw      mm2, mm7
-    psubw      mm3, mm7
-    psubw      mm4, mm7
-    psubw      mm5, mm7
+    psubw      mm0, mm7
     paddw      mm2, mm0
     paddw      mm3, mm0
     paddw      mm4, mm0
diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm
index 55790a956e..b60a21037f 100644
--- a/libavcodec/x86/h264_intrapred_10bit.asm
+++ b/libavcodec/x86/h264_intrapred_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,6 +26,7 @@
 
 SECTION_RODATA
 
+cextern pw_512
 cextern pw_16
 cextern pw_8
 cextern pw_4
@@ -35,7 +36,6 @@ cextern pw_1
 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
 pw_m3:        times 8 dw -3
 pw_pixel_max: times 8 dw ((1 << 10)-1)
-pw_512:       times 8 dw 512
 pd_17:        times 4 dd 17
 pd_16:        times 4 dd 16
 
@@ -82,8 +82,10 @@ INIT_XMM sse2
 PRED4x4_DR
 INIT_XMM ssse3
 PRED4x4_DR
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_DR
+%endif
 
 ;------------------------------------------------------------------------------
 ; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
@@ -119,8 +121,10 @@ INIT_XMM sse2
 PRED4x4_VR
 INIT_XMM ssse3
 PRED4x4_VR
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_VR
+%endif
 
 ;-------------------------------------------------------------------------------
 ; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
@@ -159,28 +163,14 @@ INIT_XMM sse2
 PRED4x4_HD
 INIT_XMM ssse3
 PRED4x4_HD
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_HD
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride)
 ;-----------------------------------------------------------------------------
-%macro HADDD 2 ; sum junk
-%if mmsize == 16
-    movhlps %2, %1
-    paddd   %1, %2
-    pshuflw %2, %1, 0xE
-    paddd   %1, %2
-%else
-    pshufw  %2, %1, 0xE
-    paddd   %1, %2
-%endif
-%endmacro
-
-%macro HADDW 2
-    pmaddwd %1, [pw_1]
-    HADDD   %1, %2
-%endmacro
 
 INIT_MMX mmxext
 cglobal pred4x4_dc_10, 3, 3
@@ -228,8 +218,10 @@ cglobal pred4x4_down_left_10, 3, 3
 
 INIT_XMM sse2
 PRED4x4_DL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_DL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
@@ -255,8 +247,10 @@ cglobal pred4x4_vertical_left_10, 3, 3
 
 INIT_XMM sse2
 PRED4x4_VL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED4x4_VL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
@@ -565,8 +559,10 @@ cglobal pred8x8l_top_dc_10, 4, 4, 6
 
 INIT_XMM sse2
 PRED8x8L_TOP_DC
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_TOP_DC
+%endif
 
 ;-------------------------------------------------------------------------------
 ; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
@@ -622,8 +618,10 @@ cglobal pred8x8l_dc_10, 4, 6, 6
 
 INIT_XMM sse2
 PRED8x8L_DC
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DC
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright,
@@ -656,8 +654,10 @@ cglobal pred8x8l_vertical_10, 4, 4, 6
 
 INIT_XMM sse2
 PRED8x8L_VERTICAL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_VERTICAL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright,
@@ -711,8 +711,10 @@ INIT_XMM sse2
 PRED8x8L_HORIZONTAL
 INIT_XMM ssse3
 PRED8x8L_HORIZONTAL
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_HORIZONTAL
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright,
@@ -778,8 +780,10 @@ INIT_XMM sse2
 PRED8x8L_DOWN_LEFT
 INIT_XMM ssse3
 PRED8x8L_DOWN_LEFT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DOWN_LEFT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright,
@@ -851,8 +855,10 @@ INIT_XMM sse2
 PRED8x8L_DOWN_RIGHT
 INIT_XMM ssse3
 PRED8x8L_DOWN_RIGHT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_DOWN_RIGHT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft,
@@ -920,8 +926,10 @@ INIT_XMM sse2
 PRED8x8L_VERTICAL_RIGHT
 INIT_XMM ssse3
 PRED8x8L_VERTICAL_RIGHT
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_VERTICAL_RIGHT
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft,
@@ -980,8 +988,10 @@ INIT_XMM sse2
 PRED8x8L_HORIZONTAL_UP
 INIT_XMM ssse3
 PRED8x8L_HORIZONTAL_UP
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 PRED8x8L_HORIZONTAL_UP
+%endif
 
 
 ;-----------------------------------------------------------------------------
diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c
index 0e572b1226..528b92e497 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 9ca6d7e644..b4cb9b119c 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  * Copyright (c) 2011 Daniel Kang
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,8 +29,8 @@
 #include "fpel.h"
 
 #if HAVE_YASM
-void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h);
+void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
 void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int h);
 void ff_put_pixels4_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
@@ -49,9 +49,9 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t *src1, const uint8_t
 #define ff_avg_pixels8_l2_sse2  ff_avg_pixels8_l2_mmxext
 #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
 #define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
-
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
-CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8)
+#define ff_put_pixels16_mmxext  ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext   ff_put_pixels8_mmx
+#define ff_put_pixels4_mmxext   ff_put_pixels4_mmx
 
 #define DEF_QPEL(OPNAME)\
 void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride);\
@@ -339,7 +339,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, const uin
     DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
 }\
@@ -349,7 +349,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, const uin
     DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
 }\
@@ -359,7 +359,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, const uin
     DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
 }\
@@ -369,7 +369,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uin
     DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
     uint8_t * const halfHV= temp;\
     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
-    assert(((int)temp & 7) == 0);\
+    av_assert2(((int)temp & 7) == 0);\
     ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
 }\
diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
index f92c4aab2b..d65660dfbc 100644
--- a/libavcodec/x86/h264_qpel_10bit.asm
+++ b/libavcodec/x86/h264_qpel_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -28,7 +28,7 @@ SECTION_RODATA 32
 
 cextern pw_16
 cextern pw_1
-cextern pb_0
+pb_0: times 32 db 0 ; we do not use cextern here as old llvm-gcc fails to align it correctly
 
 pw_pixel_max: times 8 dw ((1 << 10)-1)
 
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index bc6c72541b..2d287ba443 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -6,20 +6,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index d1873af5b4..b4fb9db309 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
 ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -135,6 +135,13 @@ WEIGHT_FUNC_HALF_MM 8, 8
     add  off_regd, 1
     or   off_regd, 1
     add        r4, 1
+    cmp        r5, 128
+     jne .normal
+    sar        r5, 1
+    sar        r6, 1
+    sar  off_regd, 1
+    sub        r4, 1
+.normal
 %if cpuflag(ssse3)
     movd       m4, r5d
     movd       m0, r6d
diff --git a/libavcodec/x86/h264_weight_10bit.asm b/libavcodec/x86/h264_weight_10bit.asm
index 961ec8ca45..5d9496224a 100644
--- a/libavcodec/x86/h264_weight_10bit.asm
+++ b/libavcodec/x86/h264_weight_10bit.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/h264chroma_init.c b/libavcodec/x86/h264chroma_init.c
index 8ec8a79aba..e08af2759e 100644
--- a/libavcodec/x86/h264chroma_init.c
+++ b/libavcodec/x86/h264chroma_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 134d594ca9..35db20014a 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -210,6 +210,7 @@ H264_BIWEIGHT_10_SSE(4,  10)
 av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                                  const int chroma_format_idc)
 {
+#if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1)
@@ -365,4 +366,5 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 #endif /* HAVE_ALIGNED_STACK */
         }
     }
+#endif
 }
diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm
index 45b8703251..f92cb2c0a5 100644
--- a/libavcodec/x86/hevc_deblock.asm
+++ b/libavcodec/x86/hevc_deblock.asm
@@ -5,20 +5,20 @@
 ;*
 ;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -26,13 +26,14 @@
 
 SECTION_RODATA
 
-pw_pixel_max: times 8 dw ((1 << 10)-1)
-pw_m1:        times 8 dw -1
-pw_m2:        times 8 dw -2
-pd_1 :        times 4 dd  1
+pw_pixel_max_12: times 8 dw ((1 << 12)-1)
+pw_pixel_max_10: times 8 dw ((1 << 10)-1)
+pw_m2:           times 8 dw -2
+pd_1 :           times 4 dd  1
 
 cextern pw_4
 cextern pw_8
+cextern pw_m1
 
 SECTION .text
 INIT_XMM sse2
@@ -57,10 +58,10 @@ INIT_XMM sse2
     movd             m4, %5
     movd             m6, %6
     movd             m5, %7
-    movd             m7, %8
+    movd             m3, %8
 
     punpcklbw        m4, m6
-    punpcklbw        m5, m7
+    punpcklbw        m5, m3
     punpcklwd        m4, m5
 
     punpckhdq        m2, m0, m4
@@ -76,16 +77,10 @@ INIT_XMM sse2
 ; in: 4 rows of 8 words in m0..m3
 ; out: 8 rows of 4 bytes in %1..%8
 %macro TRANSPOSE8x4B_STORE 8
-    packuswb         m0, m0
-    packuswb         m1, m1
-    packuswb         m2, m2
-    packuswb         m3, m3
-
-    punpcklbw        m0, m1
-    punpcklbw        m2, m3
-
-    punpckhwd        m6, m0, m2
-    punpcklwd        m0, m2
+    packuswb         m0, m2
+    packuswb         m1, m3
+    SBUTTERFLY bw, 0, 1, 2
+    SBUTTERFLY wd, 0, 1, 2
 
     movd             %1, m0
     pshufd           m0, m0, 0x39
@@ -95,13 +90,13 @@ INIT_XMM sse2
     pshufd           m0, m0, 0x39
     movd             %4, m0
 
-    movd             %5, m6
-    pshufd           m6, m6, 0x39
-    movd             %6, m6
-    pshufd           m6, m6, 0x39
-    movd             %7, m6
-    pshufd           m6, m6, 0x39
-    movd             %8, m6
+    movd             %5, m1
+    pshufd           m1, m1, 0x39
+    movd             %6, m1
+    pshufd           m1, m1, 0x39
+    movd             %7, m1
+    pshufd           m1, m1, 0x39
+    movd             %8, m1
 %endmacro
 
 ; in: 8 rows of 4 words in %4..%11
@@ -120,10 +115,10 @@ INIT_XMM sse2
     movq             m4, %5
     movq             m6, %6
     movq             m5, %7
-    movq             m7, %8
+    movq             m3, %8
 
     punpcklwd        m4, m6
-    punpcklwd        m5, m7
+    punpcklwd        m5, m3
     punpckhdq        m6, m4, m5
     punpckldq        m4, m5
 
@@ -136,32 +131,23 @@ INIT_XMM sse2
 
 ; in: 4 rows of 8 words in m0..m3
 ; out: 8 rows of 4 words in %1..%8
-%macro TRANSPOSE8x4W_STORE 8
-    pxor             m5, m5; zeros reg
-    CLIPW            m0, m5, [pw_pixel_max]
-    CLIPW            m1, m5, [pw_pixel_max]
-    CLIPW            m2, m5, [pw_pixel_max]
-    CLIPW            m3, m5, [pw_pixel_max]
+%macro TRANSPOSE8x4W_STORE 9
+    TRANSPOSE4x4W     0, 1, 2, 3, 4
 
-    punpckhwd        m4, m0, m1
-    punpcklwd        m0, m1
-    punpckhwd        m5, m2, m3
-    punpcklwd        m2, m3
-    punpckhdq        m6, m0, m2
-    punpckldq        m0, m2
+    pxor             m5, m5; zeros reg
+    CLIPW            m0, m5, %9
+    CLIPW            m1, m5, %9
+    CLIPW            m2, m5, %9
+    CLIPW            m3, m5, %9
 
     movq             %1, m0
     movhps           %2, m0
-    movq             %3, m6
-    movhps           %4, m6
-
-    punpckhdq        m6, m4, m5
-    punpckldq        m4, m5
-
-    movq             %5, m4
-    movhps           %6, m4
-    movq             %7, m6
-    movhps           %8, m6
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
 %endmacro
 
 ; in: 8 rows of 8 bytes in %1..%8
@@ -212,40 +198,20 @@ INIT_XMM sse2
 ; in: 8 rows of 8 words in m0..m8
 ; out: 8 rows of 8 bytes in %1..%8
 %macro TRANSPOSE8x8B_STORE 8
-    packuswb         m0, m0
-    packuswb         m1, m1
-    packuswb         m2, m2
-    packuswb         m3, m3
-    packuswb         m4, m4
-    packuswb         m5, m5
-    packuswb         m6, m6
-    packuswb         m7, m7
-
-    punpcklbw        m0, m1
-    punpcklbw        m2, m3
-
-    punpckhwd        m8, m0, m2
-    punpcklwd        m0, m2
-
-    punpcklbw        m4, m5
-    punpcklbw        m6, m7
-
-    punpckhwd        m9, m4, m6
-    punpcklwd        m4, m6
+    packuswb         m0, m4
+    packuswb         m1, m5
+    packuswb         m2, m6
+    packuswb         m3, m7
+    TRANSPOSE2x4x4B   0, 1, 2, 3, 4
 
-    punpckhdq       m10, m0, m4; 2, 3
-    punpckldq        m0, m4;   0, 1
-
-    punpckldq       m11, m8, m9;  4, 5
-    punpckhdq        m8, m9;   6, 7
     movq             %1, m0
     movhps           %2, m0
-    movq             %3, m10
-    movhps           %4, m10
-    movq             %5, m11
-    movhps           %6, m11
-    movq             %7, m8
-    movhps           %8, m8
+    movq             %3, m1
+    movhps           %4, m1
+    movq             %5, m2
+    movhps           %6, m2
+    movq             %7, m3
+    movhps           %8, m3
 %endmacro
 
 ; in: 8 rows of 8 words in %1..%8
@@ -264,18 +230,18 @@ INIT_XMM sse2
 
 ; in: 8 rows of 8 words in m0..m8
 ; out: 8 rows of 8 words in %1..%8
-%macro TRANSPOSE8x8W_STORE 8
+%macro TRANSPOSE8x8W_STORE 9
     TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
 
     pxor             m8, m8
-    CLIPW            m0, m8, [pw_pixel_max]
-    CLIPW            m1, m8, [pw_pixel_max]
-    CLIPW            m2, m8, [pw_pixel_max]
-    CLIPW            m3, m8, [pw_pixel_max]
-    CLIPW            m4, m8, [pw_pixel_max]
-    CLIPW            m5, m8, [pw_pixel_max]
-    CLIPW            m6, m8, [pw_pixel_max]
-    CLIPW            m7, m8, [pw_pixel_max]
+    CLIPW            m0, m8, %9
+    CLIPW            m1, m8, %9
+    CLIPW            m2, m8, %9
+    CLIPW            m3, m8, %9
+    CLIPW            m4, m8, %9
+    CLIPW            m5, m8, %9
+    CLIPW            m6, m8, %9
+    CLIPW            m7, m8, %9
 
     movdqu           %1, m0
     movdqu           %2, m1
@@ -318,13 +284,14 @@ ALIGN 16
     paddw            m5, m4;
 
     ;tc calculations
-    movd             m6, [r2]; tc0
-    add              r2, 4;
+    movq             m6, [tcq]; tc0
     punpcklwd        m6, m6
-    movd             m7, [r2]; tc1
-    punpcklwd        m7, m7
-    shufps           m6, m7, 0; tc0, tc1
+    pshufd           m6, m6, 0xA0; tc0, tc1
+%if cpuflag(ssse3)
+    psignw           m4, m6, [pw_m1]; -tc0, -tc1
+%else
     pmullw           m4, m6, [pw_m1]; -tc0, -tc1
+%endif
     ;end tc calculations
 
     paddw            m5, [pw_4]; +4
@@ -356,17 +323,17 @@ ALIGN 16
 %if %1 > 8
     shl             betaq, %1 - 8
 %endif
-    movd            m13, betaq
+    movd            m13, betad
     SPLATW          m13, m13, 0
     ;end beta calculations
 
     paddw            m9, m10, m11;   0d0, 0d3  ,  1d0, 1d3
 
-    pshufhw         m14, m9,  q0033 ;0b00001111;  0d3 0d3 0d0 0d0 in high
-    pshuflw         m14, m14, q0033 ;0b00001111;  1d3 1d3 1d0 1d0 in low
+    pshufhw         m14, m9, 0x0f ;0b00001111;  0d3 0d3 0d0 0d0 in high
+    pshuflw         m14, m14, 0x0f ;0b00001111;  1d3 1d3 1d0 1d0 in low
 
-    pshufhw          m9, m9, q3300 ;0b11110000; 0d0 0d0 0d3 0d3
-    pshuflw          m9, m9, q3300 ;0b11110000; 1d0 1d0 1d3 1d3
+    pshufhw          m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
+    pshuflw          m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
 
     paddw           m14, m9; 0d0+0d3, 1d0+1d3
 
@@ -380,7 +347,7 @@ ALIGN 16
     psraw           m15, m13, 2;   beta >> 2
     psllw            m8, m9, 1;
     pcmpgtw         m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
-    movmskps        r14, m15;
+    movmskps        r6, m15;
     ;end weak / strong decision
 
     ; weak filter nd_p/q calculation
@@ -388,19 +355,15 @@ ALIGN 16
     psrld            m8, 16
     paddw            m8, m10
     movd            r7d, m8
-    and              r7, 0xffff; 1dp0 + 1dp3
     pshufd           m8, m8, 0x4E
     movd            r8d, m8
-    and              r8, 0xffff; 0dp0 + 0dp3
 
     pshufd           m8, m11, 0x31
     psrld            m8, 16
     paddw            m8, m11
     movd            r9d, m8
-    and              r9, 0xffff; 1dq0 + 1dq3
     pshufd           m8, m8, 0x4E
     movd           r10d, m8
-    and             r10, 0xffff; 0dq0 + 0dq3
     ; end calc for weak filter
 
     ; filtering mask
@@ -422,14 +385,13 @@ ALIGN 16
     shl             r11, %1 - 8
 %endif
     movd             m8, r11d; tc0
-    add             tcq, 4;
-    mov             r3d, [tcq];
+    mov             r3d, [tcq+4];
 %if %1 > 8
     shl              r3, %1 - 8
 %endif
-    movd             m9, r3d; tc1
     add            r11d, r3d; tc0 + tc1
     jz             .bypassluma
+    movd             m9, r3d; tc1
     punpcklwd        m8, m8
     punpcklwd        m9, m9
     shufps           m8, m9, 0; tc0, tc1
@@ -453,7 +415,7 @@ ALIGN 16
     psraw           m13, 3; beta >> 3
     pcmpgtw         m13, m12;
     movmskps        r11, m13;
-    and             r14, r11; strong mask , beta_2 and beta_3 comparisons
+    and             r6, r11; strong mask , beta_2 and beta_3 comparisons
     ;----beta_3 comparison end-----
     ;----tc25 comparison---
     psubw           m12, m3, m4;      p0 - q0
@@ -464,23 +426,23 @@ ALIGN 16
 
     pcmpgtw          m8, m12; tc25 comparisons
     movmskps        r11, m8;
-    and             r14, r11; strong mask, beta_2, beta_3 and tc25 comparisons
+    and             r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
     ;----tc25 comparison end---
-    mov             r11, r14;
+    mov             r11, r6;
     shr             r11, 1;
-    and             r14, r11; strong mask, bits 2 and 0
+    and             r6, r11; strong mask, bits 2 and 0
 
     pmullw          m14, m9, [pw_m2]; -tc * 2
     paddw            m9, m9
 
-    and             r14, 5; 0b101
-    mov             r11, r14; strong mask
-    shr             r14, 2;
-    movd            m12, r14d; store to xmm for mask generation
-    shl             r14, 1
+    and             r6, 5; 0b101
+    mov             r11, r6; strong mask
+    shr             r6, 2;
+    movd            m12, r6d; store to xmm for mask generation
+    shl             r6, 1
     and             r11, 1
     movd            m10, r11d; store to xmm for mask generation
-    or              r14, r11; final strong mask, bits 1 and 0
+    or              r6, r11; final strong mask, bits 1 and 0
     jz      .weakfilter
 
     shufps          m10, m12, 0
@@ -565,16 +527,16 @@ ALIGN 16
     MASKED_COPY      m3, m12
 
 .weakfilter:
-    not             r14; strong mask -> weak mask
-    and             r14, r13; final weak filtering mask, bits 0 and 1
+    not             r6; strong mask -> weak mask
+    and             r6, r13; final weak filtering mask, bits 0 and 1
     jz             .store
 
     ; weak filtering mask
-    mov             r11, r14
+    mov             r11, r6
     shr             r11, 1
     movd            m12, r11d
-    and             r14, 1
-    movd            m11, r14d
+    and             r6, 1
+    movd            m11, r6d
     shufps          m11, m12, 0
     pcmpeqd         m11, [pd_1]; filtering mask
 
@@ -609,7 +571,11 @@ ALIGN 16
     pminsw          m12, m9;  av_clip(delta0, -tc, tc)
 
     psraw            m9, 1;   tc -> tc / 2
+%if cpuflag(ssse3)
+    psignw          m14, m9, [pw_m1]; -tc / 2
+%else
     pmullw          m14, m9, [pw_m1]; -tc / 2
+%endif
 
     pavgw           m15, m1, m3;   (p2 + p0 + 1) >> 1
     psubw           m15, m2;  ((p2 + p0 + 1) >> 1) - p1
@@ -620,7 +586,7 @@ ALIGN 16
     paddw           m15, m2; p1'
 
     ;beta calculations
-    movd            m10, betaq
+    movd            m10, betad
     SPLATW          m10, m10, 0
 
     movd            m13, r7d; 1dp0 + 1dp3
@@ -658,117 +624,161 @@ ALIGN 16
     MASKED_COPY      m4, m8
 %endmacro
 
-INIT_XMM sse2
 ;-----------------------------------------------------------------------------
-; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc,
+; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
 ;                                   uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_v_loop_filter_chroma_8, 3, 6, 8
-    sub              r0, 2
-    lea              r5, [3 * r1]
-    mov              r4, r0
-    add              r0, r5
-    TRANSPOSE4x8B_LOAD  PASS8ROWS(r4, r0, r1, r5)
+%macro LOOP_FILTER_CHROMA 0
+cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 2
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8B_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     CHROMA_DEBLOCK_BODY 8
-    TRANSPOSE8x4B_STORE PASS8ROWS(r4, r0, r1, r5)
+    TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     RET
 
-cglobal hevc_v_loop_filter_chroma_10, 3, 6, 8
-    sub              r0, 4
-    lea              r5, [3 * r1]
-    mov              r4, r0
-    add              r0, r5
-    TRANSPOSE4x8W_LOAD  PASS8ROWS(r4, r0, r1, r5)
+cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 4
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
     CHROMA_DEBLOCK_BODY 10
-    TRANSPOSE8x4W_STORE PASS8ROWS(r4, r0, r1, r5)
+    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
+    RET
+
+cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
+    sub            pixq, 4
+    lea       r3strideq, [3*strideq]
+    mov           pix0q, pixq
+    add            pixq, r3strideq
+    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
+    CHROMA_DEBLOCK_BODY 12
+    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
     RET
 
 ;-----------------------------------------------------------------------------
-; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc,
+; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
 ;                                   uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_h_loop_filter_chroma_8, 3, 6, 8
-    mov              r5, r0; pix
-    sub              r5, r1
-    sub              r5, r1
-    movh             m0, [r5];      p1
-    movh             m1, [r5 + r1]; p0
-    movh             m2, [r0];      q0
-    movh             m3, [r0 + r1]; q1
+cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
+    mov           pix0q, pixq
+    sub           pix0q, strideq
+    sub           pix0q, strideq
+    movq             m0, [pix0q];    p1
+    movq             m1, [pix0q+strideq]; p0
+    movq             m2, [pixq];    q0
+    movq             m3, [pixq+strideq]; q1
     pxor             m5, m5; zeros reg
     punpcklbw        m0, m5
     punpcklbw        m1, m5
     punpcklbw        m2, m5
     punpcklbw        m3, m5
     CHROMA_DEBLOCK_BODY  8
-    packuswb          m1, m2
-    movh       [r5 + r1], m1
-    movhps          [r0], m1
+    packuswb         m1, m2
+    movh[pix0q+strideq], m1
+    movhps       [pixq], m1
     RET
 
-cglobal hevc_h_loop_filter_chroma_10, 3, 6, 8
-    mov             r5, r0; pix
-    sub             r5, r1
-    sub             r5, r1
-    movdqu          m0, [r5];      p1
-    movdqu          m1, [r5+r1];   p0
-    movdqu          m2, [r0];      q0
-    movdqu          m3, [r0 + r1]; q1
+cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
+    mov          pix0q, pixq
+    sub          pix0q, strideq
+    sub          pix0q, strideq
+    movu            m0, [pix0q];    p1
+    movu            m1, [pix0q+strideq]; p0
+    movu            m2, [pixq];    q0
+    movu            m3, [pixq+strideq]; q1
     CHROMA_DEBLOCK_BODY 10
     pxor            m5, m5; zeros reg
-    CLIPW           m1, m5, [pw_pixel_max]
-    CLIPW           m2, m5, [pw_pixel_max]
-    movdqu   [r5 + r1], m1
-    movdqu        [r0], m2
+    CLIPW           m1, m5, [pw_pixel_max_10]
+    CLIPW           m2, m5, [pw_pixel_max_10]
+    movu [pix0q+strideq], m1
+    movu        [pixq], m2
+    RET
+
+cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
+    mov          pix0q, pixq
+    sub          pix0q, strideq
+    sub          pix0q, strideq
+    movu            m0, [pix0q];    p1
+    movu            m1, [pix0q+strideq]; p0
+    movu            m2, [pixq];    q0
+    movu            m3, [pixq+strideq]; q1
+    CHROMA_DEBLOCK_BODY 12
+    pxor            m5, m5; zeros reg
+    CLIPW           m1, m5, [pw_pixel_max_12]
+    CLIPW           m2, m5, [pw_pixel_max_12]
+    movu [pix0q+strideq], m1
+    movu        [pixq], m2
     RET
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_CHROMA
+INIT_XMM avx
+LOOP_FILTER_CHROMA
 
 %if ARCH_X86_64
-INIT_XMM ssse3
+%macro LOOP_FILTER_LUMA 0
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
-;                                 int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_v_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc
-    sub              r0, 4
-    lea              r5, [3 * r1]
-    mov              r6, r0
-    add              r0, r5
-    TRANSPOSE8x8B_LOAD  PASS8ROWS(r6, r0, r1, r5)
+cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    sub            pixq, 4
+    lea           pix0q, [3 * r1]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8B_LOAD  PASS8ROWS(src3strideq, pixq, r1, pix0q)
     LUMA_DEBLOCK_BODY 8, v
 .store:
-    TRANSPOSE8x8B_STORE PASS8ROWS(r6, r0, r1, r5)
+    TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
 .bypassluma:
     RET
 
-cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc
+cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     sub            pixq, 8
-    lea              r5, [3 * strideq]
-    mov              r6, pixq
-    add            pixq, r5
-    TRANSPOSE8x8W_LOAD  PASS8ROWS(r6, pixq, strideq, r5)
+    lea           pix0q, [3 * strideq]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
     LUMA_DEBLOCK_BODY 10, v
 .store:
-    TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5)
+    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
+.bypassluma:
+    RET
+
+cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    sub            pixq, 8
+    lea           pix0q, [3 * strideq]
+    mov     src3strideq, pixq
+    add            pixq, pix0q
+    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
+    LUMA_DEBLOCK_BODY 12, v
+.store:
+    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
 .bypassluma:
     RET
 
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
-;                                 int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
 ;-----------------------------------------------------------------------------
-cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     lea     src3strideq, [3 * strideq]
     mov           pix0q, pixq
     sub           pix0q, src3strideq
     sub           pix0q, strideq
-    movdqu           m0, [pix0q];               p3
-    movdqu           m1, [pix0q +     strideq]; p2
-    movdqu           m2, [pix0q + 2 * strideq]; p1
-    movdqu           m3, [pix0q + src3strideq]; p0
-    movdqu           m4, [pixq];                q0
-    movdqu           m5, [pixq +     strideq];  q1
-    movdqu           m6, [pixq + 2 * strideq];  q2
-    movdqu           m7, [pixq + src3strideq];  q3
+    movq             m0, [pix0q];               p3
+    movq             m1, [pix0q +     strideq]; p2
+    movq             m2, [pix0q + 2 * strideq]; p1
+    movq             m3, [pix0q + src3strideq]; p0
+    movq             m4, [pixq];                q0
+    movq             m5, [pixq +     strideq];  q1
+    movq             m6, [pixq + 2 * strideq];  q2
+    movq             m7, [pixq + src3strideq];  q3
     pxor             m8, m8
     punpcklbw        m0, m8
     punpcklbw        m1, m8
@@ -783,16 +793,16 @@ cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0
     packuswb          m1, m2
     packuswb          m3, m4
     packuswb          m5, m6
-    movh   [r5 +     r1], m1
-    movhps [r5 + 2 * r1], m1
-    movh   [r5 +     r6], m3
-    movhps [r0         ], m3
-    movh   [r0 +     r1], m5
-    movhps [r0 + 2 * r1], m5
+    movh   [pix0q +     strideq], m1
+    movhps [pix0q + 2 * strideq], m1
+    movh   [pix0q + src3strideq], m3
+    movhps [pixq               ], m3
+    movh   [pixq  +     strideq], m5
+    movhps [pixq  + 2 * strideq], m5
 .bypassluma:
     RET
 
-cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
+cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
     lea                  src3strideq, [3 * strideq]
     mov                        pix0q, pixq
     sub                        pix0q, src3strideq
@@ -808,12 +818,43 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
     LUMA_DEBLOCK_BODY             10, h
 .store:
     pxor                          m8, m8; zeros reg
-    CLIPW                         m1, m8, [pw_pixel_max]
-    CLIPW                         m2, m8, [pw_pixel_max]
-    CLIPW                         m3, m8, [pw_pixel_max]
-    CLIPW                         m4, m8, [pw_pixel_max]
-    CLIPW                         m5, m8, [pw_pixel_max]
-    CLIPW                         m6, m8, [pw_pixel_max]
+    CLIPW                         m1, m8, [pw_pixel_max_10]
+    CLIPW                         m2, m8, [pw_pixel_max_10]
+    CLIPW                         m3, m8, [pw_pixel_max_10]
+    CLIPW                         m4, m8, [pw_pixel_max_10]
+    CLIPW                         m5, m8, [pw_pixel_max_10]
+    CLIPW                         m6, m8, [pw_pixel_max_10]
+    movdqu     [pix0q +     strideq], m1;  p2
+    movdqu     [pix0q + 2 * strideq], m2;  p1
+    movdqu     [pix0q + src3strideq], m3;  p0
+    movdqu     [pixq               ], m4;  q0
+    movdqu     [pixq  +     strideq], m5;  q1
+    movdqu     [pixq  + 2 * strideq], m6;  q2
+.bypassluma:
+    RET
+
+cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
+    lea                  src3strideq, [3 * strideq]
+    mov                        pix0q, pixq
+    sub                        pix0q, src3strideq
+    sub                        pix0q, strideq
+    movdqu                        m0, [pix0q];               p3
+    movdqu                        m1, [pix0q +     strideq]; p2
+    movdqu                        m2, [pix0q + 2 * strideq]; p1
+    movdqu                        m3, [pix0q + src3strideq]; p0
+    movdqu                        m4, [pixq];                q0
+    movdqu                        m5, [pixq  +     strideq]; q1
+    movdqu                        m6, [pixq  + 2 * strideq]; q2
+    movdqu                        m7, [pixq  + src3strideq]; q3
+    LUMA_DEBLOCK_BODY             12, h
+.store:
+    pxor                          m8, m8; zeros reg
+    CLIPW                         m1, m8, [pw_pixel_max_12]
+    CLIPW                         m2, m8, [pw_pixel_max_12]
+    CLIPW                         m3, m8, [pw_pixel_max_12]
+    CLIPW                         m4, m8, [pw_pixel_max_12]
+    CLIPW                         m5, m8, [pw_pixel_max_12]
+    CLIPW                         m6, m8, [pw_pixel_max_12]
     movdqu     [pix0q +     strideq], m1;  p2
     movdqu     [pix0q + 2 * strideq], m2;  p1
     movdqu     [pix0q + src3strideq], m3;  p0
@@ -822,4 +863,13 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
     movdqu     [pixq  + 2 * strideq], m6;  q2
 .bypassluma:
     RET
+
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_LUMA
+INIT_XMM ssse3
+LOOP_FILTER_LUMA
+INIT_XMM avx
+LOOP_FILTER_LUMA
 %endif
diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm
new file mode 100644
index 0000000000..481726d217
--- /dev/null
+++ b/libavcodec/x86/hevc_idct.asm
@@ -0,0 +1,122 @@
+; /*
+; * SIMD optimized idct functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; * Copyright (c) 2014 James Almer
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT 32
+
+; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
+; %1 = HxW
+; %2 = number of loops
+; %3 = bitdepth
+%macro IDCT_DC 3
+cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp
+    movsx             tmpq, word [coeffq]
+    add               tmpw, ((1 << 14-%3) + 1)
+    sar               tmpw, (15-%3)
+    movd               xm0, tmpd
+    SPLATW              m0, xm0
+    DEFINE_ARGS coeff, cnt
+    mov               cntd, %2
+.loop:
+    mova [coeffq+mmsize*0], m0
+    mova [coeffq+mmsize*1], m0
+    mova [coeffq+mmsize*2], m0
+    mova [coeffq+mmsize*3], m0
+    mova [coeffq+mmsize*4], m0
+    mova [coeffq+mmsize*5], m0
+    mova [coeffq+mmsize*6], m0
+    mova [coeffq+mmsize*7], m0
+    add  coeffq, mmsize*8
+    dec  cntd
+    jg  .loop
+    RET
+%endmacro
+
+; %1 = HxW
+; %2 = bitdepth
+%macro IDCT_DC_NL 2 ; No loop
+cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp
+    movsx             tmpq, word [coeffq]
+    add               tmpw, ((1 << 14-%2) + 1)
+    sar               tmpw, (15-%2)
+    movd                m0, tmpd
+    SPLATW              m0, xm0
+    mova [coeffq+mmsize*0], m0
+    mova [coeffq+mmsize*1], m0
+    mova [coeffq+mmsize*2], m0
+    mova [coeffq+mmsize*3], m0
+%if mmsize == 16
+    mova [coeffq+mmsize*4], m0
+    mova [coeffq+mmsize*5], m0
+    mova [coeffq+mmsize*6], m0
+    mova [coeffq+mmsize*7], m0
+%endif
+    RET
+%endmacro
+
+; 8-bit
+INIT_MMX mmxext
+IDCT_DC_NL  4,      8
+IDCT_DC     8,  2,  8
+
+INIT_XMM sse2
+IDCT_DC_NL  8,      8
+IDCT_DC    16,  4,  8
+IDCT_DC    32, 16,  8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+IDCT_DC    16,  2,  8
+IDCT_DC    32,  8,  8
+%endif ;HAVE_AVX2_EXTERNAL
+
+; 10-bit
+INIT_MMX mmxext
+IDCT_DC_NL  4,     10
+IDCT_DC     8,  2, 10
+
+INIT_XMM sse2
+IDCT_DC_NL  8,     10
+IDCT_DC    16,  4, 10
+IDCT_DC    32, 16, 10
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+IDCT_DC    16,  2, 10
+IDCT_DC    32,  8, 10
+%endif ;HAVE_AVX2_EXTERNAL
+
+; 12-bit
+INIT_MMX mmxext
+IDCT_DC_NL  4,     12
+IDCT_DC     8,  2, 12
+
+INIT_XMM sse2
+IDCT_DC_NL  8,     12
+IDCT_DC    16,  4, 12
+IDCT_DC    32, 16, 12
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+IDCT_DC    16,  2, 12
+IDCT_DC    32,  8, 12
+%endif ;HAVE_AVX2_EXTERNAL
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
new file mode 100644
index 0000000000..eaa070c342
--- /dev/null
+++ b/libavcodec/x86/hevc_mc.asm
@@ -0,0 +1,1385 @@
+; /*
+; * Provide SSE luma and chroma mc functions for HEVC decoding
+; * Copyright (c) 2013 Pierre-Edouard LEPERE
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_8:                   times 8 dw (1 << 9)
+pw_10:                  times 8 dw (1 << 11)
+pw_12:                  times 8 dw (1 << 13)
+pw_bi_8:                times 8 dw (1 << 8)
+pw_bi_10:               times 8 dw (1 << 10)
+pw_bi_12:               times 8 dw (1 << 12)
+max_pixels_10:          times 8  dw ((1 << 10)-1)
+max_pixels_12:          times 8  dw ((1 << 12)-1)
+zero:                   times 4  dd 0
+one_per_32:             times 4  dd 1
+
+SECTION .text
+%macro EPEL_TABLE 4
+hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
+                        times %2 d%3 10, -2
+                        times %2 d%3 -4, 54
+                        times %2 d%3 16, -2
+                        times %2 d%3 -6, 46
+                        times %2 d%3 28, -4
+                        times %2 d%3 -4, 36
+                        times %2 d%3 36, -4
+                        times %2 d%3 -4, 28
+                        times %2 d%3 46, -6
+                        times %2 d%3 -2, 16
+                        times %2 d%3 54, -4
+                        times %2 d%3 -2, 10
+                        times %2 d%3 58, -2
+%endmacro
+
+
+
+EPEL_TABLE  8, 8, b, sse4
+EPEL_TABLE 10, 4, w, sse4
+EPEL_TABLE 12, 4, w, sse4
+
+%macro QPEL_TABLE 4
+hevc_qpel_filters_%4_%1 times %2 d%3  -1,  4
+                        times %2 d%3 -10, 58
+                        times %2 d%3  17, -5
+                        times %2 d%3   1,  0
+                        times %2 d%3  -1,  4
+                        times %2 d%3 -11, 40
+                        times %2 d%3  40,-11
+                        times %2 d%3   4, -1
+                        times %2 d%3   0,  1
+                        times %2 d%3  -5, 17
+                        times %2 d%3  58,-10
+                        times %2 d%3   4, -1
+%endmacro
+
+QPEL_TABLE  8, 8, b, sse4
+QPEL_TABLE 10, 4, w, sse4
+QPEL_TABLE 12, 4, w, sse4
+
+%define MAX_PB_SIZE  64
+
+%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
+
+%if ARCH_X86_64
+
+%macro SIMPLE_BILOAD 4   ;width, tab, r1, r2
+%if %1 <= 4
+    movq              %3, [%2]                                              ; load data from source2
+%elif %1 <= 8
+    movdqa            %3, [%2]                                              ; load data from source2
+%elif %1 <= 12
+    movdqa            %3, [%2]                                              ; load data from source2
+    movq              %4, [%2+16]                                           ; load data from source2
+%else
+    movdqa            %3, [%2]                                              ; load data from source2
+    movdqa            %4, [%2+16]                                           ; load data from source2
+%endif
+%endmacro
+
+%macro SIMPLE_LOAD 4    ;width, bitd, tab, r1
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+    movd              %4, [%3]                                               ; load data from source
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+    movq              %4, [%3]                                               ; load data from source
+%else
+    movdqu            %4, [%3]                                               ; load data from source
+%endif
+%endmacro
+
+%macro SIMPLE_8LOAD 5    ;width, bitd, tab, r1, r2
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+    movq              %4, [%3]                                              ; load data from source2
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+    movdqa            %4, [%3]                                              ; load data from source2
+%elif %1 <= 12
+    movdqa            %4, [%3]                                              ; load data from source2
+    movq              %5, [%3+16]                                           ; load data from source2
+%else
+    movdqa            %4, [%3]                                              ; load data from source2
+    movdqa            %5, [%3+16]                                           ; load data from source2
+%endif
+%endmacro
+
+%macro EPEL_FILTER 2-4                            ; bit depth, filter index
+%ifdef PIC
+    lea         rfilterq, [hevc_epel_filters_sse4_%1]
+%else
+    %define rfilterq hevc_epel_filters_sse4_%1
+%endif
+    sub              %2q, 1
+    shl              %2q, 5                      ; multiply by 32
+%if %0 == 2
+    movdqa           m14, [rfilterq + %2q]        ; get 2 first values of filters
+    movdqa           m15, [rfilterq + %2q+16]     ; get 2 last values of filters
+%else
+    movdqa           %3, [rfilterq + %2q]        ; get 2 first values of filters
+    movdqa           %4, [rfilterq + %2q+16]     ; get 2 last values of filters
+%endif
+%endmacro
+
+%macro EPEL_HV_FILTER 1
+%ifdef PIC
+    lea         rfilterq, [hevc_epel_filters_sse4_%1]
+%else
+    %define rfilterq hevc_epel_filters_sse4_%1
+%endif
+    sub              mxq, 1
+    sub              myq, 1
+    shl              mxq, 5                      ; multiply by 32
+    shl              myq, 5                      ; multiply by 32
+    movdqa           m14, [rfilterq + mxq]        ; get 2 first values of filters
+    movdqa           m15, [rfilterq + mxq+16]     ; get 2 last values of filters
+    lea           r3srcq, [srcstrideq*3]
+
+%ifdef PIC
+    lea         rfilterq, [hevc_epel_filters_sse4_10]
+%else
+    %define rfilterq hevc_epel_filters_sse4_10
+%endif
+    movdqa           m12, [rfilterq + myq]        ; get 2 first values of filters
+    movdqa           m13, [rfilterq + myq+16]     ; get 2 last values of filters
+%endmacro
+
+%macro QPEL_FILTER 2
+%ifdef PIC
+    lea         rfilterq, [hevc_qpel_filters_sse4_%1]
+%else
+    %define rfilterq hevc_qpel_filters_sse4_%1
+%endif
+    lea              %2q, [%2q*8-8]
+    movdqa           m12, [rfilterq + %2q*8]       ; get 4 first values of filters
+    movdqa           m13, [rfilterq + %2q*8 + 16]  ; get 4 first values of filters
+    movdqa           m14, [rfilterq + %2q*8 + 32]  ; get 4 first values of filters
+    movdqa           m15, [rfilterq + %2q*8 + 48]  ; get 4 first values of filters
+%endmacro
+
+%macro EPEL_LOAD 4
+%ifdef PIC
+    lea rfilterq, [%2]
+%else
+    %define rfilterq %2
+%endif
+%if (%1 == 8 && %4 <= 4)
+%define %%load movd
+%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
+%define %%load movq
+%else
+%define %%load movdqu
+%endif
+
+    %%load            m0, [rfilterq ]
+%ifnum %3
+    %%load            m1, [rfilterq+  %3]
+    %%load            m2, [rfilterq+2*%3]
+    %%load            m3, [rfilterq+3*%3]
+%else
+    %%load            m1, [rfilterq+  %3q]
+    %%load            m2, [rfilterq+2*%3q]
+    %%load            m3, [rfilterq+r3srcq]
+%endif
+
+%if %1 == 8
+%if %4 > 8
+    SBUTTERFLY        bw, 0, 1, 10
+    SBUTTERFLY        bw, 2, 3, 10
+%else
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+%endif
+%else
+%if %4 > 4
+    SBUTTERFLY        wd, 0, 1, 10
+    SBUTTERFLY        wd, 2, 3, 10
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+%endif
+%endif
+%endmacro
+
+
+%macro QPEL_H_LOAD 4
+%assign %%stride (%1+7)/8
+%if %1 == 8
+%if %3 <= 4
+%define %%load movd
+%elif %3 == 8
+%define %%load movq
+%else
+%define %%load movdqu
+%endif
+%else
+%if %3 == 2
+%define %%load movd
+%elif %3 == 4
+%define %%load movq
+%else
+%define %%load movdqu
+%endif
+%endif
+    %%load            m0, [%2-3*%%stride]        ;load data from source
+    %%load            m1, [%2-2*%%stride]
+    %%load            m2, [%2-%%stride  ]
+    %%load            m3, [%2           ]
+    %%load            m4, [%2+%%stride  ]
+    %%load            m5, [%2+2*%%stride]
+    %%load            m6, [%2+3*%%stride]
+    %%load            m7, [%2+4*%%stride]
+
+%if %1 == 8
+%if %3 > 8
+    SBUTTERFLY        wd, 0, 1, %4
+    SBUTTERFLY        wd, 2, 3, %4
+    SBUTTERFLY        wd, 4, 5, %4
+    SBUTTERFLY        wd, 6, 7, %4
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+    punpcklwd         m4, m5
+    punpcklwd         m6, m7
+%endif
+%else
+%if %3 > 4
+    SBUTTERFLY        dq, 0, 1, %4
+    SBUTTERFLY        dq, 2, 3, %4
+    SBUTTERFLY        dq, 4, 5, %4
+    SBUTTERFLY        dq, 6, 7, %4
+%else
+    punpckldq         m0, m1
+    punpckldq         m2, m3
+    punpckldq         m4, m5
+    punpckldq         m6, m7
+%endif
+%endif
+%endmacro
+
+%macro QPEL_V_LOAD 5
+    lea              %5q, [%2]
+    sub              %5q, r3srcq
+    movdqu            m0, [%5q            ]      ;load x- 3*srcstride
+    movdqu            m1, [%5q+   %3q     ]      ;load x- 2*srcstride
+    movdqu            m2, [%5q+ 2*%3q     ]      ;load x-srcstride
+    movdqu            m3, [%2       ]      ;load x
+    movdqu            m4, [%2+   %3q]      ;load x+stride
+    movdqu            m5, [%2+ 2*%3q]      ;load x+2*stride
+    movdqu            m6, [%2+r3srcq]      ;load x+3*stride
+    movdqu            m7, [%2+ 4*%3q]      ;load x+4*stride
+%if %1 == 8
+%if %4 > 8
+    SBUTTERFLY        bw, 0, 1, 8
+    SBUTTERFLY        bw, 2, 3, 8
+    SBUTTERFLY        bw, 4, 5, 8
+    SBUTTERFLY        bw, 6, 7, 8
+%else
+    punpcklbw         m0, m1
+    punpcklbw         m2, m3
+    punpcklbw         m4, m5
+    punpcklbw         m6, m7
+%endif
+%else
+%if %4 > 4
+    SBUTTERFLY        wd, 0, 1, 8
+    SBUTTERFLY        wd, 2, 3, 8
+    SBUTTERFLY        wd, 4, 5, 8
+    SBUTTERFLY        wd, 6, 7, 8
+%else
+    punpcklwd         m0, m1
+    punpcklwd         m2, m3
+    punpcklwd         m4, m5
+    punpcklwd         m6, m7
+%endif
+%endif
+%endmacro
+
+%macro PEL_12STORE2 3
+    movd           [%1], %2
+%endmacro
+%macro PEL_12STORE4 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_12STORE6 3
+    movq           [%1], %2
+    psrldq            %2, 8
+    movd         [%1+8], %2
+%endmacro
+%macro PEL_12STORE8 3
+    movdqa         [%1], %2
+%endmacro
+%macro PEL_12STORE12 3
+    movdqa         [%1], %2
+    movq        [%1+16], %3
+%endmacro
+%macro PEL_12STORE16 3
+    PEL_12STORE8      %1, %2, %3
+    movdqa       [%1+16], %3
+%endmacro
+
+%macro PEL_10STORE2 3
+    movd           [%1], %2
+%endmacro
+%macro PEL_10STORE4 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_10STORE6 3
+    movq           [%1], %2
+    psrldq            %2, 8
+    movd         [%1+8], %2
+%endmacro
+%macro PEL_10STORE8 3
+    movdqa         [%1], %2
+%endmacro
+%macro PEL_10STORE12 3
+    movdqa         [%1], %2
+    movq        [%1+16], %3
+%endmacro
+%macro PEL_10STORE16 3
+    PEL_10STORE8      %1, %2, %3
+    movdqa       [%1+16], %3
+%endmacro
+
+%macro PEL_8STORE2 3
+    pextrw          [%1], %2, 0
+%endmacro
+%macro PEL_8STORE4 3
+    movd            [%1], %2
+%endmacro
+%macro PEL_8STORE6 3
+    movd            [%1], %2
+    pextrw        [%1+4], %2, 2
+%endmacro
+%macro PEL_8STORE8 3
+    movq           [%1], %2
+%endmacro
+%macro PEL_8STORE12 3
+    movq            [%1], %2
+    psrldq            %2, 8
+    movd          [%1+8], %2
+%endmacro
+%macro PEL_8STORE16 3
+    movdqa          [%1], %2
+%endmacro
+
+%macro LOOP_END 3
+    add              %1q, 2*MAX_PB_SIZE          ; dst += dststride
+    add              %2q, %3q                    ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+%endmacro
+
+
+%macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
+%if %2 == 8
+%if %1 > 8
+    punpckhbw         m1, m0, m2
+    psllw             m1, 14-%2
+%endif
+    punpcklbw         m0, m2
+%endif
+    psllw             m0, 14-%2
+%endmacro
+
+
+%macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
+%if %1 == 8
+    pmaddubsw         m0, %3   ;x1*c1+x2*c2
+    pmaddubsw         m2, %4   ;x3*c3+x4*c4
+    paddw             m0, m2
+%if %2 > 8
+    pmaddubsw         m1, %3
+    pmaddubsw         m3, %4
+    paddw             m1, m3
+%endif
+%else
+    pmaddwd           m0, %3
+    pmaddwd           m2, %4
+    paddd             m0, m2
+%if %2 > 4
+    pmaddwd           m1, %3
+    pmaddwd           m3, %4
+    paddd             m1, m3
+%endif
+%if %1 != 8
+    psrad             m0, %1-8
+    psrad             m1, %1-8
+%endif
+    packssdw          m0, m1
+%endif
+%endmacro
+
+%macro QPEL_HV_COMPUTE 4     ; width, bitdepth, filter idx
+%ifdef PIC
+    lea         rfilterq, [hevc_qpel_filters_sse4_%2]
+%else
+    %define rfilterq hevc_qpel_filters_sse4_%2
+%endif
+
+%if %2 == 8
+    pmaddubsw         m0, [rfilterq + %3q*8   ]   ;x1*c1+x2*c2
+    pmaddubsw         m2, [rfilterq + %3q*8+16]   ;x3*c3+x4*c4
+    pmaddubsw         m4, [rfilterq + %3q*8+32]   ;x5*c5+x6*c6
+    pmaddubsw         m6, [rfilterq + %3q*8+48]   ;x7*c7+x8*c8
+    paddw             m0, m2
+    paddw             m4, m6
+    paddw             m0, m4
+%else
+    pmaddwd           m0, [rfilterq + %3q*8   ]
+    pmaddwd           m2, [rfilterq + %3q*8+16]
+    pmaddwd           m4, [rfilterq + %3q*8+32]
+    pmaddwd           m6, [rfilterq + %3q*8+48]
+    paddd             m0, m2
+    paddd             m4, m6
+    paddd             m0, m4
+%if %2 != 8
+    psrad             m0, %2-8
+%endif
+%if %1 > 4
+    pmaddwd           m1, [rfilterq + %3q*8   ]
+    pmaddwd           m3, [rfilterq + %3q*8+16]
+    pmaddwd           m5, [rfilterq + %3q*8+32]
+    pmaddwd           m7, [rfilterq + %3q*8+48]
+    paddd             m1, m3
+    paddd             m5, m7
+    paddd             m1, m5
+%if %2 != 8
+    psrad             m1, %2-8
+%endif
+%endif
+    p%4               m0, m1
+%endif
+%endmacro
+
+%macro QPEL_COMPUTE 2     ; width, bitdepth
+%if %2 == 8
+    pmaddubsw         m0, m12   ;x1*c1+x2*c2
+    pmaddubsw         m2, m13   ;x3*c3+x4*c4
+    pmaddubsw         m4, m14   ;x5*c5+x6*c6
+    pmaddubsw         m6, m15   ;x7*c7+x8*c8
+    paddw             m0, m2
+    paddw             m4, m6
+    paddw             m0, m4
+%if %1 > 8
+    pmaddubsw         m1, m12
+    pmaddubsw         m3, m13
+    pmaddubsw         m5, m14
+    pmaddubsw         m7, m15
+    paddw             m1, m3
+    paddw             m5, m7
+    paddw             m1, m5
+%endif
+%else
+    pmaddwd           m0, m12
+    pmaddwd           m2, m13
+    pmaddwd           m4, m14
+    pmaddwd           m6, m15
+    paddd             m0, m2
+    paddd             m4, m6
+    paddd             m0, m4
+%if %2 != 8
+    psrad             m0, %2-8
+%endif
+%if %1 > 4
+    pmaddwd           m1, m12
+    pmaddwd           m3, m13
+    pmaddwd           m5, m14
+    pmaddwd           m7, m15
+    paddd             m1, m3
+    paddd             m5, m7
+    paddd             m1, m5
+%if %2 != 8
+    psrad             m1, %2-8
+%endif
+%endif
+%endif
+%endmacro
+
+%macro BI_COMPUTE 7     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
+    paddsw            %3, %5
+%if %1 > 8
+    paddsw            %4, %6
+%endif
+    UNI_COMPUTE       %1, %2, %3, %4, %7
+%endmacro
+
+%macro UNI_COMPUTE 5
+    pmulhrsw          %3, %5
+%if %1 > 8 || (%2 > 8 && %1 > 4)
+    pmulhrsw          %4, %5
+%endif
+%if %2 == 8
+    packuswb          %3, %4
+%else
+    pminsw            %3, [max_pixels_%2]
+    pmaxsw            %3, [zero]
+%if %1 > 8
+    pminsw            %4, [max_pixels_%2]
+    pmaxsw            %4, [zero]
+%endif
+%endif
+%endmacro
+
+INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
+; ******************************
+; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
+;                         uint8_t *_src, ptrdiff_t _srcstride,
+;                         int height, int mx, int my)
+; ******************************
+
+%macro HEVC_PUT_HEVC_PEL_PIXELS 2
+cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
+    pxor               m2, m2
+.loop
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    MC_PIXEL_COMPUTE  %1, %2
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
+.loop
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
+    pxor              m2, m2
+    movdqa            m5, [pw_bi_%2]
+.loop
+    SIMPLE_LOAD       %1, %2, srcq, m0
+    SIMPLE_BILOAD     %1, src2q, m3, m4
+    MC_PIXEL_COMPUTE  %1, %2
+    BI_COMPUTE        %1, %2, m0, m1, m3, m4, m5
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+%endmacro
+
+
+; ******************************
+; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int width, int height, int mx, int my,
+;                       int16_t* mcbuffer)
+; ******************************
+
+
+%macro HEVC_PUT_HEVC_EPEL 2
+cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, 6, dst, src, srcstride, height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
+    EPEL_FILTER       %2, mx, m4, m5
+.loop
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5
+    PEL_10STORE%1      dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
+    movdqa            m6, [pw_%2]
+    EPEL_FILTER       %2, mx, m4, m5
+.loop
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5
+    UNI_COMPUTE       %1, %2, m0, m1, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 7, dst, dststride, src, srcstride, src2, height, mx, rfilter
+    movdqa            m6, [pw_bi_%2]
+    EPEL_FILTER       %2, mx, m4, m5
+.loop
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5
+    SIMPLE_BILOAD     %1, src2q, m2, m3
+    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+; ******************************
+; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
+;                      uint8_t *_src, ptrdiff_t _srcstride,
+;                      int width, int height, int mx, int my,
+;                      int16_t* mcbuffer)
+; ******************************
+
+cglobal hevc_put_hevc_epel_v%1_%2, 6, 7, 6, dst, src, srcstride, height, r3src, my, rfilter
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5
+.loop
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END          dst, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_epel_v%1_%2, 7, 8, 7, dst, dststride, src, srcstride, height, r3src, my, rfilter
+    lea           r3srcq, [srcstrideq*3]
+    movdqa            m6, [pw_%2]
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5
+.loop
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5
+    UNI_COMPUTE       %1, %2, m0, m1, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+
+cglobal hevc_put_hevc_bi_epel_v%1_%2, 8, 9, 7, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
+    lea           r3srcq, [srcstrideq*3]
+    movdqa            m6, [pw_bi_%2]
+    sub             srcq, srcstrideq
+    EPEL_FILTER       %2, my, m4, m5
+.loop
+    EPEL_LOAD         %2, srcq, srcstride, %1
+    EPEL_COMPUTE      %2, %1, m4, m5
+    SIMPLE_BILOAD     %1, src2q, m2, m3
+    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
+
+
+; ******************************
+; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int width, int height, int mx, int my)
+; ******************************
+
+%macro HEVC_PUT_HEVC_EPEL_HV 2
+cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 12 , dst, src, srcstride, height, mx, my, r3src, rfilter
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+    SWAP              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+    PEL_10STORE%1     dstq, m0, m1
+    movdqa            m4, m5
+    movdqa            m5, m6
+    movdqa            m6, m7
+    LOOP_END         dst, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+    SWAP              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
+    PEL_%2STORE%1   dstq, m0, m1
+    movdqa            m4, m5
+    movdqa            m5, m6
+    movdqa            m6, m7
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+
+cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
+%assign %%stride ((%2 + 7)/8)
+    sub             srcq, srcstrideq
+    EPEL_HV_FILTER    %2
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+    SWAP              m4, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+    SWAP              m5, m0
+    add             srcq, srcstrideq
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+    SWAP              m6, m0
+    add             srcq, srcstrideq
+.loop
+    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
+    EPEL_COMPUTE      %2, %1, m14, m15
+    SWAP              m7, m0
+    punpcklwd         m0, m4, m5
+    punpcklwd         m2, m6, m7
+%if %1 > 4
+    punpckhwd         m1, m4, m5
+    punpckhwd         m3, m6, m7
+%endif
+    EPEL_COMPUTE      14, %1, m12, m13
+    SIMPLE_BILOAD     %1, src2q, m8, m9
+    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
+    PEL_%2STORE%1   dstq, m0, m1
+    movdqa            m4, m5
+    movdqa            m5, m6
+    movdqa            m6, m7
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
+
+; ******************************
+; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int width, int height, int mx, int my)
+; ******************************
+
+%macro HEVC_PUT_HEVC_QPEL 2
+cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 15, dst, src, srcstride, height, mx, rfilter
+    QPEL_FILTER       %2, mx
+.loop
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END          dst, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
+    movdqa            m9, [pw_%2]
+    QPEL_FILTER       %2, mx
+.loop
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    UNI_COMPUTE       %1, %2, m0, m1, m9
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
+    movdqa            m9, [pw_bi_%2]
+    QPEL_FILTER       %2, mx
+.loop
+    QPEL_H_LOAD       %2, srcq, %1, 10
+    QPEL_COMPUTE      %1, %2
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    SIMPLE_BILOAD     %1, src2q, m10, m11
+    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+
+; ******************************
+; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int width, int height, int mx, int my)
+; ******************************
+
+cglobal hevc_put_hevc_qpel_v%1_%2, 6, 8, 15, dst, src, srcstride, height, r3src, my, rfilter
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r7
+    QPEL_COMPUTE      %1, %2
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    PEL_10STORE%1     dstq, m0, m1
+    LOOP_END         dst, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_qpel_v%1_%2, 7, 9, 15, dst, dststride, src, srcstride, height, r3src, my, rfilter
+    movdqa            m9, [pw_%2]
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r8
+    QPEL_COMPUTE      %1, %2
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    UNI_COMPUTE       %1, %2, m0, m1, m9
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
+    movdqa            m9, [pw_bi_%2]
+    lea           r3srcq, [srcstrideq*3]
+    QPEL_FILTER       %2, my
+.loop
+    SIMPLE_BILOAD     %1, src2q, m10, m11
+    QPEL_V_LOAD       %2, srcq, srcstride, %1, r9
+    QPEL_COMPUTE      %1, %2
+%if %2 > 8
+    packssdw          m0, m1
+%endif
+    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
+
+
+; ******************************
+; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
+;                       uint8_t *_src, ptrdiff_t _srcstride,
+;                       int height, int mx, int my)
+; ******************************
+%macro HEVC_PUT_HEVC_QPEL_HV 2
+cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 12, dst, src, srcstride, height, mx, my, r3src, rfilter
+    lea              mxq, [mxq*8-8]
+    lea              myq, [myq*8-8]
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
+    PEL_10STORE%1     dstq, m0, m1
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    movdqa            m8, m9
+    movdqa            m9, m10
+    movdqa           m10, m11
+    movdqa           m11, m12
+    movdqa           m12, m13
+    movdqa           m13, m14
+    movdqa           m14, m15
+%endif
+    LOOP_END         dst, src, srcstride
+    RET
+
+cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
+    lea              mxq, [mxq*8-8]
+    lea              myq, [myq*8-8]
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackusdw
+    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
+    PEL_%2STORE%1   dstq, m0, m1
+
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    movdqa            m8, m9
+    movdqa            m9, m10
+    movdqa           m10, m11
+    movdqa           m11, m12
+    movdqa           m12, m13
+    movdqa           m13, m14
+    movdqa           m14, m15
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
+    lea              mxq, [mxq*8-8]
+    lea              myq, [myq*8-8]
+    lea           r3srcq, [srcstrideq*3]
+    sub             srcq, r3srcq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m8, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP              m9, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m10, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m11, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m12, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m13, m0
+    add             srcq, srcstrideq
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m14, m0
+    add             srcq, srcstrideq
+.loop
+    QPEL_H_LOAD       %2, srcq, %1, 15
+    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
+    SWAP             m15, m0
+    punpcklwd         m0, m8, m9
+    punpcklwd         m2, m10, m11
+    punpcklwd         m4, m12, m13
+    punpcklwd         m6, m14, m15
+%if %1 > 4
+    punpckhwd         m1, m8, m9
+    punpckhwd         m3, m10, m11
+    punpckhwd         m5, m12, m13
+    punpckhwd         m7, m14, m15
+%endif
+    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
+    SIMPLE_BILOAD     %1, src2q, m8, m9 ;m9 not used in this case
+    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
+    PEL_%2STORE%1   dstq, m0, m1
+
+%if %1 <= 4
+    movq              m8, m9
+    movq              m9, m10
+    movq             m10, m11
+    movq             m11, m12
+    movq             m12, m13
+    movq             m13, m14
+    movq             m14, m15
+%else
+    movdqa            m8, m9
+    movdqa            m9, m10
+    movdqa           m10, m11
+    movdqa           m11, m12
+    movdqa           m12, m13
+    movdqa           m13, m14
+    movdqa           m14, m15
+%endif
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, srcstrideq             ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
+
+%macro WEIGHTING_FUNCS 2
+%if WIN64 || ARCH_X86_32
+cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox
+    mov             r4d, denomm
+%define SHIFT  r4d
+%else
+cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox
+%define SHIFT  denomd
+%endif
+    lea           SHIFT, [SHIFT+14-%2]          ; shift = 14 - bitd + denom
+%if %1 <= 4
+    pxor             m1, m1
+%endif
+    movd             m2, wxm        ; WX
+    movd             m4, SHIFT      ; shift
+%if %1 <= 4
+    punpcklwd        m2, m1
+%else
+    punpcklwd        m2, m2
+%endif
+    dec           SHIFT
+    movdqu           m5, [one_per_32]
+    movd             m6, SHIFT
+    pshufd           m2, m2, 0
+    mov           SHIFT, oxm
+    pslld            m5, m6
+%if %2 != 8
+    shl           SHIFT, %2-8       ; ox << (bitd - 8)
+%endif
+    movd             m3, SHIFT      ; OX
+    pshufd           m3, m3, 0
+%if WIN64 || ARCH_X86_32
+    mov           SHIFT, heightm
+%endif
+.loop
+   SIMPLE_LOAD        %1, 10, srcq, m0
+%if %1 <= 4
+    punpcklwd         m0, m1
+    pmaddwd           m0, m2
+    paddd             m0, m5
+    psrad             m0, m4
+    paddd             m0, m3
+%else
+    pmulhw            m6, m0, m2
+    pmullw            m0, m2
+    punpckhwd         m1, m0, m6
+    punpcklwd         m0, m6
+    paddd             m0, m5
+    paddd             m1, m5
+    psrad             m0, m4
+    psrad             m1, m4
+    paddd             m0, m3
+    paddd             m1, m3
+%endif
+    packssdw          m0, m1
+%if %2 == 8
+    packuswb          m0, m0
+%else
+    pminsw            m0, [max_pixels_%2]
+    pmaxsw            m0, [zero]
+%endif
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
+    dec          heightd                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+
+cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, height, denom, wx0, wx1, ox0, ox1
+    mov              r6d, denomm
+%if %1 <= 4
+    pxor              m1, m1
+%endif
+    movd              m2, wx0m         ; WX0
+    lea              r6d, [r6d+14-%2]  ; shift = 14 - bitd + denom
+    movd              m3, wx1m         ; WX1
+    movd              m0, r6d          ; shift
+%if %1 <= 4
+    punpcklwd         m2, m1
+    punpcklwd         m3, m1
+%else
+    punpcklwd         m2, m2
+    punpcklwd         m3, m3
+%endif
+    inc              r6d
+    movd              m5, r6d          ; shift+1
+    pshufd            m2, m2, 0
+    mov              r6d, ox0m
+    pshufd            m3, m3, 0
+    add              r6d, ox1m
+%if %2 != 8
+    shl              r6d, %2-8         ; ox << (bitd - 8)
+%endif
+    inc              r6d
+    movd              m4, r6d          ; offset
+    pshufd            m4, m4, 0
+    mov              r6d, heightm
+    pslld             m4, m0
+
+.loop
+   SIMPLE_LOAD        %1, 10, srcq,  m0
+   SIMPLE_LOAD        %1, 10, src2q, m8
+%if %1 <= 4
+    punpcklwd         m0, m1
+    punpcklwd         m8, m1
+    pmaddwd           m0, m3
+    pmaddwd           m8, m2
+    paddd             m0, m4
+    paddd             m0, m8
+    psrad             m0, m5
+%else
+    pmulhw            m6, m0, m3
+    pmullw            m0, m3
+    pmulhw            m7, m8, m2
+    pmullw            m8, m2
+    punpckhwd         m1, m0, m6
+    punpcklwd         m0, m6
+    punpckhwd         m9, m8, m7
+    punpcklwd         m8, m7
+    paddd             m0, m8
+    paddd             m1, m9
+    paddd             m0, m4
+    paddd             m1, m4
+    psrad             m0, m5
+    psrad             m1, m5
+%endif
+    packssdw          m0, m1
+%if %2 == 8
+    packuswb          m0, m0
+%else
+    pminsw            m0, [max_pixels_%2]
+    pmaxsw            m0, [zero]
+%endif
+    PEL_%2STORE%1   dstq, m0, m1
+    add             dstq, dststrideq             ; dst += dststride
+    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
+    add            src2q, 2*MAX_PB_SIZE          ; src2 += srcstride
+    dec              r6d                         ; cmp height
+    jnz               .loop                      ; height loop
+    RET
+%endmacro
+
+WEIGHTING_FUNCS 2, 8
+WEIGHTING_FUNCS 4, 8
+WEIGHTING_FUNCS 6, 8
+WEIGHTING_FUNCS 8, 8
+
+WEIGHTING_FUNCS 2, 10
+WEIGHTING_FUNCS 4, 10
+WEIGHTING_FUNCS 6, 10
+WEIGHTING_FUNCS 8, 10
+
+WEIGHTING_FUNCS 2, 12
+WEIGHTING_FUNCS 4, 12
+WEIGHTING_FUNCS 6, 12
+WEIGHTING_FUNCS 8, 12
+
+HEVC_PUT_HEVC_PEL_PIXELS  2, 8
+HEVC_PUT_HEVC_PEL_PIXELS  4, 8
+HEVC_PUT_HEVC_PEL_PIXELS  6, 8
+HEVC_PUT_HEVC_PEL_PIXELS  8, 8
+HEVC_PUT_HEVC_PEL_PIXELS 12, 8
+HEVC_PUT_HEVC_PEL_PIXELS 16, 8
+
+HEVC_PUT_HEVC_PEL_PIXELS 2, 10
+HEVC_PUT_HEVC_PEL_PIXELS 4, 10
+HEVC_PUT_HEVC_PEL_PIXELS 6, 10
+HEVC_PUT_HEVC_PEL_PIXELS 8, 10
+
+HEVC_PUT_HEVC_PEL_PIXELS 2, 12
+HEVC_PUT_HEVC_PEL_PIXELS 4, 12
+HEVC_PUT_HEVC_PEL_PIXELS 6, 12
+HEVC_PUT_HEVC_PEL_PIXELS 8, 12
+
+HEVC_PUT_HEVC_EPEL 2,  8
+HEVC_PUT_HEVC_EPEL 4,  8
+HEVC_PUT_HEVC_EPEL 6,  8
+HEVC_PUT_HEVC_EPEL 8,  8
+HEVC_PUT_HEVC_EPEL 12, 8
+HEVC_PUT_HEVC_EPEL 16, 8
+
+
+HEVC_PUT_HEVC_EPEL 2, 10
+HEVC_PUT_HEVC_EPEL 4, 10
+HEVC_PUT_HEVC_EPEL 6, 10
+HEVC_PUT_HEVC_EPEL 8, 10
+
+HEVC_PUT_HEVC_EPEL 2, 12
+HEVC_PUT_HEVC_EPEL 4, 12
+HEVC_PUT_HEVC_EPEL 6, 12
+HEVC_PUT_HEVC_EPEL 8, 12
+
+HEVC_PUT_HEVC_EPEL_HV 2,  8
+HEVC_PUT_HEVC_EPEL_HV 4,  8
+HEVC_PUT_HEVC_EPEL_HV 6,  8
+HEVC_PUT_HEVC_EPEL_HV 8,  8
+
+HEVC_PUT_HEVC_EPEL_HV 2, 10
+HEVC_PUT_HEVC_EPEL_HV 4, 10
+HEVC_PUT_HEVC_EPEL_HV 6, 10
+HEVC_PUT_HEVC_EPEL_HV 8, 10
+
+HEVC_PUT_HEVC_EPEL_HV 2, 12
+HEVC_PUT_HEVC_EPEL_HV 4, 12
+HEVC_PUT_HEVC_EPEL_HV 6, 12
+HEVC_PUT_HEVC_EPEL_HV 8, 12
+
+HEVC_PUT_HEVC_QPEL 4,  8
+HEVC_PUT_HEVC_QPEL 8,  8
+HEVC_PUT_HEVC_QPEL 12, 8
+HEVC_PUT_HEVC_QPEL 16, 8
+
+HEVC_PUT_HEVC_QPEL 4, 10
+HEVC_PUT_HEVC_QPEL 8, 10
+
+HEVC_PUT_HEVC_QPEL 4, 12
+HEVC_PUT_HEVC_QPEL 8, 12
+
+HEVC_PUT_HEVC_QPEL_HV 2, 8
+HEVC_PUT_HEVC_QPEL_HV 4, 8
+HEVC_PUT_HEVC_QPEL_HV 6, 8
+HEVC_PUT_HEVC_QPEL_HV 8, 8
+
+HEVC_PUT_HEVC_QPEL_HV 2, 10
+HEVC_PUT_HEVC_QPEL_HV 4, 10
+HEVC_PUT_HEVC_QPEL_HV 6, 10
+HEVC_PUT_HEVC_QPEL_HV 8, 10
+
+HEVC_PUT_HEVC_QPEL_HV 2, 12
+HEVC_PUT_HEVC_QPEL_HV 4, 12
+HEVC_PUT_HEVC_QPEL_HV 6, 12
+HEVC_PUT_HEVC_QPEL_HV 8, 12
+
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
new file mode 100644
index 0000000000..24b88eebab
--- /dev/null
+++ b/libavcodec/x86/hevc_res_add.asm
@@ -0,0 +1,388 @@
+; /*
+; * Provide SIMD optimizations for transform_add functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+max_pixels_10:          times 16  dw ((1 << 10)-1)
+
+
+SECTION .text
+
+;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
+%macro TR_ADD_MMX_4_8 0
+    mova              m2, [r1]
+    mova              m4, [r1+8]
+    pxor              m3, m3
+    psubw             m3, m2
+    packuswb          m2, m2
+    packuswb          m3, m3
+    pxor              m5, m5
+    psubw             m5, m4
+    packuswb          m4, m4
+    packuswb          m5, m5
+
+    movh              m0, [r0     ]
+    movh              m1, [r0+r2  ]
+    paddusb           m0, m2
+    paddusb           m1, m4
+    psubusb           m0, m3
+    psubusb           m1, m5
+    movh       [r0     ], m0
+    movh       [r0+r2  ], m1
+%endmacro
+
+
+INIT_MMX mmxext
+; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add4_8, 3, 4, 6
+    TR_ADD_MMX_4_8
+    add               r1, 16
+    lea               r0, [r0+r2*2]
+    TR_ADD_MMX_4_8
+    RET
+
+%macro TR_ADD_SSE_8_8 0
+    pxor              m3, m3
+    mova              m4, [r1]
+    mova              m6, [r1+16]
+    mova              m0, [r1+32]
+    mova              m2, [r1+48]
+    psubw             m5, m3, m4
+    psubw             m7, m3, m6
+    psubw             m1, m3, m0
+    packuswb          m4, m0
+    packuswb          m5, m1
+    psubw             m3, m2
+    packuswb          m6, m2
+    packuswb          m7, m3
+
+    movq                m0, [r0     ]
+    movq                m1, [r0+r2  ]
+    movhps              m0, [r0+r2*2]
+    movhps              m1, [r0+r3  ]
+    paddusb             m0, m4
+    paddusb             m1, m6
+    psubusb             m0, m5
+    psubusb             m1, m7
+    movq         [r0     ], m0
+    movq         [r0+r2  ], m1
+    movhps       [r0+2*r2], m0
+    movhps       [r0+r3  ], m1
+%endmacro
+
+%macro TR_ADD_SSE_16_32_8 3
+    mova             xm2, [r1+%1   ]
+    mova             xm6, [r1+%1+16]
+%if cpuflag(avx2)
+    vinserti128       m2, m2, [r1+%1+32], 1
+    vinserti128       m6, m6, [r1+%1+48], 1
+%endif
+%if cpuflag(avx)
+    psubw             m1, m0, m2
+    psubw             m5, m0, m6
+%else
+    mova              m1, m0
+    mova              m5, m0
+    psubw             m1, m2
+    psubw             m5, m6
+%endif
+    packuswb          m2, m6
+    packuswb          m1, m5
+
+    mova             xm4, [r1+%1+mmsize*2   ]
+    mova             xm6, [r1+%1+mmsize*2+16]
+%if cpuflag(avx2)
+    vinserti128       m4, m4, [r1+%1+96 ], 1
+    vinserti128       m6, m6, [r1+%1+112], 1
+%endif
+%if cpuflag(avx)
+    psubw             m3, m0, m4
+    psubw             m5, m0, m6
+%else
+    mova              m3, m0
+    mova              m5, m0
+    psubw             m3, m4
+    psubw             m5, m6
+%endif
+    packuswb          m4, m6
+    packuswb          m3, m5
+
+    paddusb           m2, [%2]
+    paddusb           m4, [%3]
+    psubusb           m2, m1
+    psubusb           m4, m3
+    mova            [%2], m2
+    mova            [%3], m4
+%endmacro
+
+
+%macro TRANSFORM_ADD_8 0
+; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add8_8, 3, 4, 8
+    lea               r3, [r2*3]
+    TR_ADD_SSE_8_8
+    add               r1, 64
+    lea               r0, [r0+r2*4]
+    TR_ADD_SSE_8_8
+    RET
+
+; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add16_8, 3, 4, 7
+    pxor              m0, m0
+    lea               r3, [r2*3]
+    TR_ADD_SSE_16_32_8  0, r0,      r0+r2
+    TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+%rep 3
+    add                r1, 128
+    lea                r0, [r0+r2*4]
+    TR_ADD_SSE_16_32_8  0, r0,      r0+r2
+    TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3
+%endrep
+    RET
+
+; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add32_8, 3, 4, 7
+    pxor               m0, m0
+    TR_ADD_SSE_16_32_8  0, r0,    r0+16
+    TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+%rep 15
+    add                r1, 128
+    lea                r0, [r0+r2*2]
+    TR_ADD_SSE_16_32_8  0, r0,    r0+16
+    TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16
+%endrep
+    RET
+%endmacro
+
+INIT_XMM sse2
+TRANSFORM_ADD_8
+INIT_XMM avx
+TRANSFORM_ADD_8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add32_8, 3, 4, 7
+    pxor              m0, m0
+    lea               r3, [r2*3]
+    TR_ADD_SSE_16_32_8   0, r0,      r0+r2
+    TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+%rep 7
+    add                r1, 256
+    lea                r0, [r0+r2*4]
+    TR_ADD_SSE_16_32_8   0, r0,      r0+r2
+    TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+%endrep
+    RET
+%endif
+
+;-----------------------------------------------------------------------------
+; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+%macro TR_ADD_SSE_8_10 4
+    mova              m0, [%4]
+    mova              m1, [%4+16]
+    mova              m2, [%4+32]
+    mova              m3, [%4+48]
+    paddw             m0, [%1+0   ]
+    paddw             m1, [%1+%2  ]
+    paddw             m2, [%1+%2*2]
+    paddw             m3, [%1+%3  ]
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova       [%1+0   ], m0
+    mova       [%1+%2  ], m1
+    mova       [%1+%2*2], m2
+    mova       [%1+%3  ], m3
+%endmacro
+
+%macro TR_ADD_MMX4_10 3
+    mova              m0, [%1+0   ]
+    mova              m1, [%1+%2  ]
+    paddw             m0, [%3]
+    paddw             m1, [%3+8]
+    CLIPW             m0, m2, m3
+    CLIPW             m1, m2, m3
+    mova       [%1+0   ], m0
+    mova       [%1+%2  ], m1
+%endmacro
+
+%macro TRANS_ADD_SSE_16_10 3
+    mova              m0, [%3]
+    mova              m1, [%3+16]
+    mova              m2, [%3+32]
+    mova              m3, [%3+48]
+    paddw             m0, [%1      ]
+    paddw             m1, [%1+16   ]
+    paddw             m2, [%1+%2   ]
+    paddw             m3, [%1+%2+16]
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova      [%1      ], m0
+    mova      [%1+16   ], m1
+    mova      [%1+%2   ], m2
+    mova      [%1+%2+16], m3
+%endmacro
+
+%macro TRANS_ADD_SSE_32_10 2
+    mova              m0, [%2]
+    mova              m1, [%2+16]
+    mova              m2, [%2+32]
+    mova              m3, [%2+48]
+
+    paddw             m0, [%1   ]
+    paddw             m1, [%1+16]
+    paddw             m2, [%1+32]
+    paddw             m3, [%1+48]
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova         [%1   ], m0
+    mova         [%1+16], m1
+    mova         [%1+32], m2
+    mova         [%1+48], m3
+%endmacro
+
+%macro TRANS_ADD16_AVX2 4
+    mova              m0, [%4]
+    mova              m1, [%4+32]
+    mova              m2, [%4+64]
+    mova              m3, [%4+96]
+
+    paddw             m0, [%1+0   ]
+    paddw             m1, [%1+%2  ]
+    paddw             m2, [%1+%2*2]
+    paddw             m3, [%1+%3  ]
+
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova       [%1+0   ], m0
+    mova       [%1+%2  ], m1
+    mova       [%1+%2*2], m2
+    mova       [%1+%3  ], m3
+%endmacro
+
+%macro TRANS_ADD32_AVX2 3
+    mova              m0, [%3]
+    mova              m1, [%3+32]
+    mova              m2, [%3+64]
+    mova              m3, [%3+96]
+
+    paddw             m0, [%1      ]
+    paddw             m1, [%1+32   ]
+    paddw             m2, [%1+%2   ]
+    paddw             m3, [%1+%2+32]
+
+    CLIPW             m0, m4, m5
+    CLIPW             m1, m4, m5
+    CLIPW             m2, m4, m5
+    CLIPW             m3, m4, m5
+    mova      [%1      ], m0
+    mova      [%1+32   ], m1
+    mova      [%1+%2   ], m2
+    mova      [%1+%2+32], m3
+%endmacro
+
+
+INIT_MMX mmxext
+cglobal hevc_transform_add4_10,3,4, 6
+    pxor              m2, m2
+    mova              m3, [max_pixels_10]
+    TR_ADD_MMX4_10     r0, r2, r1
+    add               r1, 16
+    lea               r0, [r0+2*r2]
+    TR_ADD_MMX4_10     r0, r2, r1
+    RET
+
+;-----------------------------------------------------------------------------
+; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal hevc_transform_add8_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+    lea               r3, [r2*3]
+
+    TR_ADD_SSE_8_10      r0, r2, r3, r1
+    lea               r0, [r0+r2*4]
+    add               r1, 64
+    TR_ADD_SSE_8_10      r0, r2, r3, r1
+    RET
+
+cglobal hevc_transform_add16_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+
+    TRANS_ADD_SSE_16_10 r0, r2, r1
+%rep 7
+    lea                 r0, [r0+r2*2]
+    add                 r1, 64
+    TRANS_ADD_SSE_16_10 r0, r2, r1
+%endrep
+    RET
+
+cglobal hevc_transform_add32_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+
+    TRANS_ADD_SSE_32_10 r0, r1
+%rep 31
+    lea                 r0, [r0+r2]
+    add                 r1, 64
+    TRANS_ADD_SSE_32_10 r0, r1
+%endrep
+    RET
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+
+cglobal hevc_transform_add16_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+    lea               r3, [r2*3]
+
+    TRANS_ADD16_AVX2  r0, r2, r3, r1
+%rep 3
+    lea               r0, [r0+r2*4]
+    add               r1, 128
+    TRANS_ADD16_AVX2  r0, r2, r3, r1
+%endrep
+    RET
+
+cglobal hevc_transform_add32_10,3,4,6
+    pxor              m4, m4
+    mova              m5, [max_pixels_10]
+
+    TRANS_ADD32_AVX2  r0, r2, r1
+%rep 15
+    lea               r0, [r0+r2*2]
+    add               r1, 128
+    TRANS_ADD32_AVX2  r0, r2, r1
+%endrep
+    RET
+%endif ;HAVE_AVX_EXTERNAL
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
new file mode 100644
index 0000000000..8dea1428f0
--- /dev/null
+++ b/libavcodec/x86/hevcdsp.h
@@ -0,0 +1,156 @@
+/*
+ * HEVC video decoder
+ *
+ * Copyright (C) 2012 - 2013 Guillaume Martres
+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
+ *
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_HEVCDSP_H
+#define AVCODEC_X86_HEVCDSP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+
+#define idct_dc_proto(size, bitd, opt) \
+                void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+
+#define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
+dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
+dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
+dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \
+dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \
+dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt
+
+
+#define PEL_PROTOTYPE(name, D, opt) \
+void ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); \
+void ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width); \
+void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, int denom, int wx0, int wx1, int ox0, int ox1, intptr_t mx, intptr_t my, int width)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// MC functions
+///////////////////////////////////////////////////////////////////////////////
+
+#define EPEL_PROTOTYPES(fname, bitd, opt) \
+        PEL_PROTOTYPE(fname##4,  bitd, opt); \
+        PEL_PROTOTYPE(fname##6,  bitd, opt); \
+        PEL_PROTOTYPE(fname##8,  bitd, opt); \
+        PEL_PROTOTYPE(fname##12, bitd, opt); \
+        PEL_PROTOTYPE(fname##16, bitd, opt); \
+        PEL_PROTOTYPE(fname##24, bitd, opt); \
+        PEL_PROTOTYPE(fname##32, bitd, opt); \
+        PEL_PROTOTYPE(fname##48, bitd, opt); \
+        PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define QPEL_PROTOTYPES(fname, bitd, opt) \
+        PEL_PROTOTYPE(fname##4,  bitd, opt); \
+        PEL_PROTOTYPE(fname##8,  bitd, opt); \
+        PEL_PROTOTYPE(fname##12, bitd, opt); \
+        PEL_PROTOTYPE(fname##16, bitd, opt); \
+        PEL_PROTOTYPE(fname##24, bitd, opt); \
+        PEL_PROTOTYPE(fname##32, bitd, opt); \
+        PEL_PROTOTYPE(fname##48, bitd, opt); \
+        PEL_PROTOTYPE(fname##64, bitd, opt)
+
+#define WEIGHTING_PROTOTYPE(width, bitd, opt) \
+void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, int height, int denom,  int _wx, int _ox); \
+void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, int16_t *_src2, int height, int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)
+
+#define WEIGHTING_PROTOTYPES(bitd, opt) \
+        WEIGHTING_PROTOTYPE(2, bitd, opt); \
+        WEIGHTING_PROTOTYPE(4, bitd, opt); \
+        WEIGHTING_PROTOTYPE(6, bitd, opt); \
+        WEIGHTING_PROTOTYPE(8, bitd, opt); \
+        WEIGHTING_PROTOTYPE(12, bitd, opt); \
+        WEIGHTING_PROTOTYPE(16, bitd, opt); \
+        WEIGHTING_PROTOTYPE(24, bitd, opt); \
+        WEIGHTING_PROTOTYPE(32, bitd, opt); \
+        WEIGHTING_PROTOTYPE(48, bitd, opt); \
+        WEIGHTING_PROTOTYPE(64, bitd, opt)
+
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL_PIXELS EPEL_PIXELS
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(pel_pixels ,  8, sse4);
+EPEL_PROTOTYPES(pel_pixels , 10, sse4);
+EPEL_PROTOTYPES(pel_pixels , 12, sse4);
+///////////////////////////////////////////////////////////////////////////////
+// EPEL
+///////////////////////////////////////////////////////////////////////////////
+EPEL_PROTOTYPES(epel_h ,  8, sse4);
+EPEL_PROTOTYPES(epel_h , 10, sse4);
+EPEL_PROTOTYPES(epel_h , 12, sse4);
+
+EPEL_PROTOTYPES(epel_v ,  8, sse4);
+EPEL_PROTOTYPES(epel_v , 10, sse4);
+EPEL_PROTOTYPES(epel_v , 12, sse4);
+
+EPEL_PROTOTYPES(epel_hv ,  8, sse4);
+EPEL_PROTOTYPES(epel_hv , 10, sse4);
+EPEL_PROTOTYPES(epel_hv , 12, sse4);
+
+///////////////////////////////////////////////////////////////////////////////
+// QPEL
+///////////////////////////////////////////////////////////////////////////////
+QPEL_PROTOTYPES(qpel_h ,  8, sse4);
+QPEL_PROTOTYPES(qpel_h , 10, sse4);
+QPEL_PROTOTYPES(qpel_h , 12, sse4);
+
+QPEL_PROTOTYPES(qpel_v,  8, sse4);
+QPEL_PROTOTYPES(qpel_v, 10, sse4);
+QPEL_PROTOTYPES(qpel_v, 12, sse4);
+
+QPEL_PROTOTYPES(qpel_hv,  8, sse4);
+QPEL_PROTOTYPES(qpel_hv, 10, sse4);
+QPEL_PROTOTYPES(qpel_hv, 12, sse4);
+
+
+WEIGHTING_PROTOTYPES(8, sse4);
+WEIGHTING_PROTOTYPES(10, sse4);
+WEIGHTING_PROTOTYPES(12, sse4);
+
+///////////////////////////////////////////////////////////////////////////////
+// TRANSFORM_ADD
+///////////////////////////////////////////////////////////////////////////////
+void ff_hevc_transform_add4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+void ff_hevc_transform_add16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
+#endif // AVCODEC_X86_HEVCDSP_H
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 04203c22a0..eaa97e1434 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -2,29 +2,31 @@
  * Copyright (c) 2013 Seppo Tomperi
  * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
  *
- * This file is part of Libav.
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
-
 #include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
-
+#include "libavcodec/get_bits.h" /* required for hevcdsp.h GetBitContext */
 #include "libavcodec/hevcdsp.h"
+#include "libavcodec/x86/hevcdsp.h"
 
 #define LFC_FUNC(DIR, DEPTH, OPT) \
 void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q);
@@ -32,40 +34,627 @@ void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix,
 #define LFL_FUNC(DIR, DEPTH, OPT) \
 void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
 
-#define LFC_FUNCS(type, depth) \
-    LFC_FUNC(h, depth, sse2)   \
-    LFC_FUNC(v, depth, sse2)
+#define LFC_FUNCS(type, depth, opt) \
+    LFC_FUNC(h, depth, opt)  \
+    LFC_FUNC(v, depth, opt)
+
+#define LFL_FUNCS(type, depth, opt) \
+    LFL_FUNC(h, depth, opt)  \
+    LFL_FUNC(v, depth, opt)
+
+LFC_FUNCS(uint8_t,   8, sse2)
+LFC_FUNCS(uint8_t,  10, sse2)
+LFC_FUNCS(uint8_t,  12, sse2)
+LFC_FUNCS(uint8_t,   8, avx)
+LFC_FUNCS(uint8_t,  10, avx)
+LFC_FUNCS(uint8_t,  12, avx)
+LFL_FUNCS(uint8_t,   8, sse2)
+LFL_FUNCS(uint8_t,  10, sse2)
+LFL_FUNCS(uint8_t,  12, sse2)
+LFL_FUNCS(uint8_t,   8, ssse3)
+LFL_FUNCS(uint8_t,  10, ssse3)
+LFL_FUNCS(uint8_t,  12, ssse3)
+LFL_FUNCS(uint8_t,   8, avx)
+LFL_FUNCS(uint8_t,  10, avx)
+LFL_FUNCS(uint8_t,  12, avx)
+
+#define IDCT_FUNCS(W, opt) \
+void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \
+void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs); \
+void ff_hevc_idct##W##_dc_12_##opt(int16_t *coeffs)
+
+IDCT_FUNCS(4x4,   mmxext);
+IDCT_FUNCS(8x8,   mmxext);
+IDCT_FUNCS(8x8,   sse2);
+IDCT_FUNCS(16x16, sse2);
+IDCT_FUNCS(32x32, sse2);
+IDCT_FUNCS(16x16, avx2);
+IDCT_FUNCS(32x32, avx2);
+
+#define mc_rep_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst,                                                 \
+                                                uint8_t *_src, ptrdiff_t _srcstride, int height,                \
+                                                intptr_t mx, intptr_t my, int width)                            \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t *src;                                                                                               \
+    int16_t *dst;                                                                                               \
+    for (i = 0; i < W; i += step) {                                                                             \
+        src  = _src + (i * ((bitd + 7) / 8));                                                                   \
+        dst = _dst + i;                                                                                         \
+        ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width);            \
+    }                                                                                                           \
+}
+#define mc_rep_uni_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride,                        \
+                                                    uint8_t *_src, ptrdiff_t _srcstride, int height,            \
+                                                    intptr_t mx, intptr_t my, int width)                        \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t *src;                                                                                               \
+    uint8_t *dst;                                                                                               \
+    for (i = 0; i < W; i += step) {                                                                             \
+        src = _src + (i * ((bitd + 7) / 8));                                                                    \
+        dst = _dst + (i * ((bitd + 7) / 8));                                                                    \
+        ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride,                     \
+                                                          height, mx, my, width);                               \
+    }                                                                                                           \
+}
+#define mc_rep_bi_func(name, bitd, step, W, opt) \
+void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, uint8_t *_src,          \
+                                                   ptrdiff_t _srcstride, int16_t* _src2,                        \
+                                                   int height, intptr_t mx, intptr_t my, int width)             \
+{                                                                                                               \
+    int i;                                                                                                      \
+    uint8_t  *src;                                                                                              \
+    uint8_t  *dst;                                                                                              \
+    int16_t  *src2;                                                                                             \
+    for (i = 0; i < W ; i += step) {                                                                            \
+        src  = _src + (i * ((bitd + 7) / 8));                                                                   \
+        dst  = _dst + (i * ((bitd + 7) / 8));                                                                   \
+        src2 = _src2 + i;                                                                                       \
+        ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2,                \
+                                                          height, mx, my, width);                               \
+    }                                                                                                           \
+}
+
+#define mc_rep_funcs(name, bitd, step, W, opt)        \
+    mc_rep_func(name, bitd, step, W, opt);            \
+    mc_rep_uni_func(name, bitd, step, W, opt);        \
+    mc_rep_bi_func(name, bitd, step, W, opt)
+
+#define mc_rep_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst,                                                  \
+                                                 uint8_t *src, ptrdiff_t _srcstride, int height,                \
+                                                 intptr_t mx, intptr_t my, int width)                           \
+{                                                                                                               \
+    ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width);               \
+    ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)),              \
+                                                    _srcstride, height, mx, my, width);                         \
+}
+#define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride,                         \
+                                                     uint8_t *src, ptrdiff_t _srcstride, int height,            \
+                                                     intptr_t mx, intptr_t my, int width)                       \
+{                                                                                                               \
+    ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\
+    ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride,            \
+                                                        src + (step1 * ((bitd + 7) / 8)), _srcstride,           \
+                                                        height, mx, my, width);                                 \
+}
+#define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
+void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,            \
+                                                    ptrdiff_t _srcstride, int16_t* src2,                        \
+                                                    int height, intptr_t mx, intptr_t my, int width)            \
+{                                                                                                               \
+    ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\
+    ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride,             \
+                                                       src + (step1 * ((bitd + 7) / 8)), _srcstride,            \
+                                                       src2 + step1, height, mx, my, width);                    \
+}
+
+#define mc_rep_funcs(name, bitd, step, W, opt)        \
+    mc_rep_func(name, bitd, step, W, opt);            \
+    mc_rep_uni_func(name, bitd, step, W, opt);        \
+    mc_rep_bi_func(name, bitd, step, W, opt)
+
+#define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
+    mc_rep_func2(name, bitd, step1, step2, W, opt);     \
+    mc_rep_uni_func2(name, bitd, step1, step2, W, opt); \
+    mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
+
+#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
 
-#define LFL_FUNCS(type, depth) \
-    LFL_FUNC(h, depth, ssse3)  \
-    LFL_FUNC(v, depth, ssse3)
+mc_rep_funcs(pel_pixels, 8, 16, 64, sse4);
+mc_rep_funcs(pel_pixels, 8, 16, 48, sse4);
+mc_rep_funcs(pel_pixels, 8, 16, 32, sse4);
+mc_rep_funcs(pel_pixels, 8,  8, 24, sse4);
+mc_rep_funcs(pel_pixels,10,  8, 64, sse4);
+mc_rep_funcs(pel_pixels,10,  8, 48, sse4);
+mc_rep_funcs(pel_pixels,10,  8, 32, sse4);
+mc_rep_funcs(pel_pixels,10,  8, 24, sse4);
+mc_rep_funcs(pel_pixels,10,  8, 16, sse4);
+mc_rep_funcs(pel_pixels,10,  4, 12, sse4);
+mc_rep_funcs(pel_pixels,12,  8, 64, sse4);
+mc_rep_funcs(pel_pixels,12,  8, 48, sse4);
+mc_rep_funcs(pel_pixels,12,  8, 32, sse4);
+mc_rep_funcs(pel_pixels,12,  8, 24, sse4);
+mc_rep_funcs(pel_pixels,12,  8, 16, sse4);
+mc_rep_funcs(pel_pixels,12,  4, 12, sse4);
+
+mc_rep_funcs(epel_h, 8, 16, 64, sse4);
+mc_rep_funcs(epel_h, 8, 16, 48, sse4);
+mc_rep_funcs(epel_h, 8, 16, 32, sse4);
+mc_rep_funcs(epel_h, 8,  8, 24, sse4);
+mc_rep_funcs(epel_h,10,  8, 64, sse4);
+mc_rep_funcs(epel_h,10,  8, 48, sse4);
+mc_rep_funcs(epel_h,10,  8, 32, sse4);
+mc_rep_funcs(epel_h,10,  8, 24, sse4);
+mc_rep_funcs(epel_h,10,  8, 16, sse4);
+mc_rep_funcs(epel_h,10,  4, 12, sse4);
+mc_rep_funcs(epel_h,12,  8, 64, sse4);
+mc_rep_funcs(epel_h,12,  8, 48, sse4);
+mc_rep_funcs(epel_h,12,  8, 32, sse4);
+mc_rep_funcs(epel_h,12,  8, 24, sse4);
+mc_rep_funcs(epel_h,12,  8, 16, sse4);
+mc_rep_funcs(epel_h,12,  4, 12, sse4);
+mc_rep_funcs(epel_v, 8, 16, 64, sse4);
+mc_rep_funcs(epel_v, 8, 16, 48, sse4);
+mc_rep_funcs(epel_v, 8, 16, 32, sse4);
+mc_rep_funcs(epel_v, 8,  8, 24, sse4);
+mc_rep_funcs(epel_v,10,  8, 64, sse4);
+mc_rep_funcs(epel_v,10,  8, 48, sse4);
+mc_rep_funcs(epel_v,10,  8, 32, sse4);
+mc_rep_funcs(epel_v,10,  8, 24, sse4);
+mc_rep_funcs(epel_v,10,  8, 16, sse4);
+mc_rep_funcs(epel_v,10,  4, 12, sse4);
+mc_rep_funcs(epel_v,12,  8, 64, sse4);
+mc_rep_funcs(epel_v,12,  8, 48, sse4);
+mc_rep_funcs(epel_v,12,  8, 32, sse4);
+mc_rep_funcs(epel_v,12,  8, 24, sse4);
+mc_rep_funcs(epel_v,12,  8, 16, sse4);
+mc_rep_funcs(epel_v,12,  4, 12, sse4);
+mc_rep_funcs(epel_hv, 8,  8, 64, sse4);
+mc_rep_funcs(epel_hv, 8,  8, 48, sse4);
+mc_rep_funcs(epel_hv, 8,  8, 32, sse4);
+mc_rep_funcs(epel_hv, 8,  8, 24, sse4);
+mc_rep_funcs(epel_hv, 8,  8, 16, sse4);
+mc_rep_funcs2(epel_hv,8,  8,  4, 12, sse4);
+mc_rep_funcs(epel_hv,10,  8, 64, sse4);
+mc_rep_funcs(epel_hv,10,  8, 48, sse4);
+mc_rep_funcs(epel_hv,10,  8, 32, sse4);
+mc_rep_funcs(epel_hv,10,  8, 24, sse4);
+mc_rep_funcs(epel_hv,10,  8, 16, sse4);
+mc_rep_funcs(epel_hv,10,  4, 12, sse4);
+mc_rep_funcs(epel_hv,12,  8, 64, sse4);
+mc_rep_funcs(epel_hv,12,  8, 48, sse4);
+mc_rep_funcs(epel_hv,12,  8, 32, sse4);
+mc_rep_funcs(epel_hv,12,  8, 24, sse4);
+mc_rep_funcs(epel_hv,12,  8, 16, sse4);
+mc_rep_funcs(epel_hv,12,  4, 12, sse4);
+
+mc_rep_funcs(qpel_h, 8, 16, 64, sse4);
+mc_rep_funcs(qpel_h, 8, 16, 48, sse4);
+mc_rep_funcs(qpel_h, 8, 16, 32, sse4);
+mc_rep_funcs(qpel_h, 8,  8, 24, sse4);
+mc_rep_funcs(qpel_h,10,  8, 64, sse4);
+mc_rep_funcs(qpel_h,10,  8, 48, sse4);
+mc_rep_funcs(qpel_h,10,  8, 32, sse4);
+mc_rep_funcs(qpel_h,10,  8, 24, sse4);
+mc_rep_funcs(qpel_h,10,  8, 16, sse4);
+mc_rep_funcs(qpel_h,10,  4, 12, sse4);
+mc_rep_funcs(qpel_h,12,  8, 64, sse4);
+mc_rep_funcs(qpel_h,12,  8, 48, sse4);
+mc_rep_funcs(qpel_h,12,  8, 32, sse4);
+mc_rep_funcs(qpel_h,12,  8, 24, sse4);
+mc_rep_funcs(qpel_h,12,  8, 16, sse4);
+mc_rep_funcs(qpel_h,12,  4, 12, sse4);
+mc_rep_funcs(qpel_v, 8, 16, 64, sse4);
+mc_rep_funcs(qpel_v, 8, 16, 48, sse4);
+mc_rep_funcs(qpel_v, 8, 16, 32, sse4);
+mc_rep_funcs(qpel_v, 8,  8, 24, sse4);
+mc_rep_funcs(qpel_v,10,  8, 64, sse4);
+mc_rep_funcs(qpel_v,10,  8, 48, sse4);
+mc_rep_funcs(qpel_v,10,  8, 32, sse4);
+mc_rep_funcs(qpel_v,10,  8, 24, sse4);
+mc_rep_funcs(qpel_v,10,  8, 16, sse4);
+mc_rep_funcs(qpel_v,10,  4, 12, sse4);
+mc_rep_funcs(qpel_v,12,  8, 64, sse4);
+mc_rep_funcs(qpel_v,12,  8, 48, sse4);
+mc_rep_funcs(qpel_v,12,  8, 32, sse4);
+mc_rep_funcs(qpel_v,12,  8, 24, sse4);
+mc_rep_funcs(qpel_v,12,  8, 16, sse4);
+mc_rep_funcs(qpel_v,12,  4, 12, sse4);
+mc_rep_funcs(qpel_hv, 8,  8, 64, sse4);
+mc_rep_funcs(qpel_hv, 8,  8, 48, sse4);
+mc_rep_funcs(qpel_hv, 8,  8, 32, sse4);
+mc_rep_funcs(qpel_hv, 8,  8, 24, sse4);
+mc_rep_funcs(qpel_hv, 8,  8, 16, sse4);
+mc_rep_funcs2(qpel_hv,8,  8,  4, 12, sse4);
+mc_rep_funcs(qpel_hv,10,  8, 64, sse4);
+mc_rep_funcs(qpel_hv,10,  8, 48, sse4);
+mc_rep_funcs(qpel_hv,10,  8, 32, sse4);
+mc_rep_funcs(qpel_hv,10,  8, 24, sse4);
+mc_rep_funcs(qpel_hv,10,  8, 16, sse4);
+mc_rep_funcs(qpel_hv,10,  4, 12, sse4);
+mc_rep_funcs(qpel_hv,12,  8, 64, sse4);
+mc_rep_funcs(qpel_hv,12,  8, 48, sse4);
+mc_rep_funcs(qpel_hv,12,  8, 32, sse4);
+mc_rep_funcs(qpel_hv,12,  8, 24, sse4);
+mc_rep_funcs(qpel_hv,12,  8, 16, sse4);
+mc_rep_funcs(qpel_hv,12,  4, 12, sse4);
+
+#define mc_rep_uni_w(bitd, step, W, opt) \
+void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride,\
+                                               int height, int denom,  int _wx, int _ox)                                \
+{                                                                                                                       \
+    int i;                                                                                                              \
+    int16_t *src;                                                                                                       \
+    uint8_t *dst;                                                                                                       \
+    for (i = 0; i < W; i += step) {                                                                                     \
+        src= _src + i;                                                                                                  \
+        dst= _dst + (i * ((bitd + 7) / 8));                                                                             \
+        ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, _srcstride,                                  \
+                                                     height, denom, _wx, _ox);                                          \
+    }                                                                                                                   \
+}
+
+mc_rep_uni_w(8, 6, 12, sse4);
+mc_rep_uni_w(8, 8, 16, sse4);
+mc_rep_uni_w(8, 8, 24, sse4);
+mc_rep_uni_w(8, 8, 32, sse4);
+mc_rep_uni_w(8, 8, 48, sse4);
+mc_rep_uni_w(8, 8, 64, sse4);
+
+mc_rep_uni_w(10, 6, 12, sse4);
+mc_rep_uni_w(10, 8, 16, sse4);
+mc_rep_uni_w(10, 8, 24, sse4);
+mc_rep_uni_w(10, 8, 32, sse4);
+mc_rep_uni_w(10, 8, 48, sse4);
+mc_rep_uni_w(10, 8, 64, sse4);
+
+mc_rep_uni_w(12, 6, 12, sse4);
+mc_rep_uni_w(12, 8, 16, sse4);
+mc_rep_uni_w(12, 8, 24, sse4);
+mc_rep_uni_w(12, 8, 32, sse4);
+mc_rep_uni_w(12, 8, 48, sse4);
+mc_rep_uni_w(12, 8, 64, sse4);
+
+#define mc_rep_bi_w(bitd, step, W, opt) \
+void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, \
+                                              int16_t *_src2, int height,                                               \
+                                              int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)                      \
+{                                                                                                                       \
+    int i;                                                                                                              \
+    int16_t *src;                                                                                                       \
+    int16_t *src2;                                                                                                      \
+    uint8_t *dst;                                                                                                       \
+    for (i = 0; i < W; i += step) {                                                                                     \
+        src  = _src  + i;                                                                                               \
+        src2 = _src2 + i;                                                                                               \
+        dst  = _dst  + (i * ((bitd + 7) / 8));                                                                          \
+        ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2,                             \
+                                                    height, denom, _wx0, _wx1, _ox0, _ox1);                             \
+    }                                                                                                                   \
+}
+
+mc_rep_bi_w(8, 6, 12, sse4);
+mc_rep_bi_w(8, 8, 16, sse4);
+mc_rep_bi_w(8, 8, 24, sse4);
+mc_rep_bi_w(8, 8, 32, sse4);
+mc_rep_bi_w(8, 8, 48, sse4);
+mc_rep_bi_w(8, 8, 64, sse4);
+
+mc_rep_bi_w(10, 6, 12, sse4);
+mc_rep_bi_w(10, 8, 16, sse4);
+mc_rep_bi_w(10, 8, 24, sse4);
+mc_rep_bi_w(10, 8, 32, sse4);
+mc_rep_bi_w(10, 8, 48, sse4);
+mc_rep_bi_w(10, 8, 64, sse4);
+
+mc_rep_bi_w(12, 6, 12, sse4);
+mc_rep_bi_w(12, 8, 16, sse4);
+mc_rep_bi_w(12, 8, 24, sse4);
+mc_rep_bi_w(12, 8, 32, sse4);
+mc_rep_bi_w(12, 8, 48, sse4);
+mc_rep_bi_w(12, 8, 64, sse4);
+
+#define mc_uni_w_func(name, bitd, W, opt) \
+void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,         \
+                                                      uint8_t *_src, ptrdiff_t _srcstride,          \
+                                                      int height, int denom,                        \
+                                                      int _wx, int _ox,                             \
+                                                      intptr_t mx, intptr_t my, int width)          \
+{                                                                                                   \
+    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                            \
+    ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);     \
+    ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, MAX_PB_SIZE, height, denom, _wx, _ox);\
+}
+
+#define mc_uni_w_funcs(name, bitd, opt)       \
+        mc_uni_w_func(name, bitd, 4, opt);    \
+        mc_uni_w_func(name, bitd, 8, opt);    \
+        mc_uni_w_func(name, bitd, 12, opt);   \
+        mc_uni_w_func(name, bitd, 16, opt);   \
+        mc_uni_w_func(name, bitd, 24, opt);   \
+        mc_uni_w_func(name, bitd, 32, opt);   \
+        mc_uni_w_func(name, bitd, 48, opt);   \
+        mc_uni_w_func(name, bitd, 64, opt)
+
+mc_uni_w_funcs(pel_pixels, 8, sse4);
+mc_uni_w_func(pel_pixels, 8, 6, sse4);
+mc_uni_w_funcs(epel_h, 8, sse4);
+mc_uni_w_func(epel_h, 8, 6, sse4);
+mc_uni_w_funcs(epel_v, 8, sse4);
+mc_uni_w_func(epel_v, 8, 6, sse4);
+mc_uni_w_funcs(epel_hv, 8, sse4);
+mc_uni_w_func(epel_hv, 8, 6, sse4);
+mc_uni_w_funcs(qpel_h, 8, sse4);
+mc_uni_w_funcs(qpel_v, 8, sse4);
+mc_uni_w_funcs(qpel_hv, 8, sse4);
+
+mc_uni_w_funcs(pel_pixels, 10, sse4);
+mc_uni_w_func(pel_pixels, 10, 6, sse4);
+mc_uni_w_funcs(epel_h, 10, sse4);
+mc_uni_w_func(epel_h, 10, 6, sse4);
+mc_uni_w_funcs(epel_v, 10, sse4);
+mc_uni_w_func(epel_v, 10, 6, sse4);
+mc_uni_w_funcs(epel_hv, 10, sse4);
+mc_uni_w_func(epel_hv, 10, 6, sse4);
+mc_uni_w_funcs(qpel_h, 10, sse4);
+mc_uni_w_funcs(qpel_v, 10, sse4);
+mc_uni_w_funcs(qpel_hv, 10, sse4);
+
+mc_uni_w_funcs(pel_pixels, 12, sse4);
+mc_uni_w_func(pel_pixels, 12, 6, sse4);
+mc_uni_w_funcs(epel_h, 12, sse4);
+mc_uni_w_func(epel_h, 12, 6, sse4);
+mc_uni_w_funcs(epel_v, 12, sse4);
+mc_uni_w_func(epel_v, 12, 6, sse4);
+mc_uni_w_funcs(epel_hv, 12, sse4);
+mc_uni_w_func(epel_hv, 12, 6, sse4);
+mc_uni_w_funcs(qpel_h, 12, sse4);
+mc_uni_w_funcs(qpel_v, 12, sse4);
+mc_uni_w_funcs(qpel_hv, 12, sse4);
+
+#define mc_bi_w_func(name, bitd, W, opt) \
+void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,           \
+                                                     uint8_t *_src, ptrdiff_t _srcstride,            \
+                                                     int16_t *_src2,                                 \
+                                                     int height, int denom,                          \
+                                                     int _wx0, int _wx1, int _ox0, int _ox1,         \
+                                                     intptr_t mx, intptr_t my, int width)            \
+{                                                                                                    \
+    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                             \
+    ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);      \
+    ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, MAX_PB_SIZE, _src2,            \
+                                             height, denom, _wx0, _wx1, _ox0, _ox1);                 \
+}
+
+#define mc_bi_w_funcs(name, bitd, opt)       \
+        mc_bi_w_func(name, bitd, 4, opt);    \
+        mc_bi_w_func(name, bitd, 8, opt);    \
+        mc_bi_w_func(name, bitd, 12, opt);   \
+        mc_bi_w_func(name, bitd, 16, opt);   \
+        mc_bi_w_func(name, bitd, 24, opt);   \
+        mc_bi_w_func(name, bitd, 32, opt);   \
+        mc_bi_w_func(name, bitd, 48, opt);   \
+        mc_bi_w_func(name, bitd, 64, opt)
+
+mc_bi_w_funcs(pel_pixels, 8, sse4);
+mc_bi_w_func(pel_pixels, 8, 6, sse4);
+mc_bi_w_funcs(epel_h, 8, sse4);
+mc_bi_w_func(epel_h, 8, 6, sse4);
+mc_bi_w_funcs(epel_v, 8, sse4);
+mc_bi_w_func(epel_v, 8, 6, sse4);
+mc_bi_w_funcs(epel_hv, 8, sse4);
+mc_bi_w_func(epel_hv, 8, 6, sse4);
+mc_bi_w_funcs(qpel_h, 8, sse4);
+mc_bi_w_funcs(qpel_v, 8, sse4);
+mc_bi_w_funcs(qpel_hv, 8, sse4);
+
+mc_bi_w_funcs(pel_pixels, 10, sse4);
+mc_bi_w_func(pel_pixels, 10, 6, sse4);
+mc_bi_w_funcs(epel_h, 10, sse4);
+mc_bi_w_func(epel_h, 10, 6, sse4);
+mc_bi_w_funcs(epel_v, 10, sse4);
+mc_bi_w_func(epel_v, 10, 6, sse4);
+mc_bi_w_funcs(epel_hv, 10, sse4);
+mc_bi_w_func(epel_hv, 10, 6, sse4);
+mc_bi_w_funcs(qpel_h, 10, sse4);
+mc_bi_w_funcs(qpel_v, 10, sse4);
+mc_bi_w_funcs(qpel_hv, 10, sse4);
+
+mc_bi_w_funcs(pel_pixels, 12, sse4);
+mc_bi_w_func(pel_pixels, 12, 6, sse4);
+mc_bi_w_funcs(epel_h, 12, sse4);
+mc_bi_w_func(epel_h, 12, 6, sse4);
+mc_bi_w_funcs(epel_v, 12, sse4);
+mc_bi_w_func(epel_v, 12, 6, sse4);
+mc_bi_w_funcs(epel_hv, 12, sse4);
+mc_bi_w_func(epel_hv, 12, 6, sse4);
+mc_bi_w_funcs(qpel_h, 12, sse4);
+mc_bi_w_funcs(qpel_v, 12, sse4);
+mc_bi_w_funcs(qpel_hv, 12, sse4);
+#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
+
+
+#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt )           \
+        PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
+        PEL_LINK(pointer, 2, my , mx , fname##6 ,  bitd, opt ); \
+        PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
+        PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
+        PEL_LINK(pointer, 5, my , mx , fname##16,  bitd, opt ); \
+        PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
+        PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
+        PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
+        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
+#define QPEL_LINKS(pointer, my, mx, fname, bitd, opt)           \
+        PEL_LINK(pointer, 1, my , mx , fname##4 ,  bitd, opt ); \
+        PEL_LINK(pointer, 3, my , mx , fname##8 ,  bitd, opt ); \
+        PEL_LINK(pointer, 4, my , mx , fname##12,  bitd, opt ); \
+        PEL_LINK(pointer, 5, my , mx , fname##16,  bitd, opt ); \
+        PEL_LINK(pointer, 6, my , mx , fname##24,  bitd, opt ); \
+        PEL_LINK(pointer, 7, my , mx , fname##32,  bitd, opt ); \
+        PEL_LINK(pointer, 8, my , mx , fname##48,  bitd, opt ); \
+        PEL_LINK(pointer, 9, my , mx , fname##64,  bitd, opt )
 
-LFC_FUNCS(uint8_t, 8)
-LFC_FUNCS(uint8_t, 10)
-LFL_FUNCS(uint8_t, 8)
-LFL_FUNCS(uint8_t, 10)
 
 void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
 
     if (bit_depth == 8) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
+            c->transform_add[0]    =  ff_hevc_transform_add4_8_mmxext;
+        }
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
+            }
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
+
+            c->transform_add[1]    = ff_hevc_transform_add8_8_sse2;
+            c->transform_add[2]    = ff_hevc_transform_add16_8_sse2;
+            c->transform_add[3]    = ff_hevc_transform_add32_8_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
             c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
         }
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels,  8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,      8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,      8, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,     8, sse4);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
+            }
+            c->transform_add[1]    = ff_hevc_transform_add8_8_avx;
+            c->transform_add[2]    = ff_hevc_transform_add16_8_avx;
+            c->transform_add[3]    = ff_hevc_transform_add32_8_avx;
+        }
+        if (EXTERNAL_AVX2(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
+
+            c->transform_add[3]    = ff_hevc_transform_add32_8_avx2;
+        }
     } else if (bit_depth == 10) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->transform_add[0] = ff_hevc_transform_add4_10_mmxext;
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
+        }
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
+            }
+
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
+
+            c->transform_add[1]    = ff_hevc_transform_add8_10_sse2;
+            c->transform_add[2]    = ff_hevc_transform_add16_10_sse2;
+            c->transform_add[3]    = ff_hevc_transform_add32_10_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
             c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
         }
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     10, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    10, sse4);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
+            }
+        }
+        if (EXTERNAL_AVX2(cpu_flags)) {
+
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
+
+            c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
+            c->transform_add[3] = ff_hevc_transform_add32_10_avx2;
+
+        }
+    } else if (bit_depth == 12) {
+        if (EXTERNAL_MMXEXT(cpu_flags)) {
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_12_mmxext;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_12_mmxext;
+        }
+        if (EXTERNAL_SSE2(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
+            }
+
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_12_sse2;
+        }
+        if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
+            c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
+            c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
+        }
+        if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
+            EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h,     12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v,     12, sse4);
+            EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv,    12, sse4);
+
+            QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     12, sse4);
+            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    12, sse4);
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
+            c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
+            if (ARCH_X86_64) {
+                c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
+                c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
+            }
+        }
+        if (EXTERNAL_AVX2(cpu_flags)) {
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2;
+        }
     }
 }
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 073f7f908e..2cef8e698c 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -1,20 +1,27 @@
 ;******************************************************************************
+;*
+;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
+;* Copyright (c)      Nick Kurshev <nickols_k@mail.ru>
+;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
+;* Copyright (c) 2013 Daniel Kang
+;*
 ;* SIMD-optimized halfpel functions
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -22,26 +29,49 @@
 
 SECTION_RODATA
 cextern pb_1
+cextern pw_2
+pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pb_interleave8:  db 0, 4, 1, 5, 2, 6, 3, 7
+
+cextern pw_8192
 
 SECTION_TEXT
 
 ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro PUT_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_x2, 4,5,4
+%else
 cglobal put_pixels8_x2, 4,5
+%endif
     lea          r4, [r2*2]
 .loop:
-    mova         m0, [r1]
-    mova         m1, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m1, [r1+r2+1]
+    movu         m0, [r1+1]
+    movu         m1, [r1+r2+1]
+%if cpuflag(sse2)
+    movu         m2, [r1]
+    movu         m3, [r1+r2]
+    pavgb        m0, m2
+    pavgb        m1, m3
+%else
+    PAVGB        m0, [r1]
+    PAVGB        m1, [r1+r2]
+%endif
     mova       [r0], m0
     mova    [r0+r2], m1
     add          r1, r4
     add          r0, r4
-    mova         m0, [r1]
-    mova         m1, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m1, [r1+r2+1]
+    movu         m0, [r1+1]
+    movu         m1, [r1+r2+1]
+%if cpuflag(sse2)
+    movu         m2, [r1]
+    movu         m3, [r1+r2]
+    pavgb        m0, m2
+    pavgb        m1, m3
+%else
+    PAVGB        m0, [r1]
+    PAVGB        m1, [r1+r2]
+%endif
     add          r1, r4
     mova       [r0], m0
     mova    [r0+r2], m1
@@ -99,6 +129,9 @@ INIT_MMX mmxext
 PUT_PIXELS_16
 INIT_MMX 3dnow
 PUT_PIXELS_16
+; The 8_X2 macro can easily be used here
+INIT_XMM sse2
+PUT_PIXELS8_X2
 
 
 ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -191,20 +224,24 @@ PUT_NO_RND_PIXELS8_X2_EXACT
 
 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro PUT_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal put_pixels16_y2, 4,5,3
+%else
 cglobal put_pixels8_y2, 4,5
+%endif
     lea          r4, [r2*2]
-    mova         m0, [r1]
+    movu         m0, [r1]
     sub          r0, r2
 .loop:
-    mova         m1, [r1+r2]
-    mova         m2, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r4]
     add          r1, r4
     PAVGB        m0, m1
     PAVGB        m1, m2
     mova    [r0+r2], m0
     mova    [r0+r4], m1
-    mova         m1, [r1+r2]
-    mova         m0, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m0, [r1+r4]
     add          r0, r4
     add          r1, r4
     PAVGB        m2, m1
@@ -221,6 +258,9 @@ INIT_MMX mmxext
 PUT_PIXELS8_Y2
 INIT_MMX 3dnow
 PUT_PIXELS8_Y2
+; actually, put_pixels16_y2_sse2
+INIT_XMM sse2
+PUT_PIXELS8_Y2
 
 
 ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -334,26 +374,48 @@ AVG_PIXELS8
 
 ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro AVG_PIXELS8_X2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_x2, 4,5,4
+%else
 cglobal avg_pixels8_x2, 4,5
+%endif
     lea          r4, [r2*2]
+%if notcpuflag(mmxext)
+    pcmpeqd      m5, m5
+    paddb        m5, m5
+%endif
 .loop:
-    mova         m0, [r1]
-    mova         m2, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m2, [r1+r2+1]
-    PAVGB        m0, [r0]
-    PAVGB        m2, [r0+r2]
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+%if cpuflag(sse2)
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    pavgb        m0, m1
+    pavgb        m2, m3
+%else
+    PAVGB        m0, [r1+1], m3, m5
+    PAVGB        m2, [r1+r2+1], m4, m5
+%endif
+    PAVGB        m0, [r0], m3, m5
+    PAVGB        m2, [r0+r2], m4, m5
     add          r1, r4
     mova       [r0], m0
     mova    [r0+r2], m2
-    mova         m0, [r1]
-    mova         m2, [r1+r2]
-    PAVGB        m0, [r1+1]
-    PAVGB        m2, [r1+r2+1]
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+%if cpuflag(sse2)
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    pavgb        m0, m1
+    pavgb        m2, m3
+%else
+    PAVGB        m0, [r1+1], m3, m5
+    PAVGB        m2, [r1+r2+1], m4, m5
+%endif
     add          r0, r4
     add          r1, r4
-    PAVGB        m0, [r0]
-    PAVGB        m2, [r0+r2]
+    PAVGB        m0, [r0], m3, m5
+    PAVGB        m2, [r0+r2], m4, m5
     mova       [r0], m0
     mova    [r0+r2], m2
     add          r0, r4
@@ -362,40 +424,45 @@ cglobal avg_pixels8_x2, 4,5
     REP_RET
 %endmacro
 
+INIT_MMX mmx
+AVG_PIXELS8_X2
 INIT_MMX mmxext
 AVG_PIXELS8_X2
 INIT_MMX 3dnow
 AVG_PIXELS8_X2
+; actually avg_pixels16_x2
+INIT_XMM sse2
+AVG_PIXELS8_X2
 
 
 ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 %macro AVG_PIXELS8_Y2 0
+%if cpuflag(sse2)
+cglobal avg_pixels16_y2, 4,5,3
+%else
 cglobal avg_pixels8_y2, 4,5
+%endif
     lea          r4, [r2*2]
-    mova         m0, [r1]
+    movu         m0, [r1]
     sub          r0, r2
 .loop:
-    mova         m1, [r1+r2]
-    mova         m2, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m2, [r1+r4]
     add          r1, r4
     PAVGB        m0, m1
     PAVGB        m1, m2
-    mova         m3, [r0+r2]
-    mova         m4, [r0+r4]
-    PAVGB        m0, m3
-    PAVGB        m1, m4
+    PAVGB        m0, [r0+r2]
+    PAVGB        m1, [r0+r4]
     mova    [r0+r2], m0
     mova    [r0+r4], m1
-    mova         m1, [r1+r2]
-    mova         m0, [r1+r4]
+    movu         m1, [r1+r2]
+    movu         m0, [r1+r4]
     PAVGB        m2, m1
     PAVGB        m1, m0
     add          r0, r4
     add          r1, r4
-    mova         m3, [r0+r2]
-    mova         m4, [r0+r4]
-    PAVGB        m2, m3
-    PAVGB        m1, m4
+    PAVGB        m2, [r0+r2]
+    PAVGB        m1, [r0+r4]
     mova    [r0+r2], m2
     mova    [r0+r4], m1
     add          r0, r4
@@ -408,11 +475,16 @@ INIT_MMX mmxext
 AVG_PIXELS8_Y2
 INIT_MMX 3dnow
 AVG_PIXELS8_Y2
+; actually avg_pixels16_y2
+INIT_XMM sse2
+AVG_PIXELS8_Y2
 
 
 ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-%macro AVG_PIXELS8_XY2 0
-cglobal avg_pixels8_xy2, 4,5
+; Note this is not correctly rounded, and is therefore used for
+; not-bitexact output
+%macro AVG_APPROX_PIXELS8_XY2 0
+cglobal avg_approx_pixels8_xy2, 4,5
     mova         m6, [pb_1]
     lea          r4, [r2*2]
     mova         m0, [r1]
@@ -449,6 +521,160 @@ cglobal avg_pixels8_xy2, 4,5
 %endmacro
 
 INIT_MMX mmxext
-AVG_PIXELS8_XY2
+AVG_APPROX_PIXELS8_XY2
 INIT_MMX 3dnow
-AVG_PIXELS8_XY2
+AVG_APPROX_PIXELS8_XY2
+
+
+; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
+%macro SET_PIXELS_XY2 1
+%if cpuflag(sse2)
+cglobal %1_pixels16_xy2, 4,5,8
+%else
+cglobal %1_pixels8_xy2, 4,5
+%endif
+    pxor        m7, m7
+    mova        m6, [pw_2]
+    movu        m0, [r1]
+    movu        m4, [r1+1]
+    mova        m1, m0
+    mova        m5, m4
+    punpcklbw   m0, m7
+    punpcklbw   m4, m7
+    punpckhbw   m1, m7
+    punpckhbw   m5, m7
+    paddusw     m4, m0
+    paddusw     m5, m1
+    xor         r4, r4
+    add         r1, r2
+.loop:
+    movu        m0, [r1+r4]
+    movu        m2, [r1+r4+1]
+    mova        m1, m0
+    mova        m3, m2
+    punpcklbw   m0, m7
+    punpcklbw   m2, m7
+    punpckhbw   m1, m7
+    punpckhbw   m3, m7
+    paddusw     m0, m2
+    paddusw     m1, m3
+    paddusw     m4, m6
+    paddusw     m5, m6
+    paddusw     m4, m0
+    paddusw     m5, m1
+    psrlw       m4, 2
+    psrlw       m5, 2
+%ifidn %1, avg
+    mova        m3, [r0+r4]
+    packuswb    m4, m5
+    PAVGB       m4, m3
+%else
+    packuswb    m4, m5
+%endif
+    mova   [r0+r4], m4
+    add         r4, r2
+
+    movu        m2, [r1+r4]
+    movu        m4, [r1+r4+1]
+    mova        m3, m2
+    mova        m5, m4
+    punpcklbw   m2, m7
+    punpcklbw   m4, m7
+    punpckhbw   m3, m7
+    punpckhbw   m5, m7
+    paddusw     m4, m2
+    paddusw     m5, m3
+    paddusw     m0, m6
+    paddusw     m1, m6
+    paddusw     m0, m4
+    paddusw     m1, m5
+    psrlw       m0, 2
+    psrlw       m1, 2
+%ifidn %1, avg
+    mova        m3, [r0+r4]
+    packuswb    m0, m1
+    PAVGB       m0, m3
+%else
+    packuswb    m0, m1
+%endif
+    mova   [r0+r4], m0
+    add         r4, r2
+    sub        r3d, 2
+    jnz .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+SET_PIXELS_XY2 avg
+INIT_MMX 3dnow
+SET_PIXELS_XY2 avg
+INIT_XMM sse2
+SET_PIXELS_XY2 put
+SET_PIXELS_XY2 avg
+
+%macro SSSE3_PIXELS_XY2 1-2
+%if %0 == 2 ; sse2
+cglobal %1_pixels16_xy2, 4,5,%2
+    mova        m4, [pb_interleave16]
+%else
+cglobal %1_pixels8_xy2, 4,5
+    mova        m4, [pb_interleave8]
+%endif
+    mova        m5, [pb_1]
+    movu        m0, [r1]
+    movu        m1, [r1+1]
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m5
+    xor         r4, r4
+    add         r1, r2
+.loop:
+    movu        m2, [r1+r4]
+    movu        m3, [r1+r4+1]
+    pmaddubsw   m2, m5
+    pmaddubsw   m3, m5
+    paddusw     m0, m2
+    paddusw     m1, m3
+    pmulhrsw    m0, [pw_8192]
+    pmulhrsw    m1, [pw_8192]
+%ifidn %1, avg
+    mova        m6, [r0+r4]
+    packuswb    m0, m1
+    pshufb      m0, m4
+    pavgb       m0, m6
+%else
+    packuswb    m0, m1
+    pshufb      m0, m4
+%endif
+    mova   [r0+r4], m0
+    add         r4, r2
+
+    movu        m0, [r1+r4]
+    movu        m1, [r1+r4+1]
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m5
+    paddusw     m2, m0
+    paddusw     m3, m1
+    pmulhrsw    m2, [pw_8192]
+    pmulhrsw    m3, [pw_8192]
+%ifidn %1, avg
+    mova        m6, [r0+r4]
+    packuswb    m2, m3
+    pshufb      m2, m4
+    pavgb       m2, m6
+%else
+    packuswb    m2, m3
+    pshufb      m2, m4
+%endif
+    mova   [r0+r4], m2
+    add         r4, r2
+    sub        r3d, 2
+    jnz .loop
+    REP_RET
+%endmacro
+
+INIT_MMX ssse3
+SSSE3_PIXELS_XY2 put
+SSSE3_PIXELS_XY2 avg
+INIT_XMM ssse3
+SSSE3_PIXELS_XY2 put, 6
+SSSE3_PIXELS_XY2 avg, 7
diff --git a/libavcodec/x86/hpeldsp.h b/libavcodec/x86/hpeldsp.h
index 47b0b8b825..5fae990a4f 100644
--- a/libavcodec/x86/hpeldsp.h
+++ b/libavcodec/x86/hpeldsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,12 +27,27 @@ void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
 
 void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
+
 void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
 
 void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int h);
+void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
 void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+                               ptrdiff_t line_size, int h);
 
 #endif /* AVCODEC_X86_HPELDSP_H */
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 1cc3bacd15..bcae22fa44 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
@@ -40,6 +40,14 @@ void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                                ptrdiff_t line_size, int h);
 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
+void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
+void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                             ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                                      ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
@@ -74,10 +82,12 @@ void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
-void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
-                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                      ptrdiff_t line_size, int h);
+void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
 
 #define avg_pixels8_mmx         ff_avg_pixels8_mmx
 #define avg_pixels8_x2_mmx      ff_avg_pixels8_x2_mmx
@@ -156,32 +166,49 @@ CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
     CALL_2X_PIXELS(avg_pixels16           ## CPUEXT, ff_avg_pixels8           ## CPUEXT, 8) \
     CALL_2X_PIXELS(avg_pixels16_x2        ## CPUEXT, ff_avg_pixels8_x2        ## CPUEXT, 8) \
     CALL_2X_PIXELS(avg_pixels16_y2        ## CPUEXT, ff_avg_pixels8_y2        ## CPUEXT, 8) \
-    CALL_2X_PIXELS(avg_pixels16_xy2       ## CPUEXT, ff_avg_pixels8_xy2       ## CPUEXT, 8)
+    CALL_2X_PIXELS(avg_pixels16_xy2       ## CPUEXT, ff_avg_pixels8_xy2       ## CPUEXT, 8) \
+    CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8)
 
 HPELDSP_AVG_PIXELS16(_3dnow)
 HPELDSP_AVG_PIXELS16(_mmxext)
 
 #endif /* HAVE_YASM */
 
+#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                             \
+    if (HAVE_MMX_EXTERNAL)                                                  \
+    c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU;
+
+#if HAVE_MMX_INLINE
 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
     do {                                                                        \
-        c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
+        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                                 \
         c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
         c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
         c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
     } while (0)
+#else
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
+    do {                                                                        \
+        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                                 \
+    } while (0)
+#endif
 
 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int cpu_flags)
 {
-#if HAVE_MMX_INLINE
     SET_HPEL_FUNCS(put,        [0], 16, mmx);
     SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
     SET_HPEL_FUNCS(avg,        [0], 16, mmx);
     SET_HPEL_FUNCS(avg_no_rnd,    , 16, mmx);
     SET_HPEL_FUNCS(put,        [1],  8, mmx);
     SET_HPEL_FUNCS(put_no_rnd, [1],  8, mmx);
-    SET_HPEL_FUNCS(avg,        [1],  8, mmx);
-#endif /* HAVE_MMX_INLINE */
+    if (HAVE_MMX_EXTERNAL) {
+        c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx;
+        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx;
+    }
+#if HAVE_MMX_INLINE
+    c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx;
+#endif
 }
 
 static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
@@ -193,6 +220,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
     c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
     c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
     c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
 
     c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
     c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
@@ -200,6 +228,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
     c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
     c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
     c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
 
     if (!(flags & CODEC_FLAG_BITEXACT)) {
         c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
@@ -207,8 +236,8 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
 
-        c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
-        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
+        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
+        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
     }
 
     if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
@@ -227,6 +256,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
     c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
     c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
     c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
 
     c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
     c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
@@ -234,6 +264,7 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
     c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
     c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
     c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
 
     if (!(flags & CODEC_FLAG_BITEXACT)){
         c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
@@ -241,8 +272,8 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
 
-        c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
-        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
+        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow;
+        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow;
     }
 
     if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
@@ -259,11 +290,27 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags)
         // these functions are slower than mmx on AMD, but faster on Intel
         c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
         c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
+        c->put_pixels_tab[0][1]        = ff_put_pixels16_x2_sse2;
+        c->put_pixels_tab[0][2]        = ff_put_pixels16_y2_sse2;
+        c->put_pixels_tab[0][3]        = ff_put_pixels16_xy2_sse2;
         c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
+        c->avg_pixels_tab[0][1]        = ff_avg_pixels16_x2_sse2;
+        c->avg_pixels_tab[0][2]        = ff_avg_pixels16_y2_sse2;
+        c->avg_pixels_tab[0][3]        = ff_avg_pixels16_xy2_sse2;
     }
 #endif /* HAVE_SSE2_EXTERNAL */
 }
 
+static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags, int cpu_flags)
+{
+#if HAVE_SSSE3_EXTERNAL
+    c->put_pixels_tab[0][3]            = ff_put_pixels16_xy2_ssse3;
+    c->avg_pixels_tab[0][3]            = ff_avg_pixels16_xy2_ssse3;
+    c->put_pixels_tab[1][3]            = ff_put_pixels8_xy2_ssse3;
+    c->avg_pixels_tab[1][3]            = ff_avg_pixels8_xy2_ssse3;
+#endif
+}
+
 av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -279,4 +326,7 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
 
     if (EXTERNAL_SSE2(cpu_flags))
         hpeldsp_init_sse2(c, flags, cpu_flags);
+
+    if (EXTERNAL_SSSE3(cpu_flags))
+        hpeldsp_init_ssse3(c, flags, cpu_flags);
 }
diff --git a/libavcodec/x86/hpeldsp_mmx.c b/libavcodec/x86/hpeldsp_mmx.c
deleted file mode 100644
index c93c78e40e..0000000000
--- a/libavcodec/x86/hpeldsp_mmx.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * MMX-optimized avg/put pixel routines
- *
- * Copyright (c) 2001 Fabrice Bellard
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "config.h"
-#include "hpeldsp.h"
-#include "inline_asm.h"
-
-#if HAVE_MMX_INLINE
-
-void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
-            :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
-}
-
-#endif /* HAVE_MMX_INLINE */
diff --git a/libavcodec/x86/hpeldsp_rnd_template.c b/libavcodec/x86/hpeldsp_rnd_template.c
index d854e8a2fc..c8a68fdf56 100644
--- a/libavcodec/x86/hpeldsp_rnd_template.c
+++ b/libavcodec/x86/hpeldsp_rnd_template.c
@@ -7,20 +7,20 @@
  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  * and improved by Zdenek Kabelac <kabi@users.sf.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm
index 436abc8b75..cc48556f9b 100644
--- a/libavcodec/x86/huffyuvdsp.asm
+++ b/libavcodec/x86/huffyuvdsp.asm
@@ -1,21 +1,22 @@
 ;******************************************************************************
 ;* SIMD-optimized HuffYUV functions
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Christophe Gisquet
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -33,64 +34,72 @@ SECTION_TEXT
 ; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
 ;                                     const uint8_t *diff, int w,
 ;                                     int *left, int *left_top)
-INIT_MMX mmxext
-cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top
-    movq    mm0, [topq]
-    movq    mm2, mm0
-    movd    mm4, [left_topq]
-    psllq   mm2, 8
-    movq    mm1, mm0
-    por     mm4, mm2
-    movd    mm3, [leftq]
-    psubb   mm0, mm4 ; t-tl
+%macro HFYU_MEDIAN 0
+cglobal add_hfyu_median_pred, 6,6,8, dst, top, diff, w, left, left_top
+    movu    m0, [topq]
+    mova    m2, m0
+    movd    m4, [left_topq]
+    LSHIFT  m2, 1
+    mova    m1, m0
+    por     m4, m2
+    movd    m3, [leftq]
+    psubb   m0, m4 ; t-tl
     add    dstq, wq
     add    topq, wq
     add   diffq, wq
     neg      wq
     jmp .skip
 .loop:
-    movq    mm4, [topq+wq]
-    movq    mm0, mm4
-    psllq   mm4, 8
-    por     mm4, mm1
-    movq    mm1, mm0 ; t
-    psubb   mm0, mm4 ; t-tl
+    movu    m4, [topq+wq]
+    mova    m0, m4
+    LSHIFT  m4, 1
+    por     m4, m1
+    mova    m1, m0 ; t
+    psubb   m0, m4 ; t-tl
 .skip:
-    movq    mm2, [diffq+wq]
+    movu    m2, [diffq+wq]
 %assign i 0
-%rep 8
-    movq    mm4, mm0
-    paddb   mm4, mm3 ; t-tl+l
-    movq    mm5, mm3
-    pmaxub  mm3, mm1
-    pminub  mm5, mm1
-    pminub  mm3, mm4
-    pmaxub  mm3, mm5 ; median
-    paddb   mm3, mm2 ; +residual
+%rep mmsize
+    mova    m4, m0
+    paddb   m4, m3 ; t-tl+l
+    mova    m5, m3
+    pmaxub  m3, m1
+    pminub  m5, m1
+    pminub  m3, m4
+    pmaxub  m3, m5 ; median
+    paddb   m3, m2 ; +residual
 %if i==0
-    movq    mm7, mm3
-    psllq   mm7, 56
+    mova    m7, m3
+    LSHIFT  m7, mmsize-1
 %else
-    movq    mm6, mm3
-    psrlq   mm7, 8
-    psllq   mm6, 56
-    por     mm7, mm6
+    mova    m6, m3
+    RSHIFT  m7, 1
+    LSHIFT  m6, mmsize-1
+    por     m7, m6
 %endif
-%if i<7
-    psrlq   mm0, 8
-    psrlq   mm1, 8
-    psrlq   mm2, 8
+%if i<mmsize-1
+    RSHIFT  m0, 1
+    RSHIFT  m1, 1
+    RSHIFT  m2, 1
 %endif
 %assign i i+1
 %endrep
-    movq [dstq+wq], mm7
-    add      wq, 8
+    movu [dstq+wq], m7
+    add      wq, mmsize
     jl .loop
     movzx   r2d, byte [dstq-1]
     mov [leftq], r2d
     movzx   r2d, byte [topq-1]
     mov [left_topq], r2d
     RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmxext
+HFYU_MEDIAN
+%endif
+INIT_XMM sse2
+HFYU_MEDIAN
 
 
 %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
@@ -163,3 +172,82 @@ cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
     ADD_HFYU_LEFT_LOOP 0, 1
 .src_unaligned:
     ADD_HFYU_LEFT_LOOP 0, 0
+
+%macro ADD_BYTES 0
+cglobal add_bytes, 3,4,2, dst, src, w, size
+    mov  sizeq, wq
+    and  sizeq, -2*mmsize
+    jz  .2
+    add   dstq, sizeq
+    add   srcq, sizeq
+    neg  sizeq
+.1:
+    mova    m0, [srcq + sizeq]
+    mova    m1, [srcq + sizeq + mmsize]
+    paddb   m0, [dstq + sizeq]
+    paddb   m1, [dstq + sizeq + mmsize]
+    mova   [dstq + sizeq], m0
+    mova   [dstq + sizeq + mmsize], m1
+    add  sizeq, 2*mmsize
+    jl .1
+.2:
+    and     wq, 2*mmsize-1
+    jz    .end
+    add   dstq, wq
+    add   srcq, wq
+    neg     wq
+.3
+    mov  sizeb, [srcq + wq]
+    add [dstq + wq], sizeb
+    inc     wq
+    jl .3
+.end:
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+ADD_BYTES
+%endif
+INIT_XMM sse2
+ADD_BYTES
+
+; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
+;                               intptr_t w, uint8_t *left)
+%macro LEFT_BGR32 0
+cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
+    shl           wq, 2
+    movd          m0, [leftq]
+    lea         dstq, [dstq + wq]
+    lea         srcq, [srcq + wq]
+    LSHIFT        m0, mmsize-4
+    neg           wq
+.loop:
+    movu          m1, [srcq+wq]
+    mova          m2, m1
+%if mmsize == 8
+    punpckhdq     m0, m0
+%endif
+    LSHIFT        m1, 4
+    paddb         m1, m2
+%if mmsize == 16
+    pshufd        m0, m0, q3333
+    mova          m2, m1
+    LSHIFT        m1, 8
+    paddb         m1, m2
+%endif
+    paddb         m0, m1
+    movu   [dstq+wq], m0
+    add           wq, mmsize
+    jl         .loop
+    movd          m0, [dstq-4]
+    movd     [leftq], m0
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+LEFT_BGR32
+%endif
+INIT_XMM sse2
+LEFT_BGR32
diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c
index 75537d7a4c..3ced3c0a1c 100644
--- a/libavcodec/x86/huffyuvdsp_init.c
+++ b/libavcodec/x86/huffyuvdsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,20 +25,29 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/huffyuvdsp.h"
 
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, intptr_t w);
+void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, intptr_t w);
+
 void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
-                                    const uint8_t *diff, int w,
+                                    const uint8_t *diff, intptr_t w,
                                     int *left, int *left_top);
+void ff_add_hfyu_median_pred_sse2(uint8_t *dst, const uint8_t *top,
+                                  const uint8_t *diff, intptr_t w,
+                                  int *left, int *left_top);
 
 int  ff_add_hfyu_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
-                                 int w, int left);
+                                 intptr_t w, int left);
 int  ff_add_hfyu_left_pred_sse4(uint8_t *dst, const uint8_t *src,
-                                int w, int left);
+                                intptr_t w, int left);
 
-#if HAVE_INLINE_ASM
+void ff_add_hfyu_left_pred_bgr32_mmx(uint8_t *dst, const uint8_t *src,
+                                     intptr_t w, uint8_t *left);
+void ff_add_hfyu_left_pred_bgr32_sse2(uint8_t *dst, const uint8_t *src,
+                                      intptr_t w, uint8_t *left);
 
-#if HAVE_7REGS
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
 static void add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
-                                      const uint8_t *diff, int w,
+                                      const uint8_t *diff, intptr_t w,
                                       int *left, int *left_top)
 {
     x86_reg w2 = -w;
@@ -72,56 +81,34 @@ static void add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
     *left     = l;
     *left_top = tl;
 }
-#endif /* HAVE_7REGS */
-
-static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
-{
-    x86_reg i = 0;
-
-    __asm__ volatile (
-        "jmp          2f                \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %0), %%mm0         \n\t"
-        "movq   (%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, (%2, %0)      \n\t"
-        "movq  8(%1, %0), %%mm0         \n\t"
-        "movq  8(%2, %0), %%mm1         \n\t"
-        "paddb     %%mm0, %%mm1         \n\t"
-        "movq      %%mm1, 8(%2, %0)     \n\t"
-        "add         $16, %0            \n\t"
-        "2:                             \n\t"
-        "cmp          %3, %0            \n\t"
-        "js           1b                \n\t"
-        : "+r" (i)
-        : "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
-
-    for (; i < w; i++)
-        dst[i + 0] += src[i + 0];
-}
-
-#endif /* HAVE_INLINE_ASM */
+#endif
 
 av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
-#if HAVE_INLINE_ASM
-#if HAVE_7REGS
+#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32
     if (cpu_flags & AV_CPU_FLAG_CMOV)
         c->add_hfyu_median_pred = add_hfyu_median_pred_cmov;
-#endif /* HAVE_7REGS */
+#endif
 
-    if (INLINE_MMX(cpu_flags))
-        c->add_bytes = add_bytes_mmx;
-#endif /* HAVE_INLINE_ASM */
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->add_bytes = ff_add_bytes_mmx;
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_mmx;
+    }
 
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
+    if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) {
         /* slower than cmov version on AMD */
         if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
             c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
     }
 
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_bytes            = ff_add_bytes_sse2;
+        c->add_hfyu_median_pred = ff_add_hfyu_median_pred_sse2;
+        c->add_hfyu_left_pred_bgr32 = ff_add_hfyu_left_pred_bgr32_sse2;
+    }
+
     if (EXTERNAL_SSSE3(cpu_flags)) {
         c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
         if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c b/libavcodec/x86/huffyuvencdsp_mmx.c
index 8ffaced37d..63d8e3cc73 100644
--- a/libavcodec/x86/huffyuvencdsp_mmx.c
+++ b/libavcodec/x86/huffyuvencdsp_mmx.c
@@ -5,20 +5,20 @@
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -31,10 +31,11 @@
 
 #if HAVE_INLINE_ASM
 
-static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
+static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w)
 {
     x86_reg i = 0;
 
+    if (w >= 16)
     __asm__ volatile (
         "1:                             \n\t"
         "movq  (%2, %0), %%mm0          \n\t"
diff --git a/libavcodec/x86/idctdsp.asm b/libavcodec/x86/idctdsp.asm
new file mode 100644
index 0000000000..0aa73459e2
--- /dev/null
+++ b/libavcodec/x86/idctdsp.asm
@@ -0,0 +1,183 @@
+;******************************************************************************
+;* SIMD-optimized IDCT-related routines
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2003-2013 Michael Niedermayer
+;* Copyright (c) 2013 Daniel Kang
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pb_80
+
+SECTION_TEXT
+
+;--------------------------------------------------------------------------
+;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                                  ptrdiff_t line_size)
+;--------------------------------------------------------------------------
+
+%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
+    mova     m1, [blockq+mmsize*0+%1]
+    mova     m2, [blockq+mmsize*2+%1]
+%if mmsize == 8
+    mova     m3, [blockq+mmsize*4+%1]
+    mova     m4, [blockq+mmsize*6+%1]
+%endif
+    packsswb m1, [blockq+mmsize*1+%1]
+    packsswb m2, [blockq+mmsize*3+%1]
+%if mmsize == 8
+    packsswb m3, [blockq+mmsize*5+%1]
+    packsswb m4, [blockq+mmsize*7+%1]
+%endif
+    paddb    m1, m0
+    paddb    m2, m0
+%if mmsize == 8
+    paddb    m3, m0
+    paddb    m4, m0
+    movq     [pixelsq+lsizeq*0], m1
+    movq     [pixelsq+lsizeq*1], m2
+    movq     [pixelsq+lsizeq*2], m3
+    movq     [pixelsq+lsize3q ], m4
+%else
+    movq     [pixelsq+lsizeq*0], m1
+    movhps   [pixelsq+lsizeq*1], m1
+    movq     [pixelsq+lsizeq*2], m2
+    movhps   [pixelsq+lsize3q ], m2
+%endif
+%endmacro
+
+%macro PUT_SIGNED_PIXELS_CLAMPED 1
+cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3
+    mova     m0, [pb_80]
+    lea      lsize3q, [lsizeq*3]
+    PUT_SIGNED_PIXELS_CLAMPED_HALF 0
+    lea      pixelsq, [pixelsq+lsizeq*4]
+    PUT_SIGNED_PIXELS_CLAMPED_HALF 64
+    RET
+%endmacro
+
+INIT_MMX mmx
+PUT_SIGNED_PIXELS_CLAMPED 0
+INIT_XMM sse2
+PUT_SIGNED_PIXELS_CLAMPED 3
+
+;--------------------------------------------------------------------------
+; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                            ptrdiff_t line_size);
+;--------------------------------------------------------------------------
+; %1 = block offset
+%macro PUT_PIXELS_CLAMPED_HALF 1
+    mova     m0, [blockq+mmsize*0+%1]
+    mova     m1, [blockq+mmsize*2+%1]
+%if mmsize == 8
+    mova     m2, [blockq+mmsize*4+%1]
+    mova     m3, [blockq+mmsize*6+%1]
+%endif
+    packuswb m0, [blockq+mmsize*1+%1]
+    packuswb m1, [blockq+mmsize*3+%1]
+%if mmsize == 8
+    packuswb m2, [blockq+mmsize*5+%1]
+    packuswb m3, [blockq+mmsize*7+%1]
+    movq           [pixelsq], m0
+    movq    [lsizeq+pixelsq], m1
+    movq  [2*lsizeq+pixelsq], m2
+    movq   [lsize3q+pixelsq], m3
+%else
+    movq           [pixelsq], m0
+    movhps  [lsizeq+pixelsq], m0
+    movq  [2*lsizeq+pixelsq], m1
+    movhps [lsize3q+pixelsq], m1
+%endif
+%endmacro
+
+%macro PUT_PIXELS_CLAMPED 0
+cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3
+    lea lsize3q, [lsizeq*3]
+    PUT_PIXELS_CLAMPED_HALF 0
+    lea pixelsq, [pixelsq+lsizeq*4]
+    PUT_PIXELS_CLAMPED_HALF 64
+    RET
+%endmacro
+
+INIT_MMX mmx
+PUT_PIXELS_CLAMPED
+INIT_XMM sse2
+PUT_PIXELS_CLAMPED
+
+;--------------------------------------------------------------------------
+; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels,
+;                            ptrdiff_t line_size);
+;--------------------------------------------------------------------------
+; %1 = block offset
+%macro ADD_PIXELS_CLAMPED 1
+    mova       m0, [blockq+mmsize*0+%1]
+    mova       m1, [blockq+mmsize*1+%1]
+%if mmsize == 8
+    mova       m5, [blockq+mmsize*2+%1]
+    mova       m6, [blockq+mmsize*3+%1]
+%endif
+    movq       m2, [pixelsq]
+    movq       m3, [pixelsq+lsizeq]
+%if mmsize == 8
+    mova       m7, m2
+    punpcklbw  m2, m4
+    punpckhbw  m7, m4
+    paddsw     m0, m2
+    paddsw     m1, m7
+    mova       m7, m3
+    punpcklbw  m3, m4
+    punpckhbw  m7, m4
+    paddsw     m5, m3
+    paddsw     m6, m7
+%else
+    punpcklbw  m2, m4
+    punpcklbw  m3, m4
+    paddsw     m0, m2
+    paddsw     m1, m3
+%endif
+    packuswb   m0, m1
+%if mmsize == 8
+    packuswb   m5, m6
+    movq       [pixelsq], m0
+    movq       [pixelsq+lsizeq], m5
+%else
+    movq       [pixelsq], m0
+    movhps     [pixelsq+lsizeq], m0
+%endif
+%endmacro
+
+%macro ADD_PIXELS_CLAMPED 0
+cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize
+    pxor       m4, m4
+    ADD_PIXELS_CLAMPED 0
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 32
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 64
+    lea        pixelsq, [pixelsq+lsizeq*2]
+    ADD_PIXELS_CLAMPED 96
+    RET
+%endmacro
+
+INIT_MMX mmx
+ADD_PIXELS_CLAMPED
+INIT_XMM sse2
+ADD_PIXELS_CLAMPED
diff --git a/libavcodec/x86/idctdsp.h b/libavcodec/x86/idctdsp.h
index 22df3dd758..daa4e798ed 100644
--- a/libavcodec/x86/idctdsp.h
+++ b/libavcodec/x86/idctdsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -20,12 +20,19 @@
 #define AVCODEC_X86_IDCTDSP_H
 
 #include <stdint.h>
+#include <stddef.h>
 
 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size);
+                               ptrdiff_t line_size);
+void ff_add_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size);
+                               ptrdiff_t line_size);
+void ff_put_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                ptrdiff_t line_size);
 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                                      int line_size);
+                                      ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+                                       ptrdiff_t line_size);
 
 #endif /* AVCODEC_X86_IDCTDSP_H */
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 853c6a3661..2c26a98850 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -64,12 +64,10 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
-        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
-        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
-        c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
-
         if (!high_bit_depth &&
+            avctx->lowres == 0 &&
             (avctx->idct_algo == FF_IDCT_AUTO ||
+             avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
              avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
                 c->idct_put  = ff_simple_idct_put_mmx;
                 c->idct_add  = ff_simple_idct_add_mmx;
@@ -77,4 +75,14 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                 c->perm_type = FF_IDCT_PERM_SIMPLE;
         }
     }
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
+        c->put_pixels_clamped        = ff_put_pixels_clamped_sse2;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_sse2;
+    }
 }
diff --git a/libavcodec/x86/idctdsp_mmx.c b/libavcodec/x86/idctdsp_mmx.c
deleted file mode 100644
index 7285b1d357..0000000000
--- a/libavcodec/x86/idctdsp_mmx.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * SIMD-optimized IDCT-related routines
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "idctdsp.h"
-#include "inline_asm.h"
-
-#if HAVE_INLINE_ASM
-
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    /* unrolled loop */
-    __asm__ volatile (
-        "movq      (%3), %%mm0          \n\t"
-        "movq     8(%3), %%mm1          \n\t"
-        "movq    16(%3), %%mm2          \n\t"
-        "movq    24(%3), %%mm3          \n\t"
-        "movq    32(%3), %%mm4          \n\t"
-        "movq    40(%3), %%mm5          \n\t"
-        "movq    48(%3), %%mm6          \n\t"
-        "movq    56(%3), %%mm7          \n\t"
-        "packuswb %%mm1, %%mm0          \n\t"
-        "packuswb %%mm3, %%mm2          \n\t"
-        "packuswb %%mm5, %%mm4          \n\t"
-        "packuswb %%mm7, %%mm6          \n\t"
-        "movq     %%mm0, (%0)           \n\t"
-        "movq     %%mm2, (%0, %1)       \n\t"
-        "movq     %%mm4, (%0, %1, 2)    \n\t"
-        "movq     %%mm6, (%0, %2)       \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-    pix += line_size * 4;
-    p   += 32;
-
-    // if here would be an exact copy of the code above
-    // compiler would generate some very strange code
-    // thus using "r"
-    __asm__ volatile (
-        "movq       (%3), %%mm0         \n\t"
-        "movq      8(%3), %%mm1         \n\t"
-        "movq     16(%3), %%mm2         \n\t"
-        "movq     24(%3), %%mm3         \n\t"
-        "movq     32(%3), %%mm4         \n\t"
-        "movq     40(%3), %%mm5         \n\t"
-        "movq     48(%3), %%mm6         \n\t"
-        "movq     56(%3), %%mm7         \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-        "packuswb  %%mm3, %%mm2         \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-        "packuswb  %%mm7, %%mm6         \n\t"
-        "movq      %%mm0, (%0)          \n\t"
-        "movq      %%mm2, (%0, %1)      \n\t"
-        "movq      %%mm4, (%0, %1, 2)   \n\t"
-        "movq      %%mm6, (%0, %2)      \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-}
-
-#define put_signed_pixels_clamped_mmx_half(off)             \
-    "movq          "#off"(%2), %%mm1        \n\t"           \
-    "movq     16 + "#off"(%2), %%mm2        \n\t"           \
-    "movq     32 + "#off"(%2), %%mm3        \n\t"           \
-    "movq     48 + "#off"(%2), %%mm4        \n\t"           \
-    "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
-    "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
-    "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
-    "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
-    "paddb              %%mm0, %%mm1        \n\t"           \
-    "paddb              %%mm0, %%mm2        \n\t"           \
-    "paddb              %%mm0, %%mm3        \n\t"           \
-    "paddb              %%mm0, %%mm4        \n\t"           \
-    "movq               %%mm1, (%0)         \n\t"           \
-    "movq               %%mm2, (%0, %3)     \n\t"           \
-    "movq               %%mm3, (%0, %3, 2)  \n\t"           \
-    "movq               %%mm4, (%0, %1)     \n\t"
-
-void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                                      int line_size)
-{
-    x86_reg line_skip = line_size;
-    x86_reg line_skip3;
-
-    __asm__ volatile (
-        "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
-        "lea         (%3, %3, 2), %1        \n\t"
-        put_signed_pixels_clamped_mmx_half(0)
-        "lea         (%0, %3, 4), %0        \n\t"
-        put_signed_pixels_clamped_mmx_half(64)
-        : "+&r" (pixels), "=&r" (line_skip3)
-        : "r" (block), "r" (line_skip)
-        : "memory");
-}
-
-void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-    int i;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    MOVQ_ZERO(mm7);
-    i = 4;
-    do {
-        __asm__ volatile (
-            "movq        (%2), %%mm0    \n\t"
-            "movq       8(%2), %%mm1    \n\t"
-            "movq      16(%2), %%mm2    \n\t"
-            "movq      24(%2), %%mm3    \n\t"
-            "movq          %0, %%mm4    \n\t"
-            "movq          %1, %%mm6    \n\t"
-            "movq       %%mm4, %%mm5    \n\t"
-            "punpcklbw  %%mm7, %%mm4    \n\t"
-            "punpckhbw  %%mm7, %%mm5    \n\t"
-            "paddsw     %%mm4, %%mm0    \n\t"
-            "paddsw     %%mm5, %%mm1    \n\t"
-            "movq       %%mm6, %%mm5    \n\t"
-            "punpcklbw  %%mm7, %%mm6    \n\t"
-            "punpckhbw  %%mm7, %%mm5    \n\t"
-            "paddsw     %%mm6, %%mm2    \n\t"
-            "paddsw     %%mm5, %%mm3    \n\t"
-            "packuswb   %%mm1, %%mm0    \n\t"
-            "packuswb   %%mm3, %%mm2    \n\t"
-            "movq       %%mm0, %0       \n\t"
-            "movq       %%mm2, %1       \n\t"
-            : "+m" (*pix), "+m" (*(pix + line_size))
-            : "r" (p)
-            : "memory");
-        pix += line_size * 2;
-        p   += 16;
-    } while (--i);
-}
-
-#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/imdct36.asm b/libavcodec/x86/imdct36.asm
index 633fcd9d59..ce30b42103 100644
--- a/libavcodec/x86/imdct36.asm
+++ b/libavcodec/x86/imdct36.asm
@@ -2,20 +2,20 @@
 ;* 36 point SSE-optimized IMDCT transform
 ;* Copyright (c) 2011 Vitor Sessak
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -50,7 +50,7 @@ ps_cosh_sse3:  dd 1.0, -0.50190991877167369479,  1.0, -5.73685662283492756461
                dd 1.0, -0.51763809020504152469,  1.0, -1.93185165257813657349
                dd 1.0, -0.55168895948124587824, -1.0,  1.18310079157624925896
                dd 1.0, -0.61038729438072803416, -1.0,  0.87172339781054900991
-               dd 1.0,  0.70710678118654752439,  0.0,  0.0
+               dd 1.0, -0.70710678118654752439,  0.0,  0.0
 
 costabs:  times 4 dd  0.98480773
           times 4 dd  0.93969262
@@ -129,6 +129,19 @@ SECTION_TEXT
 %endif
 %endmacro
 
+%macro BUTTERF2 3
+%if cpuflag(sse3)
+    mulps    %1, %1, [ps_cosh_sse3 + %3]
+    PSHUFD   %2, %1, 0xe1
+    addsubps %1, %1, %2
+%else
+    mulps    %1, [ps_cosh + %3]
+    PSHUFD   %2, %1, 0xe1
+    xorps    %1, [ps_p1m1p1m1]
+    addps    %1, %2
+%endif
+%endmacro
+
 %macro STORE 4
     movhlps %2, %1
     movss   [%3       ], %1
@@ -279,11 +292,7 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
     BUTTERF  m7, m2, 16
     BUTTERF  m3, m6, 32
     BUTTERF  m4, m1, 48
-
-    mulps   m5, m5, [ps_cosh + 64]
-    PSHUFD  m1, m5, 0xe1
-    xorps   m5, m5, [ps_p1m1p1m1]
-    addps   m5, m5, m1
+    BUTTERF2 m5, m1, 64
 
     ; permutates:
     ; m0    0  1  2  3     =>     2  6 10 14   m1
@@ -358,8 +367,10 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
     RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_XMM sse
 DEFINE_IMDCT
+%endif
 
 INIT_XMM sse2
 DEFINE_IMDCT
@@ -370,8 +381,10 @@ DEFINE_IMDCT
 INIT_XMM ssse3
 DEFINE_IMDCT
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEFINE_IMDCT
+%endif
 
 INIT_XMM sse
 
@@ -716,5 +729,7 @@ cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
 INIT_XMM sse
 DEFINE_FOUR_IMDCT
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 DEFINE_FOUR_IMDCT
+%endif
diff --git a/libavcodec/x86/inline_asm.h b/libavcodec/x86/inline_asm.h
index e4affabc87..3e65a76973 100644
--- a/libavcodec/x86/inline_asm.h
+++ b/libavcodec/x86/inline_asm.h
@@ -1,20 +1,20 @@
 /*
  * inline assembly helper macros
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,7 +37,7 @@
         "paddb   %%"#regd", %%"#regd"   \n\t" ::)
 
 #ifndef PIC
-#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
+#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_pw_2))
 #else
 // for shared library it's better to use this way for accessing constants
 // pcmpeqd -> -1
diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/lossless_audiodsp.asm
index d721ebda6b..64b769f7d4 100644
--- a/libavcodec/x86/apedsp.asm
+++ b/libavcodec/x86/lossless_audiodsp.asm
@@ -1,20 +1,20 @@
 ;******************************************************************************
 ;* Copyright (c) 2008 Loren Merritt
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -58,14 +58,7 @@ cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
     mova    [v1q + orderq + mmsize], m3
     add     orderq, mmsize*2
     jl .loop
-%if mmsize == 16
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-%else
-    pshufw  m0, m6, 0x4e
-%endif
-    paddd   m6, m0
+    HADDD   m6, m0
     movd   eax, m6
     RET
 %endmacro
@@ -159,9 +152,6 @@ SCALARPRODUCT_LOOP 4
 SCALARPRODUCT_LOOP 2
 SCALARPRODUCT_LOOP 0
 .end:
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-    paddd   m6, m0
+    HADDD   m6, m0
     movd   eax, m6
     RET
diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/lossless_audiodsp_init.c
index f692c2b9b6..4879dff1de 100644
--- a/libavcodec/x86/apedsp_init.c
+++ b/libavcodec/x86/lossless_audiodsp_init.c
@@ -1,25 +1,25 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/apedsp.h"
+#include "libavcodec/lossless_audiodsp.h"
 
 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
                                                const int16_t *v3,
@@ -31,7 +31,7 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
                                               const int16_t *v3,
                                               int order, int mul);
 
-av_cold void ff_apedsp_init_x86(APEDSPContext *c)
+av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm
new file mode 100644
index 0000000000..e6c23e7985
--- /dev/null
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -0,0 +1,294 @@
+;******************************************************************************
+;* SIMD lossless video DSP utils
+;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2014 Michael Niedermayer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_ef: times 8 db 14,15
+pb_67: times 8 db  6, 7
+pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
+pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
+
+SECTION_TEXT
+
+%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
+    movd    m4, maskd
+    SPLATW  m4, m4
+    add     wd, wd
+    test    wq, 2*mmsize - 1
+    jz %%.tomainloop
+    push  tmpq
+%%.wordloop:
+    sub     wq, 2
+%ifidn %2, add
+    mov   tmpw, [srcq+wq]
+    add   tmpw, [dstq+wq]
+%else
+    mov   tmpw, [src1q+wq]
+    sub   tmpw, [src2q+wq]
+%endif
+    and   tmpw, maskw
+    mov     [dstq+wq], tmpw
+    test    wq, 2*mmsize - 1
+    jnz %%.wordloop
+    pop   tmpq
+%%.tomainloop:
+%ifidn %2, add
+    add     srcq, wq
+%else
+    add     src1q, wq
+    add     src2q, wq
+%endif
+    add     dstq, wq
+    neg     wq
+    jz      %%.end
+%%.loop:
+%ifidn %2, add
+    mov%1   m0, [srcq+wq]
+    mov%1   m1, [dstq+wq]
+    mov%1   m2, [srcq+wq+mmsize]
+    mov%1   m3, [dstq+wq+mmsize]
+%else
+    mov%1   m0, [src1q+wq]
+    mov%1   m1, [src2q+wq]
+    mov%1   m2, [src1q+wq+mmsize]
+    mov%1   m3, [src2q+wq+mmsize]
+%endif
+    p%2w    m0, m1
+    p%2w    m2, m3
+    pand    m0, m4
+    pand    m2, m4
+    mov%1   [dstq+wq]       , m0
+    mov%1   [dstq+wq+mmsize], m2
+    add     wq, 2*mmsize
+    jl %%.loop
+%%.end:
+    RET
+%endmacro
+
+INIT_MMX mmx
+cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
+    INT16_LOOP a, add
+
+INIT_XMM sse2
+cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
+    test srcq, mmsize-1
+    jnz .unaligned
+    test dstq, mmsize-1
+    jnz .unaligned
+    INT16_LOOP a, add
+.unaligned:
+    INT16_LOOP u, add
+
+INIT_MMX mmx
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
+    INT16_LOOP a, sub
+
+INIT_XMM sse2
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
+    test src1q, mmsize-1
+    jnz .unaligned
+    test src2q, mmsize-1
+    jnz .unaligned
+    test dstq, mmsize-1
+    jnz .unaligned
+    INT16_LOOP a, sub
+.unaligned:
+    INT16_LOOP u, sub
+
+
+%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
+    add     wd, wd
+    add     srcq, wq
+    add     dstq, wq
+    neg     wq
+%%.loop:
+    mov%2   m1, [srcq+wq]
+    mova    m2, m1
+    pslld   m1, 16
+    paddw   m1, m2
+    mova    m2, m1
+
+    pshufb  m1, m3
+    paddw   m1, m2
+    pshufb  m0, m5
+%if mmsize == 16
+    mova    m2, m1
+    pshufb  m1, m4
+    paddw   m1, m2
+%endif
+    paddw   m0, m1
+    pand    m0, m7
+%ifidn %1, a
+    mova    [dstq+wq], m0
+%else
+    movq    [dstq+wq], m0
+    movhps  [dstq+wq+8], m0
+%endif
+    add     wq, mmsize
+    jl %%.loop
+    mov     eax, mmsize-1
+    sub     eax, wd
+    mov     wd, eax
+    shl     wd, 8
+    lea     eax, [wd+eax-1]
+    movd    m1, eax
+    pshufb  m0, m1
+    movd    eax, m0
+    RET
+%endmacro
+
+; int add_hfyu_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
+INIT_MMX ssse3
+cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left
+.skip_prologue:
+    mova    m5, [pb_67]
+    mova    m3, [pb_zzzz2323zzzzabab]
+    movd    m0, leftm
+    psllq   m0, 48
+    movd    m7, maskm
+    SPLATW  m7 ,m7
+    ADD_HFYU_LEFT_LOOP_INT16 a, a
+
+INIT_XMM sse4
+cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left
+    mova    m5, [pb_ef]
+    mova    m4, [pb_zzzzzzzz67676767]
+    mova    m3, [pb_zzzz2323zzzzabab]
+    movd    m0, leftm
+    pslldq  m0, 14
+    movd    m7, maskm
+    SPLATW  m7 ,m7
+    test    srcq, 15
+    jnz .src_unaligned
+    test    dstq, 15
+    jnz .dst_unaligned
+    ADD_HFYU_LEFT_LOOP_INT16 a, a
+.dst_unaligned:
+    ADD_HFYU_LEFT_LOOP_INT16 u, a
+.src_unaligned:
+    ADD_HFYU_LEFT_LOOP_INT16 u, u
+
+; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
+INIT_MMX mmxext
+cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
+    add      wd, wd
+    movd    mm6, maskd
+    SPLATW  mm6, mm6
+    movq    mm0, [topq]
+    movq    mm2, mm0
+    movd    mm4, [left_topq]
+    psllq   mm2, 16
+    movq    mm1, mm0
+    por     mm4, mm2
+    movd    mm3, [leftq]
+    psubw   mm0, mm4 ; t-tl
+    add    dstq, wq
+    add    topq, wq
+    add   diffq, wq
+    neg      wq
+    jmp .skip
+.loop:
+    movq    mm4, [topq+wq]
+    movq    mm0, mm4
+    psllq   mm4, 16
+    por     mm4, mm1
+    movq    mm1, mm0 ; t
+    psubw   mm0, mm4 ; t-tl
+.skip:
+    movq    mm2, [diffq+wq]
+%assign i 0
+%rep 4
+    movq    mm4, mm0
+    paddw   mm4, mm3 ; t-tl+l
+    pand    mm4, mm6
+    movq    mm5, mm3
+    pmaxsw  mm3, mm1
+    pminsw  mm5, mm1
+    pminsw  mm3, mm4
+    pmaxsw  mm3, mm5 ; median
+    paddw   mm3, mm2 ; +residual
+    pand    mm3, mm6
+%if i==0
+    movq    mm7, mm3
+    psllq   mm7, 48
+%else
+    movq    mm4, mm3
+    psrlq   mm7, 16
+    psllq   mm4, 48
+    por     mm7, mm4
+%endif
+%if i<3
+    psrlq   mm0, 16
+    psrlq   mm1, 16
+    psrlq   mm2, 16
+%endif
+%assign i i+1
+%endrep
+    movq [dstq+wq], mm7
+    add      wq, 8
+    jl .loop
+    movzx   r2d, word [dstq-2]
+    mov [leftq], r2d
+    movzx   r2d, word [topq-2]
+    mov [left_topq], r2d
+    RET
+
+cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
+    add      wd, wd
+    movd    mm7, maskd
+    SPLATW  mm7, mm7
+    movq    mm0, [src1q]
+    movq    mm2, [src2q]
+    psllq   mm0, 16
+    psllq   mm2, 16
+    movd    mm6, [left_topq]
+    por     mm0, mm6
+    movd    mm6, [leftq]
+    por     mm2, mm6
+    xor     maskq, maskq
+.loop:
+    movq    mm1, [src1q + maskq]
+    movq    mm3, [src2q + maskq]
+    movq    mm4, mm2
+    psubw   mm2, mm0
+    paddw   mm2, mm1
+    pand    mm2, mm7
+    movq    mm5, mm4
+    pmaxsw  mm4, mm1
+    pminsw  mm1, mm5
+    pminsw  mm4, mm2
+    pmaxsw  mm4, mm1
+    psubw   mm3, mm4
+    pand    mm3, mm7
+    movq    [dstq + maskq], mm3
+    add     maskq, 8
+    movq    mm0, [src1q + maskq - 2]
+    movq    mm2, [src2q + maskq - 2]
+    cmp     maskq, wq
+        jb .loop
+    movzx maskd, word [src1q + wq - 2]
+    mov [left_topq], maskd
+    movzx maskd, word [src2q + wq - 2]
+    mov [leftq], maskd
+    RET
diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c
new file mode 100644
index 0000000000..6589024a1a
--- /dev/null
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -0,0 +1,62 @@
+/*
+ * Lossless video DSP utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "../lossless_videodsp.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/x86/cpu.h"
+
+void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w);
+void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w);
+int ff_add_hfyu_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc);
+int ff_add_hfyu_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc);
+void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top);
+void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top);
+
+
+void ff_llviddsp_init_x86(LLVidDSPContext *c, AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->add_int16 = ff_add_int16_mmx;
+        c->diff_int16 = ff_diff_int16_mmx;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc->comp[0].depth_minus1<15) {
+        c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext;
+        c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->add_int16 = ff_add_int16_sse2;
+        c->diff_int16 = ff_diff_int16_sse2;
+    }
+
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->add_hfyu_left_pred_int16 = ff_add_hfyu_left_pred_int16_ssse3;
+    }
+
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        c->add_hfyu_left_pred_int16 = ff_add_hfyu_left_pred_int16_sse4;
+    }
+}
diff --git a/libavcodec/x86/lpc.c b/libavcodec/x86/lpc.c
index ea5d2eab56..3a9493f728 100644
--- a/libavcodec/x86/lpc.c
+++ b/libavcodec/x86/lpc.c
@@ -2,26 +2,25 @@
  * SIMD-optimized LPC functions
  * Copyright (c) 2007 Loren Merritt
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
@@ -73,6 +72,7 @@ static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
         "3:                                    \n\t"
         :"+&r"(i), "+&r"(j)
         :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
+         NAMED_CONSTRAINTS_ARRAY_ADD(pd_1,pd_2)
          XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
                                     "%xmm5", "%xmm6", "%xmm7")
     );
@@ -117,6 +117,7 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
                 "movsd     %%xmm2, 16(%1)           \n\t"
                 :"+&r"(i)
                 :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
+                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
                 :"memory"
             );
         } else {
@@ -140,6 +141,7 @@ static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
                 "movsd     %%xmm1, %2               \n\t"
                 :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
                 :"r"(data+len), "r"(data+len-j)
+                 NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
             );
         }
     }
@@ -152,7 +154,7 @@ av_cold void ff_lpc_init_x86(LPCContext *c)
 #if HAVE_SSE2_INLINE
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
+    if (HAVE_SSE2_INLINE && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
         c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
         c->lpc_compute_autocorr   = lpc_compute_autocorr_sse2;
     }
diff --git a/libavcodec/x86/mathops.h b/libavcodec/x86/mathops.h
index a62094ee97..1cca05d658 100644
--- a/libavcodec/x86/mathops.h
+++ b/libavcodec/x86/mathops.h
@@ -2,20 +2,20 @@
  * simple math operations
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 
 #include "config.h"
 #include "libavutil/common.h"
+#include "libavutil/x86/asm.h"
 
 #if HAVE_INLINE_ASM
 
@@ -88,6 +89,7 @@ static inline av_const int mid_pred(int a, int b, int c)
     return i;
 }
 
+#if HAVE_6REGS
 #define COPY3_IF_LT(x, y, a, b, c, d)\
 __asm__ volatile(\
     "cmpl  %0, %3       \n\t"\
@@ -97,10 +99,11 @@ __asm__ volatile(\
     : "+&r" (x), "+&r" (a), "+r" (c)\
     : "r" (y), "r" (b), "r" (d)\
 );
+#endif /* HAVE_6REGS */
 #endif /* HAVE_I686 */
 
 #define MASK_ABS(mask, level)                   \
-    __asm__ ("cltd                   \n\t"      \
+    __asm__ ("cdq                    \n\t"      \
              "xorl %1, %0            \n\t"      \
              "subl %1, %0            \n\t"      \
              : "+a"(level), "=&d"(mask))
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 1a87f37b39..95b99c48fd 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -4,25 +4,30 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA
+
+cextern pb_1
+cextern pb_80
+
 SECTION .text
 
 %macro DIFF_PIXELS_1 4
@@ -274,19 +279,27 @@ INIT_XMM ssse3
 %define ABS_SUM_8x8 ABS_SUM_8x8_64
 HADAMARD8_DIFF 9
 
-INIT_XMM sse2
-; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-;                   int line_size, int h);
-cglobal sse16, 5, 5, 8
-    shr      r4d, 1
+; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;               int line_size, int h)
+
+%macro SUM_SQUARED_ERRORS 1
+cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
+%if %1 == mmsize
+    shr       hd, 1
+%endif
     pxor      m0, m0         ; mm0 = 0
     pxor      m7, m7         ; mm7 holds the sum
 
 .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
-    movu      m1, [r1   ]    ; mm1 = pix1[0][0-15]
-    movu      m2, [r2   ]    ; mm2 = pix2[0][0-15]
-    movu      m3, [r1+r3]    ; mm3 = pix1[1][0-15]
-    movu      m4, [r2+r3]    ; mm4 = pix2[1][0-15]
+    movu      m1, [pix1q]    ; m1 = pix1[0][0-15], [0-7] for mmx
+    movu      m2, [pix2q]    ; m2 = pix2[0][0-15], [0-7] for mmx
+%if %1 == mmsize
+    movu      m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx
+    movu      m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx
+%else  ; %1 / 2 == mmsize; mmx only
+    mova      m3, [pix1q+8]  ; m3 = pix1[0][8-15]
+    mova      m4, [pix2q+8]  ; m4 = pix2[0][8-15]
+%endif
 
     ; todo: mm1-mm2, mm3-mm4
     ; algo: subtract mm1 from mm2 with saturation and vice versa
@@ -315,22 +328,608 @@ cglobal sse16, 5, 5, 8
     pmaddwd   m1, m1
     pmaddwd   m3, m3
 
-    lea       r1, [r1+r3*2]  ; pix1 += 2*line_size
-    lea       r2, [r2+r3*2]  ; pix2 += 2*line_size
-
     paddd     m1, m2
     paddd     m3, m4
     paddd     m7, m1
     paddd     m7, m3
 
-    dec       r4
+%if %1 == mmsize
+    lea    pix1q, [pix1q + 2*lsizeq]
+    lea    pix2q, [pix2q + 2*lsizeq]
+%else
+    add    pix1q, lsizeq
+    add    pix2q, lsizeq
+%endif
+    dec       hd
     jnz .next2lines
 
-    mova      m1, m7
-    psrldq    m7, 8          ; shift hi qword to lo
-    paddd     m7, m1
-    mova      m1, m7
-    psrldq    m7, 4          ; shift hi dword to lo
-    paddd     m7, m1
+    HADDD     m7, m1
     movd     eax, m7         ; return value
     RET
+%endmacro
+
+INIT_MMX mmx
+SUM_SQUARED_ERRORS 8
+
+INIT_MMX mmx
+SUM_SQUARED_ERRORS 16
+
+INIT_XMM sse2
+SUM_SQUARED_ERRORS 16
+
+;-----------------------------------------------
+;int ff_sum_abs_dctelem(int16_t *block)
+;-----------------------------------------------
+; %1 = number of xmm registers used
+; %2 = number of inline loops
+
+%macro SUM_ABS_DCTELEM 2
+cglobal sum_abs_dctelem, 1, 1, %1, block
+    pxor    m0, m0
+    pxor    m1, m1
+%assign %%i 0
+%rep %2
+    mova      m2, [blockq+mmsize*(0+%%i)]
+    mova      m3, [blockq+mmsize*(1+%%i)]
+    mova      m4, [blockq+mmsize*(2+%%i)]
+    mova      m5, [blockq+mmsize*(3+%%i)]
+    ABS1_SUM  m2, m6, m0
+    ABS1_SUM  m3, m6, m1
+    ABS1_SUM  m4, m6, m0
+    ABS1_SUM  m5, m6, m1
+%assign %%i %%i+4
+%endrep
+    paddusw m0, m1
+    HSUM    m0, m1, eax
+    and     eax, 0xFFFF
+    RET
+%endmacro
+
+INIT_MMX mmx
+SUM_ABS_DCTELEM 0, 4
+INIT_MMX mmxext
+SUM_ABS_DCTELEM 0, 4
+INIT_XMM sse2
+SUM_ABS_DCTELEM 7, 2
+INIT_XMM ssse3
+SUM_ABS_DCTELEM 6, 2
+
+;------------------------------------------------------------------------------
+; int ff_hf_noise*_mmx(uint8_t *pix1, int lsize, int h)
+;------------------------------------------------------------------------------
+; %1 = 8/16. %2-5=m#
+%macro HF_NOISE_PART1 5
+    mova      m%2, [pix1q]
+%if %1 == 8
+    mova      m%3, m%2
+    psllq     m%2, 8
+    psrlq     m%3, 8
+    psrlq     m%2, 8
+%else
+    mova      m%3, [pix1q+1]
+%endif
+    mova      m%4, m%2
+    mova      m%5, m%3
+    punpcklbw m%2, m7
+    punpcklbw m%3, m7
+    punpckhbw m%4, m7
+    punpckhbw m%5, m7
+    psubw     m%2, m%3
+    psubw     m%4, m%5
+%endmacro
+
+; %1-2 = m#
+%macro HF_NOISE_PART2 4
+    psubw     m%1, m%3
+    psubw     m%2, m%4
+    pxor       m3, m3
+    pxor       m1, m1
+    pcmpgtw    m3, m%1
+    pcmpgtw    m1, m%2
+    pxor      m%1, m3
+    pxor      m%2, m1
+    psubw     m%1, m3
+    psubw     m%2, m1
+    paddw     m%2, m%1
+    paddw      m6, m%2
+%endmacro
+
+; %1 = 8/16
+%macro HF_NOISE 1
+cglobal hf_noise%1, 3,3,0, pix1, lsize, h
+    movsxdifnidn lsizeq, lsized
+    sub        hd, 2
+    pxor       m7, m7
+    pxor       m6, m6
+    HF_NOISE_PART1 %1, 0, 1, 2, 3
+    add     pix1q, lsizeq
+    HF_NOISE_PART1 %1, 4, 1, 5, 3
+    HF_NOISE_PART2     0, 2, 4, 5
+    add     pix1q, lsizeq
+.loop:
+    HF_NOISE_PART1 %1, 0, 1, 2, 3
+    HF_NOISE_PART2     4, 5, 0, 2
+    add     pix1q, lsizeq
+    HF_NOISE_PART1 %1, 4, 1, 5, 3
+    HF_NOISE_PART2     0, 2, 4, 5
+    add     pix1q, lsizeq
+    sub        hd, 2
+        jne .loop
+
+    mova       m0, m6
+    punpcklwd  m0, m7
+    punpckhwd  m6, m7
+    paddd      m6, m0
+    mova       m0, m6
+    psrlq      m6, 32
+    paddd      m0, m6
+    movd      eax, m0   ; eax = result of hf_noise8;
+    REP_RET                 ; return eax;
+%endmacro
+
+INIT_MMX mmx
+HF_NOISE 8
+HF_NOISE 16
+
+;---------------------------------------------------------------------------------------
+;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int stride, int h);
+;---------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD 1
+cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
+    movu      m2, [pix2q]
+    movu      m1, [pix2q+strideq]
+    psadbw    m2, [pix1q]
+    psadbw    m1, [pix1q+strideq]
+    paddw     m2, m1
+%if %1 != mmsize
+    movu      m0, [pix2q+8]
+    movu      m1, [pix2q+strideq+8]
+    psadbw    m0, [pix1q+8]
+    psadbw    m1, [pix1q+strideq+8]
+    paddw     m2, m0
+    paddw     m2, m1
+%endif
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+strideq*2]
+    lea    pix2q, [pix2q+strideq*2]
+    movu      m0, [pix2q]
+    movu      m1, [pix2q+strideq]
+    psadbw    m0, [pix1q]
+    psadbw    m1, [pix1q+strideq]
+    paddw     m2, m0
+    paddw     m2, m1
+%if %1 != mmsize
+    movu      m0, [pix2q+8]
+    movu      m1, [pix2q+strideq+8]
+    psadbw    m0, [pix1q+8]
+    psadbw    m1, [pix1q+strideq+8]
+    paddw     m2, m0
+    paddw     m2, m1
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m0, m2
+    paddw     m2, m0
+%endif
+    movd     eax, m2
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD 8
+SAD 16
+INIT_XMM sse2
+SAD 16
+
+;------------------------------------------------------------------------------------------
+;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int stride, int h);
+;------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_X2 1
+cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
+    movu      m0, [pix2q]
+    movu      m2, [pix2q+strideq]
+%if mmsize == 16
+    movu      m3, [pix2q+1]
+    movu      m4, [pix2q+strideq+1]
+    pavgb     m0, m3
+    pavgb     m2, m4
+%else
+    pavgb     m0, [pix2q+1]
+    pavgb     m2, [pix2q+strideq+1]
+%endif
+    psadbw    m0, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m2
+%if %1 != mmsize
+    movu      m1, [pix2q+8]
+    movu      m2, [pix2q+strideq+8]
+    pavgb     m1, [pix2q+9]
+    pavgb     m2, [pix2q+strideq+9]
+    psadbw    m1, [pix1q+8]
+    psadbw    m2, [pix1q+strideq+8]
+    paddw     m0, m1
+    paddw     m0, m2
+%endif
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m1, [pix2q]
+    movu      m2, [pix2q+strideq]
+%if mmsize == 16
+    movu      m3, [pix2q+1]
+    movu      m4, [pix2q+strideq+1]
+    pavgb     m1, m3
+    pavgb     m2, m4
+%else
+    pavgb     m1, [pix2q+1]
+    pavgb     m2, [pix2q+strideq+1]
+%endif
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+%if %1 != mmsize
+    movu      m1, [pix2q+8]
+    movu      m2, [pix2q+strideq+8]
+    pavgb     m1, [pix2q+9]
+    pavgb     m2, [pix2q+strideq+9]
+    psadbw    m1, [pix1q+8]
+    psadbw    m2, [pix1q+strideq+8]
+    paddw     m0, m1
+    paddw     m0, m2
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_X2 8
+SAD_X2 16
+INIT_XMM sse2
+SAD_X2 16
+
+;------------------------------------------------------------------------------------------
+;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int stride, int h);
+;------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_Y2 1
+cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
+    movu      m1, [pix2q]
+    movu      m0, [pix2q+strideq]
+    movu      m3, [pix2q+2*strideq]
+    pavgb     m1, m0
+    pavgb     m0, m3
+    psadbw    m1, [pix1q]
+    psadbw    m0, [pix1q+strideq]
+    paddw     m0, m1
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m4, [pix2q+8]
+    movu      m5, [pix2q+strideq+8]
+    movu      m6, [pix2q+2*strideq+8]
+    pavgb     m4, m5
+    pavgb     m5, m6
+    psadbw    m4, [pix1q+8]
+    psadbw    m5, [pix1q+strideq+8]
+    paddw     m0, m4
+    paddw     m0, m5
+    mova      m4, m6
+%endif
+    add    pix2q, strideq
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m2, [pix2q]
+    movu      m3, [pix2q+strideq]
+    pavgb     m1, m2
+    pavgb     m2, m3
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m5, [pix2q+8]
+    movu      m6, [pix2q+strideq+8]
+    pavgb     m4, m5
+    pavgb     m5, m6
+    psadbw    m4, [pix1q+8]
+    psadbw    m5, [pix1q+strideq+8]
+    paddw     m0, m4
+    paddw     m0, m5
+    mova      m4, m6
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_Y2 8
+SAD_Y2 16
+INIT_XMM sse2
+SAD_Y2 16
+
+;-------------------------------------------------------------------------------------------
+;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int stride, int h);
+;-------------------------------------------------------------------------------------------
+;%1 = 8/16
+%macro SAD_APPROX_XY2 1
+cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
+    mova      m4, [pb_1]
+    movu      m1, [pix2q]
+    movu      m0, [pix2q+strideq]
+    movu      m3, [pix2q+2*strideq]
+%if mmsize == 16
+    movu      m5, [pix2q+1]
+    movu      m6, [pix2q+strideq+1]
+    movu      m2, [pix2q+2*strideq+1]
+    pavgb     m1, m5
+    pavgb     m0, m6
+    pavgb     m3, m2
+%else
+    pavgb     m1, [pix2q+1]
+    pavgb     m0, [pix2q+strideq+1]
+    pavgb     m3, [pix2q+2*strideq+1]
+%endif
+    psubusb   m0, m4
+    pavgb     m1, m0
+    pavgb     m0, m3
+    psadbw    m1, [pix1q]
+    psadbw    m0, [pix1q+strideq]
+    paddw     m0, m1
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m5, [pix2q+8]
+    movu      m6, [pix2q+strideq+8]
+    movu      m7, [pix2q+2*strideq+8]
+    pavgb     m5, [pix2q+1+8]
+    pavgb     m6, [pix2q+strideq+1+8]
+    pavgb     m7, [pix2q+2*strideq+1+8]
+    psubusb   m6, m4
+    pavgb     m5, m6
+    pavgb     m6, m7
+    psadbw    m5, [pix1q+8]
+    psadbw    m6, [pix1q+strideq+8]
+    paddw     m0, m5
+    paddw     m0, m6
+    mova      m5, m7
+%endif
+    add    pix2q, strideq
+    sub       hd, 2
+
+align 16
+.loop:
+    lea    pix1q, [pix1q+2*strideq]
+    lea    pix2q, [pix2q+2*strideq]
+    movu      m2, [pix2q]
+    movu      m3, [pix2q+strideq]
+%if mmsize == 16
+    movu      m5, [pix2q+1]
+    movu      m6, [pix2q+strideq+1]
+    pavgb     m2, m5
+    pavgb     m3, m6
+%else
+    pavgb     m2, [pix2q+1]
+    pavgb     m3, [pix2q+strideq+1]
+%endif
+    psubusb   m2, m4
+    pavgb     m1, m2
+    pavgb     m2, m3
+    psadbw    m1, [pix1q]
+    psadbw    m2, [pix1q+strideq]
+    paddw     m0, m1
+    paddw     m0, m2
+    mova      m1, m3
+%if %1 != mmsize
+    movu      m6, [pix2q+8]
+    movu      m7, [pix2q+strideq+8]
+    pavgb     m6, [pix2q+8+1]
+    pavgb     m7, [pix2q+strideq+8+1]
+    psubusb   m6, m4
+    pavgb     m5, m6
+    pavgb     m6, m7
+    psadbw    m5, [pix1q+8]
+    psadbw    m6, [pix1q+strideq+8]
+    paddw     m0, m5
+    paddw     m0, m6
+    mova      m5, m7
+%endif
+    sub       hd, 2
+    jg .loop
+%if mmsize == 16
+    movhlps   m1, m0
+    paddw     m0, m1
+%endif
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_APPROX_XY2 8
+SAD_APPROX_XY2 16
+INIT_XMM sse2
+SAD_APPROX_XY2 16
+
+;--------------------------------------------------------------------
+;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;                  int line_size, int h);
+;--------------------------------------------------------------------
+; %1 = 8/16
+%macro VSAD_INTRA 1
+cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
+    mova      m0, [pix1q]
+%if %1 == mmsize
+    mova      m2, [pix1q+lsizeq]
+    psadbw    m0, m2
+%else
+    mova      m2, [pix1q+lsizeq]
+    mova      m3, [pix1q+8]
+    mova      m4, [pix1q+lsizeq+8]
+    psadbw    m0, m2
+    psadbw    m3, m4
+    paddw     m0, m3
+%endif
+    sub       hd, 2
+
+.loop
+    lea    pix1q, [pix1q + 2*lsizeq]
+%if %1 == mmsize
+    mova      m1, [pix1q]
+    psadbw    m2, m1
+    paddw     m0, m2
+    mova      m2, [pix1q+lsizeq]
+    psadbw    m1, m2
+    paddw     m0, m1
+%else
+    mova      m1, [pix1q]
+    mova      m3, [pix1q+8]
+    psadbw    m2, m1
+    psadbw    m4, m3
+    paddw     m0, m2
+    paddw     m0, m4
+    mova      m2, [pix1q+lsizeq]
+    mova      m4, [pix1q+lsizeq+8]
+    psadbw    m1, m2
+    psadbw    m3, m4
+    paddw     m0, m1
+    paddw     m0, m3
+%endif
+    sub       hd, 2
+    jg     .loop
+
+%if mmsize == 16
+    pshufd m1, m0, 0xe
+    paddd  m0, m1
+%endif
+    movd eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+VSAD_INTRA 8
+VSAD_INTRA 16
+INIT_XMM sse2
+VSAD_INTRA 16
+
+;---------------------------------------------------------------------
+;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+;                   int line_size, int h);
+;---------------------------------------------------------------------
+; %1 = 8/16
+%macro VSAD_APPROX 1
+cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
+    mova   m1, [pb_80]
+    mova   m0, [pix1q]
+%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
+    mova   m4, [pix1q+lsizeq]
+%if mmsize == 16
+    movu   m3, [pix2q]
+    movu   m2, [pix2q+lsizeq]
+    psubb  m0, m3
+    psubb  m4, m2
+%else
+    psubb  m0, [pix2q]
+    psubb  m4, [pix2q+lsizeq]
+%endif
+    pxor   m0, m1
+    pxor   m4, m1
+    psadbw m0, m4
+%else ; vsad16_mmxext
+    mova   m3, [pix1q+8]
+    psubb  m0, [pix2q]
+    psubb  m3, [pix2q+8]
+    pxor   m0, m1
+    pxor   m3, m1
+    mova   m4, [pix1q+lsizeq]
+    mova   m5, [pix1q+lsizeq+8]
+    psubb  m4, [pix2q+lsizeq]
+    psubb  m5, [pix2q+lsizeq+8]
+    pxor   m4, m1
+    pxor   m5, m1
+    psadbw m0, m4
+    psadbw m3, m5
+    paddw  m0, m3
+%endif
+    sub    hd, 2
+
+.loop
+    lea pix1q, [pix1q + 2*lsizeq]
+    lea pix2q, [pix2q + 2*lsizeq]
+    mova   m2, [pix1q]
+%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
+%if mmsize == 16
+    movu   m3, [pix2q]
+    psubb  m2, m3
+%else
+    psubb  m2, [pix2q]
+%endif
+    pxor   m2, m1
+    psadbw m4, m2
+    paddw  m0, m4
+    mova   m4, [pix1q+lsizeq]
+    movu   m3, [pix2q+lsizeq]
+    psubb  m4, m3
+    pxor   m4, m1
+    psadbw m2, m4
+    paddw  m0, m2
+%else ; vsad16_mmxext
+    mova   m3, [pix1q+8]
+    psubb  m2, [pix2q]
+    psubb  m3, [pix2q+8]
+    pxor   m2, m1
+    pxor   m3, m1
+    psadbw m4, m2
+    psadbw m5, m3
+    paddw  m0, m4
+    paddw  m0, m5
+    mova   m4, [pix1q+lsizeq]
+    mova   m5, [pix1q+lsizeq+8]
+    psubb  m4, [pix2q+lsizeq]
+    psubb  m5, [pix2q+lsizeq+8]
+    pxor   m4, m1
+    pxor   m5, m1
+    psadbw m2, m4
+    psadbw m3, m5
+    paddw  m0, m2
+    paddw  m0, m3
+%endif
+    sub    hd, 2
+    jg  .loop
+
+%if mmsize == 16
+    pshufd m1, m0, 0xe
+    paddd  m0, m1
+%endif
+    movd  eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmxext
+VSAD_APPROX 8
+VSAD_APPROX 16
+INIT_XMM sse2
+VSAD_APPROX 16
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index e93b67b053..6dc59f5aa4 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -5,20 +5,20 @@
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,382 +29,67 @@
 #include "libavcodec/me_cmp.h"
 #include "libavcodec/mpegvideo.h"
 
-#if HAVE_INLINE_ASM
-
-static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                    int line_size, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl         %4, %%ecx          \n"
-        "shr          $1, %%ecx          \n"
-        "pxor      %%mm0, %%mm0          \n" /* mm0 = 0 */
-        "pxor      %%mm7, %%mm7          \n" /* mm7 holds the sum */
-        "1:                              \n"
-        "movq       (%0), %%mm1          \n" /* mm1 = pix1[0][0 - 7] */
-        "movq       (%1), %%mm2          \n" /* mm2 = pix2[0][0 - 7] */
-        "movq   (%0, %3), %%mm3          \n" /* mm3 = pix1[1][0 - 7] */
-        "movq   (%1, %3), %%mm4          \n" /* mm4 = pix2[1][0 - 7] */
-
-        /* todo: mm1-mm2, mm3-mm4 */
-        /* algo: subtract mm1 from mm2 with saturation and vice versa */
-        /*       OR the results to get absolute difference */
-        "movq      %%mm1, %%mm5          \n"
-        "movq      %%mm3, %%mm6          \n"
-        "psubusb   %%mm2, %%mm1          \n"
-        "psubusb   %%mm4, %%mm3          \n"
-        "psubusb   %%mm5, %%mm2          \n"
-        "psubusb   %%mm6, %%mm4          \n"
-
-        "por       %%mm1, %%mm2          \n"
-        "por       %%mm3, %%mm4          \n"
-
-        /* now convert to 16-bit vectors so we can square them */
-        "movq      %%mm2, %%mm1          \n"
-        "movq      %%mm4, %%mm3          \n"
-
-        "punpckhbw %%mm0, %%mm2          \n"
-        "punpckhbw %%mm0, %%mm4          \n"
-        "punpcklbw %%mm0, %%mm1          \n" /* mm1 now spread over (mm1, mm2) */
-        "punpcklbw %%mm0, %%mm3          \n" /* mm4 now spread over (mm3, mm4) */
-
-        "pmaddwd   %%mm2, %%mm2          \n"
-        "pmaddwd   %%mm4, %%mm4          \n"
-        "pmaddwd   %%mm1, %%mm1          \n"
-        "pmaddwd   %%mm3, %%mm3          \n"
-
-        "lea (%0, %3, 2), %0             \n" /* pix1 += 2 * line_size */
-        "lea (%1, %3, 2), %1             \n" /* pix2 += 2 * line_size */
-
-        "paddd     %%mm2, %%mm1          \n"
-        "paddd     %%mm4, %%mm3          \n"
-        "paddd     %%mm1, %%mm7          \n"
-        "paddd     %%mm3, %%mm7          \n"
-
-        "decl      %%ecx                 \n"
-        "jnz       1b                    \n"
-
-        "movq      %%mm7, %%mm1          \n"
-        "psrlq       $32, %%mm7          \n" /* shift hi dword to lo */
-        "paddd     %%mm7, %%mm1          \n"
-        "movd      %%mm1, %2             \n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" ((x86_reg) line_size), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                     int line_size, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl %4, %%ecx\n"
-        "pxor %%mm0, %%mm0\n"    /* mm0 = 0 */
-        "pxor %%mm7, %%mm7\n"    /* mm7 holds the sum */
-        "1:\n"
-        "movq (%0), %%mm1\n"     /* mm1 = pix1[0 -  7] */
-        "movq (%1), %%mm2\n"     /* mm2 = pix2[0 -  7] */
-        "movq 8(%0), %%mm3\n"    /* mm3 = pix1[8 - 15] */
-        "movq 8(%1), %%mm4\n"    /* mm4 = pix2[8 - 15] */
-
-        /* todo: mm1-mm2, mm3-mm4 */
-        /* algo: subtract mm1 from mm2 with saturation and vice versa */
-        /*       OR the results to get absolute difference */
-        "movq %%mm1, %%mm5\n"
-        "movq %%mm3, %%mm6\n"
-        "psubusb %%mm2, %%mm1\n"
-        "psubusb %%mm4, %%mm3\n"
-        "psubusb %%mm5, %%mm2\n"
-        "psubusb %%mm6, %%mm4\n"
-
-        "por %%mm1, %%mm2\n"
-        "por %%mm3, %%mm4\n"
-
-        /* now convert to 16-bit vectors so we can square them */
-        "movq %%mm2, %%mm1\n"
-        "movq %%mm4, %%mm3\n"
-
-        "punpckhbw %%mm0, %%mm2\n"
-        "punpckhbw %%mm0, %%mm4\n"
-        "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
-        "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
-
-        "pmaddwd %%mm2, %%mm2\n"
-        "pmaddwd %%mm4, %%mm4\n"
-        "pmaddwd %%mm1, %%mm1\n"
-        "pmaddwd %%mm3, %%mm3\n"
-
-        "add %3, %0\n"
-        "add %3, %1\n"
-
-        "paddd %%mm2, %%mm1\n"
-        "paddd %%mm4, %%mm3\n"
-        "paddd %%mm1, %%mm7\n"
-        "paddd %%mm3, %%mm7\n"
-
-        "decl %%ecx\n"
-        "jnz 1b\n"
-
-        "movq %%mm7, %%mm1\n"
-        "psrlq $32, %%mm7\n"    /* shift hi dword to lo */
-        "paddd %%mm7, %%mm1\n"
-        "movd %%mm1, %2\n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" ((x86_reg) line_size), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h)
-{
-    int tmp;
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm7, %%mm7\n"
-        "pxor %%mm6, %%mm6\n"
-
-        "movq (%0), %%mm0\n"
-        "movq %%mm0, %%mm1\n"
-        "psllq $8, %%mm0\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm0\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq %%mm4, %%mm1\n"
-        "psllq $8, %%mm4\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm4\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "1:\n"
-
-        "movq (%0), %%mm0\n"
-        "movq %%mm0, %%mm1\n"
-        "psllq $8, %%mm0\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm0\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-        "psubw %%mm0, %%mm4\n"
-        "psubw %%mm2, %%mm5\n"
-        "pxor  %%mm3, %%mm3\n"
-        "pxor  %%mm1, %%mm1\n"
-        "pcmpgtw %%mm4, %%mm3\n\t"
-        "pcmpgtw %%mm5, %%mm1\n\t"
-        "pxor  %%mm3, %%mm4\n"
-        "pxor  %%mm1, %%mm5\n"
-        "psubw %%mm3, %%mm4\n"
-        "psubw %%mm1, %%mm5\n"
-        "paddw %%mm4, %%mm5\n"
-        "paddw %%mm5, %%mm6\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq      %%mm4, %%mm1\n"
-        "psllq $8, %%mm4\n"
-        "psrlq $8, %%mm1\n"
-        "psrlq $8, %%mm4\n"
-        "movq      %%mm4, %%mm5\n"
-        "movq      %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw     %%mm1, %%mm4\n"
-        "psubw     %%mm3, %%mm5\n"
-        "psubw     %%mm4, %%mm0\n"
-        "psubw     %%mm5, %%mm2\n"
-        "pxor      %%mm3, %%mm3\n"
-        "pxor      %%mm1, %%mm1\n"
-        "pcmpgtw   %%mm0, %%mm3\n\t"
-        "pcmpgtw   %%mm2, %%mm1\n\t"
-        "pxor      %%mm3, %%mm0\n"
-        "pxor      %%mm1, %%mm2\n"
-        "psubw     %%mm3, %%mm0\n"
-        "psubw     %%mm1, %%mm2\n"
-        "paddw     %%mm0, %%mm2\n"
-        "paddw     %%mm2, %%mm6\n"
-
-        "add  %2, %0\n"
-        "subl $2, %%ecx\n"
-        " jnz 1b\n"
-
-        "movq      %%mm6, %%mm0\n"
-        "punpcklwd %%mm7, %%mm0\n"
-        "punpckhwd %%mm7, %%mm6\n"
-        "paddd     %%mm0, %%mm6\n"
-
-        "movq  %%mm6, %%mm0\n"
-        "psrlq $32,   %%mm6\n"
-        "paddd %%mm6, %%mm0\n"
-        "movd  %%mm0, %1\n"
-        : "+r" (pix1), "=r" (tmp)
-        : "r" ((x86_reg) line_size), "g" (h - 2)
-        : "%ecx");
-
-    return tmp;
-}
-
-static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h)
-{
-    int tmp;
-    uint8_t *pix = pix1;
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm7, %%mm7\n"
-        "pxor %%mm6, %%mm6\n"
-
-        "movq (%0), %%mm0\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "1:\n"
-
-        "movq (%0), %%mm0\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm0, %%mm2\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm0\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm2\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm0\n"
-        "psubw %%mm3, %%mm2\n"
-        "psubw %%mm0, %%mm4\n"
-        "psubw %%mm2, %%mm5\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm4, %%mm3\n\t"
-        "pcmpgtw %%mm5, %%mm1\n\t"
-        "pxor %%mm3, %%mm4\n"
-        "pxor %%mm1, %%mm5\n"
-        "psubw %%mm3, %%mm4\n"
-        "psubw %%mm1, %%mm5\n"
-        "paddw %%mm4, %%mm5\n"
-        "paddw %%mm5, %%mm6\n"
-
-        "add %2, %0\n"
-
-        "movq (%0), %%mm4\n"
-        "movq 1(%0), %%mm1\n"
-        "movq %%mm4, %%mm5\n"
-        "movq %%mm1, %%mm3\n"
-        "punpcklbw %%mm7, %%mm4\n"
-        "punpcklbw %%mm7, %%mm1\n"
-        "punpckhbw %%mm7, %%mm5\n"
-        "punpckhbw %%mm7, %%mm3\n"
-        "psubw %%mm1, %%mm4\n"
-        "psubw %%mm3, %%mm5\n"
-        "psubw %%mm4, %%mm0\n"
-        "psubw %%mm5, %%mm2\n"
-        "pxor %%mm3, %%mm3\n"
-        "pxor %%mm1, %%mm1\n"
-        "pcmpgtw %%mm0, %%mm3\n\t"
-        "pcmpgtw %%mm2, %%mm1\n\t"
-        "pxor %%mm3, %%mm0\n"
-        "pxor %%mm1, %%mm2\n"
-        "psubw %%mm3, %%mm0\n"
-        "psubw %%mm1, %%mm2\n"
-        "paddw %%mm0, %%mm2\n"
-        "paddw %%mm2, %%mm6\n"
-
-        "add %2, %0\n"
-        "subl $2, %%ecx\n"
-        " jnz 1b\n"
-
-        "movq %%mm6, %%mm0\n"
-        "punpcklwd %%mm7, %%mm0\n"
-        "punpckhwd %%mm7, %%mm6\n"
-        "paddd %%mm0, %%mm6\n"
+int ff_sum_abs_dctelem_mmx(int16_t *block);
+int ff_sum_abs_dctelem_mmxext(int16_t *block);
+int ff_sum_abs_dctelem_sse2(int16_t *block);
+int ff_sum_abs_dctelem_ssse3(int16_t *block);
+int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                int line_size, int h);
+int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                 int line_size, int h);
+int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  int line_size, int h);
+int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h);
+int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h);
+int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                   int stride, int h);
+int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    int stride, int h);
+int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                  int stride, int h);
+int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      int stride, int h);
+int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       int stride, int h);
+int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     int stride, int h);
+int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                      int stride, int h);
+int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                       int stride, int h);
+int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     int stride, int h);
+int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                              int stride, int h);
+int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                               int stride, int h);
+int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                             int stride, int h);
+int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                          int line_size, int h);
+int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                           int line_size, int h);
+int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                         int line_size, int h);
+int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                    int line_size, int h);
+int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                     int line_size, int h);
+int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+                   int line_size, int h);
 
-        "movq %%mm6, %%mm0\n"
-        "psrlq $32, %%mm6\n"
-        "paddd %%mm6, %%mm0\n"
-        "movd %%mm0, %1\n"
-        : "+r" (pix1), "=r" (tmp)
-        : "r" ((x86_reg) line_size), "g" (h - 2)
-        : "%ecx");
+#define hadamard_func(cpu)                                              \
+    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,     \
+                                  uint8_t *src2, int stride, int h);    \
+    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,   \
+                                    uint8_t *src2, int stride, int h);
 
-    return tmp + hf_noise8_mmx(pix + 8, line_size, h);
-}
+hadamard_func(mmx)
+hadamard_func(mmxext)
+hadamard_func(sse2)
+hadamard_func(ssse3)
 
+#if HAVE_YASM
 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
                       int line_size, int h)
 {
@@ -413,9 +98,9 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
     if (c)
         score1 = c->mecc.sse[0](c, pix1, pix2, line_size, h);
     else
-        score1 = sse16_mmx(c, pix1, pix2, line_size, h);
-    score2 = hf_noise16_mmx(pix1, line_size, h) -
-             hf_noise16_mmx(pix2, line_size, h);
+        score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h);
+    score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h)
+           - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h);
 
     if (c)
         return score1 + FFABS(score2) * c->avctx->nsse_weight;
@@ -426,9 +111,9 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
                      int line_size, int h)
 {
-    int score1 = sse8_mmx(c, pix1, pix2, line_size, h);
-    int score2 = hf_noise8_mmx(pix1, line_size, h) -
-                 hf_noise8_mmx(pix2, line_size, h);
+    int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h);
+    int score2 = ff_hf_noise8_mmx(pix1, line_size, h) -
+                 ff_hf_noise8_mmx(pix2, line_size, h);
 
     if (c)
         return score1 + FFABS(score2) * c->avctx->nsse_weight;
@@ -436,13 +121,17 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
         return score1 + FFABS(score2) * 8;
 }
 
+#endif /* HAVE_YASM */
+
+#if HAVE_INLINE_ASM
+
 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
                             int line_size, int h)
 {
     int tmp;
 
-    assert((((int) pix) & 7) == 0);
-    assert((line_size & 7) == 0);
+    av_assert2((((int) pix) & 7) == 0);
+    av_assert2((line_size & 7) == 0);
 
 #define SUM(in0, in1, out0, out1)               \
     "movq (%0), %%mm2\n"                        \
@@ -500,57 +189,14 @@ static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
 }
 #undef SUM
 
-static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
-                               int line_size, int h)
-{
-    int tmp;
-
-    assert((((int) pix) & 7) == 0);
-    assert((line_size & 7) == 0);
-
-#define SUM(in0, in1, out0, out1)               \
-    "movq (%0), " #out0 "\n"                    \
-    "movq 8(%0), " #out1 "\n"                   \
-    "add %2, %0\n"                              \
-    "psadbw " #out0 ", " #in0 "\n"              \
-    "psadbw " #out1 ", " #in1 "\n"              \
-    "paddw " #in1 ", " #in0 "\n"                \
-    "paddw " #in0 ", %%mm6\n"
-
-    __asm__ volatile (
-        "movl %3, %%ecx\n"
-        "pxor %%mm6, %%mm6\n"
-        "pxor %%mm7, %%mm7\n"
-        "movq (%0), %%mm0\n"
-        "movq 8(%0), %%mm1\n"
-        "add %2, %0\n"
-        "jmp 2f\n"
-        "1:\n"
-
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
-        "2:\n"
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
-        "subl $2, %%ecx\n"
-        "jnz 1b\n"
-
-        "movd %%mm6, %1\n"
-        : "+r" (pix), "=r" (tmp)
-        : "r" ((x86_reg) line_size), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-#undef SUM
-
 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                       int line_size, int h)
 {
     int tmp;
 
-    assert((((int) pix1) & 7) == 0);
-    assert((((int) pix2) & 7) == 0);
-    assert((line_size & 7) == 0);
+    av_assert2((((int) pix1) & 7) == 0);
+    av_assert2((((int) pix2) & 7) == 0);
+    av_assert2((line_size & 7) == 0);
 
 #define SUM(in0, in1, out0, out1)       \
     "movq (%0), %%mm2\n"                \
@@ -624,190 +270,15 @@ static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 }
 #undef SUM
 
-static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                         int line_size, int h)
-{
-    int tmp;
-
-    assert((((int) pix1) & 7) == 0);
-    assert((((int) pix2) & 7) == 0);
-    assert((line_size & 7) == 0);
-
-#define SUM(in0, in1, out0, out1)               \
-    "movq (%0), " #out0 "\n"                    \
-    "movq (%1), %%mm2\n"                        \
-    "movq 8(%0), " #out1 "\n"                   \
-    "movq 8(%1), %%mm3\n"                       \
-    "add %3, %0\n"                              \
-    "add %3, %1\n"                              \
-    "psubb %%mm2, " #out0 "\n"                  \
-    "psubb %%mm3, " #out1 "\n"                  \
-    "pxor %%mm7, " #out0 "\n"                   \
-    "pxor %%mm7, " #out1 "\n"                   \
-    "psadbw " #out0 ", " #in0 "\n"              \
-    "psadbw " #out1 ", " #in1 "\n"              \
-    "paddw " #in1 ", " #in0 "\n"                \
-    "paddw " #in0 ", %%mm6\n    "
-
-    __asm__ volatile (
-        "movl %4, %%ecx\n"
-        "pxor %%mm6, %%mm6\n"
-        "pcmpeqw %%mm7, %%mm7\n"
-        "psllw $15, %%mm7\n"
-        "packsswb %%mm7, %%mm7\n"
-        "movq (%0), %%mm0\n"
-        "movq (%1), %%mm2\n"
-        "movq 8(%0), %%mm1\n"
-        "movq 8(%1), %%mm3\n"
-        "add %3, %0\n"
-        "add %3, %1\n"
-        "psubb %%mm2, %%mm0\n"
-        "psubb %%mm3, %%mm1\n"
-        "pxor %%mm7, %%mm0\n"
-        "pxor %%mm7, %%mm1\n"
-        "jmp 2f\n"
-        "1:\n"
-
-        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
-        "2:\n"
-        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
-
-        "subl $2, %%ecx\n"
-        "jnz 1b\n"
-
-        "movd %%mm6, %2\n"
-        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
-        : "r" ((x86_reg) line_size), "m" (h)
-        : "%ecx");
-
-    return tmp;
-}
-#undef SUM
-
-#define MMABS_MMX(a,z)                          \
-    "pxor "    #z ", " #z "             \n\t"   \
-    "pcmpgtw " #a ", " #z "             \n\t"   \
-    "pxor "    #z ", " #a "             \n\t"   \
-    "psubw "   #z ", " #a "             \n\t"
-
-#define MMABS_MMXEXT(a, z)                      \
-    "pxor "    #z ", " #z "             \n\t"   \
-    "psubw "   #a ", " #z "             \n\t"   \
-    "pmaxsw "  #z ", " #a "             \n\t"
-
-#define MMABS_SSSE3(a,z)                        \
-    "pabsw "   #a ", " #a "             \n\t"
-
-#define MMABS_SUM(a,z, sum)                     \
-    MMABS(a,z)                                  \
-    "paddusw " #a ", " #sum "           \n\t"
-
-/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
- * up to about 100k on extreme inputs. But that's very unlikely to occur in
- * natural video, and it's even more unlikely to not have any alternative
- * mvs/modes with lower cost. */
-#define HSUM_MMX(a, t, dst)                     \
-    "movq    " #a ", " #t "             \n\t"   \
-    "psrlq      $32, " #a "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movq    " #a ", " #t "             \n\t"   \
-    "psrlq      $16, " #a "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define HSUM_MMXEXT(a, t, dst)                  \
-    "pshufw   $0x0E, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshufw   $0x01, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define HSUM_SSE2(a, t, dst)                    \
-    "movhlps " #a ", " #t "             \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshuflw  $0x0E, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "pshuflw  $0x01, " #a ", " #t "     \n\t"   \
-    "paddusw " #t ", " #a "             \n\t"   \
-    "movd    " #a ", " #dst "           \n\t"   \
-
-#define DCT_SAD4(m, mm, o)                      \
-    "mov"#m" "#o" +  0(%1), " #mm "2    \n\t"   \
-    "mov"#m" "#o" + 16(%1), " #mm "3    \n\t"   \
-    "mov"#m" "#o" + 32(%1), " #mm "4    \n\t"   \
-    "mov"#m" "#o" + 48(%1), " #mm "5    \n\t"   \
-    MMABS_SUM(mm ## 2, mm ## 6, mm ## 0)        \
-    MMABS_SUM(mm ## 3, mm ## 7, mm ## 1)        \
-    MMABS_SUM(mm ## 4, mm ## 6, mm ## 0)        \
-    MMABS_SUM(mm ## 5, mm ## 7, mm ## 1)        \
-
-#define DCT_SAD_MMX                             \
-    "pxor    %%mm0, %%mm0               \n\t"   \
-    "pxor    %%mm1, %%mm1               \n\t"   \
-    DCT_SAD4(q, %%mm, 0)                        \
-    DCT_SAD4(q, %%mm, 8)                        \
-    DCT_SAD4(q, %%mm, 64)                       \
-    DCT_SAD4(q, %%mm, 72)                       \
-    "paddusw %%mm1, %%mm0               \n\t"   \
-    HSUM(%%mm0, %%mm1, %0)
-
-#define DCT_SAD_SSE2                            \
-    "pxor    %%xmm0, %%xmm0             \n\t"   \
-    "pxor    %%xmm1, %%xmm1             \n\t"   \
-    DCT_SAD4(dqa, %%xmm, 0)                     \
-    DCT_SAD4(dqa, %%xmm, 64)                    \
-    "paddusw %%xmm1, %%xmm0             \n\t"   \
-    HSUM(%%xmm0, %%xmm1, %0)
-
-#define DCT_SAD_FUNC(cpu)                           \
-static int sum_abs_dctelem_ ## cpu(int16_t *block)  \
-{                                                   \
-    int sum;                                        \
-    __asm__ volatile (                              \
-        DCT_SAD                                     \
-        :"=r"(sum)                                  \
-        :"r"(block));                               \
-    return sum & 0xFFFF;                            \
-}
-
-#define DCT_SAD         DCT_SAD_MMX
-#define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
-#define MMABS(a, z)     MMABS_MMX(a, z)
-DCT_SAD_FUNC(mmx)
-#undef MMABS
-#undef HSUM
-
-#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
-#define MMABS(a, z)     MMABS_MMXEXT(a, z)
-DCT_SAD_FUNC(mmxext)
-#undef HSUM
-#undef DCT_SAD
-
-#define DCT_SAD         DCT_SAD_SSE2
-#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
-DCT_SAD_FUNC(sse2)
-#undef MMABS
-
-#if HAVE_SSSE3_INLINE
-#define MMABS(a, z)     MMABS_SSSE3(a, z)
-DCT_SAD_FUNC(ssse3)
-#undef MMABS
-#endif
-#undef HSUM
-#undef DCT_SAD
-
-
 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
     0x0000000000000000ULL,
     0x0001000100010001ULL,
     0x0002000200020002ULL,
 };
 
-DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
-
 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -(x86_reg)stride * h;
     __asm__ volatile (
         ".p2align 4                     \n\t"
         "1:                             \n\t"
@@ -840,133 +311,10 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
         : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride));
 }
 
-static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                 int stride, int h)
-{
-    __asm__ volatile (
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" ((x86_reg) stride));
-}
-
-static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
-                      int stride, int h)
-{
-    int ret;
-    __asm__ volatile (
-        "pxor %%xmm2, %%xmm2            \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movdqu (%1), %%xmm0            \n\t"
-        "movdqu (%1, %4), %%xmm1        \n\t"
-        "psadbw (%2), %%xmm0            \n\t"
-        "psadbw (%2, %4), %%xmm1        \n\t"
-        "paddw %%xmm0, %%xmm2           \n\t"
-        "paddw %%xmm1, %%xmm2           \n\t"
-        "lea (%1,%4,2), %1              \n\t"
-        "lea (%2,%4,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        "movhlps %%xmm2, %%xmm0         \n\t"
-        "paddw   %%xmm0, %%xmm2         \n\t"
-        "movd    %%xmm2, %3             \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
-        : "r" ((x86_reg) stride));
-    return ret;
-}
-
-static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                   int stride, int h)
-{
-    __asm__ volatile (
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "pavgb 1(%1), %%mm0             \n\t"
-        "pavgb 1(%1, %3), %%mm1         \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" ((x86_reg) stride));
-}
-
-static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                   int stride, int h)
-{
-    __asm__ volatile (
-        "movq (%1), %%mm0               \n\t"
-        "add %3, %1                     \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm1               \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        "pavgb %%mm1, %%mm0             \n\t"
-        "pavgb %%mm2, %%mm1             \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2, %3), %%mm1         \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "movq %%mm2, %%mm0              \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" ((x86_reg) stride));
-}
-
-static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
-                                 int stride, int h)
-{
-    __asm__ volatile (
-        "movq "MANGLE(bone)", %%mm5     \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "pavgb 1(%1), %%mm0             \n\t"
-        "add %3, %1                     \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm1               \n\t"
-        "movq (%1,%3), %%mm2            \n\t"
-        "pavgb 1(%1), %%mm1             \n\t"
-        "pavgb 1(%1,%3), %%mm2          \n\t"
-        "psubusb %%mm5, %%mm1           \n\t"
-        "pavgb %%mm1, %%mm0             \n\t"
-        "pavgb %%mm2, %%mm1             \n\t"
-        "psadbw (%2), %%mm0             \n\t"
-        "psadbw (%2,%3), %%mm1          \n\t"
-        "paddw %%mm0, %%mm6             \n\t"
-        "paddw %%mm1, %%mm6             \n\t"
-        "movq %%mm2, %%mm0              \n\t"
-        "lea (%1,%3,2), %1              \n\t"
-        "lea (%2,%3,2), %2              \n\t"
-        "sub $2, %0                     \n\t"
-        " jg 1b                         \n\t"
-        : "+r" (h), "+r" (blk1), "+r" (blk2)
-        : "r" ((x86_reg) stride));
-}
-
 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
                               int stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -(x86_reg)stride * h;
     __asm__ volatile (
         ".p2align 4                     \n\t"
         "1:                             \n\t"
@@ -1004,7 +352,7 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
 
 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 {
-    x86_reg len = -(stride * h);
+    x86_reg len = -(x86_reg)stride * h;
     __asm__ volatile (
         "movq  (%1, %%"REG_a"), %%mm0   \n\t"
         "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
@@ -1028,7 +376,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
         "punpckhbw %%mm7, %%mm5         \n\t"
         "paddw %%mm4, %%mm2             \n\t"
         "paddw %%mm5, %%mm3             \n\t"
-        "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
+        "movq %5, %%mm5                 \n\t"
         "paddw %%mm2, %%mm0             \n\t"
         "paddw %%mm3, %%mm1             \n\t"
         "paddw %%mm5, %%mm0             \n\t"
@@ -1052,7 +400,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
         " js 1b                         \n\t"
         : "+a" (len)
         : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
-          "r" ((x86_reg) stride));
+          "r" ((x86_reg) stride), "m" (round_tab[2]));
 }
 
 static inline int sum_mmx(void)
@@ -1070,15 +418,6 @@ static inline int sum_mmx(void)
     return ret & 0xFFFF;
 }
 
-static inline int sum_mmxext(void)
-{
-    int ret;
-    __asm__ volatile (
-        "movd %%mm6, %0                 \n\t"
-        : "=r" (ret));
-    return ret;
-}
-
 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 {
     sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
@@ -1093,7 +432,7 @@ static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
                         uint8_t *blk1, int stride, int h)               \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1107,7 +446,7 @@ static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
                            uint8_t *blk1, int stride, int h)            \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1122,7 +461,7 @@ static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
                            uint8_t *blk1, int stride, int h)            \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1137,7 +476,7 @@ static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
                             uint8_t *blk1, int stride, int h)           \
 {                                                                       \
-    assert(h == 8);                                                     \
+    av_assert2(h == 8);                                                     \
     __asm__ volatile (                                                  \
         "pxor %%mm7, %%mm7     \n\t"                                    \
         "pxor %%mm6, %%mm6     \n\t"                                    \
@@ -1207,32 +546,15 @@ static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
 }                                                                       \
 
 PIX_SAD(mmx)
-PIX_SAD(mmxext)
 
 #endif /* HAVE_INLINE_ASM */
 
-int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
-                  int line_size, int h);
-
-#define hadamard_func(cpu)                                              \
-    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,     \
-                                  uint8_t *src2, int stride, int h);    \
-    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,   \
-                                    uint8_t *src2, int stride, int h);
-
-hadamard_func(mmx)
-hadamard_func(mmxext)
-hadamard_func(sse2)
-hadamard_func(ssse3)
-
 av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
 {
     int cpu_flags = av_get_cpu_flags();
 
 #if HAVE_INLINE_ASM
     if (INLINE_MMX(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_mmx;
-
         c->pix_abs[0][0] = sad16_mmx;
         c->pix_abs[0][1] = sad16_x2_mmx;
         c->pix_abs[0][2] = sad16_y2_mmx;
@@ -1245,77 +567,81 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
         c->sad[0] = sad16_mmx;
         c->sad[1] = sad8_mmx;
 
-        c->sse[0]  = sse16_mmx;
-        c->sse[1]  = sse8_mmx;
         c->vsad[4] = vsad_intra16_mmx;
 
-        c->nsse[0] = nsse16_mmx;
-        c->nsse[1] = nsse8_mmx;
-
         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
             c->vsad[0] = vsad16_mmx;
         }
     }
 
-    if (INLINE_MMXEXT(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
-
-        c->vsad[4] = vsad_intra16_mmxext;
-
-        c->pix_abs[0][0] = sad16_mmxext;
-        c->pix_abs[1][0] = sad8_mmxext;
-
-        c->sad[0] = sad16_mmxext;
-        c->sad[1] = sad8_mmxext;
-
-        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
-            c->pix_abs[0][1] = sad16_x2_mmxext;
-            c->pix_abs[0][2] = sad16_y2_mmxext;
-            c->pix_abs[0][3] = sad16_xy2_mmxext;
-            c->pix_abs[1][1] = sad8_x2_mmxext;
-            c->pix_abs[1][2] = sad8_y2_mmxext;
-            c->pix_abs[1][3] = sad8_xy2_mmxext;
-
-            c->vsad[0] = vsad16_mmxext;
-        }
-    }
-
-    if (INLINE_SSE2(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_sse2;
-    }
-
-    if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) {
-        c->sad[0] = sad16_sse2;
-    }
-
-#if HAVE_SSSE3_INLINE
-    if (INLINE_SSSE3(cpu_flags)) {
-        c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
-    }
-#endif
 #endif /* HAVE_INLINE_ASM */
 
     if (EXTERNAL_MMX(cpu_flags)) {
         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
         c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
+        c->sse[0]            = ff_sse16_mmx;
+        c->sse[1]            = ff_sse8_mmx;
+#if HAVE_YASM
+        c->nsse[0]           = nsse16_mmx;
+        c->nsse[1]           = nsse8_mmx;
+#endif
     }
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
         c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
+
+        c->sad[0] = ff_sad16_mmxext;
+        c->sad[1] = ff_sad8_mmxext;
+
+        c->pix_abs[0][0] = ff_sad16_mmxext;
+        c->pix_abs[0][1] = ff_sad16_x2_mmxext;
+        c->pix_abs[0][2] = ff_sad16_y2_mmxext;
+        c->pix_abs[1][0] = ff_sad8_mmxext;
+        c->pix_abs[1][1] = ff_sad8_x2_mmxext;
+        c->pix_abs[1][2] = ff_sad8_y2_mmxext;
+
+        c->vsad[4] = ff_vsad_intra16_mmxext;
+        c->vsad[5] = ff_vsad_intra8_mmxext;
+
+        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
+            c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
+            c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
+
+            c->vsad[0] = ff_vsad16_approx_mmxext;
+            c->vsad[1] = ff_vsad8_approx_mmxext;
+        }
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->sse[0] = ff_sse16_sse2;
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
 
 #if HAVE_ALIGNED_STACK
         c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
         c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
 #endif
+        if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
+            c->sad[0]        = ff_sad16_sse2;
+            c->pix_abs[0][0] = ff_sad16_sse2;
+            c->pix_abs[0][1] = ff_sad16_x2_sse2;
+            c->pix_abs[0][2] = ff_sad16_y2_sse2;
+
+            c->vsad[4]       = ff_vsad_intra16_sse2;
+            if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
+                c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
+                c->vsad[0]       = ff_vsad16_approx_sse2;
+            }
+        }
     }
 
-    if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
+#if HAVE_ALIGNED_STACK
         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
+#endif
     }
 }
diff --git a/libavcodec/x86/mlpdsp.asm b/libavcodec/x86/mlpdsp.asm
new file mode 100644
index 0000000000..ce656af145
--- /dev/null
+++ b/libavcodec/x86/mlpdsp.asm
@@ -0,0 +1,196 @@
+;******************************************************************************
+;* SIMD-optimized MLP DSP functions
+;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+%if ARCH_X86_64
+
+%macro SHLX 2
+%if cpuflag(bmi2)
+   shlx %1, %1, %2q
+%else
+   shl  %1, %2b
+%endif
+%endmacro
+
+%macro REMATRIX 0
+    movdqa        m0, [samplesq]
+    movdqa        m1, [coeffsq ]
+    pshufd        m2, m0, q2301
+    pshufd        m3, m1, q2301
+    pmuldq        m0, m1
+    pmuldq        m3, m2
+    paddq         m0, m3
+%if notcpuflag(avx2)
+    movdqa        m1, [samplesq + 16]
+    movdqa        m2, [coeffsq  + 16]
+    pshufd        m3, m1, q2301
+    pshufd        m4, m2, q2301
+    pmuldq        m1, m2
+    pmuldq        m4, m3
+    paddq         m0, m1
+    paddq         m0, m4
+%else
+    vextracti128 xm1, m0, 1
+    paddq        xm0, xm1
+%endif
+%endmacro
+
+%macro LOOP_END 0
+    pshufd       xm1, xm0, q0032
+    paddq        xm0, xm1
+    movq      accumq, xm0
+    movzx     blsbsd, byte [blsbs_ptrq]             ; load *bypassed_lsbs
+    sar       accumq, 14                            ; accum >>= 14
+    and       accumd, maskd                         ; accum &= mask
+    add       accumd, blsbsd                        ; accum += *bypassed_lsbs
+    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
+    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
+    add     samplesq, 32                            ; samples += MAX_CHANNELS;
+    cmp   blsbs_ptrq, cntq
+%endmacro
+
+%macro LOOP_SHIFT_END 0
+    pshufd       xm1, xm0, q0032
+    paddq        xm0, xm1
+    movq      accumq, xm0
+    and       indexd, auspd                         ; index &= access_unit_size_pow2;
+    movsx     noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
+    add       indexd, index2d                       ; index += index2
+    SHLX      noiseq, mns                           ; noise_buffer[index] <<= matrix_noise_shift
+    add       accumq, noiseq                        ; accum += noise_buffer[index]
+    movzx     noised, byte [blsbs_ptrq]             ; load *bypassed_lsbs (reuse tmp noise register)
+    sar       accumq, 14                            ; accum >>= 14
+    and       accumd, maskd                         ; accum &= mask
+    add       accumd, noised                        ; accum += *bypassed_lsbs
+    mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
+    add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
+    add     samplesq, 32                            ; samples += MAX_CHANNELS;
+    cmp   blsbs_ptrq, cntq
+%endmacro
+
+;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
+;                             const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
+;                             int index, unsigned int dest_ch, uint16_t blockpos,
+;                             unsigned int maxchan, int matrix_noise_shift,
+;                             int access_unit_size_pow2, int32_t mask)
+%macro MLP_REMATRIX_CHANNEL 0
+cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
+                                        index, dest_ch, blockpos, maxchan, mns, \
+                                        accum, mask, cnt
+    mov         mnsd, mnsm                          ; load matrix_noise_shift
+    movzx  blockposq, word blockposm                ; load and zero extend blockpos (16bit)
+    mov     maxchand, maxchanm                      ; load maxchan
+    mov        maskd, maskm                         ; load mask
+%if WIN64
+    mov     dest_chd, dest_chm                      ; load dest_chd (not needed on UNIX64)
+%endif
+    shl     dest_chd, 2
+    lea         cntq, [blsbs_ptrq + blockposq*8]
+    test        mnsd, mnsd                          ; is matrix_noise_shift != 0?
+    jne .shift                                      ; jump if true
+    cmp     maxchand, 4                             ; is maxchan < 4?
+    jl .loop4                                       ; jump if true
+
+align 16
+.loop8:
+    ; Process 5 or more channels
+    REMATRIX
+    LOOP_END
+    jne .loop8
+    RET
+
+align 16
+.loop4:
+    ; Process up to 4 channels
+    movdqa       xm0, [samplesq]
+    movdqa       xm1, [coeffsq ]
+    pshufd       xm2, xm0, q2301
+    pshufd       xm3, xm1, q2301
+    pmuldq       xm0, xm1
+    pmuldq       xm3, xm2
+    paddq        xm0, xm3
+    LOOP_END
+    jne .loop4
+    RET
+
+.shift:
+%if WIN64
+    mov       indexd, indexm         ; load index (not needed on UNIX64)
+%endif
+    mov          r9d, r9m            ; load access_unit_size_pow2
+%if cpuflag(bmi2)
+    ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
+    DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
+                index, dest_ch, accum, index2, mns, \
+                ausp, mask, cnt, noise
+    add         mnsd, 7              ; matrix_noise_shift += 7
+%else ; sse4
+    mov           r6, rcx            ; move rcx elsewhere so we can use cl for matrix_noise_shift
+%if WIN64
+    ; r0 = rcx
+    DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
+                index2, accum, ausp, mask, cnt, noise
+%else ; UNIX64
+    ; r3 = rcx
+    DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
+                index2, accum, ausp, mask, cnt, noise
+%endif
+    lea         mnsd, [r8 + 7]       ; rcx = matrix_noise_shift + 7
+%endif ; cpuflag
+    sub        auspd, 1              ; access_unit_size_pow2 -= 1
+    cmp          r7d, 4              ; is maxchan < 4?
+    lea      index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
+    jl .loop4_shift                  ; jump if maxchan < 4
+
+align 16
+.loop8_shift:
+    ; Process 5 or more channels
+    REMATRIX
+    LOOP_SHIFT_END
+    jne .loop8_shift
+    RET
+
+align 16
+.loop4_shift:
+    ; Process up to 4 channels
+    movdqa       xm0, [samplesq]
+    movdqa       xm1, [coeffsq ]
+    pshufd       xm2, xm0, q2301
+    pshufd       xm3, xm1, q2301
+    pmuldq       xm0, xm1
+    pmuldq       xm3, xm2
+    paddq        xm0, xm3
+    LOOP_SHIFT_END
+    jne .loop4_shift
+    RET
+%endmacro
+
+INIT_XMM sse4
+MLP_REMATRIX_CHANNEL
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2, bmi2
+MLP_REMATRIX_CHANNEL
+%endif
+
+%endif ; ARCH_X86_64
diff --git a/libavcodec/x86/mlpdsp.c b/libavcodec/x86/mlpdsp_init.c
index 72fc637764..dc0bc585c7 100644
--- a/libavcodec/x86/mlpdsp.c
+++ b/libavcodec/x86/mlpdsp_init.c
@@ -2,32 +2,47 @@
  * MLP DSP functions x86-optimized
  * Copyright (c) 2009 Ramiro Polla
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
-#include "libavutil/internal.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mlpdsp.h"
 #include "libavcodec/mlp.h"
 
-#if HAVE_7REGS && HAVE_INLINE_ASM
+#define REMATRIX_CHANNEL_FUNC(opt) \
+void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
+                                   const int32_t *coeffs, \
+                                   const uint8_t *bypassed_lsbs, \
+                                   const int8_t *noise_buffer, \
+                                   int index, \
+                                   unsigned int dest_ch, \
+                                   uint16_t blockpos, \
+                                   unsigned int maxchan, \
+                                   int matrix_noise_shift, \
+                                   int access_unit_size_pow2, \
+                                   int32_t mask);
+
+REMATRIX_CHANNEL_FUNC(sse4)
+REMATRIX_CHANNEL_FUNC(avx2_bmi2)
+
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
 
 extern char ff_mlp_firorder_8;
 extern char ff_mlp_firorder_7;
@@ -45,12 +60,12 @@ extern char ff_mlp_iirorder_2;
 extern char ff_mlp_iirorder_1;
 extern char ff_mlp_iirorder_0;
 
-static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
+static const void * const firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
                                    &ff_mlp_firorder_2, &ff_mlp_firorder_3,
                                    &ff_mlp_firorder_4, &ff_mlp_firorder_5,
                                    &ff_mlp_firorder_6, &ff_mlp_firorder_7,
                                    &ff_mlp_firorder_8 };
-static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
+static const void * const iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
                                    &ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
                                    &ff_mlp_iirorder_4 };
 
@@ -179,9 +194,13 @@ static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
 
 av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
 {
-#if HAVE_7REGS && HAVE_INLINE_ASM
     int cpu_flags = av_get_cpu_flags();
+#if HAVE_7REGS && HAVE_INLINE_ASM && HAVE_INLINE_ASM_NONLOCAL_LABELS
     if (INLINE_MMX(cpu_flags))
         c->mlp_filter_channel = mlp_filter_channel_x86;
 #endif
+    if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags))
+        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_sse4;
+    if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags) && cpu_flags & AV_CPU_FLAG_BMI2)
+        c->mlp_rematrix_channel = ff_mlp_rematrix_channel_avx2_bmi2;
 }
diff --git a/libavcodec/x86/mpegaudiodsp.c b/libavcodec/x86/mpegaudiodsp.c
index 533b4a7c3f..27231674ae 100644
--- a/libavcodec/x86/mpegaudiodsp.c
+++ b/libavcodec/x86/mpegaudiodsp.c
@@ -2,20 +2,20 @@
  * SIMD-optimized MP3 decoding functions
  * Copyright (c) 2010 Vitor Sessak
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,11 +26,18 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mpegaudiodsp.h"
 
-void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
-void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
+#define DECL(CPU)\
+static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
+void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
+
+#if ARCH_X86_32
+DECL(sse)
+#endif
+DECL(sse2)
+DECL(sse3)
+DECL(ssse3)
+DECL(avx)
+
 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
                                float *tmpbuf);
 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
@@ -38,7 +45,7 @@ void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
 
 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
 
-#if HAVE_SSE2_INLINE
+#if HAVE_6REGS && HAVE_SSE_INLINE
 
 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
@@ -182,7 +189,7 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out,
     *out = sum;
 }
 
-#endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_6REGS && HAVE_SSE_INLINE */
 
 #if HAVE_YASM
 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
@@ -217,11 +224,17 @@ static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
     }                                                                   \
 }
 
+#if HAVE_SSE
+#if ARCH_X86_32
 DECL_IMDCT_BLOCKS(sse,sse)
+#endif
 DECL_IMDCT_BLOCKS(sse2,sse)
 DECL_IMDCT_BLOCKS(sse3,sse)
 DECL_IMDCT_BLOCKS(ssse3,sse)
+#endif
+#if HAVE_AVX_EXTERNAL
 DECL_IMDCT_BLOCKS(avx,avx)
+#endif
 #endif /* HAVE_YASM */
 
 av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
@@ -242,16 +255,19 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
         }
     }
 
-#if HAVE_SSE2_INLINE
-    if (INLINE_SSE2(cpu_flags)) {
+#if HAVE_6REGS && HAVE_SSE_INLINE
+    if (INLINE_SSE(cpu_flags)) {
         s->apply_window_float = apply_window_mp3;
     }
-#endif /* HAVE_SSE2_INLINE */
+#endif /* HAVE_SSE_INLINE */
 
 #if HAVE_YASM
+#if HAVE_SSE
+#if ARCH_X86_32
     if (EXTERNAL_SSE(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_sse;
     }
+#endif
     if (EXTERNAL_SSE2(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_sse2;
     }
@@ -261,8 +277,11 @@ av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
     if (EXTERNAL_SSSE3(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_ssse3;
     }
+#endif
+#if HAVE_AVX_EXTERNAL
     if (EXTERNAL_AVX(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_avx;
     }
+#endif
 #endif /* HAVE_YASM */
 }
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 1395156a36..b0028ce276 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -2,20 +2,20 @@
  * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
  * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,7 +26,7 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideo.h"
 
-#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
 
 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
                                   int16_t *block, int n, int qscale)
@@ -35,7 +35,7 @@ static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
 
     qmul = qscale << 1;
 
-    assert(s->block_last_index[n]>=0 || s->h263_aic);
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
     if (!s->h263_aic) {
         if (n < 4)
@@ -111,7 +111,7 @@ static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
     qmul = qscale << 1;
     qadd = (qscale - 1) | 1;
 
-    assert(s->block_last_index[n]>=0 || s->h263_aic);
+    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
     nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
 
@@ -171,7 +171,7 @@ static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int block0;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
@@ -239,7 +239,7 @@ static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
@@ -306,7 +306,7 @@ static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
     const uint16_t *quant_matrix;
     int block0;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     if(s->alternate_scan) nCoeffs= 63; //FIXME
     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -371,7 +371,7 @@ static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     if(s->alternate_scan) nCoeffs= 63; //FIXME
     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -442,11 +442,11 @@ __asm__ volatile(
         );
 }
 
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_MMX_INLINE */
 
 av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
 {
-#if HAVE_INLINE_ASM
+#if HAVE_MMX_INLINE
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
@@ -458,5 +458,5 @@ av_cold void ff_mpv_common_init_x86(MpegEncContext *s)
             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
     }
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_MMX_INLINE */
 }
diff --git a/libavcodec/x86/mpegvideodsp.c b/libavcodec/x86/mpegvideodsp.c
index 0e5dd0f153..941a8e2e4c 100644
--- a/libavcodec/x86/mpegvideodsp.c
+++ b/libavcodec/x86/mpegvideodsp.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,6 +22,7 @@
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mpegvideodsp.h"
+#include "libavcodec/videodsp.h"
 
 #if HAVE_INLINE_ASM
 
@@ -43,20 +44,24 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
     const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
     const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
     const uint64_t shift2  = 2 * shift;
+#define MAX_STRIDE 4096U
+#define MAX_H 8U
+    uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
     int x, y;
 
     const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
     const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
     const int dxh = dxy * (h - 1);
     const int dyw = dyx * (w - 1);
+    int need_emu  =  (unsigned) ix >= width  - w ||
+                     (unsigned) iy >= height - h;
 
     if ( // non-constant fullpel offset (3% of blocks)
         ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
          (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) ||
         // uses more than 16 bits of subpel mv (only at huge resolution)
         (dxx | dxy | dyx | dyy) & 15 ||
-        (unsigned) ix >= width  - w ||
-        (unsigned) iy >= height - h) {
+        (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
         // FIXME could still use mmx for some of the rows
         ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
                  shift, r, width, height);
@@ -64,6 +69,10 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
     }
 
     src += ix + iy * stride;
+    if (need_emu) {
+        ff_emulated_edge_mc_8(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
+        src = edge_buf;
+    }
 
     __asm__ volatile (
         "movd         %0, %%mm6         \n\t"
@@ -150,4 +159,3 @@ av_cold void ff_mpegvideodsp_init_x86(MpegVideoDSPContext *c)
         c->gmc = gmc_mmx;
 #endif /* HAVE_INLINE_ASM */
 }
-
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index 47349d17ec..b410511c6a 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -2,20 +2,20 @@
  * The simplest mpeg encoder (well, it was the simplest!)
  * Copyright (c) 2000,2001 Fabrice Bellard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,8 @@
 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
 DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64];
 
+#if HAVE_6REGS
+
 #if HAVE_MMX_INLINE
 #define COMPILE_TEMPLATE_MMXEXT 0
 #define COMPILE_TEMPLATE_SSE2   0
@@ -81,6 +83,8 @@ DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64];
 #include "mpegvideoenc_template.c"
 #endif /* HAVE_SSSE3_INLINE */
 
+#endif /* HAVE_6REGS */
+
 #if HAVE_INLINE_ASM
 static void  denoise_dct_mmx(MpegEncContext *s, int16_t *block){
     const int intra= s->mb_intra;
@@ -193,7 +197,7 @@ static void  denoise_dct_sse2(MpegEncContext *s, int16_t *block){
 }
 #endif /* HAVE_INLINE_ASM */
 
-av_cold void ff_mpv_encode_init_x86(MpegEncContext *s)
+av_cold void ff_dct_encode_init_x86(MpegEncContext *s)
 {
     const int dct_algo = s->avctx->dct_algo;
     int i;
@@ -205,21 +209,25 @@ av_cold void ff_mpv_encode_init_x86(MpegEncContext *s)
 #if HAVE_MMX_INLINE
         int cpu_flags = av_get_cpu_flags();
         if (INLINE_MMX(cpu_flags)) {
+#if HAVE_6REGS
             s->dct_quantize = dct_quantize_mmx;
+#endif
             s->denoise_dct  = denoise_dct_mmx;
         }
 #endif
-#if HAVE_MMXEXT_INLINE
+#if HAVE_6REGS && HAVE_MMXEXT_INLINE
         if (INLINE_MMXEXT(cpu_flags))
             s->dct_quantize = dct_quantize_mmxext;
 #endif
 #if HAVE_SSE2_INLINE
         if (INLINE_SSE2(cpu_flags)) {
+#if HAVE_6REGS
             s->dct_quantize = dct_quantize_sse2;
+#endif
             s->denoise_dct  = denoise_dct_sse2;
         }
 #endif
-#if HAVE_SSSE3_INLINE
+#if HAVE_6REGS && HAVE_SSSE3_INLINE
         if (INLINE_SSSE3(cpu_flags))
             s->dct_quantize = dct_quantize_ssse3;
 #endif
diff --git a/libavcodec/x86/mpegvideoenc_qns_template.c b/libavcodec/x86/mpegvideoenc_qns_template.c
index 8d8d68762a..882d486205 100644
--- a/libavcodec/x86/mpegvideoenc_qns_template.c
+++ b/libavcodec/x86/mpegvideoenc_qns_template.c
@@ -5,26 +5,26 @@
  * MMX optimization by Michael Niedermayer <michaelni@gmx.at>
  * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include <assert.h>
 #include <stdint.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/x86/asm.h"
 
@@ -36,7 +36,7 @@ static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[
 {
     x86_reg i=0;
 
-    assert(FFABS(scale) < MAX_ABS);
+    av_assert2(FFABS(scale) < MAX_ABS);
     scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
 
     SET_RND(mm6);
diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index 1274c13e05..1899ba23c6 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -107,7 +107,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
     const uint16_t *qmat, *bias;
     LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
 
-    assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
+    av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
 
     //s->fdct (block);
     RENAME_FDCT(ff_fdct)(block); // cannot be anything else ...
@@ -117,10 +117,15 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
 
     if (s->mb_intra) {
         int dummy;
-        if (n < 4)
+        if (n < 4){
             q = s->y_dc_scale;
-        else
+            bias = s->q_intra_matrix16[qscale][1];
+            qmat = s->q_intra_matrix16[qscale][0];
+        }else{
             q = s->c_dc_scale;
+            bias = s->q_chroma_intra_matrix16[qscale][1];
+            qmat = s->q_chroma_intra_matrix16[qscale][0];
+        }
         /* note: block[0] is assumed to be positive */
         if (!s->h263_aic) {
         __asm__ volatile (
@@ -135,8 +140,6 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         block[0]=0; //avoid fake overflow
 //        temp_block[0] = (block[0] + (q >> 1)) / q;
         last_non_zero_p1 = 1;
-        bias = s->q_intra_matrix16[qscale][1];
-        qmat = s->q_intra_matrix16[qscale][0];
     } else {
         last_non_zero_p1 = 0;
         bias = s->q_inter_matrix16[qscale][1];
@@ -172,7 +175,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
             " js 1b                             \n\t"
             PMAX(MM"3", MM"0")
             "movd "MM"3, %%"REG_a"              \n\t"
-            "movzb %%al, %%"REG_a"              \n\t" // last_non_zero_p1
+            "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
             : "+a" (last_non_zero_p1)
             : "r" (block+64), "r" (qmat), "r" (bias),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
@@ -206,7 +209,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
             " js 1b                             \n\t"
             PMAX(MM"3", MM"0")
             "movd "MM"3, %%"REG_a"              \n\t"
-            "movzb %%al, %%"REG_a"              \n\t" // last_non_zero_p1
+            "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
             : "+a" (last_non_zero_p1)
             : "r" (block+64), "r" (qmat+64), "r" (bias+64),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
@@ -220,7 +223,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         "psubusw "MM"1, "MM"4               \n\t"
         "packuswb "MM"4, "MM"4              \n\t"
 #if COMPILE_TEMPLATE_SSE2
-        "packuswb "MM"4, "MM"4              \n\t"
+        "packsswb "MM"4, "MM"4              \n\t"
 #endif
         "movd "MM"4, %0                     \n\t" // *overflow
         : "=g" (*overflow)
@@ -274,6 +277,50 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
         block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
         block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    }else if(s->idsp.perm_type == FF_IDCT_PERM_LIBMPEG2){
+        if(last_non_zero_p1 <= 1) goto end;
+        block[0x04] = temp_block[0x01];
+        block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
+        if(last_non_zero_p1 <= 4) goto end;
+        block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
+        block[0x05] = temp_block[0x03];
+        if(last_non_zero_p1 <= 7) goto end;
+        block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
+        block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
+        if(last_non_zero_p1 <= 11) goto end;
+        block[0x1C] = temp_block[0x19];
+        block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
+        block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
+        if(last_non_zero_p1 <= 16) goto end;
+        block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
+        block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
+        block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
+        block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
+        if(last_non_zero_p1 <= 24) goto end;
+        block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
+        block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
+        block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
+        block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
+        if(last_non_zero_p1 <= 32) goto end;
+        block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
+        block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
+        block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
+        block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
+        if(last_non_zero_p1 <= 40) goto end;
+        block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
+        block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
+        block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
+        block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
+        if(last_non_zero_p1 <= 48) goto end;
+        block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
+        block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
+            block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+        block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
+        if(last_non_zero_p1 <= 56) goto end;
+        block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
+        block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
+        block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
+        block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
     }else{
         if(last_non_zero_p1 <= 1) goto end;
         block[0x01] = temp_block[0x01];
diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm
index 9326ee776d..aec73f82dc 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -4,92 +4,151 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION .text
+SECTION_RODATA
 
-INIT_MMX mmx
+cextern pw_1
+
+SECTION .text
 ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
-cglobal pix_sum16, 2, 3
+; %1 = number of loops
+; %2 = number of GPRs used
+%macro PIX_SUM16 3
+cglobal pix_sum16, 2, %2, 6
     movsxdifnidn r1, r1d
-    mov          r2, r1
-    neg          r2
-    shl          r2, 4
-    sub          r0, r2
-    pxor         m7, m7
-    pxor         m6, m6
+    mov          r2, %1
+%if mmsize == 16
+    lea          r3, [r1*3]
+%endif
+%if notcpuflag(xop)
+    pxor         m5, m5
+%endif
+    pxor         m4, m4
 .loop:
-    mova         m0, [r0+r2+0]
-    mova         m1, [r0+r2+0]
-    mova         m2, [r0+r2+8]
-    mova         m3, [r0+r2+8]
-    punpcklbw    m0, m7
-    punpckhbw    m1, m7
-    punpcklbw    m2, m7
-    punpckhbw    m3, m7
+%if cpuflag(xop)
+    vphaddubq    m0, [r0]
+    vphaddubq    m1, [r0+r1]
+    vphaddubq    m2, [r0+r1*2]
+    vphaddubq    m3, [r0+r3]
+%else
+    mova         m0, [r0]
+%if mmsize == 8
+    mova         m1, [r0+8]
+%if cpuflag(mmxext)
+    mova         m2, [r0+r1]
+    mova         m3, [r0+r1+8]
+%endif
+%else ; sse2
+    mova         m1, [r0+r1]
+    mova         m2, [r0+r1*2]
+    mova         m3, [r0+r3]
+%endif
+%if cpuflag(mmxext)
+    psadbw       m0, m5
+    psadbw       m1, m5
+    psadbw       m2, m5
+    psadbw       m3, m5
+%else ; mmx
+    punpckhbw    m2, m0, m5
+    punpcklbw    m0, m5
+    punpckhbw    m3, m1, m5
+    punpcklbw    m1, m5
+%endif ; cpuflag(mmxext)
+%endif ; cpuflag(xop)
     paddw        m1, m0
     paddw        m3, m2
     paddw        m3, m1
-    paddw        m6, m3
-    add          r2, r1
-    js .loop
-    mova         m5, m6
-    psrlq        m6, 32
-    paddw        m6, m5
-    mova         m5, m6
-    psrlq        m6, 16
-    paddw        m6, m5
-    movd        eax, m6
-    and         eax, 0xffff
+    paddw        m4, m3
+%if cpuflag(mmxext)
+    lea          r0, [r0+r1*%3]
+%else
+    add          r0, r1
+%endif
+    dec r2
+    jne .loop
+%if mmsize == 16
+    pshufd       m0, m4, q0032
+    paddd        m4, m0
+%elif notcpuflag(mmxext)
+    HADDW        m4, m5
+%endif
+    movd        eax, m4
     RET
+%endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
+PIX_SUM16 16, 3, 0
+INIT_MMX mmxext
+PIX_SUM16  8, 4, 2
+%endif
+INIT_XMM sse2
+PIX_SUM16  4, 4, 4
+%if HAVE_XOP_EXTERNAL
+INIT_XMM xop
+PIX_SUM16  4, 4, 4
+%endif
+
 ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
-cglobal pix_norm1, 2, 4
+; %1 = number of xmm registers used
+; %2 = number of loops
+%macro PIX_NORM1 2
+cglobal pix_norm1, 2, 3, %1
     movsxdifnidn r1, r1d
-    mov          r2, 16
+    mov          r2, %2
     pxor         m0, m0
-    pxor         m7, m7
+    pxor         m5, m5
 .loop:
     mova         m2, [r0+0]
+%if mmsize == 8
     mova         m3, [r0+8]
-    mova         m1, m2
-    punpckhbw    m1, m0
+%else
+    mova         m3, [r0+r1]
+%endif
+    punpckhbw    m1, m2, m0
     punpcklbw    m2, m0
-    mova         m4, m3
-    punpckhbw    m3, m0
-    punpcklbw    m4, m0
+    punpckhbw    m4, m3, m0
+    punpcklbw    m3, m0
     pmaddwd      m1, m1
     pmaddwd      m2, m2
     pmaddwd      m3, m3
     pmaddwd      m4, m4
     paddd        m2, m1
     paddd        m4, m3
-    paddd        m7, m2
+    paddd        m5, m2
+    paddd        m5, m4
+%if mmsize == 8
     add          r0, r1
-    paddd        m7, m4
+%else
+    lea          r0, [r0+r1*2]
+%endif
     dec r2
     jne .loop
-    mova         m1, m7
-    psrlq        m7, 32
-    paddd        m1, m7
-    movd        eax, m1
+    HADDD        m5, m1
+    movd        eax, m5
     RET
+%endmacro
+
+INIT_MMX mmx
+PIX_NORM1 0, 16
+INIT_XMM sse2
+PIX_NORM1 6, 8
 
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c
index 7732e7307f..2a4db61511 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -1,29 +1,34 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 
 int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
+int ff_pix_sum16_mmxext(uint8_t *pix, int line_size);
+int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
+int ff_pix_sum16_xop(uint8_t *pix, int line_size);
 int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
+int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
 
 #if HAVE_INLINE_ASM
 
@@ -123,7 +128,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
             : "+r" (ptr)
             : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
               "r" (ptr + wrap * height));
-    } else {
+    } else if (w == 16) {
         __asm__ volatile (
             "1:                                 \n\t"
             "movd            (%0), %%mm0        \n\t"
@@ -141,6 +146,25 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
             "add               %1, %0           \n\t"
             "cmp               %3, %0           \n\t"
             "jb                1b               \n\t"
+            : "+r"(ptr)
+            : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
+            );
+    } else {
+        av_assert1(w == 4);
+        __asm__ volatile (
+            "1:                             \n\t"
+            "movd            (%0), %%mm0    \n\t"
+            "punpcklbw      %%mm0, %%mm0    \n\t"
+            "punpcklwd      %%mm0, %%mm0    \n\t"
+            "movd           %%mm0, -4(%0)   \n\t"
+            "movd      -4(%0, %2), %%mm1    \n\t"
+            "punpcklbw      %%mm1, %%mm1    \n\t"
+            "punpckhwd      %%mm1, %%mm1    \n\t"
+            "punpckhdq      %%mm1, %%mm1    \n\t"
+            "movd           %%mm1, (%0, %2) \n\t"
+            "add               %1, %0       \n\t"
+            "cmp               %3, %0       \n\t"
+            "jb                1b           \n\t"
             : "+r" (ptr)
             : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
               "r" (ptr + wrap * height));
@@ -195,11 +219,26 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags)) {
         c->pix_sum   = ff_pix_sum16_mmx;
         c->pix_norm1 = ff_pix_norm1_mmx;
     }
 
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_mmxext;
+    }
+#endif
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_sse2;
+        c->pix_norm1   = ff_pix_norm1_sse2;
+    }
+
+    if (EXTERNAL_XOP(cpu_flags)) {
+        c->pix_sum     = ff_pix_sum16_xop;
+    }
+
 #if HAVE_INLINE_ASM
 
     if (INLINE_MMX(cpu_flags)) {
diff --git a/libavcodec/x86/pixblockdsp.asm b/libavcodec/x86/pixblockdsp.asm
index c8fd1b24a1..7c5377b2bb 100644
--- a/libavcodec/x86/pixblockdsp.asm
+++ b/libavcodec/x86/pixblockdsp.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2000, 2001 Fabrice Bellard
 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;*****************************************************************************
 
@@ -26,9 +26,8 @@
 SECTION .text
 
 INIT_MMX mmx
-; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
+; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size)
 cglobal get_pixels, 3,4
-    movsxdifnidn r2, r2d
     add          r0, 128
     mov          r3, -128
     pxor         m7, m7
@@ -51,8 +50,7 @@ cglobal get_pixels, 3,4
     REP_RET
 
 INIT_XMM sse2
-cglobal get_pixels, 3, 4
-    movsxdifnidn r2, r2d
+cglobal get_pixels, 3, 4, 5
     lea          r3, [r2*3]
     pxor         m4, m4
     movh         m0, [r1]
@@ -108,3 +106,28 @@ cglobal diff_pixels, 4,5
     add          r4, 16
     jne .loop
     REP_RET
+
+INIT_XMM sse2
+cglobal diff_pixels, 4, 5, 5
+    movsxdifnidn r3, r3d
+    pxor         m4, m4
+    add          r0,  128
+    mov          r4, -128
+.loop:
+    movh         m0, [r1]
+    movh         m2, [r2]
+    movh         m1, [r1+r3]
+    movh         m3, [r2+r3]
+    punpcklbw    m0, m4
+    punpcklbw    m1, m4
+    punpcklbw    m2, m4
+    punpcklbw    m3, m4
+    psubw        m0, m2
+    psubw        m1, m3
+    mova [r0+r4+0 ], m0
+    mova [r0+r4+16], m1
+    lea          r1, [r1+r3*2]
+    lea          r2, [r2+r3*2]
+    add          r4, 32
+    jne .loop
+    RET
diff --git a/libavcodec/x86/pixblockdsp_init.c b/libavcodec/x86/pixblockdsp_init.c
index 9582e0b5c2..4d06a44c6d 100644
--- a/libavcodec/x86/pixblockdsp_init.c
+++ b/libavcodec/x86/pixblockdsp_init.c
@@ -1,20 +1,20 @@
 /*
  * SIMD-optimized pixel operations
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,10 +23,12 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/pixblockdsp.h"
 
-void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
-void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
+void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size);
+void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, ptrdiff_t line_size);
 void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                         int stride);
+void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
+                         int stride);
 
 av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
                                      AVCodecContext *avctx,
@@ -43,5 +45,6 @@ av_cold void ff_pixblockdsp_init_x86(PixblockDSPContext *c,
     if (EXTERNAL_SSE2(cpu_flags)) {
         if (!high_bit_depth)
             c->get_pixels = ff_get_pixels_sse2;
+        c->diff_pixels = ff_diff_pixels_sse2;
     }
 }
diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm
index c05f3da017..8e23ccfbc6 100644
--- a/libavcodec/x86/pngdsp.asm
+++ b/libavcodec/x86/pngdsp.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
 ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/pngdsp_init.c b/libavcodec/x86/pngdsp_init.c
index 34a3da36d7..7dca62c675 100644
--- a/libavcodec/x86/pngdsp_init.c
+++ b/libavcodec/x86/pngdsp_init.c
@@ -2,20 +2,20 @@
  * x86 PNG optimizations.
  * Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm
index a0e97b3951..255eb24897 100644
--- a/libavcodec/x86/proresdsp.asm
+++ b/libavcodec/x86/proresdsp.asm
@@ -1,23 +1,24 @@
 ;******************************************************************************
 ;* x86-SIMD-optimized IDCT for prores
-;* this is identical to "simple" IDCT except for the clip range
+;* this is identical to "simple" IDCT written by Michael Niedermayer
+;* except for the clip range
 ;*
 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -47,10 +48,10 @@ w1_plus_w5: times 4 dw W1sh2, +W5sh2
 w5_min_w1:  times 4 dw W5sh2, -W1sh2
 w5_plus_w7: times 4 dw W5sh2, +W7sh2
 w7_min_w5:  times 4 dw W7sh2, -W5sh2
-row_round:  times 8 dw (1<<14)
+pw_88:      times 8 dw 0x2008
 
+cextern pw_1
 cextern pw_4
-cextern pw_8
 cextern pw_512
 cextern pw_1019
 
@@ -91,14 +92,12 @@ section .text align=16
     ; a2 -= W6 * row[2];
     ; a3 -= W2 * row[2];
 %ifidn %1, col
-    paddw       m10,[pw_8]
+    paddw       m10,[pw_88]
 %endif
-    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
 %ifidn %1, row
-    psubw       m10,[row_round]
+    paddw       m10,[pw_1]
 %endif
-    SIGNEXTEND  m8,  m9,  m14      ; { row[2] }[0-3] / [4-7]
-    SIGNEXTEND  m10, m11, m14      ; { row[0] }[0-3] / [4-7]
+    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
     pmaddwd     m2,  m0, [w4_plus_w6]
     pmaddwd     m3,  m1, [w4_plus_w6]
     pmaddwd     m4,  m0, [w4_min_w6]
@@ -107,75 +106,33 @@ section .text align=16
     pmaddwd     m7,  m1, [w4_min_w2]
     pmaddwd     m0, [w4_plus_w2]
     pmaddwd     m1, [w4_plus_w2]
-    pslld       m2,  2
-    pslld       m3,  2
-    pslld       m4,  2
-    pslld       m5,  2
-    pslld       m6,  2
-    pslld       m7,  2
-    pslld       m0,  2
-    pslld       m1,  2
 
     ; a0: -1*row[0]-1*row[2]
     ; a1: -1*row[0]
     ; a2: -1*row[0]
     ; a3: -1*row[0]+1*row[2]
-    psubd       m2,  m10           ; a1[0-3]
-    psubd       m3,  m11           ; a1[4-7]
-    psubd       m4,  m10           ; a2[0-3]
-    psubd       m5,  m11           ; a2[4-7]
-    psubd       m0,  m10
-    psubd       m1,  m11
-    psubd       m6,  m10
-    psubd       m7,  m11
-    psubd       m0,  m8            ; a0[0-3]
-    psubd       m1,  m9            ; a0[4-7]
-    paddd       m6,  m8            ; a3[0-3]
-    paddd       m7,  m9            ; a3[4-7]
 
     ; a0 +=   W4*row[4] + W6*row[6]; i.e. -1*row[4]
     ; a1 -=   W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
     ; a2 -=   W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
     ; a3 +=   W4*row[4] - W6*row[6]; i.e. -1*row[4]
     SBUTTERFLY3 wd,  8,  9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
-    SIGNEXTEND  m13, m14, m10      ; { row[4] }[0-3] / [4-7]
     pmaddwd     m10, m8, [w4_plus_w6]
     pmaddwd     m11, m9, [w4_plus_w6]
-    pslld       m10, 2
-    pslld       m11, 2
-    psubd       m10,  m13
-    psubd       m11,  m14
     paddd       m0,  m10            ; a0[0-3]
     paddd       m1,  m11            ; a0[4-7]
     pmaddwd     m10, m8, [w4_min_w6]
     pmaddwd     m11, m9, [w4_min_w6]
-    pslld       m10, 2
-    pslld       m11, 2
-    psubd       m10, m13
-    psubd       m11, m14
     paddd       m6,  m10           ; a3[0-3]
     paddd       m7,  m11           ; a3[4-7]
     pmaddwd     m10, m8, [w4_min_w2]
     pmaddwd     m11, m9, [w4_min_w2]
     pmaddwd     m8, [w4_plus_w2]
     pmaddwd     m9, [w4_plus_w2]
-    pslld       m10, 2
-    pslld       m11, 2
-    pslld       m8,  2
-    pslld       m9,  2
-    psubd       m10, m13
-    psubd       m11, m14
-    psubd       m8,  m13
-    psubd       m9,  m14
     psubd       m4,  m10           ; a2[0-3] intermediate
     psubd       m5,  m11           ; a2[4-7] intermediate
     psubd       m2,  m8            ; a1[0-3] intermediate
     psubd       m3,  m9            ; a1[4-7] intermediate
-    SIGNEXTEND  m12, m13, m10      ; { row[6] }[0-3] / [4-7]
-    psubd       m4,  m12           ; a2[0-3]
-    psubd       m5,  m13           ; a2[4-7]
-    paddd       m2,  m12           ; a1[0-3]
-    paddd       m3,  m13           ; a1[4-7]
 
     ; load/store
     mova   [r2+  0], m0
@@ -206,8 +163,6 @@ section .text align=16
     ; b3 = MUL(W7, row[1]);
     ; MAC(b3, -W5, row[3]);
     SBUTTERFLY3 wd,  0,  1, 10, 8  ; { row[1], row[3] }[0-3]/[4-7]
-    SIGNEXTEND  m10, m11, m12      ; { row[1] }[0-3] / [4-7]
-    SIGNEXTEND  m8,  m9,  m12      ; { row[3] }[0-3] / [4-7]
     pmaddwd     m2,  m0, [w3_min_w7]
     pmaddwd     m3,  m1, [w3_min_w7]
     pmaddwd     m4,  m0, [w5_min_w1]
@@ -216,35 +171,11 @@ section .text align=16
     pmaddwd     m7,  m1, [w7_min_w5]
     pmaddwd     m0, [w1_plus_w3]
     pmaddwd     m1, [w1_plus_w3]
-    pslld       m2,  2
-    pslld       m3,  2
-    pslld       m4,  2
-    pslld       m5,  2
-    pslld       m6,  2
-    pslld       m7,  2
-    pslld       m0,  2
-    pslld       m1,  2
 
     ; b0: +1*row[1]+2*row[3]
     ; b1: +2*row[1]-1*row[3]
     ; b2: -1*row[1]-1*row[3]
     ; b3: +1*row[1]+1*row[3]
-    psubd       m2,  m8
-    psubd       m3,  m9
-    paddd       m0,  m8
-    paddd       m1,  m9
-    paddd       m8,  m10           ; { row[1] + row[3] }[0-3]
-    paddd       m9,  m11           ; { row[1] + row[3] }[4-7]
-    paddd       m10, m10
-    paddd       m11, m11
-    paddd       m0,  m8            ; b0[0-3]
-    paddd       m1,  m9            ; b0[4-7]
-    paddd       m2,  m10           ; b1[0-3]
-    paddd       m3,  m11           ; b2[4-7]
-    psubd       m4,  m8            ; b2[0-3]
-    psubd       m5,  m9            ; b2[4-7]
-    paddd       m6,  m8            ; b3[0-3]
-    paddd       m7,  m9            ; b3[4-7]
 
     ; MAC(b0,  W5, row[5]);
     ; MAC(b0,  W7, row[7]);
@@ -255,38 +186,16 @@ section .text align=16
     ; MAC(b3,  W3, row[5]);
     ; MAC(b3, -W1, row[7]);
     SBUTTERFLY3 wd,  8,  9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
-    SIGNEXTEND  m13, m12, m11      ; { row[5] }[0-3] / [4-7]
-    SIGNEXTEND  m14, m11, m10      ; { row[7] }[0-3] / [4-7]
 
     ; b0: -1*row[5]+1*row[7]
     ; b1: -1*row[5]+1*row[7]
     ; b2: +1*row[5]+2*row[7]
     ; b3: +2*row[5]-1*row[7]
-    paddd       m4,  m13
-    paddd       m5,  m12
-    paddd       m6,  m13
-    paddd       m7,  m12
-    psubd       m13, m14           ; { row[5] - row[7] }[0-3]
-    psubd       m12, m11           ; { row[5] - row[7] }[4-7]
-    paddd       m14, m14
-    paddd       m11, m11
-    psubd       m0,  m13
-    psubd       m1,  m12
-    psubd       m2,  m13
-    psubd       m3,  m12
-    paddd       m4,  m14
-    paddd       m5,  m11
-    paddd       m6,  m13
-    paddd       m7,  m12
 
     pmaddwd     m10, m8, [w1_plus_w5]
     pmaddwd     m11, m9, [w1_plus_w5]
     pmaddwd     m12, m8, [w5_plus_w7]
     pmaddwd     m13, m9, [w5_plus_w7]
-    pslld       m10, 2
-    pslld       m11, 2
-    pslld       m12,  2
-    pslld       m13,  2
     psubd       m2,  m10           ; b1[0-3]
     psubd       m3,  m11           ; b1[4-7]
     paddd       m0,  m12            ; b0[0-3]
@@ -295,10 +204,6 @@ section .text align=16
     pmaddwd     m13, m9, [w7_plus_w3]
     pmaddwd     m8, [w3_min_w1]
     pmaddwd     m9, [w3_min_w1]
-    pslld       m12, 2
-    pslld       m13, 2
-    pslld       m8,  2
-    pslld       m9,  2
     paddd       m4,  m12           ; b2[0-3]
     paddd       m5,  m13           ; b2[4-7]
     paddd       m6,  m8            ; b3[0-3]
@@ -345,7 +250,7 @@ cglobal prores_idct_put_10, 4, 4, %1
     pmullw      m13,[r3+64]
     pmullw      m12,[r3+96]
 
-    IDCT_1D     row, 17
+    IDCT_1D     row, 15
 
     ; transpose for second part of IDCT
     TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
@@ -360,20 +265,11 @@ cglobal prores_idct_put_10, 4, 4, %1
 
     ; for (i = 0; i < 8; i++)
     ;     idctSparseColAdd(dest + i, line_size, block + i);
-    IDCT_1D     col, 20
+    IDCT_1D     col, 18
 
     ; clip/store
-    mova        m6, [pw_512]
     mova        m3, [pw_4]
     mova        m5, [pw_1019]
-    paddw       m8,  m6
-    paddw       m0,  m6
-    paddw       m1,  m6
-    paddw       m2,  m6
-    paddw       m4,  m6
-    paddw       m11, m6
-    paddw       m9,  m6
-    paddw       m10, m6
     pmaxsw      m8,  m3
     pmaxsw      m0,  m3
     pmaxsw      m1,  m3
@@ -422,7 +318,9 @@ INIT_XMM sse2
 idct_put_fn 16
 INIT_XMM sse4
 idct_put_fn 16
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 idct_put_fn 16
+%endif
 
 %endif
diff --git a/libavcodec/x86/proresdsp_init.c b/libavcodec/x86/proresdsp_init.c
index e82dac0448..d647788e6b 100644
--- a/libavcodec/x86/proresdsp_init.c
+++ b/libavcodec/x86/proresdsp_init.c
@@ -3,20 +3,20 @@
  *
  * Copyright (c) 2010-2011 Maxim Poliakovski
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,7 +32,7 @@ void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize,
 void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize,
                                 int16_t *block, const int16_t *qmat);
 
-av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp)
+av_cold void ff_proresdsp_init_x86(ProresDSPContext *dsp, AVCodecContext *avctx)
 {
 #if ARCH_X86_64
     int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/x86/qpel.asm b/libavcodec/x86/qpel.asm
index 27a1c63b8a..4e72d5084f 100644
--- a/libavcodec/x86/qpel.asm
+++ b/libavcodec/x86/qpel.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2003-2013 Michael Niedermayer
 ;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/qpeldsp.asm b/libavcodec/x86/qpeldsp.asm
index 8f65550e60..dc0f900c5b 100644
--- a/libavcodec/x86/qpeldsp.asm
+++ b/libavcodec/x86/qpeldsp.asm
@@ -1,22 +1,23 @@
 ;******************************************************************************
-;* quarterpel DSP functions
-;*
+;* mpeg4 qpel
+;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2013 Daniel Kang
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c
index cdefe50a3c..3268d907ab 100644
--- a/libavcodec/x86/qpeldsp_init.c
+++ b/libavcodec/x86/qpeldsp_init.c
@@ -1,20 +1,22 @@
 /*
  * quarterpel DSP functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,13 +79,13 @@ void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, const uint8_t *src,
 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst,
                                                 const uint8_t *src,
                                                 int dstStride, int srcStride);
-#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
-#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
+#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx
 
 #if HAVE_YASM
 
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
-CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8)
+#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext  ff_put_pixels8_mmx
 
 #define QPEL_OP(OPNAME, RND, MMX)                                       \
 static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst,                  \
diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c
index a9fb13234b..c9fd71eeef 100644
--- a/libavcodec/x86/rnd_template.c
+++ b/libavcodec/x86/rnd_template.c
@@ -7,20 +7,20 @@
  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  * and improved by Zdenek Kabelac <kabi@users.sf.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index 4d9c35b600..7732d65b2a 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -2,20 +2,20 @@
 ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c
index 586e4e9a6d..99c56f9d09 100644
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@@ -2,20 +2,20 @@
  * RV30/40 MMX/SSE2 optimizations
  * Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index 0a242b54e3..fdd81a0a37 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -4,20 +4,20 @@
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index e006c76584..2900e2d2d3 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -2,20 +2,20 @@
  * RV40 decoder motion compensation functions x86-optimised
  * Copyright (c) 2008 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -32,6 +32,13 @@
 #include "libavutil/x86/cpu.h"
 #include "hpeldsp.h"
 
+#define DEFINE_FN(op, size, insn) \
+static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \
+                                               ptrdiff_t stride) \
+{ \
+    ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \
+}
+
 #if HAVE_YASM
 void ff_put_rv40_chroma_mc8_mmx  (uint8_t *dst, uint8_t *src,
                                   int stride, int h, int x, int y);
@@ -127,8 +134,8 @@ QPEL_FUNCS_DECL(OP, 3, 2, OPT)
 /** @} */
 
 #define LOOPSIZE  8
-#define HCOFF(x)  (32 * (x - 1))
-#define VCOFF(x)  (32 * (x - 1))
+#define HCOFF(x)  (32 * ((x) - 1))
+#define VCOFF(x)  (32 * ((x) - 1))
 QPEL_MC_DECL(put_, _ssse3)
 QPEL_MC_DECL(avg_, _ssse3)
 
@@ -136,8 +143,8 @@ QPEL_MC_DECL(avg_, _ssse3)
 #undef HCOFF
 #undef VCOFF
 #define LOOPSIZE  8
-#define HCOFF(x)  (64 * (x - 1))
-#define VCOFF(x)  (64 * (x - 1))
+#define HCOFF(x)  (64 * ((x) - 1))
+#define VCOFF(x)  (64 * ((x) - 1))
 QPEL_MC_DECL(put_, _sse2)
 QPEL_MC_DECL(avg_, _sse2)
 
@@ -146,8 +153,8 @@ QPEL_MC_DECL(avg_, _sse2)
 #undef HCOFF
 #undef VCOFF
 #define LOOPSIZE  4
-#define HCOFF(x)  (64 * (x - 1))
-#define VCOFF(x)  (64 * (x - 1))
+#define HCOFF(x)  (64 * ((x) - 1))
+#define VCOFF(x)  (64 * ((x) - 1))
 
 QPEL_MC_DECL(put_, _mmx)
 
@@ -186,30 +193,24 @@ QPEL_FUNCS_SET (OP, 3, 1, OPT) \
 QPEL_FUNCS_SET (OP, 3, 2, OPT)
 /** @} */
 
+DEFINE_FN(put, 8, ssse3)
+
+DEFINE_FN(put, 16, sse2)
+DEFINE_FN(put, 16, ssse3)
+
+DEFINE_FN(avg, 8, mmxext)
+DEFINE_FN(avg, 8, ssse3)
+
+DEFINE_FN(avg, 16, sse2)
+DEFINE_FN(avg, 16, ssse3)
 #endif /* HAVE_YASM */
 
 #if HAVE_MMX_INLINE
-static void put_rv40_qpel8_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                    ptrdiff_t stride)
-{
-    ff_put_pixels8_xy2_mmx(dst, src, stride, 8);
-}
-static void put_rv40_qpel16_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                     ptrdiff_t stride)
-{
-    ff_put_pixels16_xy2_mmx(dst, src, stride, 16);
-}
-static void avg_rv40_qpel8_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                    ptrdiff_t stride)
-{
-    ff_avg_pixels8_xy2_mmx(dst, src, stride, 8);
-}
-static void avg_rv40_qpel16_mc33_mmx(uint8_t *dst, const uint8_t *src,
-                                     ptrdiff_t stride)
-{
-    ff_avg_pixels16_xy2_mmx(dst, src, stride, 16);
-}
-#endif /* HAVE_MMX_INLINE */
+DEFINE_FN(put, 8, mmx)
+DEFINE_FN(avg, 8, mmx)
+DEFINE_FN(put, 16, mmx)
+DEFINE_FN(avg, 16, mmx)
+#endif
 
 av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 {
@@ -240,6 +241,7 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 #endif
     }
     if (EXTERNAL_MMXEXT(cpu_flags)) {
+        c->avg_pixels_tab[1][15]        = avg_rv40_qpel8_mc33_mmxext;
         c->avg_chroma_pixels_tab[0]     = ff_avg_rv40_chroma_mc8_mmxext;
         c->avg_chroma_pixels_tab[1]     = ff_avg_rv40_chroma_mc4_mmxext;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
@@ -251,6 +253,8 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 #endif
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_sse2;
+        c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_sse2;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
         c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
         c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
@@ -259,6 +263,10 @@ av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
         QPEL_MC_SET(avg_, _sse2)
     }
     if (EXTERNAL_SSSE3(cpu_flags)) {
+        c->put_pixels_tab[0][15]        = put_rv40_qpel16_mc33_ssse3;
+        c->put_pixels_tab[1][15]        = put_rv40_qpel8_mc33_ssse3;
+        c->avg_pixels_tab[0][15]        = avg_rv40_qpel16_mc33_ssse3;
+        c->avg_pixels_tab[1][15]        = avg_rv40_qpel8_mc33_ssse3;
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
         c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
         c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
index d7164b6496..6f2e4f48d9 100644
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@@ -2,20 +2,20 @@
 ;* AAC Spectral Band Replication decoding functions
 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -25,7 +25,13 @@ SECTION_RODATA
 ; mask equivalent for multiply by -1.0 1.0
 ps_mask         times 2 dd 1<<31, 0
 ps_mask2        times 2 dd 0, 1<<31
-ps_neg          times 4 dd 1<<31
+ps_noise0       times 2 dd  1.0,  0.0,
+ps_noise2       times 2 dd -1.0,  0.0
+ps_noise13      dd  0.0,  1.0, 0.0, -1.0
+                dd  0.0, -1.0, 0.0,  1.0
+                dd  0.0,  1.0, 0.0, -1.0
+cextern         sbr_noise_table
+cextern         ps_neg
 
 SECTION_TEXT
 
@@ -136,7 +142,6 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
     mulps      m2, bw             ; (a1[0] a1[1])*bw*bw = (a0 a1)
     mova       m3, m1
     mova       m4, m2
-    mova       m7, [ps_mask]
 
     ; Set pointers
 %if ARCH_X86_64 == 0 || WIN64
@@ -156,30 +161,28 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
     shl      start, 3            ; offset from num loops
 
     mova        m0, [X_lowq + start]
-    movlhps     m1, m1           ; (a2 a3 a2 a3)
-    movlhps     m2, m2           ; (a0 a1 a0 a1)
-    shufps      m3, m3, q0101    ; (a3 a2 a3 a2)
-    shufps      m4, m4, q0101    ; (a1 a0 a1 a0)
-    xorps       m3, m7           ; (-a3 a2 -a3 a2)
-    xorps       m4, m7           ; (-a1 a0 -a1 a0)
+    shufps      m3, m3, q1111
+    shufps      m4, m4, q1111
+    xorps       m3, [ps_mask]
+    shufps      m1, m1, q0000
+    shufps      m2, m2, q0000
+    xorps       m4, [ps_mask]
 .loop2:
-    mova        m5, m0
+    movu        m7, [X_lowq + start + 8]        ; BbCc
     mova        m6, m0
-    shufps      m0, m0, q2200    ; {Xl[-2][0],",Xl[-1][0],"}
-    shufps      m5, m5, q3311    ; {Xl[-2][1],",Xl[-1][1],"}
-    mulps       m0, m2
-    mulps       m5, m4
-    mova        m7, m6
-    addps       m5, m0
-    mova        m0, [X_lowq + start + 2*2*4]
-    shufps      m6, m0, q0022    ; {Xl[-1][0],",Xl[0][0],"}
-    shufps      m7, m0, q1133    ; {Xl[-1][1],",Xl[1][1],"}
-    mulps       m6, m1
+    mova        m5, m7
+    shufps      m0, m0, q2301                   ; aAbB
+    shufps      m7, m7, q2301                   ; bBcC
+    mulps       m0, m4
     mulps       m7, m3
-    addps       m5, m6
+    mulps       m6, m2
+    mulps       m5, m1
+    addps       m7, m0
+    mova        m0, [X_lowq + start +16]        ; CcDd
     addps       m7, m0
-    addps       m5, m7
-    mova  [X_highq + start], m5
+    addps       m6, m5
+    addps       m7, m6
+    mova  [X_highq + start], m7
     add     start, 16
     jnz         .loop2
     RET
@@ -246,33 +249,47 @@ cglobal sbr_neg_odd_64, 1,2,4,z
     jne      .loop
     REP_RET
 
-INIT_XMM sse2
 ; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
+%macro SBR_QMF_DEINT_BFLY  0
 cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
     mov               cq, 64*4-2*mmsize
     lea            vrevq, [vq + 64*4]
 .loop:
     mova              m0, [src0q+cq]
     mova              m1, [src1q]
-    mova              m2, [src0q+cq+mmsize]
-    mova              m3, [src1q+mmsize]
-    pshufd            m4, m0, q0123
-    pshufd            m5, m1, q0123
-    pshufd            m6, m2, q0123
-    pshufd            m7, m3, q0123
-    addps             m3, m4
+    mova              m4, [src0q+cq+mmsize]
+    mova              m5, [src1q+mmsize]
+%if cpuflag(sse2)
+    pshufd            m2, m0, q0123
+    pshufd            m3, m1, q0123
+    pshufd            m6, m4, q0123
+    pshufd            m7, m5, q0123
+%else
+    shufps            m2, m0, m0, q0123
+    shufps            m3, m1, m1, q0123
+    shufps            m6, m4, m4, q0123
+    shufps            m7, m5, m5, q0123
+%endif
+    addps             m5, m2
     subps             m0, m7
     addps             m1, m6
-    subps             m2, m5
+    subps             m4, m3
     mova         [vrevq], m1
-    mova  [vrevq+mmsize], m3
+    mova  [vrevq+mmsize], m5
     mova         [vq+cq], m0
-    mova  [vq+cq+mmsize], m2
+    mova  [vq+cq+mmsize], m4
     add            src1q, 2*mmsize
     add            vrevq, 2*mmsize
     sub               cq, 2*mmsize
     jge            .loop
     REP_RET
+%endmacro
+
+INIT_XMM sse
+SBR_QMF_DEINT_BFLY
+
+INIT_XMM sse2
+SBR_QMF_DEINT_BFLY
 
 INIT_XMM sse2
 cglobal sbr_qmf_pre_shuffle, 1,4,6,z
@@ -303,3 +320,128 @@ cglobal sbr_qmf_pre_shuffle, 1,4,6,z
     movq       m2, [zq]
     movq    [r2q], m2
     REP_RET
+
+%ifdef PIC
+%define NREGS 1
+%if UNIX64
+%define NOISE_TABLE r6q ; r5q is m_max
+%else
+%define NOISE_TABLE r5q
+%endif
+%else
+%define NREGS 0
+%define NOISE_TABLE sbr_noise_table
+%endif
+
+%macro LOAD_NST  1
+%ifdef PIC
+    lea  NOISE_TABLE, [%1]
+    mova          m0, [kxq + NOISE_TABLE]
+%else
+    mova          m0, [kxq + %1]
+%endif
+%endmacro
+
+INIT_XMM sse2
+; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    mova       m0, [ps_noise0]
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    and       kxq, 1
+    shl       kxq, 4
+    LOAD_NST  ps_noise13
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    mova       m0, [ps_noise2]
+    jmp apply_noise_main
+
+; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
+;                      const float *q_filt, int noise,
+;                      int kx, int m_max)
+cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+    and       kxq, 1
+    shl       kxq, 4
+    LOAD_NST  ps_noise13+16
+
+apply_noise_main:
+%if ARCH_X86_64 == 0 || WIN64
+    mov       kxd, m_maxm
+%define count kxq
+%else
+%define count m_maxq
+%endif
+    dec    noiseq
+    shl    count, 2
+%ifdef PIC
+    lea NOISE_TABLE, [sbr_noise_table]
+%endif
+    lea        Yq, [Yq + 2*count]
+    add      s_mq, count
+    add   q_filtq, count
+    shl    noiseq, 3
+    pxor       m5, m5
+    neg    count
+.loop:
+    mova       m1, [q_filtq + count]
+    movu       m3, [noiseq + NOISE_TABLE + 1*mmsize]
+    movu       m4, [noiseq + NOISE_TABLE + 2*mmsize]
+    add    noiseq, 2*mmsize
+    and    noiseq, 0x1ff<<3
+    punpckhdq  m2, m1, m1
+    punpckldq  m1, m1
+    mulps      m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+    mulps      m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+    mova       m3, [s_mq + count]
+    ; TODO: replace by a vpermd in AVX2
+    punpckhdq  m4, m3, m3
+    punpckldq  m3, m3
+    pcmpeqd    m6, m3, m5 ; m6 == 0
+    pcmpeqd    m7, m4, m5 ; m7 == 0
+    mulps      m3, m0 ; s_m[m] * phi_sign
+    mulps      m4, m0 ; s_m[m] * phi_sign
+    pand       m1, m6
+    pand       m2, m7
+    movu       m6, [Yq + 2*count]
+    movu       m7, [Yq + 2*count + mmsize]
+    addps      m3, m1
+    addps      m4, m2
+    addps      m6, m3
+    addps      m7, m4
+    movu    [Yq + 2*count], m6
+    movu    [Yq + 2*count + mmsize], m7
+    add    count, mmsize
+    jl      .loop
+    RET
+
+INIT_XMM sse
+cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
+%define COUNT  32*4
+%define OFFSET 32*4
+    mov        cq, -COUNT
+    lea     vrevq, [vq + OFFSET + COUNT]
+    add        vq, OFFSET-mmsize
+    add      srcq, 2*COUNT
+    mova       m3, [ps_neg]
+.loop:
+    mova       m0, [srcq + 2*cq + 0*mmsize]
+    mova       m1, [srcq + 2*cq + 1*mmsize]
+    shufps     m2, m0, m1, q2020
+    shufps     m1, m0, q1313
+    xorps      m2, m3
+    mova     [vq], m1
+    mova  [vrevq + cq], m2
+    sub        vq, mmsize
+    add        cq, mmsize
+    jl      .loop
+    REP_RET
diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c
index 9600852163..a2aca742cf 100644
--- a/libavcodec/x86/sbrdsp_init.c
+++ b/libavcodec/x86/sbrdsp_init.c
@@ -2,20 +2,20 @@
  * AAC Spectral Band Replication decoding functions
  * Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,9 +34,25 @@ void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
                        float bw, int start, int end);
 void ff_sbr_neg_odd_64_sse(float *z);
 void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
+void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
 void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
 void ff_sbr_qmf_pre_shuffle_sse2(float *z);
 
+void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
+                                  const float *q_filt, int noise,
+                                  int kx, int m_max);
+
+void ff_sbr_qmf_deint_neg_sse(float *v, const float *src);
+
 av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -48,10 +64,16 @@ av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
         s->hf_g_filt  = ff_sbr_hf_g_filt_sse;
         s->hf_gen     = ff_sbr_hf_gen_sse;
         s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
+        s->qmf_deint_bfly   = ff_sbr_qmf_deint_bfly_sse;
+        s->qmf_deint_neg    = ff_sbr_qmf_deint_neg_sse;
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         s->qmf_deint_bfly   = ff_sbr_qmf_deint_bfly_sse2;
         s->qmf_pre_shuffle  = ff_sbr_qmf_pre_shuffle_sse2;
+        s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2;
+        s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2;
+        s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
+        s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
     }
 }
diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c
index 71763dbf75..1d46212a13 100644
--- a/libavcodec/x86/simple_idct.c
+++ b/libavcodec/x86/simple_idct.c
@@ -3,24 +3,23 @@
  *
  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 
@@ -86,7 +85,7 @@ DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
 
 static inline void idct(int16_t *block)
 {
-        DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+        LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
         int16_t * const temp= (int16_t*)align_tmp;
 
         __asm__ volatile(
@@ -1148,6 +1147,7 @@ Temp
 
 "9: \n\t"
                 :: "r" (block), "r" (temp), "r" (coeffs)
+                   NAMED_CONSTRAINTS_ADD(wm1010,d40000)
                 : "%eax"
         );
 }
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index 4fc29141b5..4a98732503 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/snowdsp.c b/libavcodec/x86/snowdsp.c
new file mode 100644
index 0000000000..2778489026
--- /dev/null
+++ b/libavcodec/x86/snowdsp.c
@@ -0,0 +1,908 @@
+/*
+ * MMX and SSE2 optimized snow DSP utils
+ * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/snow.h"
+#include "libavcodec/snow_dwt.h"
+
+#if HAVE_INLINE_ASM
+
+static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
+    const int w2= (width+1)>>1;
+    const int w_l= (width>>1);
+    const int w_r= w2 - 1;
+    int i;
+
+    { // Lift 0
+        IDWTELEM * const ref = b + w2 - 1;
+        IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
+        // (the first time erroneously), we allow the SSE2 code to run an extra pass.
+        // The savings in code and time are well worth having to store this value and
+        // calculate b[0] correctly afterwards.
+
+        i = 0;
+        __asm__ volatile(
+            "pcmpeqd   %%xmm7, %%xmm7         \n\t"
+            "pcmpeqd   %%xmm3, %%xmm3         \n\t"
+            "psllw         $1, %%xmm3         \n\t"
+            "paddw     %%xmm7, %%xmm3         \n\t"
+            "psllw        $13, %%xmm3         \n\t"
+        ::);
+        for(; i<w_l-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw  %%xmm1, %%xmm2        \n\t"
+                "paddw  %%xmm5, %%xmm6        \n\t"
+                "paddw  %%xmm7, %%xmm2        \n\t"
+                "paddw  %%xmm7, %%xmm6        \n\t"
+                "pmulhw %%xmm3, %%xmm2        \n\t"
+                "pmulhw %%xmm3, %%xmm6        \n\t"
+                "paddw    (%0), %%xmm2        \n\t"
+                "paddw  16(%0), %%xmm6        \n\t"
+                "movdqa %%xmm2, (%0)          \n\t"
+                "movdqa %%xmm6, 16(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+        b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+    }
+
+    { // Lift 1
+        IDWTELEM * const dst = b+w2;
+
+        i = 0;
+        for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
+            dst[i] = dst[i] - (b[i] + b[i + 1]);
+        }
+        for(; i<w_r-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm1        \n\t"
+                "movdqu 16(%1), %%xmm5        \n\t"
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw  %%xmm1, %%xmm2        \n\t"
+                "paddw  %%xmm5, %%xmm6        \n\t"
+                "movdqa   (%0), %%xmm0        \n\t"
+                "movdqa 16(%0), %%xmm4        \n\t"
+                "psubw  %%xmm2, %%xmm0        \n\t"
+                "psubw  %%xmm6, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&dst[i]), "r"(&b[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+    }
+
+    { // Lift 2
+        IDWTELEM * const ref = b+w2 - 1;
+        IDWTELEM b_0 = b[0];
+
+        i = 0;
+        __asm__ volatile(
+            "psllw         $15, %%xmm7        \n\t"
+            "pcmpeqw    %%xmm6, %%xmm6        \n\t"
+            "psrlw         $13, %%xmm6        \n\t"
+            "paddw      %%xmm7, %%xmm6        \n\t"
+        ::);
+        for(; i<w_l-15; i+=16){
+            __asm__ volatile(
+                "movdqu   (%1), %%xmm0        \n\t"
+                "movdqu 16(%1), %%xmm4        \n\t"
+                "movdqu  2(%1), %%xmm1        \n\t"
+                "movdqu 18(%1), %%xmm5        \n\t" //FIXME try aligned reads and shifts
+                "paddw  %%xmm6, %%xmm0        \n\t"
+                "paddw  %%xmm6, %%xmm4        \n\t"
+                "paddw  %%xmm7, %%xmm1        \n\t"
+                "paddw  %%xmm7, %%xmm5        \n\t"
+                "pavgw  %%xmm1, %%xmm0        \n\t"
+                "pavgw  %%xmm5, %%xmm4        \n\t"
+                "psubw  %%xmm7, %%xmm0        \n\t"
+                "psubw  %%xmm7, %%xmm4        \n\t"
+                "psraw      $1, %%xmm0        \n\t"
+                "psraw      $1, %%xmm4        \n\t"
+                "movdqa   (%0), %%xmm1        \n\t"
+                "movdqa 16(%0), %%xmm5        \n\t"
+                "paddw  %%xmm1, %%xmm0        \n\t"
+                "paddw  %%xmm5, %%xmm4        \n\t"
+                "psraw      $2, %%xmm0        \n\t"
+                "psraw      $2, %%xmm4        \n\t"
+                "paddw  %%xmm1, %%xmm0        \n\t"
+                "paddw  %%xmm5, %%xmm4        \n\t"
+                "movdqa %%xmm0, (%0)          \n\t"
+                "movdqa %%xmm4, 16(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                : "memory"
+            );
+        }
+        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+        b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
+    }
+
+    { // Lift 3
+        IDWTELEM * const src = b+w2;
+
+        i = 0;
+        for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
+            temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
+        }
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movdqu  2(%1), %%xmm2        \n\t"
+                "movdqu 18(%1), %%xmm6        \n\t"
+                "paddw    (%1), %%xmm2        \n\t"
+                "paddw  16(%1), %%xmm6        \n\t"
+                "movdqu   (%0), %%xmm0        \n\t"
+                "movdqu 16(%0), %%xmm4        \n\t"
+                "paddw  %%xmm2, %%xmm0        \n\t"
+                "paddw  %%xmm6, %%xmm4        \n\t"
+                "psraw      $1, %%xmm2        \n\t"
+                "psraw      $1, %%xmm6        \n\t"
+                "paddw  %%xmm0, %%xmm2        \n\t"
+                "paddw  %%xmm4, %%xmm6        \n\t"
+                "movdqa %%xmm2, (%2)          \n\t"
+                "movdqa %%xmm6, 16(%2)        \n\t"
+                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+    }
+
+    {
+        snow_interleave_line_header(&i, width, b, temp);
+
+        for (; (i & 0x3E) != 0x3E; i-=2){
+            b[i+1] = temp[i>>1];
+            b[i] = b[i>>1];
+        }
+        for (i-=62; i>=0; i-=64){
+            __asm__ volatile(
+                "movdqa      (%1), %%xmm0       \n\t"
+                "movdqa    16(%1), %%xmm2       \n\t"
+                "movdqa    32(%1), %%xmm4       \n\t"
+                "movdqa    48(%1), %%xmm6       \n\t"
+                "movdqa      (%1), %%xmm1       \n\t"
+                "movdqa    16(%1), %%xmm3       \n\t"
+                "movdqa    32(%1), %%xmm5       \n\t"
+                "movdqa    48(%1), %%xmm7       \n\t"
+                "punpcklwd   (%2), %%xmm0       \n\t"
+                "punpcklwd 16(%2), %%xmm2       \n\t"
+                "punpcklwd 32(%2), %%xmm4       \n\t"
+                "punpcklwd 48(%2), %%xmm6       \n\t"
+                "movdqa    %%xmm0, (%0)         \n\t"
+                "movdqa    %%xmm2, 32(%0)       \n\t"
+                "movdqa    %%xmm4, 64(%0)       \n\t"
+                "movdqa    %%xmm6, 96(%0)       \n\t"
+                "punpckhwd   (%2), %%xmm1       \n\t"
+                "punpckhwd 16(%2), %%xmm3       \n\t"
+                "punpckhwd 32(%2), %%xmm5       \n\t"
+                "punpckhwd 48(%2), %%xmm7       \n\t"
+                "movdqa    %%xmm1, 16(%0)       \n\t"
+                "movdqa    %%xmm3, 48(%0)       \n\t"
+                "movdqa    %%xmm5, 80(%0)       \n\t"
+                "movdqa    %%xmm7, 112(%0)      \n\t"
+                :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
+                 : "memory"
+               );
+        }
+    }
+}
+
+static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
+    const int w2= (width+1)>>1;
+    const int w_l= (width>>1);
+    const int w_r= w2 - 1;
+    int i;
+
+    { // Lift 0
+        IDWTELEM * const ref = b + w2 - 1;
+
+        i = 1;
+        b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+        __asm__ volatile(
+            "pcmpeqw    %%mm7, %%mm7         \n\t"
+            "pcmpeqw    %%mm3, %%mm3         \n\t"
+            "psllw         $1, %%mm3         \n\t"
+            "paddw      %%mm7, %%mm3         \n\t"
+            "psllw        $13, %%mm3         \n\t"
+           ::);
+        for(; i<w_l-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm2        \n\t"
+                "movq    8(%1), %%mm6        \n\t"
+                "paddw   2(%1), %%mm2        \n\t"
+                "paddw  10(%1), %%mm6        \n\t"
+                "paddw   %%mm7, %%mm2        \n\t"
+                "paddw   %%mm7, %%mm6        \n\t"
+                "pmulhw  %%mm3, %%mm2        \n\t"
+                "pmulhw  %%mm3, %%mm6        \n\t"
+                "paddw    (%0), %%mm2        \n\t"
+                "paddw   8(%0), %%mm6        \n\t"
+                "movq    %%mm2, (%0)         \n\t"
+                "movq    %%mm6, 8(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+    }
+
+    { // Lift 1
+        IDWTELEM * const dst = b+w2;
+
+        i = 0;
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm2        \n\t"
+                "movq    8(%1), %%mm6        \n\t"
+                "paddw   2(%1), %%mm2        \n\t"
+                "paddw  10(%1), %%mm6        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "psubw   %%mm2, %%mm0        \n\t"
+                "psubw   %%mm6, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&dst[i]), "r"(&b[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+    }
+
+    { // Lift 2
+        IDWTELEM * const ref = b+w2 - 1;
+
+        i = 1;
+        b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
+        __asm__ volatile(
+            "psllw         $15, %%mm7        \n\t"
+            "pcmpeqw     %%mm6, %%mm6        \n\t"
+            "psrlw         $13, %%mm6        \n\t"
+            "paddw       %%mm7, %%mm6        \n\t"
+           ::);
+        for(; i<w_l-7; i+=8){
+            __asm__ volatile(
+                "movq     (%1), %%mm0        \n\t"
+                "movq    8(%1), %%mm4        \n\t"
+                "movq    2(%1), %%mm1        \n\t"
+                "movq   10(%1), %%mm5        \n\t"
+                "paddw   %%mm6, %%mm0        \n\t"
+                "paddw   %%mm6, %%mm4        \n\t"
+                "paddw   %%mm7, %%mm1        \n\t"
+                "paddw   %%mm7, %%mm5        \n\t"
+                "pavgw   %%mm1, %%mm0        \n\t"
+                "pavgw   %%mm5, %%mm4        \n\t"
+                "psubw   %%mm7, %%mm0        \n\t"
+                "psubw   %%mm7, %%mm4        \n\t"
+                "psraw      $1, %%mm0        \n\t"
+                "psraw      $1, %%mm4        \n\t"
+                "movq     (%0), %%mm1        \n\t"
+                "movq    8(%0), %%mm5        \n\t"
+                "paddw   %%mm1, %%mm0        \n\t"
+                "paddw   %%mm5, %%mm4        \n\t"
+                "psraw      $2, %%mm0        \n\t"
+                "psraw      $2, %%mm4        \n\t"
+                "paddw   %%mm1, %%mm0        \n\t"
+                "paddw   %%mm5, %%mm4        \n\t"
+                "movq    %%mm0, (%0)         \n\t"
+                "movq    %%mm4, 8(%0)        \n\t"
+                :: "r"(&b[i]), "r"(&ref[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+    }
+
+    { // Lift 3
+        IDWTELEM * const src = b+w2;
+        i = 0;
+
+        for(; i<w_r-7; i+=8){
+            __asm__ volatile(
+                "movq    2(%1), %%mm2        \n\t"
+                "movq   10(%1), %%mm6        \n\t"
+                "paddw    (%1), %%mm2        \n\t"
+                "paddw   8(%1), %%mm6        \n\t"
+                "movq     (%0), %%mm0        \n\t"
+                "movq    8(%0), %%mm4        \n\t"
+                "paddw   %%mm2, %%mm0        \n\t"
+                "paddw   %%mm6, %%mm4        \n\t"
+                "psraw      $1, %%mm2        \n\t"
+                "psraw      $1, %%mm6        \n\t"
+                "paddw   %%mm0, %%mm2        \n\t"
+                "paddw   %%mm4, %%mm6        \n\t"
+                "movq    %%mm2, (%2)         \n\t"
+                "movq    %%mm6, 8(%2)        \n\t"
+                :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+                 : "memory"
+               );
+        }
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+    }
+
+    {
+        snow_interleave_line_header(&i, width, b, temp);
+
+        for (; (i & 0x1E) != 0x1E; i-=2){
+            b[i+1] = temp[i>>1];
+            b[i] = b[i>>1];
+        }
+        for (i-=30; i>=0; i-=32){
+            __asm__ volatile(
+                "movq        (%1), %%mm0       \n\t"
+                "movq       8(%1), %%mm2       \n\t"
+                "movq      16(%1), %%mm4       \n\t"
+                "movq      24(%1), %%mm6       \n\t"
+                "movq        (%1), %%mm1       \n\t"
+                "movq       8(%1), %%mm3       \n\t"
+                "movq      16(%1), %%mm5       \n\t"
+                "movq      24(%1), %%mm7       \n\t"
+                "punpcklwd   (%2), %%mm0       \n\t"
+                "punpcklwd  8(%2), %%mm2       \n\t"
+                "punpcklwd 16(%2), %%mm4       \n\t"
+                "punpcklwd 24(%2), %%mm6       \n\t"
+                "movq       %%mm0, (%0)        \n\t"
+                "movq       %%mm2, 16(%0)      \n\t"
+                "movq       %%mm4, 32(%0)      \n\t"
+                "movq       %%mm6, 48(%0)      \n\t"
+                "punpckhwd   (%2), %%mm1       \n\t"
+                "punpckhwd  8(%2), %%mm3       \n\t"
+                "punpckhwd 16(%2), %%mm5       \n\t"
+                "punpckhwd 24(%2), %%mm7       \n\t"
+                "movq       %%mm1, 8(%0)       \n\t"
+                "movq       %%mm3, 24(%0)      \n\t"
+                "movq       %%mm5, 40(%0)      \n\t"
+                "movq       %%mm7, 56(%0)      \n\t"
+                :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
+                 : "memory"
+               );
+        }
+    }
+}
+
+#if HAVE_7REGS
+#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
+        ""op" ("r",%%"REG_d"), %%"t0"      \n\t"\
+        ""op" 16("r",%%"REG_d"), %%"t1"    \n\t"\
+        ""op" 32("r",%%"REG_d"), %%"t2"    \n\t"\
+        ""op" 48("r",%%"REG_d"), %%"t3"    \n\t"
+
+#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
+        snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
+        snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "psubw %%"s0", %%"t0" \n\t"\
+        "psubw %%"s1", %%"t1" \n\t"\
+        "psubw %%"s2", %%"t2" \n\t"\
+        "psubw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
+        "movdqa %%"s0", ("w",%%"REG_d")      \n\t"\
+        "movdqa %%"s1", 16("w",%%"REG_d")    \n\t"\
+        "movdqa %%"s2", 32("w",%%"REG_d")    \n\t"\
+        "movdqa %%"s3", 48("w",%%"REG_d")    \n\t"
+
+#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
+        "psraw $"n", %%"t0" \n\t"\
+        "psraw $"n", %%"t1" \n\t"\
+        "psraw $"n", %%"t2" \n\t"\
+        "psraw $"n", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "paddw %%"s0", %%"t0" \n\t"\
+        "paddw %%"s1", %%"t1" \n\t"\
+        "paddw %%"s2", %%"t2" \n\t"\
+        "paddw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "pmulhw %%"s0", %%"t0" \n\t"\
+        "pmulhw %%"s1", %%"t1" \n\t"\
+        "pmulhw %%"s2", %%"t2" \n\t"\
+        "pmulhw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "movdqa %%"s0", %%"t0" \n\t"\
+        "movdqa %%"s1", %%"t1" \n\t"\
+        "movdqa %%"s2", %%"t2" \n\t"\
+        "movdqa %%"s3", %%"t3" \n\t"
+
+static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+    x86_reg i = width;
+
+    while(i & 0x1F)
+    {
+        i--;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+    i+=i;
+
+         __asm__ volatile (
+        "jmp 2f                                      \n\t"
+        "1:                                          \n\t"
+        snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
+
+
+        "pcmpeqw    %%xmm0, %%xmm0                   \n\t"
+        "pcmpeqw    %%xmm2, %%xmm2                   \n\t"
+        "paddw      %%xmm2, %%xmm2                   \n\t"
+        "paddw      %%xmm0, %%xmm2                   \n\t"
+        "psllw         $13, %%xmm2                   \n\t"
+        snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
+
+        "pcmpeqw %%xmm7, %%xmm7                      \n\t"
+        "pcmpeqw %%xmm5, %%xmm5                      \n\t"
+        "psllw $15, %%xmm7                           \n\t"
+        "psrlw $13, %%xmm5                           \n\t"
+        "paddw %%xmm7, %%xmm5                        \n\t"
+        snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
+        "movq   (%2,%%"REG_d"), %%xmm1        \n\t"
+        "movq  8(%2,%%"REG_d"), %%xmm3        \n\t"
+        "paddw %%xmm7, %%xmm1                        \n\t"
+        "paddw %%xmm7, %%xmm3                        \n\t"
+        "pavgw %%xmm1, %%xmm0                        \n\t"
+        "pavgw %%xmm3, %%xmm2                        \n\t"
+        "movq 16(%2,%%"REG_d"), %%xmm1        \n\t"
+        "movq 24(%2,%%"REG_d"), %%xmm3        \n\t"
+        "paddw %%xmm7, %%xmm1                        \n\t"
+        "paddw %%xmm7, %%xmm3                        \n\t"
+        "pavgw %%xmm1, %%xmm4                        \n\t"
+        "pavgw %%xmm3, %%xmm6                        \n\t"
+        snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+
+        snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
+        snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
+        snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
+
+        "2:                                          \n\t"
+        "sub $64, %%"REG_d"                          \n\t"
+        "jge 1b                                      \n\t"
+        :"+d"(i)
+        :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+
+#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
+        ""op" ("r",%%"REG_d"), %%"t0"   \n\t"\
+        ""op" 8("r",%%"REG_d"), %%"t1"  \n\t"\
+        ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\
+        ""op" 24("r",%%"REG_d"), %%"t3" \n\t"
+
+#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
+        snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
+        snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
+        "movq %%"s0", ("w",%%"REG_d")   \n\t"\
+        "movq %%"s1", 8("w",%%"REG_d")  \n\t"\
+        "movq %%"s2", 16("w",%%"REG_d") \n\t"\
+        "movq %%"s3", 24("w",%%"REG_d") \n\t"
+
+#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+        "movq %%"s0", %%"t0" \n\t"\
+        "movq %%"s1", %%"t1" \n\t"\
+        "movq %%"s2", %%"t2" \n\t"\
+        "movq %%"s3", %%"t3" \n\t"
+
+
+static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+    x86_reg i = width;
+    while(i & 15)
+    {
+        i--;
+        b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+        b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+        b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+        b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+    }
+    i+=i;
+    __asm__ volatile(
+        "jmp 2f                                      \n\t"
+        "1:                                          \n\t"
+
+        snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
+        "pcmpeqw    %%mm0, %%mm0                     \n\t"
+        "pcmpeqw    %%mm2, %%mm2                     \n\t"
+        "paddw      %%mm2, %%mm2                     \n\t"
+        "paddw      %%mm0, %%mm2                     \n\t"
+        "psllw        $13, %%mm2                     \n\t"
+        snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
+        "pcmpeqw %%mm7, %%mm7                        \n\t"
+        "pcmpeqw %%mm5, %%mm5                        \n\t"
+        "psllw $15, %%mm7                            \n\t"
+        "psrlw $13, %%mm5                            \n\t"
+        "paddw %%mm7, %%mm5                          \n\t"
+        snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
+        "movq   (%2,%%"REG_d"), %%mm1         \n\t"
+        "movq  8(%2,%%"REG_d"), %%mm3         \n\t"
+        "paddw %%mm7, %%mm1                          \n\t"
+        "paddw %%mm7, %%mm3                          \n\t"
+        "pavgw %%mm1, %%mm0                          \n\t"
+        "pavgw %%mm3, %%mm2                          \n\t"
+        "movq 16(%2,%%"REG_d"), %%mm1         \n\t"
+        "movq 24(%2,%%"REG_d"), %%mm3         \n\t"
+        "paddw %%mm7, %%mm1                          \n\t"
+        "paddw %%mm7, %%mm3                          \n\t"
+        "pavgw %%mm1, %%mm4                          \n\t"
+        "pavgw %%mm3, %%mm6                          \n\t"
+        snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+
+        snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
+        snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
+        snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
+
+        "2:                                          \n\t"
+        "sub $32, %%"REG_d"                          \n\t"
+        "jge 1b                                      \n\t"
+        :"+d"(i)
+        :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+#endif //HAVE_7REGS
+
+#if HAVE_6REGS
+#define snow_inner_add_yblock_sse2_header \
+    IDWTELEM * * dst_array = sb->line + src_y;\
+    x86_reg tmp;\
+    __asm__ volatile(\
+             "mov  %7, %%"REG_c"             \n\t"\
+             "mov  %6, %2                    \n\t"\
+             "mov  %4, %%"REG_S"             \n\t"\
+             "pxor %%xmm7, %%xmm7            \n\t" /* 0 */\
+             "pcmpeqd %%xmm3, %%xmm3         \n\t"\
+             "psllw $15, %%xmm3              \n\t"\
+             "psrlw $12, %%xmm3              \n\t" /* FRAC_BITS >> 1 */\
+             "1:                             \n\t"\
+             "mov %1, %%"REG_D"              \n\t"\
+             "mov (%%"REG_D"), %%"REG_D"     \n\t"\
+             "add %3, %%"REG_D"              \n\t"
+
+#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
+             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+             "movq (%%"REG_d"), %%"out_reg1" \n\t"\
+             "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+             "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
+             "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
+             "punpcklbw %%xmm7, %%xmm0       \n\t"\
+             "punpcklbw %%xmm7, %%xmm4       \n\t"\
+             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
+             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
+             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+             "movq (%%"REG_d"), %%"out_reg1" \n\t"\
+             "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+             "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
+             "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
+             "punpcklbw %%xmm7, %%xmm0       \n\t"\
+             "punpcklbw %%xmm7, %%xmm4       \n\t"\
+             "pmullw %%xmm0, %%"out_reg1"    \n\t"\
+             "pmullw %%xmm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
+             snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
+             "paddusw %%xmm2, %%xmm1         \n\t"\
+             "paddusw %%xmm6, %%xmm5         \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
+             snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
+             "paddusw %%xmm2, %%xmm1         \n\t"\
+             "paddusw %%xmm6, %%xmm5         \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common1\
+             "add $32, %%"REG_S"             \n\t"\
+             "add %%"REG_c", %0              \n\t"\
+             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
+             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
+             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
+             "add %%"REG_c", (%%"REG_a")     \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common2\
+             "jnz 1b                         \n\t"\
+             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+             :\
+             "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+             XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\
+             "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+
+#define snow_inner_add_yblock_sse2_end_8\
+             "sal $1, %%"REG_c"              \n\t"\
+             "add"OPSIZE" $"PTR_SIZE"*2, %1  \n\t"\
+             snow_inner_add_yblock_sse2_end_common1\
+             "sar $1, %%"REG_c"              \n\t"\
+             "sub $2, %2                     \n\t"\
+             snow_inner_add_yblock_sse2_end_common2
+
+#define snow_inner_add_yblock_sse2_end_16\
+             "add"OPSIZE" $"PTR_SIZE"*1, %1  \n\t"\
+             snow_inner_add_yblock_sse2_end_common1\
+             "dec %2                         \n\t"\
+             snow_inner_add_yblock_sse2_end_common2
+
+static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_8("2", "8")
+snow_inner_add_yblock_sse2_accum_8("1", "128")
+snow_inner_add_yblock_sse2_accum_8("0", "136")
+
+             "mov %0, %%"REG_d"              \n\t"
+             "movdqa (%%"REG_D"), %%xmm0     \n\t"
+             "movdqa %%xmm1, %%xmm2          \n\t"
+
+             "punpckhwd %%xmm7, %%xmm1       \n\t"
+             "punpcklwd %%xmm7, %%xmm2       \n\t"
+             "paddd %%xmm2, %%xmm0           \n\t"
+             "movdqa 16(%%"REG_D"), %%xmm2   \n\t"
+             "paddd %%xmm1, %%xmm2           \n\t"
+             "paddd %%xmm3, %%xmm0           \n\t"
+             "paddd %%xmm3, %%xmm2           \n\t"
+
+             "mov %1, %%"REG_D"              \n\t"
+             "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
+             "add %3, %%"REG_D"              \n\t"
+
+             "movdqa (%%"REG_D"), %%xmm4     \n\t"
+             "movdqa %%xmm5, %%xmm6          \n\t"
+             "punpckhwd %%xmm7, %%xmm5       \n\t"
+             "punpcklwd %%xmm7, %%xmm6       \n\t"
+             "paddd %%xmm6, %%xmm4           \n\t"
+             "movdqa 16(%%"REG_D"), %%xmm6   \n\t"
+             "paddd %%xmm5, %%xmm6           \n\t"
+             "paddd %%xmm3, %%xmm4           \n\t"
+             "paddd %%xmm3, %%xmm6           \n\t"
+
+             "psrad $8, %%xmm0               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%xmm2               \n\t" /* FRAC_BITS. */
+             "packssdw %%xmm2, %%xmm0        \n\t"
+             "packuswb %%xmm7, %%xmm0        \n\t"
+             "movq %%xmm0, (%%"REG_d")       \n\t"
+
+             "psrad $8, %%xmm4               \n\t" /* FRAC_BITS. */
+             "psrad $8, %%xmm6               \n\t" /* FRAC_BITS. */
+             "packssdw %%xmm6, %%xmm4        \n\t"
+             "packuswb %%xmm7, %%xmm4        \n\t"
+             "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
+snow_inner_add_yblock_sse2_end_8
+}
+
+static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_16("2", "16")
+snow_inner_add_yblock_sse2_accum_16("1", "512")
+snow_inner_add_yblock_sse2_accum_16("0", "528")
+
+             "mov %0, %%"REG_d"              \n\t"
+             "psrlw $4, %%xmm1               \n\t"
+             "psrlw $4, %%xmm5               \n\t"
+             "paddw   (%%"REG_D"), %%xmm1    \n\t"
+             "paddw 16(%%"REG_D"), %%xmm5    \n\t"
+             "paddw %%xmm3, %%xmm1           \n\t"
+             "paddw %%xmm3, %%xmm5           \n\t"
+             "psraw $4, %%xmm1               \n\t" /* FRAC_BITS. */
+             "psraw $4, %%xmm5               \n\t" /* FRAC_BITS. */
+             "packuswb %%xmm5, %%xmm1        \n\t"
+
+             "movdqu %%xmm1, (%%"REG_d")       \n\t"
+
+snow_inner_add_yblock_sse2_end_16
+}
+
+#define snow_inner_add_yblock_mmx_header \
+    IDWTELEM * * dst_array = sb->line + src_y;\
+    x86_reg tmp;\
+    __asm__ volatile(\
+             "mov  %7, %%"REG_c"             \n\t"\
+             "mov  %6, %2                    \n\t"\
+             "mov  %4, %%"REG_S"             \n\t"\
+             "pxor %%mm7, %%mm7              \n\t" /* 0 */\
+             "pcmpeqd %%mm3, %%mm3           \n\t"\
+             "psllw $15, %%mm3               \n\t"\
+             "psrlw $12, %%mm3               \n\t" /* FRAC_BITS >> 1 */\
+             "1:                             \n\t"\
+             "mov %1, %%"REG_D"              \n\t"\
+             "mov (%%"REG_D"), %%"REG_D"     \n\t"\
+             "add %3, %%"REG_D"              \n\t"
+
+#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
+             "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+             "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
+             "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
+             "punpcklbw %%mm7, %%"out_reg1" \n\t"\
+             "punpcklbw %%mm7, %%"out_reg2" \n\t"\
+             "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
+             "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
+             "punpcklbw %%mm7, %%mm0       \n\t"\
+             "punpcklbw %%mm7, %%mm4       \n\t"\
+             "pmullw %%mm0, %%"out_reg1"    \n\t"\
+             "pmullw %%mm4, %%"out_reg2"    \n\t"
+
+#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
+             snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
+             "paddusw %%mm2, %%mm1         \n\t"\
+             "paddusw %%mm6, %%mm5         \n\t"
+
+#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
+             "mov %0, %%"REG_d"              \n\t"\
+             "psrlw $4, %%mm1                \n\t"\
+             "psrlw $4, %%mm5                \n\t"\
+             "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\
+             "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\
+             "paddw %%mm3, %%mm1             \n\t"\
+             "paddw %%mm3, %%mm5             \n\t"\
+             "psraw $4, %%mm1                \n\t"\
+             "psraw $4, %%mm5                \n\t"\
+             "packuswb %%mm5, %%mm1          \n\t"\
+             "movq %%mm1, "write_offset"(%%"REG_d") \n\t"
+
+#define snow_inner_add_yblock_mmx_end(s_step)\
+             "add $"s_step", %%"REG_S"             \n\t"\
+             "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
+             "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
+             "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
+             "add %%"REG_c", (%%"REG_a")     \n\t"\
+             "add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\
+             "add %%"REG_c", %0              \n\t"\
+             "dec %2                         \n\t"\
+             "jnz 1b                         \n\t"\
+             :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+             :\
+             "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
+             "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+
+static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "8", "0")
+snow_inner_add_yblock_mmx_accum("1", "128", "0")
+snow_inner_add_yblock_mmx_accum("0", "136", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+snow_inner_add_yblock_mmx_end("16")
+}
+
+static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+                      int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "16", "0")
+snow_inner_add_yblock_mmx_accum("1", "512", "0")
+snow_inner_add_yblock_mmx_accum("0", "528", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
+snow_inner_add_yblock_mmx_accum("2", "24", "8")
+snow_inner_add_yblock_mmx_accum("1", "520", "8")
+snow_inner_add_yblock_mmx_accum("0", "536", "8")
+snow_inner_add_yblock_mmx_mix("16", "8")
+snow_inner_add_yblock_mmx_end("32")
+}
+
+static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+
+    if (b_w == 16)
+        inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else if (b_w == 8 && obmc_stride == 16) {
+        if (!(b_h & 1))
+            inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+        else
+            inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    } else
+         ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+
+static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+                          int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+    if (b_w == 16)
+        inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else if (b_w == 8 && obmc_stride == 16)
+        inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+    else
+        ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+#endif /* HAVE_6REGS */
+
+#endif /* HAVE_INLINE_ASM */
+
+void ff_dwt_init_x86(SnowDWTContext *c)
+{
+#if HAVE_INLINE_ASM
+    int mm_flags = av_get_cpu_flags();
+
+    if (mm_flags & AV_CPU_FLAG_MMX) {
+        if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
+#if HAVE_7REGS
+            c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
+#endif
+#if HAVE_6REGS
+            c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
+#endif
+        }
+        else{
+            if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+            c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
+#if HAVE_7REGS
+            c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+#endif
+            }
+#if HAVE_6REGS
+            c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
+#endif
+        }
+    }
+#endif /* HAVE_INLINE_ASM */
+}
diff --git a/libavcodec/x86/svq1enc.asm b/libavcodec/x86/svq1enc.asm
new file mode 100644
index 0000000000..24ee70f108
--- /dev/null
+++ b/libavcodec/x86/svq1enc.asm
@@ -0,0 +1,61 @@
+;******************************************************************************
+;* SIMD-optimized SVQ1 encoder functions
+;* Copyright (c) 2007 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+%macro SSD_INT8_VS_INT16 0
+cglobal ssd_int8_vs_int16, 3, 3, 3, pix1, pix2, size
+    pxor m0, m0
+.loop
+    sub       sizeq, 8
+    movq      m1, [pix1q + sizeq]
+    mova      m2, [pix2q + sizeq*2]
+%if mmsize == 8
+    movq      m3, [pix2q + sizeq*2 + mmsize]
+    punpckhbw m4, m1
+    punpcklbw m1, m1
+    psraw     m4, 8
+    psraw     m1, 8
+    psubw     m3, m4
+    psubw     m2, m1
+    pmaddwd   m3, m3
+    pmaddwd   m2, m2
+    paddd     m0, m3
+    paddd     m0, m2
+%else
+    punpcklbw m1, m1
+    psraw     m1, 8
+    psubw     m2, m1
+    pmaddwd   m2, m2
+    paddd     m0, m2
+%endif
+    jg .loop
+    HADDD     m0, m1
+    movd     eax, m0
+    RET
+%endmacro
+
+INIT_MMX mmx
+SSD_INT8_VS_INT16
+INIT_XMM sse2
+SSD_INT8_VS_INT16
diff --git a/libavcodec/x86/svq1enc.c b/libavcodec/x86/svq1enc.c
deleted file mode 100644
index 02b0a84b8c..0000000000
--- a/libavcodec/x86/svq1enc.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/x86/asm.h"
-#include "libavutil/x86/cpu.h"
-#include "libavcodec/svq1enc.h"
-
-#if HAVE_INLINE_ASM
-
-static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
-                                 int size)
-{
-    int sum;
-    x86_reg i = size;
-
-    __asm__ volatile (
-        "pxor %%mm4, %%mm4 \n"
-        "1: \n"
-        "sub $8, %0 \n"
-        "movq (%2, %0), %%mm2 \n"
-        "movq (%3, %0, 2), %%mm0 \n"
-        "movq 8(%3, %0, 2), %%mm1 \n"
-        "punpckhbw %%mm2, %%mm3 \n"
-        "punpcklbw %%mm2, %%mm2 \n"
-        "psraw $8, %%mm3 \n"
-        "psraw $8, %%mm2 \n"
-        "psubw %%mm3, %%mm1 \n"
-        "psubw %%mm2, %%mm0 \n"
-        "pmaddwd %%mm1, %%mm1 \n"
-        "pmaddwd %%mm0, %%mm0 \n"
-        "paddd %%mm1, %%mm4 \n"
-        "paddd %%mm0, %%mm4 \n"
-        "jg 1b \n"
-        "movq %%mm4, %%mm3 \n"
-        "psrlq $32, %%mm3 \n"
-        "paddd %%mm3, %%mm4 \n"
-        "movd %%mm4, %1 \n"
-        : "+r" (i), "=r" (sum)
-        : "r" (pix1), "r" (pix2));
-
-    return sum;
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
-{
-#if HAVE_INLINE_ASM
-    int cpu_flags = av_get_cpu_flags();
-
-    if (INLINE_MMX(cpu_flags)) {
-        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
-    }
-#endif /* HAVE_INLINE_ASM */
-}
diff --git a/libavcodec/x86/svq1enc_init.c b/libavcodec/x86/svq1enc_init.c
new file mode 100644
index 0000000000..40b4b0e183
--- /dev/null
+++ b/libavcodec/x86/svq1enc_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2007 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/svq1enc.h"
+
+int ff_ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
+                             intptr_t size);
+int ff_ssd_int8_vs_int16_sse2(const int8_t *pix1, const int16_t *pix2,
+                              intptr_t size);
+
+av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_mmx;
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->ssd_int8_vs_int16 = ff_ssd_int8_vs_int16_sse2;
+    }
+}
diff --git a/libavcodec/x86/ttadsp.asm b/libavcodec/x86/ttadsp.asm
new file mode 100644
index 0000000000..8f489498a3
--- /dev/null
+++ b/libavcodec/x86/ttadsp.asm
@@ -0,0 +1,119 @@
+;******************************************************************************
+;* TTA DSP SIMD optimizations
+;*
+;* Copyright (C) 2014 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pd_n0113: dd ~0, ~1, ~1, ~3
+pd_1224:  dd 1, 2, 2, 4
+
+SECTION .text
+
+%macro TTA_FILTER 2
+INIT_XMM %1
+cglobal ttafilter_process_dec, 5,5,%2, qm, dx, dl, error, in, shift, round
+    mova       m2, [qmq       ]
+    mova       m3, [qmq + 0x10]
+    mova       m4, [dxq       ]
+    mova       m5, [dxq + 0x10]
+
+    movd       m6, [errorq]         ; if (filter->error < 0) {
+    SPLATD     m6                   ;     for (int i = 0; i < 8; i++)
+    psignd     m0, m4, m6           ;         filter->qm[i] -= filter->dx[i];
+    psignd     m1, m5, m6           ; } else if (filter->error > 0) {
+    paddd      m2, m0               ;     for (int i = 0; i < 8; i++)
+    paddd      m3, m1               ;         filter->qm[i] += filter->dx[i];
+    mova       [qmq       ], m2     ; }
+    mova       [qmq + 0x10], m3     ;
+
+    mova       m0, [dlq       ]
+    mova       m1, [dlq + 0x10]
+
+%if cpuflag(sse4)
+    pmulld     m2, m0
+    pmulld     m3, m1
+%else
+    pshufd     m6, m0, 0xb1
+    pshufd     m7, m2, 0xb1
+    pmuludq    m6, m7
+    pshufd     m6, m6, 0xd8
+    pmuludq    m2, m0
+    pshufd     m2, m2, 0xd8
+    punpckldq  m2, m6
+
+    pshufd     m6, m1, 0xb1
+    pshufd     m7, m3, 0xb1
+    pmuludq    m6, m7
+    pshufd     m6, m6, 0xd8
+    pmuludq    m3, m1
+    pshufd     m3, m3, 0xd8
+    punpckldq  m3, m6
+%endif
+    ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around
+    paddd      m2, m3               ; int sum = filter->round +
+                                    ;           filter->dl[0] * filter->qm[0] +
+    pshufd     m3, m2, 0xe          ;           filter->dl[1] * filter->qm[1] +
+    paddd      m2, m3               ;           filter->dl[2] * filter->qm[2] +
+                                    ;           filter->dl[3] * filter->qm[3] +
+    movd       m6, roundm           ;           filter->dl[4] * filter->qm[4] +
+    paddd      m6, m2               ;           filter->dl[5] * filter->qm[5] +
+    pshufd     m2, m2, 0x1          ;           filter->dl[6] * filter->qm[6] +
+    paddd      m6, m2               ;           filter->dl[7] * filter->qm[7];
+
+    palignr    m5, m4, 4            ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2];
+                                    ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4];
+
+    palignr    m2, m1, m0, 4        ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2];
+                                    ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4];
+
+    psrad      m4, m1, 30           ; filter->dx[4] = ((filter->dl[4] >> 30) | 1);
+    por        m4, [pd_1224 ]       ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1;
+    pand       m4, [pd_n0113]       ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1;
+                                    ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3;
+
+    mova       [dlq       ], m2
+    mova       [dxq       ], m5
+    mova       [dxq + 0x10], m4
+    movd       m0, [inq]            ; filter->error = *in;
+    movd       [errorq], m0         ;
+
+    movd       m2, shiftm           ; *in += (sum >> filter->shift);
+    psrad      m6, m2               ;
+    paddd      m0, m6               ;
+    movd       [inq], m0            ;
+
+    psrldq     m1, 4                ;
+    pslldq     m0, 12               ; filter->dl[4] = -filter->dl[5];
+    pshufd     m0, m0, 0xf0         ; filter->dl[5] = -filter->dl[6];
+    psubd      m0, m1               ; filter->dl[6] = *in - filter->dl[7];
+    psrldq     m1, m0, 4            ; filter->dl[7] = *in;
+    pshufd     m1, m1, 0xf4         ; filter->dl[5] += filter->dl[6];
+    paddd      m0, m1               ; filter->dl[4] += filter->dl[5];
+    psrldq     m1, 4                ;
+    paddd      m0, m1               ;
+    mova       [dlq + 0x10], m0     ;
+    RET
+%endmacro
+
+TTA_FILTER ssse3, 8
+TTA_FILTER sse4,  7
diff --git a/libavcodec/x86/ttadsp_init.c b/libavcodec/x86/ttadsp_init.c
new file mode 100644
index 0000000000..47dc87f6af
--- /dev/null
+++ b/libavcodec/x86/ttadsp_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/ttadsp.h"
+#include "libavutil/x86/cpu.h"
+#include "config.h"
+
+void ff_ttafilter_process_dec_ssse3(int32_t *qm, int32_t *dx, int32_t *dl,
+                                    int32_t *error, int32_t *in, int32_t shift,
+                                    int32_t round);
+void ff_ttafilter_process_dec_sse4(int32_t *qm, int32_t *dx, int32_t *dl,
+                                   int32_t *error, int32_t *in, int32_t shift,
+                                   int32_t round);
+
+av_cold void ff_ttadsp_init_x86(TTADSPContext *c)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSSE3(cpu_flags))
+        c->ttafilter_process_dec = ff_ttafilter_process_dec_ssse3;
+    if (EXTERNAL_SSE4(cpu_flags))
+        c->ttafilter_process_dec = ff_ttafilter_process_dec_sse4;
+#endif
+}
diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c
new file mode 100644
index 0000000000..02c5eaa2c2
--- /dev/null
+++ b/libavcodec/x86/v210-init.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavcodec/v210dec.h"
+
+extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
+
+av_cold void v210_x86_init(V210DecContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_YASM
+    if (s->aligned_input) {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
+
+        if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
+    }
+    else {
+        if (cpu_flags & AV_CPU_FLAG_SSSE3)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
+
+        if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
+            s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
+    }
+#endif
+}
diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm
new file mode 100644
index 0000000000..400a1f3f9e
--- /dev/null
+++ b/libavcodec/x86/v210.asm
@@ -0,0 +1,90 @@
+;******************************************************************************
+;* V210 SIMD unpack
+;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
+;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+v210_mask: times 4 dd 0x3ff
+v210_mult: dw 64,4,64,4,64,4,64,4
+v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
+v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
+
+SECTION .text
+
+%macro v210_planar_unpack 1
+
+; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
+cglobal v210_planar_unpack_%1, 5, 5, 7
+    movsxdifnidn r4, r4d
+    lea    r1, [r1+2*r4]
+    add    r2, r4
+    add    r3, r4
+    neg    r4
+
+    mova   m3, [v210_mult]
+    mova   m4, [v210_mask]
+    mova   m5, [v210_luma_shuf]
+    mova   m6, [v210_chroma_shuf]
+.loop
+%ifidn %1, unaligned
+    movu   m0, [r0]
+%else
+    mova   m0, [r0]
+%endif
+
+    pmullw m1, m0, m3
+    psrld  m0, 10
+    psrlw  m1, 6  ; u0 v0 y1 y2 v1 u2 y4 y5
+    pand   m0, m4 ; y0 __ u1 __ y3 __ v2 __
+
+    shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
+    pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
+    movu   [r1+2*r4], m2
+
+    shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
+    pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
+    movq   [r2+r4], m1
+    movhps [r3+r4], m1
+
+    add r0, mmsize
+    add r4, 6
+    jl  .loop
+
+    REP_RET
+%endmacro
+
+INIT_XMM ssse3
+v210_planar_unpack unaligned
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+v210_planar_unpack unaligned
+%endif
+
+INIT_XMM ssse3
+v210_planar_unpack aligned
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+v210_planar_unpack aligned
+%endif
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm
index adf08d7d84..546688cf9d 100644
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp.asm
@@ -2,20 +2,20 @@
 ;* VC1 deblocking optimizations
 ;* Copyright (c) 2009 David Conrad
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vc1dsp.h b/libavcodec/x86/vc1dsp.h
index 9b6c8ada26..fdd4de1813 100644
--- a/libavcodec/x86/vc1dsp.h
+++ b/libavcodec/x86/vc1dsp.h
@@ -1,20 +1,20 @@
 /*
  * VC-1 and WMV3 decoder - X86 DSP init functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index aff4b264e3..2bef5f5fb5 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -27,6 +27,7 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
 #include "libavcodec/vc1dsp.h"
 #include "fpel.h"
 #include "vc1dsp.h"
@@ -62,12 +63,17 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
     ff_vc1_h_loop_filter8_sse4(src,          stride, pq);
     ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
 }
-
 static void avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
                                       ptrdiff_t stride, int rnd)
 {
     ff_avg_pixels8_mmxext(dst, src, stride, 8);
 }
+static void avg_vc1_mspel_mc00_16_sse2(uint8_t *dst, const uint8_t *src,
+                                       ptrdiff_t stride, int rnd)
+{
+    ff_avg_pixels16_sse2(dst, src, stride, 16);
+}
+
 #endif /* HAVE_YASM */
 
 void ff_put_vc1_chroma_mc8_nornd_mmx  (uint8_t *dst, uint8_t *src,
@@ -86,10 +92,10 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags))
+    if (HAVE_6REGS && INLINE_MMX(cpu_flags))
         ff_vc1dsp_init_mmx(dsp);
 
-    if (INLINE_MMXEXT(cpu_flags))
+    if (HAVE_6REGS && INLINE_MMXEXT(cpu_flags))
         ff_vc1dsp_init_mmxext(dsp);
 
 #define ASSIGN_LF(EXT) \
@@ -111,13 +117,14 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
         ASSIGN_LF(mmxext);
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
 
-        dsp->avg_vc1_mspel_pixels_tab[0]         = avg_vc1_mspel_mc00_mmxext;
+        dsp->avg_vc1_mspel_pixels_tab[1][0]      = avg_vc1_mspel_mc00_mmxext;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
         dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_sse2;
         dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_sse2;
         dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
         dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
+        dsp->avg_vc1_mspel_pixels_tab[0][0]      = avg_vc1_mspel_mc00_16_sse2;
     }
     if (EXTERNAL_SSSE3(cpu_flags)) {
         ASSIGN_LF(ssse3);
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index 046affbc26..77a8e359ff 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -25,7 +25,6 @@
  */
 
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
@@ -34,7 +33,7 @@
 #include "fpel.h"
 #include "vc1dsp.h"
 
-#if HAVE_INLINE_ASM
+#if HAVE_6REGS && HAVE_INLINE_ASM
 
 #define OP_PUT(S,D)
 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@@ -111,6 +110,7 @@ static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
         : "+r"(src), "+r"(dst)
         : "r"(stride), "r"(-2*stride),
           "m"(shift), "m"(rnd), "r"(9*stride-4)
+          NAMED_CONSTRAINTS_ADD(ff_pw_9)
         : "%"REG_c, "memory"
     );
 }
@@ -155,6 +155,7 @@ static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
         "jnz 1b                            \n\t"\
         : "+r"(h), "+r" (src),  "+r" (dst)\
         : "r"(stride), "m"(rnd)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_128,ff_pw_9)\
         : "memory"\
     );\
 }
@@ -213,6 +214,7 @@ static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
         : "+r"(src),  "+r"(dst)\
         : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
           "g"(stride-offset)\
+          NAMED_CONSTRAINTS_ADD(ff_pw_9)\
         : "%"REG_c, "memory"\
     );\
 }
@@ -315,6 +317,7 @@ vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src,      \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(src_stride), "r"(3*src_stride),                           \
           "m"(rnd), "m"(shift)                                          \
+          NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18)              \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -352,6 +355,7 @@ OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride,    \
         "jnz 1b                    \n\t"                                \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(stride), "m"(rnd)                                         \
+          NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128)    \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -387,6 +391,7 @@ OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src,         \
         "jnz 1b                    \n\t"                                \
         : "+r"(h), "+r" (src),  "+r" (dst)                              \
         : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd)             \
+          NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3)              \
         : "memory"                                                      \
     );                                                                  \
 }
@@ -457,6 +462,15 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
 \
     /* Horizontal mode with no vertical mode */\
     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
+} \
+static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
+                                  int stride, int hmode, int vmode, int rnd)\
+{ \
+    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
+    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
+    dst += 8*stride; src += 8*stride; \
+    OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
+    OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
 }
 
 VC1_MSPEL_MC(put_)
@@ -477,6 +491,20 @@ static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst,         \
                                                   int rnd)              \
 {                                                                       \
      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                     \
+}\
+static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst,         \
+                                                  const uint8_t *src,   \
+                                                  ptrdiff_t stride,     \
+                                                  int rnd)              \
+{                                                                       \
+     put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \
+}\
+static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst,      \
+                                                     const uint8_t *src,\
+                                                     ptrdiff_t stride,  \
+                                                     int rnd)           \
+{                                                                       \
+     avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \
 }
 
 DECLARE_FUNCTION(0, 1)
@@ -700,59 +728,83 @@ static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
     );
 }
 
+#if HAVE_MMX_EXTERNAL
 static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
                                    ptrdiff_t stride, int rnd)
 {
     ff_put_pixels8_mmx(dst, src, stride, 8);
 }
+static void put_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride, int rnd)
+{
+    ff_put_pixels16_mmx(dst, src, stride, 16);
+}
+static void avg_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
+                                   ptrdiff_t stride, int rnd)
+{
+    ff_avg_pixels8_mmx(dst, src, stride, 8);
+}
+static void avg_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src,
+                                      ptrdiff_t stride, int rnd)
+{
+    ff_avg_pixels16_mmx(dst, src, stride, 16);
+}
+#endif
+
+#define FN_ASSIGN(OP, X, Y, INSN) \
+    dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
+    dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN
 
 av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
 {
-    dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
-    dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
-    dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
-    dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
-    dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
-
-    dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
-    dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
-    dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
-    dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
+#if HAVE_MMX_EXTERNAL
+    FN_ASSIGN(put_, 0, 0, _mmx);
+    FN_ASSIGN(avg_, 0, 0, _mmx);
+#endif
+    FN_ASSIGN(put_, 0, 1, _mmx);
+    FN_ASSIGN(put_, 0, 2, _mmx);
+    FN_ASSIGN(put_, 0, 3, _mmx);
+
+    FN_ASSIGN(put_, 1, 0, _mmx);
+    FN_ASSIGN(put_, 1, 1, _mmx);
+    FN_ASSIGN(put_, 1, 2, _mmx);
+    FN_ASSIGN(put_, 1, 3, _mmx);
+
+    FN_ASSIGN(put_, 2, 0, _mmx);
+    FN_ASSIGN(put_, 2, 1, _mmx);
+    FN_ASSIGN(put_, 2, 2, _mmx);
+    FN_ASSIGN(put_, 2, 3, _mmx);
+
+    FN_ASSIGN(put_, 3, 0, _mmx);
+    FN_ASSIGN(put_, 3, 1, _mmx);
+    FN_ASSIGN(put_, 3, 2, _mmx);
+    FN_ASSIGN(put_, 3, 3, _mmx);
 }
 
 av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
 {
-    dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;
+    FN_ASSIGN(avg_, 0, 1, _mmxext);
+    FN_ASSIGN(avg_, 0, 2, _mmxext);
+    FN_ASSIGN(avg_, 0, 3, _mmxext);
 
-    dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext;
+    FN_ASSIGN(avg_, 1, 0, _mmxext);
+    FN_ASSIGN(avg_, 1, 1, _mmxext);
+    FN_ASSIGN(avg_, 1, 2, _mmxext);
+    FN_ASSIGN(avg_, 1, 3, _mmxext);
 
-    dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext;
+    FN_ASSIGN(avg_, 2, 0, _mmxext);
+    FN_ASSIGN(avg_, 2, 1, _mmxext);
+    FN_ASSIGN(avg_, 2, 2, _mmxext);
+    FN_ASSIGN(avg_, 2, 3, _mmxext);
 
-    dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext;
-    dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext;
+    FN_ASSIGN(avg_, 3, 0, _mmxext);
+    FN_ASSIGN(avg_, 3, 1, _mmxext);
+    FN_ASSIGN(avg_, 3, 2, _mmxext);
+    FN_ASSIGN(avg_, 3, 3, _mmxext);
 
     dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
     dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
     dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
     dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
 }
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_6REGS && HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm
index 53b9e8292c..25d43640ab 100644
--- a/libavcodec/x86/videodsp.asm
+++ b/libavcodec/x86/videodsp.asm
@@ -2,20 +2,20 @@
 ;* Core video DSP functions
 ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -54,13 +54,13 @@ SECTION .text
 ; |    |    <- bottom is copied from last line in body of source
 ; '----' <- bh
 %if ARCH_X86_64
-cglobal emu_edge_vvar, 7, 8, 1, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \
                                 start_y, end_y, bh, w
 %else ; x86-32
 cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
 %define src_strideq r3mp
-%define dst_strideq r2mp
-    mov            srcq, r1mp
+%define dst_strideq r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
@@ -97,7 +97,10 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
     neg        n_wordsq
     lea        start_xq, [start_xq+n_wordsq*2]
 .y_loop:                                        ; do {
-    ; FIXME also write a ssse3 version using pshufb
+%if cpuflag(avx2)
+    vpbroadcastb     m0, [dstq+start_xq]
+    mov              wq, n_wordsq               ;   initialize w
+%else
     movzx            wd, byte [dstq+start_xq]   ;   w = read(1)
     imul             wd, 0x01010101             ;   w *= 0x01010101
     movd             m0, wd
@@ -107,6 +110,7 @@ cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
 %else ; mmx
     punpckldq        m0, m0                     ;   splat
 %endif ; mmx/sse
+%endif ; avx2
 .x_loop:                                        ;   do {
     movu    [dstq+wq*2], m0                     ;     write($reg, $mmsize)
     add              wq, mmsize/2               ;     w -= $mmsize/2
@@ -127,6 +131,11 @@ hvar_fn
 INIT_XMM sse2
 hvar_fn
 
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+hvar_fn
+%endif
+
 ; macro to read/write a horizontal number of pixels (%2) to/from registers
 ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels
 ;         - if (%2 & 8)  fills 8 bytes into xmm$next
@@ -262,30 +271,30 @@ hvar_fn
 %rep 1+%2-%1
 %if %%n <= 3
 %if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \
                                        start_y, end_y, val, bh
     mov             bhq, r6mp                   ; r6mp = bhmp
 %else ; x86-32
 cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
     mov            dstq, r0mp
-    mov            srcq, r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
-%define dst_strideq r2mp
+%define dst_strideq r1mp
 %define src_strideq r3mp
 %endif ; x86-64/32
 %else
 %if ARCH_X86_64
-cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, src, dst_stride, src_stride, \
+cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \
                                        start_y, end_y, bh
 %else ; x86-32
 cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
-    mov            srcq, r1mp
+    mov            srcq, r2mp
     mov        start_yq, r4mp
     mov          end_yq, r5mp
     mov             bhq, r6mp
-%define dst_strideq r2mp
+%define dst_strideq r1mp
 %define src_strideq r3mp
 %endif ; x86-64/32
 %endif
@@ -344,9 +353,8 @@ VERTICAL_EXTEND 16, 22
 ; obviously not the same on both sides.
 
 %macro READ_V_PIXEL 2
-%if %1 == 2
-    movzx          valw, byte %2
-    imul           valw, 0x0101
+%if cpuflag(avx2)
+    vpbroadcastb     m0, %2
 %else
     movzx          vald, byte %2
     imul           vald, 0x01010101
@@ -356,13 +364,16 @@ VERTICAL_EXTEND 16, 22
     pshufd           m0, m0, q0000
 %else
     punpckldq        m0, m0
-%endif
-%endif ; %1 >= 8
-%endif
+%endif ; mmsize == 16
+%endif ; %1 > 16
+%endif ; avx2
 %endmacro ; READ_V_PIXEL
 
 %macro WRITE_V_PIXEL 2
 %assign %%off 0
+
+%if %1 >= 8
+
 %rep %1/mmsize
     movu     [%2+%%off], m0
 %assign %%off %%off+mmsize
@@ -378,34 +389,44 @@ VERTICAL_EXTEND 16, 22
 %assign %%off %%off+8
 %endif
 %endif ; %1-%%off >= 8
-%endif
+%endif ; mmsize == 16
 
 %if %1-%%off >= 4
 %if %1 > 8 && %1-%%off > 4
     movq      [%2+%1-8], m0
 %assign %%off %1
-%elif %1 >= 8 && %1-%%off >= 4
-    movd     [%2+%%off], m0
-%assign %%off %%off+4
 %else
-    mov      [%2+%%off], vald
+    movd     [%2+%%off], m0
 %assign %%off %%off+4
 %endif
 %endif ; %1-%%off >= 4
 
-%if %1-%%off >= 2
-%if %1 >= 8
-    movd      [%2+%1-4], m0
+%else ; %1 < 8
+
+%rep %1/4
+    mov      [%2+%%off], vald
+%assign %%off %%off+4
+%endrep ; %1/4
+
+%endif ; %1 >=/< 8
+
+%if %1-%%off == 2
+%if cpuflag(avx2)
+    movd     [%2+%%off-2], m0
 %else
     mov      [%2+%%off], valw
-%endif
+%endif ; avx2
 %endif ; (%1-%%off)/2
 %endmacro ; WRITE_V_PIXEL
 
 %macro H_EXTEND 2
 %assign %%n %1
 %rep 1+(%2-%1)/2
+%if cpuflag(avx2)
+cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh
+%else
 cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
+%endif
 .loop_y:                                        ; do {
     READ_V_PIXEL    %%n, [dstq+start_xq]        ;   $variable_regs = read($n)
     WRITE_V_PIXEL   %%n, dstq                   ;   write($variable_regs, $n)
@@ -426,6 +447,11 @@ H_EXTEND 16, 22
 INIT_XMM sse2
 H_EXTEND 16, 22
 
+%if HAVE_AVX2_EXTERNAL
+INIT_XMM avx2
+H_EXTEND 8, 22
+%endif
+
 %macro PREFETCH_FN 1
 cglobal prefetch, 3, 3, 0, buf, stride, h
 .loop:
diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c
index 8ee837096a..885cdf1d8c 100644
--- a/libavcodec/x86/videodsp_init.c
+++ b/libavcodec/x86/videodsp_init.c
@@ -1,25 +1,27 @@
 /*
+ * Copyright (C) 2002-2012 Michael Niedermayer
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
@@ -28,11 +30,11 @@
 #include "libavcodec/videodsp.h"
 
 #if HAVE_YASM
-typedef void emu_edge_vfix_func(uint8_t *dst, const uint8_t *src,
-                                x86_reg dst_stride, x86_reg src_stride,
+typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
+                                const uint8_t *src, x86_reg src_stride,
                                 x86_reg start_y, x86_reg end_y, x86_reg bh);
-typedef void emu_edge_vvar_func(uint8_t *dst, const uint8_t *src,
-                                x86_reg dst_stride, x86_reg src_stride,
+typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
+                                const uint8_t *src, x86_reg src_stride,
                                 x86_reg start_y, x86_reg end_y, x86_reg bh,
                                 x86_reg w);
 
@@ -59,7 +61,7 @@ extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
 extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
 extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
 #if ARCH_X86_32
-static emu_edge_vfix_func *vfixtbl_mmx[22] = {
+static emu_edge_vfix_func * const vfixtbl_mmx[22] = {
     &ff_emu_edge_vfix1_mmx,  &ff_emu_edge_vfix2_mmx,  &ff_emu_edge_vfix3_mmx,
     &ff_emu_edge_vfix4_mmx,  &ff_emu_edge_vfix5_mmx,  &ff_emu_edge_vfix6_mmx,
     &ff_emu_edge_vfix7_mmx,  &ff_emu_edge_vfix8_mmx,  &ff_emu_edge_vfix9_mmx,
@@ -78,7 +80,7 @@ extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
 extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
 extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
 extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
-static emu_edge_vfix_func *vfixtbl_sse[22] = {
+static emu_edge_vfix_func * const vfixtbl_sse[22] = {
     ff_emu_edge_vfix1_mmx,  ff_emu_edge_vfix2_mmx,  ff_emu_edge_vfix3_mmx,
     ff_emu_edge_vfix4_mmx,  ff_emu_edge_vfix5_mmx,  ff_emu_edge_vfix6_mmx,
     ff_emu_edge_vfix7_mmx,  ff_emu_edge_vfix8_mmx,  ff_emu_edge_vfix9_mmx,
@@ -107,7 +109,7 @@ extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
 extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
 extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
 #if ARCH_X86_32
-static emu_edge_hfix_func *hfixtbl_mmx[11] = {
+static emu_edge_hfix_func * const hfixtbl_mmx[11] = {
     ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
     ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
     ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
@@ -119,13 +121,30 @@ extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
-static emu_edge_hfix_func *hfixtbl_sse2[11] = {
+static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
     ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
     ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
     ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
     ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
 };
 extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
+#if HAVE_AVX2_EXTERNAL
+extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
+extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
+static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
+    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
+    ff_emu_edge_hfix8_avx2,  ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
+    ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
+    ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
+};
+extern emu_edge_hvar_func ff_emu_edge_hvar_avx2;
+#endif
 
 static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
                                               ptrdiff_t dst_stride,
@@ -133,22 +152,24 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
                                               x86_reg block_w, x86_reg block_h,
                                               x86_reg src_x, x86_reg src_y,
                                               x86_reg w, x86_reg h,
-                                              emu_edge_vfix_func **vfix_tbl,
+                                              emu_edge_vfix_func * const *vfix_tbl,
                                               emu_edge_vvar_func *v_extend_var,
-                                              emu_edge_hfix_func **hfix_tbl,
+                                              emu_edge_hfix_func * const *hfix_tbl,
                                               emu_edge_hvar_func *h_extend_var)
 {
     x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
 
     if (!w || !h)
-         return;
+        return;
 
     if (src_y >= h) {
-        src  -= src_y * src_stride;
-        src_y = src_y_add = h - 1;
+        src -= src_y*src_stride;
+        src_y_add = h - 1;
+        src_y     = h - 1;
     } else if (src_y <= -block_h) {
-        src  -= src_y*src_stride;
-        src_y = src_y_add = 1 - block_h;
+        src -= src_y*src_stride;
+        src_y_add = 1 - block_h;
+        src_y     = 1 - block_h;
     }
     if (src_x >= w) {
         src   += w - 1 - src_x;
@@ -162,18 +183,17 @@ static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
     start_x = FFMAX(0, -src_x);
     end_y   = FFMIN(block_h, h-src_y);
     end_x   = FFMIN(block_w, w-src_x);
-    assert(start_x < end_x && block_w > 0);
-    assert(start_y < end_y && block_h > 0);
+    av_assert2(start_x < end_x && block_w > 0);
+    av_assert2(start_y < end_y && block_h > 0);
 
     // fill in the to-be-copied part plus all above/below
     src += (src_y_add + start_y) * src_stride + start_x;
     w = end_x - start_x;
     if (w <= 22) {
-        vfix_tbl[w - 1](dst + start_x, src,
-                        dst_stride, src_stride,
+        vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
                         start_y, end_y, block_h);
     } else {
-        v_extend_var(dst + start_x, src, dst_stride, src_stride,
+        v_extend_var(dst + start_x, dst_stride, src, src_stride,
                      start_y, end_y, block_h, w);
     }
 
@@ -212,7 +232,7 @@ static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
                      hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
 }
 
-static av_noinline void emulated_edge_mc_sse(uint8_t * buf,const uint8_t *src,
+static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
                                              ptrdiff_t buf_stride,
                                              ptrdiff_t src_stride,
                                              int block_w, int block_h,
@@ -231,10 +251,24 @@ static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
                                               int src_x, int src_y, int w,
                                               int h)
 {
-    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, src_x,
-                     src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
                      hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
 }
+
+#if HAVE_AVX2_EXTERNAL
+static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
+                                              ptrdiff_t buf_stride,
+                                              ptrdiff_t src_stride,
+                                              int block_w, int block_h,
+                                              int src_x, int src_y, int w,
+                                              int h)
+{
+    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
+                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+                     hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
+}
+#endif /* HAVE_AVX2_EXTERNAL */
 #endif /* HAVE_YASM */
 
 void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
@@ -264,5 +298,10 @@ av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
     if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
         ctx->emulated_edge_mc = emulated_edge_mc_sse2;
     }
+#if HAVE_AVX2_EXTERNAL
+    if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) {
+        ctx->emulated_edge_mc = emulated_edge_mc_avx2;
+    }
+#endif
 #endif /* HAVE_YASM */
 }
diff --git a/libavcodec/x86/vorbisdsp.asm b/libavcodec/x86/vorbisdsp.asm
index c54650eef5..b25d838868 100644
--- a/libavcodec/x86/vorbisdsp.asm
+++ b/libavcodec/x86/vorbisdsp.asm
@@ -2,20 +2,20 @@
 ;* Vorbis x86 optimizations
 ;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vorbisdsp_init.c b/libavcodec/x86/vorbisdsp_init.c
index bbd83195cc..bc1cc43a18 100644
--- a/libavcodec/x86/vorbisdsp_init.c
+++ b/libavcodec/x86/vorbisdsp_init.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index fc8a047224..24496ae8cf 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -2,20 +2,20 @@
 ;* MMX/SSE2-optimized functions for the VP3 decoder
 ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index ed38a8e4df..cc3eba4e9a 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2009 David Conrad <lessen42@gmail.com>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +23,7 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/vp3dsp.h"
 #include "config.h"
@@ -39,10 +42,70 @@ void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
 void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride,
                                  int *bounding_values);
 
+#if HAVE_MMX_INLINE
+
+#define MOVQ_BFE(regd)                                  \
+    __asm__ volatile (                                  \
+        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
+        "paddb   %%"#regd", %%"#regd"   \n\t" ::)
+
+#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
+    "movq  "#rega", "#regr"             \n\t"                    \
+    "movq  "#regc", "#regp"             \n\t"                    \
+    "pand  "#regb", "#regr"             \n\t"                    \
+    "pand  "#regd", "#regp"             \n\t"                    \
+    "pxor  "#rega", "#regb"             \n\t"                    \
+    "pxor  "#regc", "#regd"             \n\t"                    \
+    "pand    %%mm6, "#regb"             \n\t"                    \
+    "pand    %%mm6, "#regd"             \n\t"                    \
+    "psrlq      $1, "#regb"             \n\t"                    \
+    "psrlq      $1, "#regd"             \n\t"                    \
+    "paddb "#regb", "#regr"             \n\t"                    \
+    "paddb "#regd", "#regp"             \n\t"
+
+#if HAVE_6REGS
+static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, ptrdiff_t stride, int h)
+{
+//    START_TIMER
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   (%2), %%mm1             \n\t"
+        "movq   (%1,%4), %%mm2          \n\t"
+        "movq   (%2,%4), %%mm3          \n\t"
+        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%3)             \n\t"
+        "movq   %%mm5, (%3,%4)          \n\t"
+
+        "movq   (%1,%4,2), %%mm0        \n\t"
+        "movq   (%2,%4,2), %%mm1        \n\t"
+        "movq   (%1,%5), %%mm2          \n\t"
+        "movq   (%2,%5), %%mm3          \n\t"
+        "lea    (%1,%4,4), %1           \n\t"
+        "lea    (%2,%4,4), %2           \n\t"
+        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%3,%4,2)        \n\t"
+        "movq   %%mm5, (%3,%5)          \n\t"
+        "lea    (%3,%4,4), %3           \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
+        :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
+        :"memory");
+//    STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
+}
+#endif /*HAVE_6REGS */
+#endif /* HAVE_MMX_INLINE */
+
 av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if HAVE_6REGS && HAVE_MMX_INLINE
+    c->put_no_rnd_pixels_l2 = put_vp_no_rnd_pixels8_l2_mmx;
+#endif /* HAVE_6REGS && HAVE_MMX_INLINE */
+
 #if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags)) {
         c->idct_put  = ff_vp3_idct_put_mmx;
diff --git a/libavcodec/x86/vp56_arith.h b/libavcodec/x86/vp56_arith.h
index 0a693684af..810cc8dcd8 100644
--- a/libavcodec/x86/vp56_arith.h
+++ b/libavcodec/x86/vp56_arith.h
@@ -4,49 +4,46 @@
  * Copyright (C) 2006  Aurelien Jacobs <aurel@gnuage.org>
  * Copyright (C) 2010  Eli Friedman
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #ifndef AVCODEC_X86_VP56_ARITH_H
 #define AVCODEC_X86_VP56_ARITH_H
 
-#if HAVE_INLINE_ASM && HAVE_FAST_CMOV
+#if HAVE_INLINE_ASM && HAVE_FAST_CMOV && HAVE_6REGS
 #define vp56_rac_get_prob vp56_rac_get_prob
 static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
 {
     unsigned int code_word = vp56_rac_renorm(c);
-    unsigned int high = c->high;
-    unsigned int low = 1 + (((high - 1) * prob) >> 8);
+    unsigned int low = 1 + (((c->high - 1) * prob) >> 8);
     unsigned int low_shift = low << 16;
     int bit = 0;
+    c->code_word = code_word;
 
     __asm__(
         "subl  %4, %1      \n\t"
         "subl  %3, %2      \n\t"
-        "leal (%2, %3), %3 \n\t"
         "setae %b0         \n\t"
         "cmovb %4, %1      \n\t"
-        "cmovb %3, %2      \n\t"
-        : "+q"(bit), "+r"(high), "+r"(code_word), "+r"(low_shift)
-        : "r"(low)
+        "cmovb %5, %2      \n\t"
+        : "+q"(bit), "+&r"(c->high), "+&r"(c->code_word)
+        : "r"(low_shift), "r"(low), "r"(code_word)
     );
 
-    c->high      = high;
-    c->code_word = code_word;
     return bit;
 }
 #endif
diff --git a/libavcodec/x86/vp6dsp.asm b/libavcodec/x86/vp6dsp.asm
index 80f8ca5f38..3d874ea62a 100644
--- a/libavcodec/x86/vp6dsp.asm
+++ b/libavcodec/x86/vp6dsp.asm
@@ -3,20 +3,20 @@
 ;* Copyright (C) 2009  Sebastien Lucas <sebastien.lucas@gmail.com>
 ;* Copyright (C) 2009  Zuxy Meng <zuxy.meng@gmail.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vp6dsp_init.c b/libavcodec/x86/vp6dsp_init.c
index cd94f3e038..82baee7e97 100644
--- a/libavcodec/x86/vp6dsp_init.c
+++ b/libavcodec/x86/vp6dsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (C) 2009  Sebastien Lucas <sebastien.lucas@gmail.com>
  * Copyright (C) 2009  Zuxy Meng <zuxy.meng@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index adc9730dfa..538b3f4a9b 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
@@ -143,13 +143,13 @@ filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
 
-pw_256:   times 8 dw 256
 pw_20091: times 4 dw 20091
 pw_17734: times 4 dw 17734
 
 cextern pw_3
 cextern pw_4
 cextern pw_64
+cextern pw_256
 
 SECTION .text
 
diff --git a/libavcodec/x86/vp8dsp_init.c b/libavcodec/x86/vp8dsp_init.c
index e5afd493bb..6668f91169 100644
--- a/libavcodec/x86/vp8dsp_init.c
+++ b/libavcodec/x86/vp8dsp_init.c
@@ -3,20 +3,20 @@
  * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -347,7 +347,7 @@ av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
     }
 
-    if (EXTERNAL_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
+    if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
         VP8_LUMA_MC_FUNC(0, 16, sse2);
         VP8_MC_FUNC(1, 8, sse2);
         VP8_BILINEAR_MC_FUNC(0, 16, sse2);
@@ -417,7 +417,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
         c->vp8_luma_dc_wht                      = ff_vp8_luma_dc_wht_sse;
     }
 
-    if (EXTERNAL_SSE2(cpu_flags) && (cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
+    if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
         c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
 
         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
@@ -430,7 +430,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->vp8_idct_dc_add4y          = ff_vp8_idct_dc_add4y_sse2;
 
-        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
+        c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse2;
 
         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
@@ -455,7 +455,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
     }
 
     if (EXTERNAL_SSE4(cpu_flags)) {
-        c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_sse4;
+        c->vp8_idct_dc_add            = ff_vp8_idct_dc_add_sse4;
 
         c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse4;
         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse4;
diff --git a/libavcodec/x86/vp8dsp_loopfilter.asm b/libavcodec/x86/vp8dsp_loopfilter.asm
index 5d792e8207..98bb6696a0 100644
--- a/libavcodec/x86/vp8dsp_loopfilter.asm
+++ b/libavcodec/x86/vp8dsp_loopfilter.asm
@@ -3,20 +3,20 @@
 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index ce58c08a3b..2790024194 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -1,187 +1,277 @@
 /*
  * VP9 SIMD optimizations
  *
- * Copyright (c) 2013 Ronald S. Bultje <rsbultje@gmail.com>
+ * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/vp9.h"
+#include "libavcodec/vp9dsp.h"
 
 #if HAVE_YASM
 
-#define fpel_func(avg, sz, opt)                                         \
-void ff_ ## avg ## sz ## _ ## opt(uint8_t *dst, const uint8_t *src,     \
-                                  ptrdiff_t dst_stride,                 \
-                                  ptrdiff_t src_stride,                 \
-                                  int h, int mx, int my)
-
+#define fpel_func(avg, sz, opt) \
+void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                              const uint8_t *src, ptrdiff_t src_stride, \
+                              int h, int mx, int my)
 fpel_func(put,  4, mmx);
 fpel_func(put,  8, mmx);
 fpel_func(put, 16, sse);
 fpel_func(put, 32, sse);
 fpel_func(put, 64, sse);
-fpel_func(avg,  4, sse);
-fpel_func(avg,  8, sse);
+fpel_func(avg,  4, mmxext);
+fpel_func(avg,  8, mmxext);
 fpel_func(avg, 16, sse2);
 fpel_func(avg, 32, sse2);
 fpel_func(avg, 64, sse2);
+fpel_func(put, 32, avx);
+fpel_func(put, 64, avx);
+fpel_func(avg, 32, avx2);
+fpel_func(avg, 64, avx2);
 #undef fpel_func
 
-#define mc_func(avg, sz, dir, opt)                                          \
-void                                                                        \
-ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst,         \
-                                                      const uint8_t *src,   \
-                                                      ptrdiff_t dst_stride, \
-                                                      ptrdiff_t src_stride, \
-                                                      int h,                \
-                                                      const int8_t (*filter)[16])
-
-#define mc_funcs(sz)            \
-    mc_func(put, sz, h, ssse3); \
-    mc_func(avg, sz, h, ssse3); \
-    mc_func(put, sz, v, ssse3); \
-    mc_func(avg, sz, v, ssse3)
-
-mc_funcs(4);
-mc_funcs(8);
+#define mc_func(avg, sz, dir, opt) \
+void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                 const uint8_t *src, ptrdiff_t src_stride, \
+                                                 int h, const int8_t (*filter)[32])
+#define mc_funcs(sz, opt) \
+mc_func(put, sz, h, opt); \
+mc_func(avg, sz, h, opt); \
+mc_func(put, sz, v, opt); \
+mc_func(avg, sz, v, opt)
+
+mc_funcs(4, ssse3);
+mc_funcs(8, ssse3);
+#if ARCH_X86_64
+mc_funcs(16, ssse3);
+mc_funcs(32, avx2);
+#endif
 
 #undef mc_funcs
 #undef mc_func
 
-#define mc_rep_func(avg, sz, hsz, dir, opt)                                 \
-static av_always_inline void                                                \
-ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst,         \
-                                                      const uint8_t *src,   \
-                                                      ptrdiff_t dst_stride, \
-                                                      ptrdiff_t src_stride, \
-                                                      int h,                \
-                                                      const int8_t (*filter)[16]) \
-{                                                                           \
-    ff_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst, src,        \
-                                                           dst_stride,      \
-                                                           src_stride,      \
-                                                           h,               \
-                                                           filter);         \
-    ff_ ## avg ## _8tap_1d_ ## dir ## _ ## hsz ## _ ## opt(dst + hsz,       \
-                                                           src + hsz,       \
-                                                           dst_stride,      \
-                                                           src_stride,      \
-                                                           h, filter);      \
+#define mc_rep_func(avg, sz, hsz, dir, opt) \
+static av_always_inline void \
+ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                            const uint8_t *src, ptrdiff_t src_stride, \
+                                            int h, const int8_t (*filter)[32]) \
+{ \
+    ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst,       dst_stride, src, \
+                                                 src_stride, h, filter); \
+    ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst + hsz, dst_stride, src + hsz, \
+                                                 src_stride, h, filter); \
 }
 
-#define mc_rep_funcs(sz, hsz)            \
-    mc_rep_func(put, sz, hsz, h, ssse3); \
-    mc_rep_func(avg, sz, hsz, h, ssse3); \
-    mc_rep_func(put, sz, hsz, v, ssse3); \
-    mc_rep_func(avg, sz, hsz, v, ssse3)
-
-mc_rep_funcs(16, 8);
-mc_rep_funcs(32, 16);
-mc_rep_funcs(64, 32);
+#define mc_rep_funcs(sz, hsz, opt) \
+mc_rep_func(put, sz, hsz, h, opt); \
+mc_rep_func(avg, sz, hsz, h, opt); \
+mc_rep_func(put, sz, hsz, v, opt); \
+mc_rep_func(avg, sz, hsz, v, opt)
+
+#if ARCH_X86_32
+mc_rep_funcs(16, 8, ssse3);
+#endif
+mc_rep_funcs(32, 16, ssse3);
+mc_rep_funcs(64, 32, ssse3);
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+mc_rep_funcs(64, 32, avx2);
+#endif
 
 #undef mc_rep_funcs
 #undef mc_rep_func
 
-extern const int8_t ff_filters_ssse3[3][15][4][16];
-
-#define filter_8tap_2d_fn(op, sz, f, fname)                             \
-static void                                                             \
-op ## _8tap_ ## fname ## _ ## sz ## hv_ssse3(uint8_t *dst,              \
-                                             const uint8_t *src,        \
-                                             ptrdiff_t dst_stride,      \
-                                             ptrdiff_t src_stride,      \
-                                             int h, int mx, int my)     \
-{                                                                       \
-    LOCAL_ALIGNED_16(uint8_t, temp, [71 * 64]);                         \
-    ff_put_8tap_1d_h_ ## sz ## _ssse3(temp, src - 3 * src_stride,       \
-                                      64, src_stride,                   \
-                                      h + 7,                            \
-                                      ff_filters_ssse3[f][mx - 1]);     \
-    ff_ ## op ## _8tap_1d_v_ ## sz ## _ssse3(dst, temp + 3 * 64,        \
-                                             dst_stride, 64,            \
-                                             h,                         \
-                                             ff_filters_ssse3[f][my - 1]); \
+extern const int8_t ff_filters_ssse3[3][15][4][32];
+
+#define filter_8tap_2d_fn(op, sz, f, fname, align, opt) \
+static void op##_8tap_##fname##_##sz##hv_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                               const uint8_t *src, ptrdiff_t src_stride, \
+                                               int h, int mx, int my) \
+{ \
+    LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64]); \
+    ff_vp9_put_8tap_1d_h_##sz##_##opt(temp, 64, src - 3 * src_stride, src_stride, \
+                                      h + 7, ff_filters_ssse3[f][mx - 1]); \
+    ff_vp9_##op##_8tap_1d_v_##sz##_##opt(dst, dst_stride, temp + 3 * 64, 64, \
+                                         h, ff_filters_ssse3[f][my - 1]); \
 }
 
-#define filters_8tap_2d_fn(op, sz)                          \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp)     \
-    filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth)
-
-#define filters_8tap_2d_fn2(op) \
-    filters_8tap_2d_fn(op, 64)  \
-    filters_8tap_2d_fn(op, 32)  \
-    filters_8tap_2d_fn(op, 16)  \
-    filters_8tap_2d_fn(op, 8)   \
-    filters_8tap_2d_fn(op, 4)
-
-filters_8tap_2d_fn2(put)
-filters_8tap_2d_fn2(avg)
+#define filters_8tap_2d_fn(op, sz, align, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular, align, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP,   sharp, align, opt) \
+filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH,  smooth, align, opt)
+
+#define filters_8tap_2d_fn2(op, align, opt) \
+filters_8tap_2d_fn(op, 64, align, opt) \
+filters_8tap_2d_fn(op, 32, align, opt) \
+filters_8tap_2d_fn(op, 16, align, opt) \
+filters_8tap_2d_fn(op, 8, align, opt) \
+filters_8tap_2d_fn(op, 4, align, opt)
+
+filters_8tap_2d_fn2(put, 16, ssse3)
+filters_8tap_2d_fn2(avg, 16, ssse3)
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+filters_8tap_2d_fn(put, 64, 32, avx2)
+filters_8tap_2d_fn(put, 32, 32, avx2)
+filters_8tap_2d_fn(avg, 64, 32, avx2)
+filters_8tap_2d_fn(avg, 32, 32, avx2)
+#endif
 
 #undef filters_8tap_2d_fn2
 #undef filters_8tap_2d_fn
 #undef filter_8tap_2d_fn
 
-#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar)                  \
-static void                                                             \
-op ## _8tap_ ## fname ## _ ## sz ## dir ## _ssse3(uint8_t *dst,         \
-                                                  const uint8_t *src,   \
-                                                  ptrdiff_t dst_stride, \
-                                                  ptrdiff_t src_stride, \
-                                                  int h, int mx,        \
-                                                  int my)               \
-{                                                                       \
-    ff_ ## op ## _8tap_1d_ ## dir ## _ ## sz ## _ssse3(dst, src,        \
-                                                       dst_stride,      \
-                                                       src_stride, h,   \
-                                                       ff_filters_ssse3[f][dvar - 1]); \
+#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar, opt) \
+static void op##_8tap_##fname##_##sz##dir##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                  const uint8_t *src, ptrdiff_t src_stride, \
+                                                  int h, int mx, int my) \
+{ \
+    ff_vp9_##op##_8tap_1d_##dir##_##sz##_##opt(dst, dst_stride, src, src_stride, \
+                                               h, ff_filters_ssse3[f][dvar - 1]); \
 }
 
-#define filters_8tap_1d_fn(op, sz, dir, dvar)                          \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar)     \
-    filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar)
-
-#define filters_8tap_1d_fn2(op, sz)             \
-    filters_8tap_1d_fn(op, sz, h, mx)           \
-    filters_8tap_1d_fn(op, sz, v, my)
-
-#define filters_8tap_1d_fn3(op) \
-    filters_8tap_1d_fn2(op, 64) \
-    filters_8tap_1d_fn2(op, 32) \
-    filters_8tap_1d_fn2(op, 16) \
-    filters_8tap_1d_fn2(op,  8) \
-    filters_8tap_1d_fn2(op,  4)
-
-filters_8tap_1d_fn3(put)
-filters_8tap_1d_fn3(avg)
+#define filters_8tap_1d_fn(op, sz, dir, dvar, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP,   sharp,   dir, dvar, opt) \
+filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH,  smooth,  dir, dvar, opt)
+
+#define filters_8tap_1d_fn2(op, sz, opt) \
+filters_8tap_1d_fn(op, sz, h, mx, opt) \
+filters_8tap_1d_fn(op, sz, v, my, opt)
+
+#define filters_8tap_1d_fn3(op, opt) \
+filters_8tap_1d_fn2(op, 64, opt) \
+filters_8tap_1d_fn2(op, 32, opt) \
+filters_8tap_1d_fn2(op, 16, opt) \
+filters_8tap_1d_fn2(op, 8, opt) \
+filters_8tap_1d_fn2(op, 4, opt)
+
+filters_8tap_1d_fn3(put, ssse3)
+filters_8tap_1d_fn3(avg, ssse3)
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+filters_8tap_1d_fn2(put, 64, avx2)
+filters_8tap_1d_fn2(put, 32, avx2)
+filters_8tap_1d_fn2(avg, 64, avx2)
+filters_8tap_1d_fn2(avg, 32, avx2)
+#endif
 
 #undef filters_8tap_1d_fn
 #undef filters_8tap_1d_fn2
 #undef filters_8tap_1d_fn3
 #undef filter_8tap_1d_fn
 
+#define itxfm_func(typea, typeb, size, opt) \
+void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                            int16_t *block, int eob)
+#define itxfm_funcs(size, opt) \
+itxfm_func(idct,  idct,  size, opt); \
+itxfm_func(iadst, idct,  size, opt); \
+itxfm_func(idct,  iadst, size, opt); \
+itxfm_func(iadst, iadst, size, opt)
+
+itxfm_funcs(4, ssse3);
+itxfm_funcs(8, ssse3);
+itxfm_funcs(8, avx);
+itxfm_funcs(16, ssse3);
+itxfm_funcs(16, avx);
+itxfm_func(idct, idct, 32, ssse3);
+itxfm_func(idct, idct, 32, avx);
+itxfm_func(iwht, iwht, 4, mmx);
+
+#undef itxfm_func
+#undef itxfm_funcs
+
+#define lpf_funcs(size1, size2, opt) \
+void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                    int E, int I, int H); \
+void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                    int E, int I, int H)
+
+lpf_funcs(16, 16, sse2);
+lpf_funcs(16, 16, ssse3);
+lpf_funcs(16, 16, avx);
+lpf_funcs(44, 16, sse2);
+lpf_funcs(44, 16, ssse3);
+lpf_funcs(44, 16, avx);
+lpf_funcs(84, 16, sse2);
+lpf_funcs(84, 16, ssse3);
+lpf_funcs(84, 16, avx);
+lpf_funcs(48, 16, sse2);
+lpf_funcs(48, 16, ssse3);
+lpf_funcs(48, 16, avx);
+lpf_funcs(88, 16, sse2);
+lpf_funcs(88, 16, ssse3);
+lpf_funcs(88, 16, avx);
+
+#undef lpf_funcs
+
+#define ipred_func(size, type, opt) \
+void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \
+                                                   const uint8_t *l, const uint8_t *a)
+
+#define ipred_funcs(type, opt) \
+ipred_func(4, type, opt); \
+ipred_func(8, type, opt); \
+ipred_func(16, type, opt); \
+ipred_func(32, type, opt)
+
+ipred_funcs(dc, ssse3);
+ipred_funcs(dc_left, ssse3);
+ipred_funcs(dc_top, ssse3);
+
+#undef ipred_funcs
+
+ipred_func(8, v, mmx);
+ipred_func(16, v, sse2);
+ipred_func(32, v, sse2);
+
+#define ipred_func_set(size, type, opt1, opt2) \
+ipred_func(size, type, opt1); \
+ipred_func(size, type, opt2)
+
+#define ipred_funcs(type, opt1, opt2) \
+ipred_func(4, type, opt1); \
+ipred_func_set(8, type, opt1, opt2); \
+ipred_func_set(16, type, opt1, opt2); \
+ipred_func_set(32, type, opt1, opt2)
+
+ipred_funcs(h, ssse3, avx);
+ipred_funcs(tm, ssse3, avx);
+ipred_funcs(dl, ssse3, avx);
+ipred_funcs(dr, ssse3, avx);
+ipred_funcs(hu, ssse3, avx);
+ipred_funcs(hd, ssse3, avx);
+ipred_funcs(vl, ssse3, avx);
+ipred_funcs(vr, ssse3, avx);
+
+ipred_func(32, dc, avx2);
+ipred_func(32, dc_left, avx2);
+ipred_func(32, dc_top, avx2);
+ipred_func(32, v, avx2);
+ipred_func(32, h, avx2);
+ipred_func(32, tm, avx2);
+
+#undef ipred_funcs
+#undef ipred_func_set
+#undef ipred_func
+
 #endif /* HAVE_YASM */
 
 av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
@@ -189,52 +279,166 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
 #if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
-#define init_fpel(idx1, idx2, sz, type, opt)                            \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] =                    \
-    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_ ## type ## sz ## _ ## opt
-
+#define init_fpel(idx1, idx2, sz, type, opt) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##_##opt
 
 #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \
-    dsp->mc[idx1][FILTER_8TAP_SMOOTH][idx2][idxh][idxv]  = type ## _8tap_smooth_  ## sz ## dir ## _ ## opt; \
-    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type ## _8tap_regular_ ## sz ## dir ## _ ## opt; \
-    dsp->mc[idx1][FILTER_8TAP_SHARP][idx2][idxh][idxv]   = type ## _8tap_sharp_   ## sz ## dir ## _ ## opt
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_##opt
 
-#define init_subpel2(idx, idxh, idxv, dir, type, opt)     \
+#define init_subpel2_32_64(idx, idxh, idxv, dir, type, opt) \
     init_subpel1(0, idx, idxh, idxv, 64, dir, type, opt); \
-    init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt); \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt)
+
+#define init_subpel2(idx, idxh, idxv, dir, type, opt) \
+    init_subpel2_32_64(idx, idxh, idxv, dir, type, opt); \
     init_subpel1(2, idx, idxh, idxv, 16, dir, type, opt); \
     init_subpel1(3, idx, idxh, idxv,  8, dir, type, opt); \
     init_subpel1(4, idx, idxh, idxv,  4, dir, type, opt)
 
-#define init_subpel3(idx, type, opt)        \
+#define init_subpel3(idx, type, opt) \
     init_subpel2(idx, 1, 1, hv, type, opt); \
-    init_subpel2(idx, 0, 1,  v, type, opt); \
-    init_subpel2(idx, 1, 0,  h, type, opt)
+    init_subpel2(idx, 0, 1, v, type, opt); \
+    init_subpel2(idx, 1, 0, h, type, opt)
+
+#define init_lpf(opt) do { \
+    if (ARCH_X86_64) { \
+        dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
+        dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
+        dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \
+        dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \
+        dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
+        dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
+        dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
+        dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \
+        dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \
+        dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
+    } \
+} while (0)
+
+#define init_ipred(tx, sz, opt) do { \
+    dsp->intra_pred[tx][HOR_PRED]             = ff_vp9_ipred_h_##sz##x##sz##_##opt; \
+    dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED]  = ff_vp9_ipred_dl_##sz##x##sz##_##opt; \
+    dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = ff_vp9_ipred_dr_##sz##x##sz##_##opt; \
+    dsp->intra_pred[tx][HOR_DOWN_PRED]        = ff_vp9_ipred_hd_##sz##x##sz##_##opt; \
+    dsp->intra_pred[tx][VERT_LEFT_PRED]       = ff_vp9_ipred_vl_##sz##x##sz##_##opt; \
+    dsp->intra_pred[tx][HOR_UP_PRED]          = ff_vp9_ipred_hu_##sz##x##sz##_##opt; \
+    if (ARCH_X86_64 || tx != TX_32X32) { \
+        dsp->intra_pred[tx][VERT_RIGHT_PRED]      = ff_vp9_ipred_vr_##sz##x##sz##_##opt; \
+        dsp->intra_pred[tx][TM_VP8_PRED]          = ff_vp9_ipred_tm_##sz##x##sz##_##opt; \
+    } \
+} while (0)
+#define init_dc_ipred(tx, sz, opt) do { \
+    init_ipred(tx, sz, opt); \
+    dsp->intra_pred[tx][DC_PRED]              = ff_vp9_ipred_dc_##sz##x##sz##_##opt; \
+    dsp->intra_pred[tx][LEFT_DC_PRED]         = ff_vp9_ipred_dc_left_##sz##x##sz##_##opt; \
+    dsp->intra_pred[tx][TOP_DC_PRED]          = ff_vp9_ipred_dc_top_##sz##x##sz##_##opt; \
+} while (0)
 
     if (EXTERNAL_MMX(cpu_flags)) {
         init_fpel(4, 0,  4, put, mmx);
         init_fpel(3, 0,  8, put, mmx);
+        dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
+        dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
+        dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
+        dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
+        dsp->intra_pred[TX_8X8][VERT_PRED] = ff_vp9_ipred_v_8x8_mmx;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        init_fpel(4, 1,  4, avg, mmxext);
+        init_fpel(3, 1,  8, avg, mmxext);
     }
 
     if (EXTERNAL_SSE(cpu_flags)) {
         init_fpel(2, 0, 16, put, sse);
         init_fpel(1, 0, 32, put, sse);
         init_fpel(0, 0, 64, put, sse);
-        init_fpel(4, 1,  4, avg, sse);
-        init_fpel(3, 1,  8, avg, sse);
     }
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         init_fpel(2, 1, 16, avg, sse2);
         init_fpel(1, 1, 32, avg, sse2);
         init_fpel(0, 1, 64, avg, sse2);
+        init_lpf(sse2);
+        dsp->intra_pred[TX_16X16][VERT_PRED] = ff_vp9_ipred_v_16x16_sse2;
+        dsp->intra_pred[TX_32X32][VERT_PRED] = ff_vp9_ipred_v_32x32_sse2;
     }
 
     if (EXTERNAL_SSSE3(cpu_flags)) {
         init_subpel3(0, put, ssse3);
         init_subpel3(1, avg, ssse3);
+        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][ADST_DCT]  = ff_vp9_idct_iadst_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][DCT_ADST]  = ff_vp9_iadst_idct_4x4_add_ssse3;
+        dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3;
+        if (ARCH_X86_64) {
+            dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
+            dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_ssse3;
+            dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_ssse3;
+            dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3;
+            dsp->itxfm_add[TX_16X16][DCT_DCT]   = ff_vp9_idct_idct_16x16_add_ssse3;
+            dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_ssse3;
+            dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_ssse3;
+            dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3;
+            dsp->itxfm_add[TX_32X32][ADST_ADST] =
+            dsp->itxfm_add[TX_32X32][ADST_DCT] =
+            dsp->itxfm_add[TX_32X32][DCT_ADST] =
+            dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
+        }
+        init_lpf(ssse3);
+        init_dc_ipred(TX_4X4,    4, ssse3);
+        init_dc_ipred(TX_8X8,    8, ssse3);
+        init_dc_ipred(TX_16X16, 16, ssse3);
+        init_dc_ipred(TX_32X32, 32, ssse3);
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        if (ARCH_X86_64) {
+            dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
+            dsp->itxfm_add[TX_8X8][ADST_DCT]  = ff_vp9_idct_iadst_8x8_add_avx;
+            dsp->itxfm_add[TX_8X8][DCT_ADST]  = ff_vp9_iadst_idct_8x8_add_avx;
+            dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx;
+            dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
+            dsp->itxfm_add[TX_16X16][ADST_DCT]  = ff_vp9_idct_iadst_16x16_add_avx;
+            dsp->itxfm_add[TX_16X16][DCT_ADST]  = ff_vp9_iadst_idct_16x16_add_avx;
+            dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
+            dsp->itxfm_add[TX_32X32][ADST_ADST] =
+            dsp->itxfm_add[TX_32X32][ADST_DCT] =
+            dsp->itxfm_add[TX_32X32][DCT_ADST] =
+            dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
+        }
+        init_fpel(1, 0, 32, put, avx);
+        init_fpel(0, 0, 64, put, avx);
+        init_lpf(avx);
+        init_ipred(TX_8X8,    8, avx);
+        init_ipred(TX_16X16, 16, avx);
+        init_ipred(TX_32X32, 32, avx);
+    }
+
+    if (EXTERNAL_AVX2(cpu_flags)) {
+        init_fpel(1, 1, 32, avg, avx2);
+        init_fpel(0, 1, 64, avg, avx2);
+        if (ARCH_X86_64) {
+#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
+            init_subpel2_32_64(0, 1, 1, hv, put, avx2);
+            init_subpel2_32_64(0, 0, 1, v,  put, avx2);
+            init_subpel2_32_64(0, 1, 0, h,  put, avx2);
+            init_subpel2_32_64(1, 1, 1, hv, avg, avx2);
+            init_subpel2_32_64(1, 0, 1, v,  avg, avx2);
+            init_subpel2_32_64(1, 1, 0, h,  avg, avx2);
+#endif
+        }
+        dsp->intra_pred[TX_32X32][DC_PRED] = ff_vp9_ipred_dc_32x32_avx2;
+        dsp->intra_pred[TX_32X32][LEFT_DC_PRED] = ff_vp9_ipred_dc_left_32x32_avx2;
+        dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_vp9_ipred_dc_top_32x32_avx2;
+        dsp->intra_pred[TX_32X32][VERT_PRED] = ff_vp9_ipred_v_32x32_avx2;
+        dsp->intra_pred[TX_32X32][HOR_PRED] = ff_vp9_ipred_h_32x32_avx2;
+        dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_vp9_ipred_tm_32x32_avx2;
     }
 
 #undef init_fpel
diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm
new file mode 100644
index 0000000000..c710793af9
--- /dev/null
+++ b/libavcodec/x86/vp9intrapred.asm
@@ -0,0 +1,1554 @@
+;******************************************************************************
+;* VP9 Intra prediction SIMD optimizations
+;*
+;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* Parts based on:
+;* H.264 intra prediction asm optimizations
+;* Copyright (c) 2010 Fiona Glaser
+;* Copyright (c) 2010 Holger Lubitz
+;* Copyright (c) 2010 Loren Merritt
+;* Copyright (c) 2010 Ronald S. Bultje
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+
+pw_m256: times 16 dw -256
+pw_m255: times 16 dw -255
+pw_4096: times 8 dw 4096
+
+pb_4x3_4x2_4x1_4x0: times 4 db 3
+                    times 4 db 2
+                    times 4 db 1
+                    times 4 db 0
+pb_8x1_8x0:   times 8 db 1
+              times 8 db 0
+pb_8x3_8x2:   times 8 db 3
+              times 8 db 2
+pb_0to5_2x7:  db 0, 1, 2, 3, 4, 5, 7, 7
+              times 8 db -1
+pb_0to6_9x7:  db 0, 1, 2, 3, 4, 5, 6
+              times 9 db 7
+pb_1to6_10x7: db 1, 2, 3, 4, 5, 6
+              times 10 db 7
+pb_2to6_3x7:
+pb_2to6_11x7: db 2, 3, 4, 5, 6
+              times 11 db 7
+pb_1toE_2xF:  db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+pb_2toE_3xF:  db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+pb_13456_3xm1: db 1, 3, 4, 5, 6
+               times 3 db -1
+pb_6012_4xm1: db 6, 0, 1, 2
+              times 4 db -1
+pb_6xm1_246_8toE: times 6 db -1
+                  db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
+pb_6xm1_BDF_0to6: times 6 db -1
+                  db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
+pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+pb_7to1_9x0:  db 7, 6, 5, 4
+pb_3to1_5x0:  db 3, 2, 1
+              times 9 db 0
+pb_Fto0:      db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+pb_2:  times 32 db 2
+pb_15: times 16 db 15
+
+cextern pb_1
+cextern pb_3
+cextern pw_512
+cextern pw_1024
+cextern pw_2048
+cextern pw_8192
+
+SECTION .text
+
+; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+INIT_MMX ssse3
+cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+    pmulhrsw                m0, [pw_4096]
+    pshufb                  m0, m1
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    lea                   dstq, [dstq+strideq*2]
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    RET
+
+INIT_MMX ssse3
+cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [lq]
+    movq                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    pmulhrsw                m0, [pw_2048]
+    pshufb                  m0, m2
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM ssse3
+cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    movhlps                 m1, m0
+    paddw                   m0, m1
+    pmulhrsw                m0, [pw_1024]
+    pshufb                  m0, m2
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM ssse3
+cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [lq+16]
+    mova                    m2, [aq]
+    mova                    m3, [aq+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m4, m4
+    psadbw                  m0, m4
+    psadbw                  m1, m4
+    psadbw                  m2, m4
+    psadbw                  m3, m4
+    paddw                   m0, m1
+    paddw                   m2, m3
+    paddw                   m0, m2
+    movhlps                 m1, m0
+    paddw                   m0, m1
+    pmulhrsw                m0, [pw_512]
+    pshufb                  m0, m4
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    vextracti128           xm1, m0, 1
+    paddw                  xm0, xm1
+    movhlps                xm1, xm0
+    paddw                  xm0, xm1
+    pmulhrsw               xm0, [pw_512]
+    vpbroadcastb            m0, xm0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endif
+
+; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
+
+%macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l)
+INIT_MMX ssse3
+cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [%2q]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+    pmulhrsw                m0, [pw_8192]
+    pshufb                  m0, m1
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    lea                   dstq, [dstq+strideq*2]
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*1], m0
+    RET
+
+INIT_MMX ssse3
+cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pxor                    m1, m1
+    psadbw                  m0, m1
+    pmulhrsw                m0, [pw_4096]
+    pshufb                  m0, m1
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM ssse3
+cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    movhlps                 m1, m0
+    paddw                   m0, m1
+    pmulhrsw                m0, [pw_2048]
+    pshufb                  m0, m2
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM ssse3
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    mova                    m1, [%2q+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    movhlps                 m1, m0
+    paddw                   m0, m1
+    pmulhrsw                m0, [pw_1024]
+    pshufb                  m0, m2
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m0
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m0
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    vextracti128           xm1, m0, 1
+    paddw                  xm0, xm1
+    movhlps                xm1, xm0
+    paddw                  xm0, xm1
+    pmulhrsw               xm0, [pw_1024]
+    vpbroadcastb            m0, xm0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endif
+%endmacro
+
+DC_1D_FUNCS top,  a
+DC_1D_FUNCS left, l
+
+; v
+
+INIT_MMX mmx
+cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+strideq*2], m0
+    movq      [dstq+stride3q ], m0
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM sse2
+cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m1, [aq+16]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 8
+.loop:
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m1
+    mova   [dstq+strideq*1+ 0], m0
+    mova   [dstq+strideq*1+16], m1
+    mova   [dstq+strideq*2+ 0], m0
+    mova   [dstq+strideq*2+16], m1
+    mova   [dstq+stride3q + 0], m0
+    mova   [dstq+stride3q +16], m1
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+%endif
+
+; h
+
+INIT_XMM ssse3
+cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
+    movd                    m0, [lq]
+    pshufb                  m0, [pb_4x3_4x2_4x1_4x0]
+    lea               stride3q, [strideq*3]
+    movd      [dstq+strideq*0], m0
+    psrldq                  m0, 4
+    movd      [dstq+strideq*1], m0
+    psrldq                  m0, 4
+    movd      [dstq+strideq*2], m0
+    psrldq                  m0, 4
+    movd      [dstq+stride3q ], m0
+    RET
+
+%macro H_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_h_8x8, 3, 5, 4, dst, stride, l, stride3, cnt
+    mova                    m2, [pb_8x1_8x0]
+    mova                    m3, [pb_8x3_8x2]
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 1
+.loop:
+    movd                    m0, [lq+cntq*4]
+    pshufb                  m1, m0, m3
+    pshufb                  m0, m2
+    movq      [dstq+strideq*0], m1
+    movhps    [dstq+strideq*1], m1
+    movq      [dstq+strideq*2], m0
+    movhps    [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+
+INIT_XMM %1
+cglobal vp9_ipred_h_16x16, 3, 5, 8, dst, stride, l, stride3, cnt
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 3
+.loop:
+    movd                    m3, [lq+cntq*4]
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+
+INIT_XMM %1
+cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 7
+.loop:
+    movd                    m3, [lq+cntq*4]
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+    mova   [dstq+strideq*0+ 0], m0
+    mova   [dstq+strideq*0+16], m0
+    mova   [dstq+strideq*1+ 0], m1
+    mova   [dstq+strideq*1+16], m1
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+    mova   [dstq+strideq*2+ 0], m2
+    mova   [dstq+strideq*2+16], m2
+    mova   [dstq+stride3q + 0], m3
+    mova   [dstq+stride3q +16], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+%endmacro
+
+H_XMM_FUNCS ssse3
+H_XMM_FUNCS avx
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 7
+.loop:
+    movd                   xm3, [lq+cntq*4]
+    vinserti128             m3, m3, xm3, 1
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+%endif
+
+; tm
+
+INIT_MMX ssse3
+cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
+    pxor                    m1, m1
+    pinsrw                  m2, [aq-1], 0
+    movd                    m0, [aq]
+    DEFINE_ARGS dst, stride, l, cnt
+    mova                    m3, [pw_m256]
+    mova                    m4, [pw_m255]
+    pshufb                  m2, m3
+    punpcklbw               m0, m1
+    psubw                   m0, m2
+    mov                   cntq, 1
+.loop:
+    pinsrw                  m2, [lq+cntq*2], 0
+    pshufb                  m1, m2, m4
+    pshufb                  m2, m3
+    paddw                   m1, m0
+    paddw                   m2, m0
+    packuswb                m1, m1
+    packuswb                m2, m2
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+
+%macro TM_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
+    pxor                    m1, m1
+    pinsrw                  m2, [aq-1], 0
+    movh                    m0, [aq]
+    DEFINE_ARGS dst, stride, l, cnt
+    mova                    m3, [pw_m256]
+    mova                    m4, [pw_m255]
+    pshufb                  m2, m3
+    punpcklbw               m0, m1
+    psubw                   m0, m2
+    mov                   cntq, 3
+.loop:
+    pinsrw                  m2, [lq+cntq*2], 0
+    pshufb                  m1, m2, m4
+    pshufb                  m2, m3
+    paddw                   m1, m0
+    paddw                   m2, m0
+    packuswb                m1, m2
+    movh      [dstq+strideq*0], m1
+    movhps    [dstq+strideq*1], m1
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+
+INIT_XMM %1
+cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
+    pxor                    m3, m3
+    pinsrw                  m2, [aq-1], 0
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, l, cnt
+    mova                    m4, [pw_m256]
+    mova                    m5, [pw_m255]
+    pshufb                  m2, m4
+    punpckhbw               m1, m0, m3
+    punpcklbw               m0, m3
+    psubw                   m1, m2
+    psubw                   m0, m2
+    mov                   cntq, 7
+.loop:
+    pinsrw                  m7, [lq+cntq*2], 0
+    pshufb                  m3, m7, m5
+    pshufb                  m7, m4
+    paddw                   m2, m3, m0
+    paddw                   m3, m1
+    paddw                   m6, m7, m0
+    paddw                   m7, m1
+    packuswb                m2, m3
+    packuswb                m6, m7
+    mova      [dstq+strideq*0], m2
+    mova      [dstq+strideq*1], m6
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+
+%if ARCH_X86_64
+INIT_XMM %1
+cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a
+    pxor                    m5, m5
+    pinsrw                  m4, [aq-1], 0
+    mova                    m0, [aq]
+    mova                    m2, [aq+16]
+    DEFINE_ARGS dst, stride, l, cnt
+    mova                    m8, [pw_m256]
+    mova                    m9, [pw_m255]
+    pshufb                  m4, m8
+    punpckhbw               m1, m0,  m5
+    punpckhbw               m3, m2,  m5
+    punpcklbw               m0, m5
+    punpcklbw               m2, m5
+    psubw                   m1, m4
+    psubw                   m0, m4
+    psubw                   m3, m4
+    psubw                   m2, m4
+    mov                   cntq, 15
+.loop:
+    pinsrw                 m13, [lq+cntq*2], 0
+    pshufb                  m7, m13, m9
+    pshufb                 m13, m8
+    paddw                   m4, m7,  m0
+    paddw                   m5, m7,  m1
+    paddw                   m6, m7,  m2
+    paddw                   m7, m3
+    paddw                  m10, m13, m0
+    paddw                  m11, m13, m1
+    paddw                  m12, m13, m2
+    paddw                  m13, m3
+    packuswb                m4, m5
+    packuswb                m6, m7
+    packuswb               m10, m11
+    packuswb               m12, m13
+    mova   [dstq+strideq*0+ 0], m4
+    mova   [dstq+strideq*0+16], m6
+    mova   [dstq+strideq*1+ 0], m10
+    mova   [dstq+strideq*1+16], m12
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+%endif
+%endmacro
+
+TM_XMM_FUNCS ssse3
+TM_XMM_FUNCS avx
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
+    pxor                    m3, m3
+    pinsrw                 xm2, [aq-1], 0
+    vinserti128             m2, m2, xm2, 1
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, l, cnt
+    mova                    m4, [pw_m256]
+    mova                    m5, [pw_m255]
+    pshufb                  m2, m4
+    punpckhbw               m1, m0, m3
+    punpcklbw               m0, m3
+    psubw                   m1, m2
+    psubw                   m0, m2
+    mov                   cntq, 15
+.loop:
+    pinsrw                 xm7, [lq+cntq*2], 0
+    vinserti128             m7, m7, xm7, 1
+    pshufb                  m3, m7, m5
+    pshufb                  m7, m4
+    paddw                   m2, m3, m0
+    paddw                   m3, m1
+    paddw                   m6, m7, m0
+    paddw                   m7, m1
+    packuswb                m2, m3
+    packuswb                m6, m7
+    mova      [dstq+strideq*0], m2
+    mova      [dstq+strideq*1], m6
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+%endif
+
+; dl
+
+%macro LOWPASS 4 ; left [dst], center, right, tmp
+    pxor                   m%4, m%1, m%3
+    pand                   m%4, [pb_1]
+    pavgb                  m%1, m%3
+    psubusb                m%1, m%4
+    pavgb                  m%1, m%2
+%endmacro
+
+INIT_MMX ssse3
+cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m1, [aq]
+    pshufb                  m0, m1, [pb_0to5_2x7]
+    pshufb                  m2, m1, [pb_2to6_3x7]
+    psrlq                   m1, 8
+    LOWPASS                  0, 1, 2, 3
+
+    pshufw                  m1, m0, q3321
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*2], m1
+    psrlq                   m0, 8
+    psrlq                   m1, 8
+    add                   dstq, strideq
+    movd      [dstq+strideq*0], m0
+    movd      [dstq+strideq*2], m1
+    RET
+
+%macro DL_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
+    movq                    m0, [aq]
+    lea               stride5q, [strideq*5]
+    pshufb                  m1, m0, [pb_1to6_10x7]
+    psrldq                  m2, m1, 1
+    shufps                  m0, m1, q3210
+    LOWPASS                  0, 1, 2, 3
+
+    pshufd                  m1, m0, q3321
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*4], m1
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+stride5q ], m1
+    lea                   dstq, [dstq+strideq*2]
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*0], m0
+    movq      [dstq+strideq*4], m1
+    psrldq                  m0, 1
+    psrldq                  m1, 1
+    movq      [dstq+strideq*1], m0
+    movq      [dstq+stride5q ], m1
+    RET
+
+INIT_XMM %1
+cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
+    mova                    m5, [pb_1toE_2xF]
+    mova                    m0, [aq]
+    pshufb                  m1, m0, m5
+    pshufb                  m2, m1, m5
+    pshufb                  m4, m0, [pb_15]
+    LOWPASS                  0, 1, 2, 3
+    DEFINE_ARGS dst, stride, cnt, stride9
+    lea               stride9q, [strideq*3]
+    mov                   cntd, 4
+    lea               stride9q, [stride9q*3]
+
+.loop:
+    movhlps                 m4, m0
+    mova      [dstq+strideq*0], m0
+    pshufb                  m0, m5
+    mova      [dstq+strideq*8], m4
+    movhlps                 m4, m0
+    mova      [dstq+strideq*1], m0
+    pshufb                  m0, m5
+    mova      [dstq+stride9q ], m4
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM %1
+cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
+    mova                    m5, [pb_1toE_2xF]
+    mova                    m0, [aq]
+    mova                    m1, [aq+16]
+    palignr                 m2, m1, m0, 1
+    palignr                 m3, m1, m0, 2
+    LOWPASS                  0, 2, 3, 4
+    pshufb                  m2, m1, m5
+    pshufb                  m3, m2, m5
+    pshufb                  m6, m1, [pb_15]
+    LOWPASS                  1, 2, 3, 4
+    mova                    m7, m6
+    lea                 dst16q, [dstq  +strideq*8]
+    mov                   cntd, 8
+    lea                 dst16q, [dst16q+strideq*8]
+.loop:
+    movhlps                 m7, m1
+    mova [dstq  +strideq*0+ 0], m0
+    mova [dstq  +strideq*0+16], m1
+    movhps [dstq+strideq*8+ 0], m0
+    movq [dstq  +strideq*8+ 8], m1
+    mova [dstq  +strideq*8+16], m7
+    mova [dst16q+strideq*0+ 0], m1
+    mova [dst16q+strideq*0+16], m6
+    mova [dst16q+strideq*8+ 0], m7
+    mova [dst16q+strideq*8+16], m6
+%if cpuflag(avx)
+    vpalignr                m0, m1, m0, 1
+    pshufb                  m1, m5
+%else
+    palignr                 m2, m1, m0, 1
+    pshufb                  m1, m5
+    mova                    m0, m2
+%endif
+    add                   dstq, strideq
+    add                 dst16q, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+DL_XMM_FUNCS ssse3
+DL_XMM_FUNCS avx
+
+; dr
+
+INIT_MMX ssse3
+cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq-1]
+    movd                    m1, [aq+3]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    palignr                 m1, m0, 1
+    psrlq                   m2, m1, 8
+    LOWPASS                  0, 1, 2, 3
+
+    movd      [dstq+stride3q ], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*2], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*1], m0
+    psrlq                   m0, 8
+    movd      [dstq+strideq*0], m0
+    RET
+
+%macro DR_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
+    movq                    m1, [lq]
+    movhps                  m1, [aq-1]
+    movd                    m2, [aq+7]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pslldq                  m0, m1, 1
+    palignr                 m2, m1, 1
+    LOWPASS                  0, 1, 2, 3
+
+    movhps    [dstq+strideq*0], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*1], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*2], m0
+    pslldq                  m0, 1
+    movhps    [dstq+stride3q ], m0
+    pslldq                  m0, 1
+    lea                   dstq, [dstq+strideq*4]
+    movhps    [dstq+strideq*0], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*1], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*2], m0
+    pslldq                  m0, 1
+    movhps    [dstq+stride3q ], m0
+    RET
+
+INIT_XMM %1
+cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
+    mova                    m1, [lq]
+    movu                    m2, [aq-1]
+    movd                    m4, [aq+15]
+    DEFINE_ARGS dst, stride, stride9, cnt
+    lea               stride9q, [strideq *3]
+    mov                   cntd, 4
+    lea               stride9q, [stride9q*3]
+    palignr                 m4, m2, 1
+    palignr                 m3, m2, m1, 15
+    LOWPASS                  3,  2, 4, 5
+    pslldq                  m0, m1, 1
+    palignr                 m2, m1, 1
+    LOWPASS                  0,  1, 2, 4
+
+.loop:
+    mova    [dstq+strideq*0  ], m3
+    movhps  [dstq+strideq*8+0], m0
+    movq    [dstq+strideq*8+8], m3
+    palignr                 m3, m0, 15
+    pslldq                  m0, 1
+    mova    [dstq+strideq*1  ], m3
+    movhps  [dstq+stride9q +0], m0
+    movq    [dstq+stride9q +8], m3
+    palignr                 m3, m0, 15
+    pslldq                  m0, 1
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM %1
+cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
+    mova                    m1, [lq]
+    mova                    m2, [lq+16]
+    movu                    m3, [aq-1]
+    movu                    m4, [aq+15]
+    movd                    m5, [aq+31]
+    DEFINE_ARGS dst, stride, stride8, cnt
+    lea               stride8q, [strideq*8]
+    palignr                 m5, m4, 1
+    palignr                 m6, m4, m3, 15
+    LOWPASS                  5,  4,  6,  7
+    palignr                 m4, m3, 1
+    palignr                 m6, m3, m2, 15
+    LOWPASS                  4,  3,  6,  7
+    palignr                 m3, m2, 1
+    palignr                 m6, m2, m1, 15
+    LOWPASS                  3,  2,  6,  7
+    palignr                 m2, m1, 1
+    pslldq                  m0, m1, 1
+    LOWPASS                  2,  1,  0,  6
+    mov                   cntd, 16
+
+    ; out=m2/m3/m4/m5
+.loop:
+    mova  [dstq+stride8q*0+ 0], m4
+    mova  [dstq+stride8q*0+16], m5
+    mova  [dstq+stride8q*2+ 0], m3
+    mova  [dstq+stride8q*2+16], m4
+    palignr                 m5, m4, 15
+    palignr                 m4, m3, 15
+    palignr                 m3, m2, 15
+    pslldq                  m2, 1
+    add                   dstq, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+DR_XMM_FUNCS ssse3
+DR_XMM_FUNCS avx
+
+; vl
+
+INIT_MMX ssse3
+cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m0, [aq]
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    lea                   dstq, [dstq+strideq*2]
+    psrlq                   m1, 8
+    psrlq                   m2, 8
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    RET
+
+%macro VL_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
+    movq                    m0, [aq]
+    pshufb                  m0, [pb_0to6_9x7]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m0, 2
+    LOWPASS                  2,  1,  0,  3
+    pavgb                   m1, m0
+
+    movq      [dstq+strideq*0], m1
+    movq      [dstq+strideq*1], m2
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*2], m1
+    movq      [dstq+stride3q ], m2
+    lea                   dstq, [dstq+strideq*4]
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*0], m1
+    movq      [dstq+strideq*1], m2
+    psrldq                  m1, 1
+    psrldq                  m2, 1
+    movq      [dstq+strideq*2], m1
+    movq      [dstq+stride3q ], m2
+    RET
+
+INIT_XMM %1
+cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m4, [pb_1toE_2xF]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pshufb                  m1, m0, m4
+    pshufb                  m2, m1, m4
+    LOWPASS                  2,  1,  0, 3
+    pavgb                   m1, m0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*1], m2
+    pshufb                  m1, m4
+    pshufb                  m2, m4
+    mova      [dstq+strideq*2], m1
+    mova      [dstq+stride3q ], m2
+    pshufb                  m1, m4
+    pshufb                  m2, m4
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM %1
+cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m5, [aq+16]
+    mova                    m4, [pb_1toE_2xF]
+    DEFINE_ARGS dst, stride, dst16, cnt
+    palignr                 m2, m5, m0, 1
+    palignr                 m3, m5, m0, 2
+    lea                 dst16q, [dstq  +strideq*8]
+    LOWPASS                  3,  2,  0, 6
+    pavgb                   m2, m0
+    pshufb                  m0, m5, m4
+    pshufb                  m1, m0, m4
+    lea                 dst16q, [dst16q+strideq*8]
+    LOWPASS                  1,  0,  5, 6
+    pavgb                   m0, m5
+    pshufb                  m5, [pb_15]
+    mov                   cntd, 8
+
+.loop:
+%macro %%write 3
+    mova    [dstq+stride%1+ 0], %2
+    mova    [dstq+stride%1+16], %3
+    movhps  [dst16q+stride%1 ], %2
+    movu  [dst16q+stride%1+ 8], %3
+    movq  [dst16q+stride%1+24], m5
+%if cpuflag(avx)
+    palignr                 %2, %3, %2, 1
+    pshufb                  %3, m4
+%else
+    palignr                 m6, %3, %2, 1
+    pshufb                  %3, m4
+    mova                    %2, m6
+%endif
+%endmacro
+
+    %%write                q*0, m2, m0
+    %%write                q*1, m3, m1
+    lea                   dstq, [dstq  +strideq*2]
+    lea                 dst16q, [dst16q+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+VL_XMM_FUNCS ssse3
+VL_XMM_FUNCS avx
+
+; vr
+
+INIT_MMX ssse3
+cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
+    movq                    m1, [aq-1]
+    punpckldq               m2, [lq]
+    movd                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pavgb                   m0, m1
+    palignr                 m1, m2, 5
+    psrlq                   m2, m1, 8
+    psllq                   m3, m1, 8
+    LOWPASS                  2,  1, 3, 4
+
+    ; ABCD <- for the following predictor:
+    ; EFGH
+    ; IABC  | m0 contains ABCDxxxx
+    ; JEFG  | m2 contains xJIEFGHx
+
+    punpckldq               m0, m2
+    pshufb                  m2, [pb_13456_3xm1]
+    movd      [dstq+strideq*0], m0
+    pshufb                  m0, [pb_6012_4xm1]
+    movd      [dstq+stride3q ], m2
+    psrlq                   m2, 8
+    movd      [dstq+strideq*2], m0
+    movd      [dstq+strideq*1], m2
+    RET
+
+%macro VR_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
+    movu                    m1, [aq-1]
+    movhps                  m2, [lq]
+    movq                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    pavgb                   m0, m1
+    palignr                 m1, m2, 9
+    pslldq                  m2, m1, 1
+    pslldq                  m3, m1, 2
+    LOWPASS                  1,  2, 3, 4
+
+    ; ABCDEFGH <- for the following predictor:
+    ; IJKLMNOP
+    ; QABCDEFG  | m0 contains ABCDEFGHxxxxxxxx
+    ; RIJKLMNO  | m1 contains xxVUTSRQIJKLMNOP
+    ; SQABCDEF
+    ; TRIJKLMN
+    ; USQABCDE
+    ; VTRIJKLM
+
+    punpcklqdq              m0, m1 ; ABCDEFGHxxVUTSRQ
+    movq      [dstq+strideq*0], m0
+    pshufb                  m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG
+    movhps    [dstq+strideq*1], m1
+    pshufb                  m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO
+    movhps    [dstq+strideq*2], m0
+    pslldq                  m0, 1
+    movhps    [dstq+stride3q ], m1
+    lea                   dstq, [dstq+strideq*4]
+    pslldq                  m1, 1
+    movhps    [dstq+strideq*0], m0
+    pslldq                  m0, 1
+    movhps    [dstq+strideq*1], m1
+    pslldq                  m1, 1
+    movhps    [dstq+strideq*2], m0
+    movhps    [dstq+stride3q ], m1
+    RET
+
+INIT_XMM %1
+cglobal vp9_ipred_vr_16x16, 4, 4, 6, dst, stride, l, a
+    mova                    m0, [aq]
+    movu                    m1, [aq-1]
+    mova                    m2, [lq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    palignr                 m3, m1, m2, 15
+    LOWPASS                  3,  1,  0,  4
+    pavgb                   m0, m1
+    palignr                 m1, m2,  1
+    pslldq                  m4, m2,  1
+    LOWPASS                  1,  2,  4,  5
+    pshufb                  m1, [pb_02468ACE_13579BDF]
+    mov                   cntd, 4
+
+.loop:
+    movlhps                 m2, m1
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m3
+    palignr                 m4, m0, m1, 15
+    palignr                 m5, m3, m2, 15
+    mova      [dstq+strideq*2], m4
+    mova      [dstq+stride3q ], m5
+    lea                   dstq, [dstq+strideq*4]
+    palignr                 m0, m1, 14
+    palignr                 m3, m2, 14
+    pslldq                  m1, 2
+    dec                   cntd
+    jg .loop
+    RET
+
+%if ARCH_X86_64
+INIT_XMM %1
+cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
+    mova                    m0, [aq]
+    mova                    m2, [aq+16]
+    movu                    m1, [aq-1]
+    palignr                 m3, m2, m0, 15
+    palignr                 m4, m2, m0, 14
+    LOWPASS                  4,  3,  2,  5
+    pavgb                   m3, m2
+    mova                    m2, [lq+16]
+    palignr                 m5, m1, m2, 15
+    LOWPASS                  5,  1,  0,  6
+    pavgb                   m0, m1
+    mova                    m6, [lq]
+    palignr                 m1, m2,  1
+    palignr                 m7, m2, m6, 15
+    LOWPASS                  1,  2,  7,  8
+    palignr                 m2, m6,  1
+    pslldq                  m7, m6,  1
+    LOWPASS                  2,  6,  7,  8
+    pshufb                  m1, [pb_02468ACE_13579BDF]
+    pshufb                  m2, [pb_02468ACE_13579BDF]
+    DEFINE_ARGS dst, stride, dst16, cnt
+    lea                 dst16q, [dstq  +strideq*8]
+    lea                 dst16q, [dst16q+strideq*8]
+    SBUTTERFLY             qdq,  2,  1,  6
+    mov                   cntd, 8
+
+.loop:
+    ; even lines (0, 2, 4, ...): m1 | m0, m3
+    ;  odd lines (1, 3, 5, ...): m2 | m5, m4
+%macro %%write 4
+    mova    [dstq+stride%1+ 0], %3
+    mova    [dstq+stride%1+16], %4
+    movhps  [dst16q+stride%1 ], %2
+    movu  [dst16q+stride%1+ 8], %3
+    movq  [dst16q+stride%1+24], %4
+    palignr                 %4, %3, 15
+    palignr                 %3, %2, 15
+    pslldq                  %2,  1
+%endmacro
+
+    %%write                q*0, m1, m0, m3
+    %%write                q*1, m2, m5, m4
+    lea                   dstq, [dstq  +strideq*2]
+    lea                 dst16q, [dst16q+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+%endif
+%endmacro
+
+VR_XMM_FUNCS ssse3
+VR_XMM_FUNCS avx
+
+; hd
+
+INIT_MMX ssse3
+cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
+    movd                    m0, [lq]
+    punpckldq               m0, [aq-1]
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0,  3
+    pavgb                   m1, m0
+
+    ; DHIJ <- for the following predictor:
+    ; CGDH
+    ; BFCG  | m1 contains ABCDxxxx
+    ; AEBF  | m2 contains EFGHIJxx
+
+    punpcklbw               m1, m2
+    punpckhdq               m0, m1, m2
+
+    ; m1 contains AEBFCGDH
+    ; m0 contains CGDHIJxx
+
+    movd      [dstq+stride3q ], m1
+    movd      [dstq+strideq*1], m0
+    psrlq                   m1, 16
+    psrlq                   m0, 16
+    movd      [dstq+strideq*2], m1
+    movd      [dstq+strideq*0], m0
+    RET
+
+%macro HD_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_hd_8x8, 4, 4, 4, dst, stride, l, a
+    movq                    m0, [lq]
+    movhps                  m0, [aq-1]
+    DEFINE_ARGS dst, stride, stride3, dst4
+    lea               stride3q, [strideq*3]
+    lea                  dst4q, [dstq+strideq*4]
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m1, 1
+    LOWPASS                  2,  1,  0,  3
+    pavgb                   m1, m0
+
+    ; HPQRSTUV <- for the following predictor
+    ; GOHPQRST
+    ; FNGOHPQR  | m1 contains ABCDEFGHxxxxxxxx
+    ; EMFNGOHP  | m2 contains IJKLMNOPQRSTUVxx
+    ; DLEMFNGO
+    ; CKDLEMFN
+    ; BJCKDLEM
+    ; AIBJCKDL
+
+    punpcklbw               m1, m2
+    movhlps                 m2, m2
+
+    ; m1 contains AIBJCKDLEMFNGOHP
+    ; m2 contains QRSTUVxxxxxxxxxx
+
+    movhps   [dstq +stride3q ], m1
+    movq     [dst4q+stride3q ], m1
+    palignr                 m3, m2, m1, 2
+    movhps   [dstq +strideq*2], m3
+    movq     [dst4q+strideq*2], m3
+    palignr                 m3, m2, m1, 4
+    movhps   [dstq +strideq*1], m3
+    movq     [dst4q+strideq*1], m3
+    palignr                 m2, m1, 6
+    movhps   [dstq +strideq*0], m2
+    movq     [dst4q+strideq*0], m2
+    RET
+
+INIT_XMM %1
+cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
+    mova                    m0, [lq]
+    movu                    m3, [aq-1]
+    DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
+    lea               stride4q, [strideq*4]
+    lea                  dst4q, [dstq +stride4q]
+    lea                  dst8q, [dst4q+stride4q]
+    lea                 dst12q, [dst8q+stride4q]
+    psrldq                  m4, m3,  1
+    psrldq                  m5, m3,  2
+    LOWPASS                  5,  4,  3,  6
+    palignr                 m1, m3, m0,  1
+    palignr                 m2, m3, m0,  2
+    LOWPASS                  2,  1,  0,  6
+    pavgb                   m1, m0
+    SBUTTERFLY              bw,  1,  2,  6
+
+    ; I PROBABLY INVERTED L0 ad L16 here
+    ; m1, m2, m5
+.loop:
+    sub               stride4q, strideq
+    movhps [dstq +stride4q +0], m2
+    movq   [dstq +stride4q +8], m5
+    mova   [dst4q+stride4q   ], m2
+    movhps [dst8q+stride4q +0], m1
+    movq   [dst8q+stride4q +8], m2
+    mova  [dst12q+stride4q   ], m1
+%if cpuflag(avx)
+    palignr                 m1, m2, m1, 2
+    palignr                 m2, m5, m2, 2
+%else
+    palignr                 m3, m2, m1, 2
+    palignr                 m0, m5, m2, 2
+    mova                    m1, m3
+    mova                    m2, m0
+%endif
+    psrldq                  m5, 2
+    jg .loop
+    RET
+
+INIT_XMM %1
+cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [lq+16]
+    movu                    m2, [aq-1]
+    movu                    m3, [aq+15]
+    DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
+    lea               stride8q, [strideq*8]
+    lea                  dst8q, [dstq  +stride8q]
+    lea                 dst16q, [dst8q +stride8q]
+    lea                 dst24q, [dst16q+stride8q]
+    psrldq                  m4, m3,  1
+    psrldq                  m5, m3,  2
+    LOWPASS                  5,  4,  3,  6
+    palignr                 m4, m3, m2,  2
+    palignr                 m3, m2,  1
+    LOWPASS                  4,  3,  2,  6
+    palignr                 m3, m2, m1,  2
+    palignr                 m2, m1,  1
+    LOWPASS                  3,  2,  1,  6
+    pavgb                   m2, m1
+    palignr                 m6, m1, m0,  1
+    palignr                 m1, m0,  2
+    LOWPASS                  1,  6,  0,  7
+    pavgb                   m0, m6
+    SBUTTERFLY              bw,  2,  3,  6
+    SBUTTERFLY              bw,  0,  1,  6
+
+    ; m0, m1, m2, m3, m4, m5
+.loop:
+    sub               stride8q, strideq
+    mova  [dstq  +stride8q+ 0], m3
+    mova  [dstq  +stride8q+16], m4
+    mova  [dst8q +stride8q+ 0], m2
+    mova  [dst8q +stride8q+16], m3
+    mova  [dst16q+stride8q+ 0], m1
+    mova  [dst16q+stride8q+16], m2
+    mova  [dst24q+stride8q+ 0], m0
+    mova  [dst24q+stride8q+16], m1
+%if cpuflag(avx)
+    palignr                 m0, m1, m0, 2
+    palignr                 m1, m2, m1, 2
+    palignr                 m2, m3, m2, 2
+    palignr                 m3, m4, m3, 2
+    palignr                 m4, m5, m4, 2
+    psrldq                  m5, 2
+%else
+    psrldq                  m6, m5, 2
+    palignr                 m5, m4, 2
+    palignr                 m4, m3, 2
+    palignr                 m3, m2, 2
+    palignr                 m2, m1, 2
+    palignr                 m1, m0, 2
+    mova                    m0, m1
+    mova                    m1, m2
+    mova                    m2, m3
+    mova                    m3, m4
+    mova                    m4, m5
+    mova                    m5, m6
+%endif
+    jg .loop
+    RET
+%endmacro
+
+HD_XMM_FUNCS ssse3
+HD_XMM_FUNCS avx
+
+INIT_MMX ssse3
+cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
+    movd                    m0, [lq]
+    pshufb                  m0, [pb_3to1_5x0]
+    psrlq                   m1, m0, 8
+    psrlq                   m2, m1, 8
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride3
+    lea               stride3q, [strideq*3]
+    SBUTTERFLY              bw,  1, 2, 0
+    palignr                 m2, m1, 2
+    movd      [dstq+strideq*0], m1
+    movd      [dstq+strideq*1], m2
+    punpckhdq               m1, m1
+    punpckhdq               m2, m2
+    movd      [dstq+strideq*2], m1
+    movd      [dstq+stride3q ], m2
+    RET
+
+%macro HU_XMM_FUNCS 1
+INIT_XMM %1
+cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
+    movq                    m0, [lq]
+    pshufb                  m0, [pb_7to1_9x0]
+    psrldq                  m1, m0, 1
+    psrldq                  m2, m1, 1
+    LOWPASS                  2,  1, 0, 3
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride3, dst4
+    lea               stride3q, [strideq*3]
+    lea                  dst4q, [dstq+strideq*4]
+    SBUTTERFLY              bw,  1, 2, 0
+    movq     [dstq +strideq*0], m1
+    movhps   [dst4q+strideq*0], m1
+    palignr                 m0, m2, m1, 2
+    movq     [dstq +strideq*1], m0
+    movhps   [dst4q+strideq*1], m0
+    palignr                 m0, m2, m1, 4
+    movq     [dstq +strideq*2], m0
+    movhps   [dst4q+strideq*2], m0
+    palignr                 m2, m1, 6
+    movq     [dstq +stride3q ], m2
+    movhps   [dst4q+stride3q ], m2
+    RET
+
+INIT_XMM %1
+cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
+    mova                    m0, [lq]
+    pshufb                  m0, [pb_Fto0]
+    mova                    m3, [pb_2toE_3xF]
+    pshufb                  m1, m0, [pb_1toE_2xF]
+    pshufb                  m2, m0, m3
+    LOWPASS                  2,  1,  0,  4
+    pavgb                   m1, m0
+    DEFINE_ARGS dst, stride, stride9, cnt
+    lea                stride9q, [strideq *3]
+    mov                   cntd,  4
+    lea                stride9q, [stride9q*3]
+    SBUTTERFLY              bw,  1,  2,  0
+
+.loop:
+    mova      [dstq+strideq*0], m1
+    mova      [dstq+strideq*8], m2
+    palignr                 m0, m2, m1, 2
+    pshufb                  m2, m3
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+stride9q ], m2
+    palignr                 m1, m2, m0, 2
+    pshufb                  m2, m3
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntd
+    jg .loop
+    RET
+
+INIT_XMM %1
+cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l
+    mova                    m0, [lq]
+    mova                    m1, [lq+16]
+    mova                    m2, [pb_Fto0]
+    mova                    m4, [pb_2toE_3xF]
+    pshufb                  m0, m2
+    pshufb                  m1, m2
+    palignr                 m2, m0, m1,  1
+    palignr                 m3, m0, m1,  2
+    LOWPASS                  3,  2,  1,  5
+    pavgb                   m2, m1
+    pshufb                  m1, m0, m4
+    pshufb                  m5, m0, [pb_1toE_2xF]
+    LOWPASS                  1,  5,  0,  6
+    pavgb                   m0, m5
+    DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
+    mov                   cntd,  8
+    xor               stride0q, stride0q
+    lea                  dst8q, [dstq  +strideq*8]
+    lea                 dst16q, [dst8q +strideq*8]
+    lea                 dst24q, [dst16q+strideq*8]
+    SBUTTERFLY              bw,  0,  1,  5
+    SBUTTERFLY              bw,  2,  3,  5
+    pshufb                  m6, m1, [pb_15]
+
+.loop:
+    mova  [dstq  +stride0q+ 0], m2
+    mova  [dstq  +stride0q+16], m3
+    mova  [dst8q +stride0q+ 0], m3
+    mova  [dst8q +stride0q+16], m0
+    mova  [dst16q+stride0q+ 0], m0
+    mova  [dst16q+stride0q+16], m1
+    mova  [dst24q+stride0q+ 0], m1
+    mova  [dst24q+stride0q+16], m6
+%if cpuflag(avx)
+    palignr                 m2, m3, m2, 2
+    palignr                 m3, m0, m3, 2
+    palignr                 m0, m1, m0, 2
+    pshufb                  m1, m4
+%else
+    pshufb                  m5, m1, m4
+    palignr                 m1, m0, 2
+    palignr                 m0, m3, 2
+    palignr                 m3, m2, 2
+    mova                    m2, m3
+    mova                    m3, m0
+    mova                    m0, m1
+    mova                    m1, m5
+%endif
+    add               stride0q, strideq
+    dec                   cntd
+    jg .loop
+    RET
+%endmacro
+
+HU_XMM_FUNCS ssse3
+HU_XMM_FUNCS avx
+
+; FIXME 127, 128, 129 ?
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
new file mode 100644
index 0000000000..6cce0aa196
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -0,0 +1,1670 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2013 Clément Bœsch <u pkh me>
+;* Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_11585x2:  times 8 dw 23170
+pw_m11585x2: times 8 dw -23170
+
+%macro VP9_IDCT_COEFFS 2-3 0
+pw_%1x2:    times 8 dw  %1*2
+pw_m%1x2:   times 8 dw -%1*2
+pw_%2x2:    times 8 dw  %2*2
+pw_m%2x2:   times 8 dw -%2*2
+pw_m%1_%2:  times 4 dw -%1,  %2
+pw_%2_%1:   times 4 dw  %2,  %1
+pw_m%2_m%1: times 4 dw -%2, -%1
+%if %3 == 1
+pw_m%2_%1:  times 4 dw -%2,  %1
+pw_%1_%2:   times 4 dw  %1,  %2
+%endif
+%endmacro
+
+VP9_IDCT_COEFFS 15137,  6270, 1
+VP9_IDCT_COEFFS 16069,  3196, 1
+VP9_IDCT_COEFFS  9102, 13623, 1
+VP9_IDCT_COEFFS 16305,  1606
+VP9_IDCT_COEFFS 10394, 12665
+VP9_IDCT_COEFFS 14449,  7723
+VP9_IDCT_COEFFS  4756, 15679
+VP9_IDCT_COEFFS 16364,   804
+VP9_IDCT_COEFFS 11003, 12140
+VP9_IDCT_COEFFS 14811,  7005
+VP9_IDCT_COEFFS  5520, 15426
+VP9_IDCT_COEFFS 15893,  3981
+VP9_IDCT_COEFFS  8423, 14053
+VP9_IDCT_COEFFS 13160,  9760
+VP9_IDCT_COEFFS  2404, 16207
+
+pw_5283_13377: times 4 dw 5283, 13377
+pw_9929_13377: times 4 dw 9929, 13377
+pw_15212_m13377: times 4 dw 15212, -13377
+pw_15212_9929: times 4 dw 15212, 9929
+pw_m5283_m15212: times 4 dw -5283, -15212
+pw_13377x2: times 8 dw 13377*2
+
+pd_8192: times 4 dd 8192
+
+cextern pw_512
+cextern pw_1024
+cextern pw_2048
+cextern pw_m1
+
+SECTION .text
+
+; (a*x + b*y + round) >> shift
+%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2
+    pmaddwd            m%1, m%2, %4
+    pmaddwd            m%2,  %5
+    paddd              m%1,  %3
+    paddd              m%2,  %3
+    psrad              m%1,  14
+    psrad              m%2,  14
+%endmacro
+
+%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
+    VP9_MULSUB_2W_2X    %7,  %6,  %5, [pw_m%3_%4], [pw_%4_%3]
+    VP9_MULSUB_2W_2X    %1,  %2,  %5, [pw_m%3_%4], [pw_%4_%3]
+    packssdw           m%1, m%7
+    packssdw           m%2, m%6
+%endmacro
+
+%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
+%if %0 == 7
+    punpckhwd          m%6, m%2, m%1
+    punpcklwd          m%2, m%1
+    VP9_MULSUB_2W_4X   %1, %2, %3, %4, %5, %6, %7
+%else
+    punpckhwd          m%8, m%4, m%3
+    punpcklwd          m%2, m%4, m%3
+    VP9_MULSUB_2W_4X   %1, %2, %5, %6, %7, %8, %9
+%endif
+%endmacro
+
+%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
+    punpckhwd          m%4, m%2, m%1
+    punpcklwd          m%2, m%1
+    pmaddwd            m%3, m%4, [pw_m%5_%6]
+    pmaddwd            m%4, [pw_%6_%5]
+    pmaddwd            m%1, m%2, [pw_m%5_%6]
+    pmaddwd            m%2, [pw_%6_%5]
+%endmacro
+
+%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round
+    SUMSUB_BA            d, %1, %2, %5
+    SUMSUB_BA            d, %3, %4, %5
+    paddd              m%1, %6
+    paddd              m%2, %6
+    paddd              m%3, %6
+    paddd              m%4, %6
+    psrad              m%1, 14
+    psrad              m%2, 14
+    psrad              m%3, 14
+    psrad              m%4, 14
+    packssdw           m%1, m%3
+    packssdw           m%2, m%4
+%endmacro
+
+%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
+    movh               m%3, [%6]
+    movh               m%4, [%6+strideq]
+    punpcklbw          m%3, m%5
+    punpcklbw          m%4, m%5
+    paddw              m%3, m%1
+    paddw              m%4, m%2
+    packuswb           m%3, m%5
+    packuswb           m%4, m%5
+    movh              [%6], m%3
+    movh      [%6+strideq], m%4
+%endmacro
+
+%macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
+%assign %%y 0
+%rep %3
+%assign %%x 0
+%rep %3*2/mmsize
+    mova      [%1+%%y+%%x], %4
+%assign %%x (%%x+mmsize)
+%endrep
+%assign %%y (%%y+%2)
+%endrep
+%endmacro
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IWHT4_1D 0
+    SWAP                 1, 2, 3
+    paddw               m0, m2
+    psubw               m3, m1
+    psubw               m4, m0, m3
+    psraw               m4, 1
+    psubw               m5, m4, m1
+    SWAP                 5, 1
+    psubw               m4, m2
+    SWAP                 4, 2
+    psubw               m0, m1
+    paddw               m3, m2
+    SWAP                 3, 2, 1
+%endmacro
+
+INIT_MMX mmx
+cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
+    mova                m0, [blockq+0*8]
+    mova                m1, [blockq+1*8]
+    mova                m2, [blockq+2*8]
+    mova                m3, [blockq+3*8]
+    psraw               m0, 2
+    psraw               m1, 2
+    psraw               m2, 2
+    psraw               m3, 2
+
+    VP9_IWHT4_1D
+    TRANSPOSE4x4W        0, 1, 2, 3, 4
+    VP9_IWHT4_1D
+
+    pxor                m4, m4
+    VP9_STORE_2X         0, 1, 5, 6, 4
+    lea               dstq, [dstq+strideq*2]
+    VP9_STORE_2X         2, 3, 5, 6, 4
+    ZERO_BLOCK      blockq, 8, 4, m4
+    RET
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT4_1D_FINALIZE 0
+    SUMSUB_BA            w, 3, 2, 4                         ; m3=t3+t0, m2=-t3+t0
+    SUMSUB_BA            w, 1, 0, 4                         ; m1=t2+t1, m0=-t2+t1
+    SWAP                 0, 3, 2                            ; 3102 -> 0123
+%endmacro
+
+%macro VP9_IDCT4_1D 0
+    SUMSUB_BA            w, 2, 0, 4                         ; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
+    pmulhrsw            m2, m6                              ; m2=t0
+    pmulhrsw            m0, m6                              ; m0=t1
+    VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5     ; m1=t2, m3=t3
+    VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+; 2x2 top left corner
+%macro VP9_IDCT4_2x2_1D 0
+    pmulhrsw            m0, m5                              ; m0=t1
+    mova                m2, m0                              ; m2=t0
+    mova                m3, m1
+    pmulhrsw            m1, m6                              ; m1=t2
+    pmulhrsw            m3, m7                              ; m3=t3
+    VP9_IDCT4_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT4_WRITEOUT 0
+    mova                m5, [pw_2048]
+    pmulhrsw            m0, m5              ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+    pmulhrsw            m1, m5
+    VP9_STORE_2X         0,  1,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+    pmulhrsw            m2, m5
+    pmulhrsw            m3, m5
+    VP9_STORE_2X         2,  3,  6,  7,  4
+%endmacro
+
+INIT_MMX ssse3
+cglobal vp9_idct_idct_4x4_add, 4,4,0, dst, stride, block, eob
+
+    cmp eobd, 4 ; 2x2 or smaller
+    jg .idctfull
+
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idct2x2
+
+    movd                m0, [blockq]
+    mova                m5, [pw_11585x2]
+    pmulhrsw            m0, m5
+    pmulhrsw            m0, m5
+    pshufw              m0, m0, 0
+    pxor                m4, m4
+    movh          [blockq], m4
+    pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    RET
+
+; faster path for when only top left 2x2 block is set
+.idct2x2:
+    movd                m0, [blockq+0]
+    movd                m1, [blockq+8]
+    mova                m5, [pw_11585x2]
+    mova                m6, [pw_6270x2]
+    mova                m7, [pw_15137x2]
+    VP9_IDCT4_2x2_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_IDCT4_2x2_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    movh       [blockq+ 0], m4
+    movh       [blockq+ 8], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+
+.idctfull: ; generic full 4x4 idct/idct
+    mova                m0, [blockq+ 0]
+    mova                m1, [blockq+ 8]
+    mova                m2, [blockq+16]
+    mova                m3, [blockq+24]
+    mova                m6, [pw_11585x2]
+    mova                m7, [pd_8192]       ; rounding
+    VP9_IDCT4_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_IDCT4_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    mova       [blockq+ 0], m4
+    mova       [blockq+ 8], m4
+    mova       [blockq+16], m4
+    mova       [blockq+24], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+
+;-------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IADST4_1D 0
+    movq2dq           xmm0, m0
+    movq2dq           xmm1, m1
+    movq2dq           xmm2, m2
+    movq2dq           xmm3, m3
+    paddw               m3, m0
+    punpcklwd         xmm0, xmm1
+    punpcklwd         xmm2, xmm3
+    pmaddwd           xmm1, xmm0, [pw_5283_13377]
+    pmaddwd           xmm4, xmm0, [pw_9929_13377]
+    pmaddwd           xmm0, [pw_15212_m13377]
+    pmaddwd           xmm3, xmm2, [pw_15212_9929]
+    pmaddwd           xmm2, [pw_m5283_m15212]
+    psubw               m3, m2
+    paddd             xmm0, xmm2
+    paddd             xmm3, [pd_8192]
+    paddd             xmm2, [pd_8192]
+    paddd             xmm1, xmm3
+    paddd             xmm0, xmm3
+    paddd             xmm4, xmm2
+    psrad             xmm1, 14
+    psrad             xmm0, 14
+    psrad             xmm4, 14
+    pmulhrsw            m3, [pw_13377x2]        ; out2
+    packssdw          xmm0, xmm0
+    packssdw          xmm1, xmm1
+    packssdw          xmm4, xmm4
+    movdq2q             m0, xmm0                ; out3
+    movdq2q             m1, xmm1                ; out0
+    movdq2q             m2, xmm4                ; out1
+    SWAP                 0, 1, 2, 3
+%endmacro
+
+%macro IADST4_FN 5
+INIT_MMX %5
+cglobal vp9_%1_%3_4x4_add, 3, 3, 8, dst, stride, block, eob
+    mova                m0, [blockq+ 0]
+    mova                m1, [blockq+ 8]
+    mova                m2, [blockq+16]
+    mova                m3, [blockq+24]
+    mova                m6, [pw_11585x2]
+    mova                m7, [pd_8192]       ; rounding
+    VP9_%2_1D
+    TRANSPOSE4x4W  0, 1, 2, 3, 4
+    VP9_%4_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    mova       [blockq+ 0], m4
+    mova       [blockq+ 8], m4
+    mova       [blockq+16], m4
+    mova       [blockq+24], m4
+    VP9_IDCT4_WRITEOUT
+    RET
+%endmacro
+
+IADST4_FN idct,  IDCT4,  iadst, IADST4, ssse3
+IADST4_FN iadst, IADST4, idct,  IDCT4,  ssse3
+IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
+
+%if ARCH_X86_64 ; TODO: 32-bit? (32-bit limited to 8 xmm reg, we use more)
+
+;-------------------------------------------------------------------------------------------
+; void vp9_idct_idct_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT8_1D_FINALIZE 0
+    SUMSUB_BA            w,  3, 10, 4                       ;  m3=t0+t7, m10=t0-t7
+    SUMSUB_BA            w,  1,  2, 4                       ;  m1=t1+t6,  m2=t1-t6
+    SUMSUB_BA            w, 11,  0, 4                       ; m11=t2+t5,  m0=t2-t5
+    SUMSUB_BA            w,  9,  8, 4                       ;  m9=t3+t4,  m8=t3-t4
+    SWAP                11, 10, 2
+    SWAP                 3,  9, 0
+%endmacro
+
+%macro VP9_IDCT8_1D 0
+    SUMSUB_BA            w, 8, 0, 4                         ; m8=IN(0)+IN(4) m0=IN(0)-IN(4)
+    pmulhrsw            m8, m12                             ; m8=t0a
+    pmulhrsw            m0, m12                             ; m0=t1a
+    VP9_UNPACK_MULSUB_2W_4X 2, 10, 15137,  6270, m7, 4, 5   ; m2=t2a, m10=t3a
+    VP9_UNPACK_MULSUB_2W_4X 1, 11, 16069,  3196, m7, 4, 5   ; m1=t4a, m11=t7a
+    VP9_UNPACK_MULSUB_2W_4X 9,  3,  9102, 13623, m7, 4, 5   ; m9=t5a,  m3=t6a
+    SUMSUB_BA            w, 10,  8, 4                       ; m10=t0a+t3a (t0),  m8=t0a-t3a (t3)
+    SUMSUB_BA            w,  2,  0, 4                       ;  m2=t1a+t2a (t1),  m0=t1a-t2a (t2)
+    SUMSUB_BA            w,  9,  1, 4                       ;  m9=t4a+t5a (t4),  m1=t4a-t5a (t5a)
+    SUMSUB_BA            w,  3, 11, 4                       ;  m3=t7a+t6a (t7), m11=t7a-t6a (t6a)
+    SUMSUB_BA            w,  1, 11, 4                       ;  m1=t6a+t5a (t6), m11=t6a-t5a (t5)
+    pmulhrsw            m1, m12                             ; m1=t6
+    pmulhrsw           m11, m12                             ; m11=t5
+    VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_4x4_1D 0
+    pmulhrsw            m0, m12                             ; m0=t1a/t0a
+    pmulhrsw           m10, m2, [pw_15137x2]                ; m10=t3a
+    pmulhrsw            m2, [pw_6270x2]                     ; m2=t2a
+    pmulhrsw           m11, m1, [pw_16069x2]                ; m11=t7a
+    pmulhrsw            m1, [pw_3196x2]                     ; m1=t4a
+    pmulhrsw            m9, m3, [pw_9102x2]                 ; m9=-t5a
+    pmulhrsw            m3, [pw_13623x2]                    ; m3=t6a
+    psubw               m8, m0, m10                         ; m8=t0a-t3a (t3)
+    paddw              m10, m0                              ; m10=t0a+t3a (t0)
+    SUMSUB_BA            w,  2,  0, 4                       ;  m2=t1a+t2a (t1),  m0=t1a-t2a (t2)
+    SUMSUB_BA            w,  9,  1, 4                       ;  m1=t4a+t5a (t4),  m9=t4a-t5a (t5a)
+    SWAP                 1,  9
+    SUMSUB_BA            w,  3, 11, 4                       ;  m3=t7a+t6a (t7), m11=t7a-t6a (t6a)
+    SUMSUB_BA            w,  1, 11, 4                       ;  m1=t6a+t5a (t6), m11=t6a-t5a (t5)
+    pmulhrsw            m1, m12                             ; m1=t6
+    pmulhrsw           m11, m12                             ; m11=t5
+    VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+; TODO: a lot of t* copies can probably be removed and merged with
+; following SUMSUBs from VP9_IDCT8_1D_FINALIZE with AVX
+%macro VP9_IDCT8_2x2_1D 0
+    pmulhrsw            m0, m12                             ;  m0=t0
+    mova                m3, m1
+    pmulhrsw            m1, m6                              ;  m1=t4
+    pmulhrsw            m3, m7                              ;  m3=t7
+    mova                m2, m0                              ;  m2=t1
+    mova               m10, m0                              ; m10=t2
+    mova                m8, m0                              ;  m8=t3
+    mova               m11, m3                              ; t5 = t7a ...
+    mova                m9, m3                              ; t6 = t7a ...
+    psubw              m11, m1                              ; t5 = t7a - t4a
+    paddw               m9, m1                              ; t6 = t7a + t4a
+    pmulhrsw           m11, m12                             ; m11=t5
+    pmulhrsw            m9, m12                             ;  m9=t6
+    SWAP                 0, 10
+    SWAP                 9,  1
+    VP9_IDCT8_1D_FINALIZE
+%endmacro
+
+%macro VP9_IDCT8_WRITEOUT 0
+    mova                m5, [pw_1024]
+    pmulhrsw            m0, m5              ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+    pmulhrsw            m1, m5
+    VP9_STORE_2X         0,  1,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+    pmulhrsw            m2, m5
+    pmulhrsw            m3, m5
+    VP9_STORE_2X         2,  3,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+    pmulhrsw            m8, m5
+    pmulhrsw            m9, m5
+    VP9_STORE_2X         8,  9,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+    pmulhrsw           m10, m5
+    pmulhrsw           m11, m5
+    VP9_STORE_2X        10, 11,  6,  7,  4
+%endmacro
+
+%macro VP9_IDCT_IDCT_8x8_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
+
+    mova               m12, [pw_11585x2]    ; often used
+
+    cmp eobd, 12 ; top left half or less
+    jg .idctfull
+
+    cmp eobd, 3  ; top left corner or less
+    jg .idcthalf
+
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idcttopleftcorner
+
+    movd                m0, [blockq]
+    pmulhrsw            m0, m12
+    pmulhrsw            m0, m12
+    SPLATW              m0, m0, 0
+    pxor                m4, m4
+    movd          [blockq], m4
+    mova                m5, [pw_1024]
+    pmulhrsw            m0, m5              ; (x*1024 + (1<<14))>>15 <=> (x+16)>>5
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    lea               dstq, [dstq+2*strideq]
+    VP9_STORE_2X         0,  0,  6,  7,  4
+    RET
+
+; faster path for when only left corner is set (3 input: DC, right to DC, below
+; to DC). Note: also working with a 2x2 block
+.idcttopleftcorner:
+    movd                m0, [blockq+0]
+    movd                m1, [blockq+16]
+    mova                m6, [pw_3196x2]
+    mova                m7, [pw_16069x2]
+    VP9_IDCT8_2x2_1D
+    TRANSPOSE8x8W  0, 1, 2, 3, 8, 9, 10, 11, 4
+    VP9_IDCT8_2x2_1D
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    movd       [blockq+ 0], m4
+    movd       [blockq+16], m4
+    VP9_IDCT8_WRITEOUT
+    RET
+
+.idcthalf:
+    movh                m0, [blockq + 0]
+    movh                m1, [blockq +16]
+    movh                m2, [blockq +32]
+    movh                m3, [blockq +48]
+    VP9_IDCT8_4x4_1D
+    TRANSPOSE8x8W  0, 1, 2, 3, 8, 9, 10, 11, 4
+    VP9_IDCT8_4x4_1D
+    pxor                m4, m4
+    movh       [blockq+ 0], m4
+    movh       [blockq+16], m4
+    movh       [blockq+32], m4
+    movh       [blockq+48], m4
+    VP9_IDCT8_WRITEOUT
+    RET
+
+.idctfull: ; generic full 8x8 idct/idct
+    mova                m0, [blockq+  0]    ; IN(0)
+    mova                m1, [blockq+ 16]    ; IN(1)
+    mova                m2, [blockq+ 32]    ; IN(2)
+    mova                m3, [blockq+ 48]    ; IN(3)
+    mova                m8, [blockq+ 64]    ; IN(4)
+    mova                m9, [blockq+ 80]    ; IN(5)
+    mova               m10, [blockq+ 96]    ; IN(6)
+    mova               m11, [blockq+112]    ; IN(7)
+    mova                m7, [pd_8192]       ; rounding
+    VP9_IDCT8_1D
+    TRANSPOSE8x8W  0, 1, 2, 3, 8, 9, 10, 11, 4
+    VP9_IDCT8_1D
+
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    ZERO_BLOCK      blockq, 16, 8, m4
+    VP9_IDCT8_WRITEOUT
+    RET
+%endmacro
+
+VP9_IDCT_IDCT_8x8_ADD_XMM ssse3
+VP9_IDCT_IDCT_8x8_ADD_XMM avx
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_8x8_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IADST8_1D 0 ; input/output=m0/1/2/3/8/9/10/11
+    VP9_UNPACK_MULSUB_2D_4X 11,  0,  4,  5, 16305,  1606    ; m11/4=t1[d], m0/5=t0[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  8,  6, 13, 10394, 12665    ; m3/6=t5[d], m8/13=t4[d]
+    VP9_RND_SH_SUMSUB_BA     8,  0, 13,  5, 14, m7          ; m8=t0[w], m0=t4[w]
+    VP9_RND_SH_SUMSUB_BA     3, 11,  6,  4, 14, m7          ; m3=t1[w], m11=t5[w]
+
+    VP9_UNPACK_MULSUB_2D_4X  9,  2,  4,  5, 14449,  7723    ; m9/4=t3[d], m2/5=t2[d]
+    VP9_UNPACK_MULSUB_2D_4X  1, 10,  6, 13,  4756, 15679    ; m1/6=t7[d], m10/13=t6[d]
+    VP9_RND_SH_SUMSUB_BA    10,  2, 13,  5, 14, m7          ; m10=t2[w], m2=t6[w]
+    VP9_RND_SH_SUMSUB_BA     1,  9,  6,  4, 14, m7          ; m1=t3[w], m9=t7[w]
+
+    ; m8=t0, m3=t1, m10=t2, m1=t3, m0=t4, m11=t5, m2=t6, m9=t7
+
+    VP9_UNPACK_MULSUB_2D_4X  0, 11,  4,  5, 15137,  6270    ; m0/4=t5[d], m11/5=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X  9,  2,  6, 13,  6270, 15137    ; m9/6=t6[d], m2/13=t7[d]
+    VP9_RND_SH_SUMSUB_BA     9, 11,  6,  5, 14, m7
+    psignw                  m9, [pw_m1]                     ; m9=out1[w], m11=t6[w]
+    VP9_RND_SH_SUMSUB_BA     2,  0, 13,  4, 14, m7          ; m2=out6[w], m0=t7[w]
+
+    SUMSUB_BA                w, 10,  8, 14                  ; m10=out0[w], m8=t2[w]
+    SUMSUB_BA                w,  1,  3, 14
+    psignw                  m1, [pw_m1]                     ; m1=out7[w], m3=t3[w]
+
+    ; m10=out0, m9=out1, m8=t2, m3=t3, m11=t6, m0=t7, m2=out6, m1=out7
+
+    SUMSUB_BA                w,  3,  8,  4
+    SUMSUB_BA                w,  0, 11,  5
+    pmulhrsw                m3, m12
+    pmulhrsw               m11, m12
+    pmulhrsw                m8, m12                         ; out4
+    pmulhrsw                m0, m12                         ; out2
+    psignw                  m3, [pw_m1]                     ; out3
+    psignw                 m11, [pw_m1]                     ; out5
+
+    ; m10=out0, m9=out1, m0=out2, m3=out3, m8=out4, m11=out5, m2=out6, m1=out7
+
+    SWAP                     0, 10, 2
+    SWAP                    11,  1, 9
+%endmacro
+
+%macro IADST8_FN 5
+INIT_XMM %5
+cglobal vp9_%1_%3_8x8_add, 3, 3, 15, dst, stride, block, eob
+    mova                m0, [blockq+  0]    ; IN(0)
+    mova                m1, [blockq+ 16]    ; IN(1)
+    mova                m2, [blockq+ 32]    ; IN(2)
+    mova                m3, [blockq+ 48]    ; IN(3)
+    mova                m8, [blockq+ 64]    ; IN(4)
+    mova                m9, [blockq+ 80]    ; IN(5)
+    mova               m10, [blockq+ 96]    ; IN(6)
+    mova               m11, [blockq+112]    ; IN(7)
+
+    mova               m12, [pw_11585x2]    ; often used
+    mova                m7, [pd_8192]       ; rounding
+    VP9_%2_1D
+    TRANSPOSE8x8W  0, 1, 2, 3, 8, 9, 10, 11, 4
+    VP9_%4_1D
+
+    pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X
+    ZERO_BLOCK      blockq, 16, 8, m4
+    VP9_IDCT8_WRITEOUT
+    RET
+%endmacro
+
+IADST8_FN idct,  IDCT8,  iadst, IADST8, ssse3
+IADST8_FN idct,  IDCT8,  iadst, IADST8, avx
+IADST8_FN iadst, IADST8, idct,  IDCT8,  ssse3
+IADST8_FN iadst, IADST8, idct,  IDCT8,  avx
+IADST8_FN iadst, IADST8, iadst, IADST8, ssse3
+IADST8_FN iadst, IADST8, iadst, IADST8, avx
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+; at the end of this macro, m7 is stored in stack_scratch
+; everything else (t0-6 and t8-15) is stored in m0-6 and m8-15
+; the following sumsubs have not been done yet:
+;    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
+;    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
+%macro VP9_IDCT16_1D_START 4 ; src, nnzc, stride, stack_scratch
+%if %2 <= 4
+    mova                m3, [%1+ 1*%3]      ; IN(1)
+    mova               m12, [%1+ 2*%3]      ; IN(2)
+    mova                m0, [%1+ 3*%3]      ; IN(3)
+
+    pmulhrsw           m15, m12, [pw_16069x2]       ; t6-7
+    pmulhrsw           m12, [pw_3196x2]             ; t4-5
+    pmulhrsw            m4, m3,  [pw_16305x2]       ; t14-15
+    pmulhrsw            m3, [pw_1606x2]             ; t8-9
+    pmulhrsw            m7, m0,  [pw_m4756x2]       ; t10-11
+    pmulhrsw            m0, [pw_15679x2]            ; t12-13
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+    paddw              m14, m15, m12
+    psubw              m13, m15, m12
+    pmulhrsw           m13, [pw_11585x2]            ; t5
+    pmulhrsw           m14, [pw_11585x2]            ; t6
+
+    VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137,  6270, [pd_8192], 10, 11 ; t9,  t14
+    VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 10, 11 ; t10, t13
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+%else
+    mova                m5, [%1+ 1*%3]     ; IN(1)
+    mova               m14, [%1+ 2*%3]     ; IN(2)
+    mova                m6, [%1+ 3*%3]     ; IN(3)
+    mova                m9, [%1+ 4*%3]     ; IN(4)
+    mova                m7, [%1+ 5*%3]     ; IN(5)
+    mova               m15, [%1+ 6*%3]     ; IN(6)
+    mova                m4, [%1+ 7*%3]     ; IN(7)
+%if %2 <= 8
+    pmulhrsw            m8, m9,  [pw_15137x2]       ; t3
+    pmulhrsw            m9, [pw_6270x2]             ; t2
+    pmulhrsw           m13, m14, [pw_16069x2]       ; t7
+    pmulhrsw           m14, [pw_3196x2]             ; t4
+    pmulhrsw           m12, m15, [pw_m9102x2]       ; t5
+    pmulhrsw           m15, [pw_13623x2]            ; t6
+    pmulhrsw            m2, m5,  [pw_16305x2]       ; t15
+    pmulhrsw            m5, [pw_1606x2]             ; t8
+    pmulhrsw            m3, m4,  [pw_m10394x2]      ; t9
+    pmulhrsw            m4, [pw_12665x2]            ; t14
+    pmulhrsw            m0, m7,  [pw_14449x2]       ; t13
+    pmulhrsw            m7, [pw_7723x2]             ; t10
+    pmulhrsw            m1, m6,  [pw_m4756x2]       ; t11
+    pmulhrsw            m6, [pw_15679x2]            ; t12
+%else
+    mova                m3, [%1+ 9*%3]     ; IN(9)
+    mova               m12, [%1+10*%3]     ; IN(10)
+    mova                m0, [%1+11*%3]     ; IN(11)
+    mova                m8, [%1+12*%3]     ; IN(12)
+    mova                m1, [%1+13*%3]     ; IN(13)
+    mova               m13, [%1+14*%3]     ; IN(14)
+    mova                m2, [%1+15*%3]     ; IN(15)
+
+    ; m10=in0, m5=in1, m14=in2, m6=in3, m9=in4, m7=in5, m15=in6, m4=in7
+    ; m11=in8, m3=in9, m12=in10 m0=in11, m8=in12, m1=in13, m13=in14, m2=in15
+
+    VP9_UNPACK_MULSUB_2W_4X   9,   8, 15137,  6270, [pd_8192], 10, 11 ; t2,  t3
+    VP9_UNPACK_MULSUB_2W_4X  14,  13, 16069,  3196, [pd_8192], 10, 11 ; t4,  t7
+    VP9_UNPACK_MULSUB_2W_4X  12,  15,  9102, 13623, [pd_8192], 10, 11 ; t5,  t6
+    VP9_UNPACK_MULSUB_2W_4X   5,   2, 16305,  1606, [pd_8192], 10, 11 ; t8,  t15
+    VP9_UNPACK_MULSUB_2W_4X   3,   4, 10394, 12665, [pd_8192], 10, 11 ; t9,  t14
+    VP9_UNPACK_MULSUB_2W_4X   7,   0, 14449,  7723, [pd_8192], 10, 11 ; t10, t13
+    VP9_UNPACK_MULSUB_2W_4X   1,   6,  4756, 15679, [pd_8192], 10, 11 ; t11, t12
+%endif
+
+    ; m11=t0, m10=t1, m9=t2, m8=t3, m14=t4, m12=t5, m15=t6, m13=t7
+    ; m5=t8, m3=t9, m7=t10, m1=t11, m6=t12, m0=t13, m4=t14, m2=t15
+
+    SUMSUB_BA            w, 12, 14, 10      ; t4,  t5
+    SUMSUB_BA            w, 15, 13, 10      ; t7,  t6
+    SUMSUB_BA            w,  3,  5, 10      ; t8,  t9
+    SUMSUB_BA            w,  7,  1, 10      ; t11, t10
+    SUMSUB_BA            w,  0,  6, 10      ; t12, t13
+    SUMSUB_BA            w,  4,  2, 10      ; t15, t14
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+    SUMSUB_BA            w, 14, 13, 10
+    pmulhrsw           m13, [pw_11585x2]                              ; t5
+    pmulhrsw           m14, [pw_11585x2]                              ; t6
+    VP9_UNPACK_MULSUB_2W_4X   2,   5, 15137,  6270, [pd_8192], 10, 11 ; t9,  t14
+    VP9_UNPACK_MULSUB_2W_4X   6,   1, 6270, m15137, [pd_8192], 10, 11 ; t10, t13
+%endif
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
+    ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
+
+    SUMSUB_BA            w,  7,  3, 10      ; t8,  t11
+    SUMSUB_BA            w,  6,  2, 10      ; t9,  t10
+    SUMSUB_BA            w,  0,  4, 10      ; t15, t12
+    SUMSUB_BA            w,  1,  5, 10      ; t14. t13
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+
+    SUMSUB_BA            w,  2,  5, 10
+    SUMSUB_BA            w,  3,  4, 10
+    pmulhrsw            m5, [pw_11585x2]    ; t10
+    pmulhrsw            m4, [pw_11585x2]    ; t11
+    pmulhrsw            m3, [pw_11585x2]    ; t12
+    pmulhrsw            m2, [pw_11585x2]    ; t13
+
+    ; backup first register
+    mova              [%4], m7
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
+
+    ; from load/start
+%if %2 <= 4
+    mova               m11, [%1+ 0*%3]      ; IN(0)
+    pmulhrsw           m11, [pw_11585x2]    ; t0-t3
+
+    psubw               m8, m11, m15
+    paddw              m15, m11
+    psubw               m9, m11, m14
+    paddw              m14, m11
+    psubw              m10, m11, m13
+    paddw              m13, m11
+%else
+    mova               m10, [%1+ 0*%3]      ; IN(0)
+%if %2 <= 8
+    pmulhrsw           m10, [pw_11585x2]    ; t0 and t1
+    psubw              m11, m10, m8
+    paddw               m8, m10
+%else
+    mova               m11, [%1+ 8*%3]      ; IN(8)
+
+    ; from 3 stages back
+    SUMSUB_BA            w, 11, 10, 7
+    pmulhrsw           m11, [pw_11585x2]    ; t0
+    pmulhrsw           m10, [pw_11585x2]    ; t1
+
+    ; from 2 stages back
+    SUMSUB_BA            w,  8, 11, 7       ; t0,  t3
+%endif
+    SUMSUB_BA            w,  9, 10, 7       ; t1,  t2
+
+    ; from 1 stage back
+    SUMSUB_BA            w, 15,  8, 7       ; t0,  t7
+    SUMSUB_BA            w, 14,  9, 7       ; t1,  t6
+    SUMSUB_BA            w, 13, 10, 7       ; t2,  t5
+%endif
+    SUMSUB_BA            w, 12, 11, 7       ; t3,  t4
+
+    SUMSUB_BA            w,  0, 15, 7       ; t0, t15
+    SUMSUB_BA            w,  1, 14, 7       ; t1, t14
+    SUMSUB_BA            w,  2, 13, 7       ; t2, t13
+    SUMSUB_BA            w,  3, 12, 7       ; t3, t12
+    SUMSUB_BA            w,  4, 11, 7       ; t4, t11
+    SUMSUB_BA            w,  5, 10, 7       ; t5, t10
+%endmacro
+
+%macro VP9_IDCT16_1D 2-3 16 ; src, pass, nnzc
+    VP9_IDCT16_1D_START %1, %3, 32, tmpq+32
+
+%if %2 == 1
+    ; backup a different register
+    mova         [tmpq+16], m15
+    mova                m7, [tmpq+32]
+
+    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
+    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
+
+    TRANSPOSE8x8W        0, 1, 2, 3, 4, 5, 6, 7, 15
+    mova        [tmpq+  0], m0
+    mova        [tmpq+ 32], m1
+    mova        [tmpq+ 64], m2
+    mova        [tmpq+ 96], m3
+    mova        [tmpq+128], m4
+    mova        [tmpq+160], m5
+    mova        [tmpq+192], m6
+    mova        [tmpq+224], m7
+
+    mova               m15, [tmpq+16]
+    TRANSPOSE8x8W        8, 9, 10, 11, 12, 13, 14, 15, 0
+    mova        [tmpq+ 16], m8
+    mova        [tmpq+ 48], m9
+    mova        [tmpq+ 80], m10
+    mova        [tmpq+112], m11
+    mova        [tmpq+144], m12
+    mova        [tmpq+176], m13
+    mova        [tmpq+208], m14
+    mova        [tmpq+240], m15
+%else ; %2 == 2
+    ; backup more registers
+    mova         [tmpq+64], m8
+    mova         [tmpq+96], m9
+
+    pxor                m7, m7
+    pmulhrsw            m0, [pw_512]
+    pmulhrsw            m1, [pw_512]
+    VP9_STORE_2X         0,  1,  8,  9,  7
+    lea               dstq, [dstq+strideq*2]
+    pmulhrsw            m2, [pw_512]
+    pmulhrsw            m3, [pw_512]
+    VP9_STORE_2X         2,  3,  8,  9,  7
+    lea               dstq, [dstq+strideq*2]
+    pmulhrsw            m4, [pw_512]
+    pmulhrsw            m5, [pw_512]
+    VP9_STORE_2X         4,  5,  8,  9,  7
+    lea               dstq, [dstq+strideq*2]
+
+    ; restore from cache
+    SWAP                 0, 7               ; move zero from m7 to m0
+    mova                m7, [tmpq+32]
+    mova                m8, [tmpq+64]
+    mova                m9, [tmpq+96]
+
+    SUMSUB_BA            w,  6,  9, 1       ; t6, t9
+    SUMSUB_BA            w,  7,  8, 1       ; t7, t8
+
+    pmulhrsw            m6, [pw_512]
+    pmulhrsw            m7, [pw_512]
+    VP9_STORE_2X         6,  7,  1,  2,  0
+    lea               dstq, [dstq+strideq*2]
+    pmulhrsw            m8, [pw_512]
+    pmulhrsw            m9, [pw_512]
+    VP9_STORE_2X         8,  9,  1,  2,  0
+    lea               dstq, [dstq+strideq*2]
+    pmulhrsw           m10, [pw_512]
+    pmulhrsw           m11, [pw_512]
+    VP9_STORE_2X        10, 11,  1,  2,  0
+    lea               dstq, [dstq+strideq*2]
+    pmulhrsw           m12, [pw_512]
+    pmulhrsw           m13, [pw_512]
+    VP9_STORE_2X        12, 13,  1,  2,  0
+    lea               dstq, [dstq+strideq*2]
+    pmulhrsw           m14, [pw_512]
+    pmulhrsw           m15, [pw_512]
+    VP9_STORE_2X        14, 15,  1,  2,  0
+%endif ; %2 == 1/2
+%endmacro
+
+%macro VP9_STORE_2XFULL 6-7 strideq; dc, tmp1, tmp2, tmp3, tmp4, zero, stride
+    mova               m%3, [dstq]
+    mova               m%5, [dstq+%7]
+    punpcklbw          m%2, m%3, m%6
+    punpckhbw          m%3, m%6
+    punpcklbw          m%4, m%5, m%6
+    punpckhbw          m%5, m%6
+    paddw              m%2, m%1
+    paddw              m%3, m%1
+    paddw              m%4, m%1
+    paddw              m%5, m%1
+    packuswb           m%2, m%3
+    packuswb           m%4, m%5
+    mova            [dstq], m%2
+    mova         [dstq+%7], m%4
+%endmacro
+
+%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
+    ; 2x2=eob=3, 4x4=eob=10
+    cmp eobd, 38
+    jg .idctfull
+    cmp eobd, 1 ; faster path for when only DC is set
+    jne .idct8x8
+
+    ; dc-only
+    movd                m0, [blockq]
+    mova                m1, [pw_11585x2]
+    pmulhrsw            m0, m1
+    pmulhrsw            m0, m1
+    SPLATW              m0, m0, q0000
+    pmulhrsw            m0, [pw_512]
+    pxor                m5, m5
+    movd          [blockq], m5
+%rep 7
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
+    lea               dstq, [dstq+2*strideq]
+%endrep
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5
+    RET
+
+    DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
+.idct8x8:
+    mov               tmpq, rsp
+    VP9_IDCT16_1D   blockq, 1, 8
+
+    mov               cntd, 2
+    mov           dst_bakq, dstq
+.loop2_8x8:
+    VP9_IDCT16_1D     tmpq, 2, 8
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_8x8
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 8, m0
+    RET
+
+.idctfull:
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_IDCT16_1D   blockq, 1
+    add             blockq, 16
+    add               tmpq, 256
+    dec               cntd
+    jg .loop1_full
+    sub             blockq, 32
+
+    mov               cntd, 2
+    mov               tmpq, rsp
+    mov           dst_bakq, dstq
+.loop2_full:
+    VP9_IDCT16_1D     tmpq, 2
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 16, m0
+    RET
+%endmacro
+
+VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
+VP9_IDCT_IDCT_16x16_ADD_XMM avx
+
+;---------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IADST16_1D 2 ; src, pass
+%assign %%str 16*%2
+    mova                m0, [%1+ 0*32]  ; in0
+    mova                m1, [%1+15*32]  ; in15
+    mova                m8, [%1+ 7*32]  ; in7
+    mova                m9, [%1+ 8*32]  ; in8
+
+    VP9_UNPACK_MULSUB_2D_4X  1,  0,  2,  3, 16364,   804    ; m1/2=t1[d], m0/3=t0[d]
+    VP9_UNPACK_MULSUB_2D_4X  8,  9, 11, 10, 11003, 12140    ; m8/11=t9[d], m9/10=t8[d]
+    VP9_RND_SH_SUMSUB_BA     9,  0, 10,  3,  4, [pd_8192]   ; m9=t0[w], m0=t8[w]
+    VP9_RND_SH_SUMSUB_BA     8,  1, 11,  2,  4, [pd_8192]   ; m8=t1[w], m1=t9[w]
+
+    mova               m11, [%1+ 2*32]  ; in2
+    mova               m10, [%1+13*32]  ; in13
+    mova                m3, [%1+ 5*32]  ; in5
+    mova                m2, [%1+10*32]  ; in10
+
+    VP9_UNPACK_MULSUB_2D_4X 10, 11,  6,  7, 15893,  3981    ; m10/6=t3[d], m11/7=t2[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  4,  5,  8423, 14053    ; m3/4=t11[d], m2/5=t10[d]
+    VP9_RND_SH_SUMSUB_BA     2, 11,  5,  7, 12, [pd_8192]   ; m2=t2[w], m11=t10[w]
+    VP9_RND_SH_SUMSUB_BA     3, 10,  4,  6, 12, [pd_8192]   ; m3=t3[w], m10=t11[w]
+
+    mova   [tmpq+ 0*%%str], m9          ; make some scratch space (t0:m9->r0)
+    mova                m4, [%1+ 4*32]  ; in4
+    mova                m5, [%1+11*32]  ; in11
+    mova               m12, [%1+ 3*32]  ; in3
+    mova               m13, [%1+12*32]  ; in12
+
+    VP9_UNPACK_MULSUB_2D_4X  5,  4,  7,  6, 14811,  7005    ; m5/7=t5[d], m4/6=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X 12, 13, 14, 15,  5520, 15426    ; m12/14=t13[d], m13/15=t12[d]
+    VP9_RND_SH_SUMSUB_BA    13,  4, 15,  6,  9, [pd_8192]   ; m13=t4[w], m4=t12[w]
+    VP9_RND_SH_SUMSUB_BA    12,  5, 14,  7,  9, [pd_8192]   ; m12=t5[w], m5=t13[w]
+
+    mova   [tmpq+ 2*%%str], m8          ; t1:m9->r2
+    mova   [tmpq+ 3*%%str], m2          ; t2:m2->r3
+    mova   [tmpq+ 4*%%str], m3          ; t3:m3->r4
+    mova   [tmpq+ 5*%%str], m13         ; t4:m13->r5
+    mova                m2, [%1+ 6*32]  ; in6
+    mova                m3, [%1+ 9*32]  ; in9
+    mova                m8, [%1+ 1*32]  ; in1
+    mova                m9, [%1+14*32]  ; in14
+
+    VP9_UNPACK_MULSUB_2D_4X  3,  2,  7,  6, 13160,  9760    ; m3/7=t7[d], m2/6=t6[d]
+    VP9_UNPACK_MULSUB_2D_4X  8,  9, 13, 14,  2404, 16207    ; m8/13=t15[d], m9/14=t14[d]
+    VP9_RND_SH_SUMSUB_BA     9,  2, 14,  6, 15, [pd_8192]   ; m9=t6[w], m2=t14[w]
+    VP9_RND_SH_SUMSUB_BA     8,  3, 13,  7, 15, [pd_8192]   ; m8=t7[w], m3=t15[w]
+
+    ; r0=t0, r2=t1, r3=t2, r4=t3, r5=t4, m12=t5, m9=t6, m8=t7
+    ; m0=t8, m1=t9, m11=t10, m10=t11, m4=t12, m5=t13, m2=t14, m3=t15
+
+    ; handle t8-15 first
+    VP9_UNPACK_MULSUB_2D_4X  0,  1,  6,  7, 16069,  3196    ; m1/7=t8[d], m0/6=t9[d]
+    VP9_UNPACK_MULSUB_2D_4X  5,  4, 13, 14,  3196, 16069    ; m5/13=t12[d], m4/14=t13[d]
+    VP9_RND_SH_SUMSUB_BA     5,  1, 13,  7, 15, [pd_8192]   ; m5=t8[w], m1=t12[w]
+    VP9_RND_SH_SUMSUB_BA     4,  0, 14,  6, 15, [pd_8192]   ; m4=t9[w], m0=t13[w]
+
+    VP9_UNPACK_MULSUB_2D_4X 11, 10,  6,  7,  9102, 13623    ; m11/6=t11[d], m10/7=t10[d]
+    VP9_UNPACK_MULSUB_2D_4X  3,  2, 13, 14, 13623,  9102    ; m3/13=t14[d], m2/14=t15[d]
+    VP9_RND_SH_SUMSUB_BA     3, 10, 13,  7, 15, [pd_8192]   ; m3=t10[w], m10=t14[w]
+    VP9_RND_SH_SUMSUB_BA     2, 11, 14,  6, 15, [pd_8192]   ; m2=t11[w], m11=t15[w]
+
+    ; m5=t8, m4=t9, m3=t10, m2=t11, m1=t12, m0=t13, m10=t14, m11=t15
+
+    VP9_UNPACK_MULSUB_2D_4X  1,  0,  6,  7, 15137,  6270    ; m1/6=t13[d], m0/7=t12[d]
+    VP9_UNPACK_MULSUB_2D_4X 11, 10, 13, 14,  6270, 15137    ; m11/13=t14[d], m10/14=t15[d]
+    VP9_RND_SH_SUMSUB_BA    11,  0, 13,  7, 15, [pd_8192]   ; m11=out2[w], m0=t14[w]
+    VP9_RND_SH_SUMSUB_BA    10,  1, 14,  6, 15, [pd_8192]
+    psignw                 m10, [pw_m1]                     ; m10=out13[w], m1=t15[w]
+
+    SUMSUB_BA                w,  3,  5, 15
+    psignw                  m3, [pw_m1]                     ; m3=out1[w], m5=t10[w]
+    SUMSUB_BA                w,  2,  4, 15                  ; m2=out14[w], m4=t11[w]
+
+    SUMSUB_BA                w,  5,  4, 15
+    pmulhrsw                m5, [pw_11585x2]                ; m5=out6[w]
+    pmulhrsw                m4, [pw_11585x2]                ; m4=out9[w]
+    SUMSUB_BA                w,  1,  0, 15
+    pmulhrsw                m1, [pw_m11585x2]               ; m1=out5[w]
+    pmulhrsw                m0, [pw_11585x2]                ; m0=out10[w]
+
+    ; m3=out1, m11=out2, m1=out5, m5=out6, m4=out9, m0=out10, m10=out13, m2=out14
+
+    mova                    m6, [tmpq+ 0*%%str]
+    mova                    m7, [tmpq+ 2*%%str]
+    mova                   m13, [tmpq+ 3*%%str]
+    mova                   m14, [tmpq+ 4*%%str]
+    mova                   m15, [tmpq+ 5*%%str]
+    mova       [tmpq+ 8*%%str], m5
+    mova       [tmpq+ 9*%%str], m4
+    mova       [tmpq+10*%%str], m0
+    mova       [tmpq+11*%%str], m10
+    mova       [tmpq+12*%%str], m2
+
+    ; m6=t0, m7=t1, m13=t2, m14=t3, m15=t4, m12=t5, m9=t6, m8=t7
+    ; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14
+
+    SUMSUB_BA                w, 15,  6,  0                  ; m15=t0[w], m6=t4[w]
+    SUMSUB_BA                w, 12,  7,  0                  ; m12=t1[w], m7=t5[w]
+    SUMSUB_BA                w,  9, 13,  0                  ; m9=t2[w], m13=t6[w]
+    SUMSUB_BA                w,  8, 14,  0                  ; m8=t3[w], m14=t7[w]
+
+    VP9_UNPACK_MULSUB_2D_4X  6,  7,  0,  2, 15137,  6270    ; m6/0=t5[d], m7/2=t4[d]
+    VP9_UNPACK_MULSUB_2D_4X 14, 13,  4,  5,  6270, 15137    ; m14/4=t6[d], m13/5=t7[d]
+    VP9_RND_SH_SUMSUB_BA    14,  7,  4,  2, 10, [pd_8192]
+    psignw                 m14, [pw_m1]                     ; m14=out3[w], m7=t6[w]
+    VP9_RND_SH_SUMSUB_BA    13,  6,  5,  0, 10, [pd_8192]   ; m13=out12[w], m6=t7[w]
+    SUMSUB_BA                w,  9, 15, 10                  ; m9=out0[w], m15=t2[w]
+    SUMSUB_BA                w,  8, 12, 10
+    psignw                  m8, [pw_m1]                     ; m8=out15[w], m12=t3[w]
+
+    SUMSUB_BA                w, 12, 15, 10
+    pmulhrsw               m12, [pw_m11585x2]               ; m12=out7[w]
+    pmulhrsw               m15, [pw_11585x2]                ; m15=out8[w]
+    SUMSUB_BA                w,  7,  6, 10
+    pmulhrsw                m7, [pw_11585x2]                ; m7=out4[w]
+    pmulhrsw                m6, [pw_11585x2]                ; m6=out11[w]
+
+    ; m9=out0, m14=out3, m7=out4, m12=out7, m15=out8, m6=out11, m13=out12, m8=out15
+    ; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14
+
+%if %2 == 1
+    mova                    m0, [tmpq+ 8*%%str]
+    TRANSPOSE8x8W            9, 3, 11, 14, 7, 1, 0, 12, 2
+    mova          [tmpq+ 0*16], m9
+    mova          [tmpq+ 2*16], m3
+    mova          [tmpq+ 4*16], m11
+    mova          [tmpq+ 6*16], m14
+    mova                    m9, [tmpq+ 9*%%str]
+    mova                    m3, [tmpq+10*%%str]
+    mova                   m11, [tmpq+11*%%str]
+    mova                   m14, [tmpq+12*%%str]
+    mova          [tmpq+ 8*16], m7
+    mova          [tmpq+10*16], m1
+    mova          [tmpq+12*16], m0
+    mova          [tmpq+14*16], m12
+
+    TRANSPOSE8x8W           15, 9, 3, 6, 13, 11, 14, 8, 2
+    mova          [tmpq+ 1*16], m15
+    mova          [tmpq+ 3*16], m9
+    mova          [tmpq+ 5*16], m3
+    mova          [tmpq+ 7*16], m6
+    mova          [tmpq+ 9*16], m13
+    mova          [tmpq+11*16], m11
+    mova          [tmpq+13*16], m14
+    mova          [tmpq+15*16], m8
+%else
+    mova                    m5, [tmpq+ 8*%%str]
+    pxor                    m0, m0
+
+    pmulhrsw                m9, [pw_512]
+    pmulhrsw                m3, [pw_512]
+    VP9_STORE_2X             9,  3, 2, 4, 0
+    lea                   dstq, [dstq+strideq*2]
+    pmulhrsw               m11, [pw_512]
+    pmulhrsw               m14, [pw_512]
+    VP9_STORE_2X            11, 14, 2, 4, 0
+    lea                   dstq, [dstq+strideq*2]
+    pmulhrsw                m7, [pw_512]
+    pmulhrsw                m1, [pw_512]
+    VP9_STORE_2X             7,  1, 2, 4, 0
+    lea                   dstq, [dstq+strideq*2]
+    pmulhrsw                m5, [pw_512]
+    pmulhrsw               m12, [pw_512]
+    VP9_STORE_2X             5, 12, 2, 4, 0
+    lea                   dstq, [dstq+strideq*2]
+
+    mova                    m9, [tmpq+ 9*%%str]
+    mova                    m3, [tmpq+10*%%str]
+    mova                   m11, [tmpq+11*%%str]
+    mova                   m14, [tmpq+12*%%str]
+
+    pmulhrsw               m15, [pw_512]
+    pmulhrsw                m9, [pw_512]
+    VP9_STORE_2X            15,  9, 2, 4, 0
+    lea                   dstq, [dstq+strideq*2]
+    pmulhrsw                m3, [pw_512]
+    pmulhrsw                m6, [pw_512]
+    VP9_STORE_2X             3,  6, 2, 4, 0
+    lea                   dstq, [dstq+strideq*2]
+    pmulhrsw               m13, [pw_512]
+    pmulhrsw               m11, [pw_512]
+    VP9_STORE_2X            13, 11, 2, 4, 0
+    lea                   dstq, [dstq+strideq*2]
+    pmulhrsw               m14, [pw_512]
+    pmulhrsw                m8, [pw_512]
+    VP9_STORE_2X            14,  8, 2, 4, 0
+%endif
+%endmacro
+
+%macro IADST16_FN 5
+INIT_XMM %5
+cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_%2_1D       blockq, 1
+    add             blockq, 16
+    add               tmpq, 256
+    dec               cntd
+    jg .loop1_full
+    sub             blockq, 32
+
+    mov               cntd, 2
+    mov               tmpq, rsp
+    mov           dst_bakq, dstq
+.loop2_full:
+    VP9_%4_1D         tmpq, 2
+    lea               dstq, [dst_bakq+8]
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m0 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 32, 16, m0
+    RET
+%endmacro
+
+IADST16_FN idct,  IDCT16,  iadst, IADST16, ssse3
+IADST16_FN idct,  IDCT16,  iadst, IADST16, avx
+IADST16_FN iadst, IADST16, idct,  IDCT16,  ssse3
+IADST16_FN iadst, IADST16, idct,  IDCT16,  avx
+IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
+IADST16_FN iadst, IADST16, iadst, IADST16, avx
+
+;---------------------------------------------------------------------------------------------
+; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;---------------------------------------------------------------------------------------------
+
+%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
+%assign %%str 16*%2*%2
+    ; first do t0-15, this can be done identical to idct16x16
+    VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq+ 4*%%str
+
+    ; backup a different register
+    mova    [tmpq+30*%%str], m15    ; t15
+    mova                m7, [tmpq+ 4*%%str]
+
+    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
+    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
+
+    ; store everything on stack to make space available for t16-31
+    ; we store interleaved with the output of the second half (t16-31)
+    ; so we don't need to allocate extra stack space
+    mova    [tmpq+ 0*%%str], m0     ; t0
+    mova    [tmpq+ 4*%%str], m1     ; t1
+    mova    [tmpq+ 8*%%str], m2     ; t2
+    mova    [tmpq+12*%%str], m3     ; t3
+    mova    [tmpq+16*%%str], m4     ; t4
+    mova    [tmpq+20*%%str], m5     ; t5
+    mova    [tmpq+24*%%str], m6     ; t6
+    mova    [tmpq+28*%%str], m7     ; t7
+    mova    [tmpq+ 2*%%str], m8     ; t8
+    mova    [tmpq+ 6*%%str], m9     ; t9
+    mova    [tmpq+10*%%str], m10    ; t10
+    mova    [tmpq+14*%%str], m11    ; t11
+    mova    [tmpq+18*%%str], m12    ; t12
+    mova    [tmpq+22*%%str], m13    ; t13
+    mova    [tmpq+26*%%str], m14    ; t14
+
+    ; then, secondly, do t16-31
+%if %3 <= 8
+    mova                 m4, [%1+ 1*64]
+    mova                 m3, [%1+ 3*64]
+    mova                 m0, [%1+ 5*64]
+    mova                 m7, [%1+ 7*64]
+
+    pmulhrsw            m11,  m4, [pw_16364x2] ;t31
+    pmulhrsw             m4, [pw_804x2] ;t16
+    pmulhrsw             m8,  m7, [pw_m5520x2] ;t19
+    pmulhrsw             m7, [pw_15426x2] ;t28
+    pmulhrsw            m15,  m0, [pw_15893x2] ;t27
+    pmulhrsw             m0, [pw_3981x2] ;t20
+    pmulhrsw            m12,  m3, [pw_m2404x2] ;t23
+    pmulhrsw             m3, [pw_16207x2] ;t24
+
+    ; m4=t16/17, m8=t18/19, m0=t20/21, m12=t22/23,
+    ; m3=t24/25, m15=t26/27, m7=t28/29, m11=t30/31
+
+    VP9_UNPACK_MULSUB_2W_4X   5, 10, 11,  4, 16069,  3196, [pd_8192], 6,  9 ; t17, t30
+    VP9_UNPACK_MULSUB_2W_4X   9,  6,  7,  8, 3196, m16069, [pd_8192], 1, 14 ; t18, t29
+    ; from 1 stage forward
+    SUMSUB_BA                 w,  8,  4,  1
+    ; temporary storage
+    mova    [tmpq+17*%%str], m8             ; t16
+    mova    [tmpq+21*%%str], m4             ; t19
+    VP9_UNPACK_MULSUB_2W_4X   1, 14, 15,  0,  9102, 13623, [pd_8192], 4,  8 ; t21, t26
+    VP9_UNPACK_MULSUB_2W_4X  13,  2,  3, 12, 13623, m9102, [pd_8192], 4,  8 ; t22, t25
+
+    ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
+    ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
+%else
+    mova                m10, [%1+ 1*64]
+    mova                m13, [%1+ 3*64]
+    mova                m14, [%1+ 5*64]
+    mova                 m9, [%1+ 7*64]
+    mova                 m8, [%1+ 9*64]
+    mova                m15, [%1+11*64]
+    mova                m12, [%1+13*64]
+    mova                m11, [%1+15*64]
+%if %3 <= 16
+    pmulhrsw             m5, m10, [pw_16364x2]
+    pmulhrsw            m10, [pw_804x2]
+    pmulhrsw             m4, m11, [pw_m11003x2]
+    pmulhrsw            m11, [pw_12140x2]
+    pmulhrsw             m7,  m8, [pw_14811x2]
+    pmulhrsw             m8, [pw_7005x2]
+    pmulhrsw             m6,  m9, [pw_m5520x2]
+    pmulhrsw             m9, [pw_15426x2]
+    pmulhrsw             m1, m14, [pw_15893x2]
+    pmulhrsw            m14, [pw_3981x2]
+    pmulhrsw             m0, m15, [pw_m8423x2]
+    pmulhrsw            m15, [pw_14053x2]
+%else
+    mova                 m4, [%1+17*64]
+    mova                 m0, [%1+21*64]
+    mova                 m7, [%1+23*64]
+    mova                 m6, [%1+25*64]
+    mova                 m1, [%1+27*64]
+    mova                 m5, [%1+31*64]
+
+    ; m10=in1, m4=in17, m8=in9, m6=in25, m14=in5, m0=in21, m12=in13, m2=in29,
+    ; m13=in3, m3=in19, m15=in11, m1=in27, m9=in7, m7=in23, m11=in15, m5=in31
+
+    VP9_UNPACK_MULSUB_2W_4X  10,  5, 16364,   804, [pd_8192], 2, 3 ; t16, t31
+    VP9_UNPACK_MULSUB_2W_4X   4, 11, 11003, 12140, [pd_8192], 2, 3 ; t17, t30
+    VP9_UNPACK_MULSUB_2W_4X   8,  7, 14811,  7005, [pd_8192], 2, 3 ; t18, t29
+    VP9_UNPACK_MULSUB_2W_4X   6,  9,  5520, 15426, [pd_8192], 2, 3 ; t19, t28
+    VP9_UNPACK_MULSUB_2W_4X  14,  1, 15893,  3981, [pd_8192], 2, 3 ; t20, t27
+    VP9_UNPACK_MULSUB_2W_4X   0, 15,  8423, 14053, [pd_8192], 2, 3 ; t21, t26
+%endif
+
+    ; from 1 stage forward
+    SUMSUB_BA             w,  4, 10,  2
+    SUMSUB_BA             w,  8,  6,  2
+    ; from 2 stages forward
+    SUMSUB_BA             w,  8,  4,  2
+    ; temporary storage
+    mova    [tmpq+17*%%str], m8             ; t16
+    mova    [tmpq+21*%%str], m4             ; t19
+%if %3 <= 16
+    pmulhrsw             m3, m12, [pw_13160x2]
+    pmulhrsw            m12, [pw_9760x2]
+    pmulhrsw             m2, m13, [pw_m2404x2]
+    pmulhrsw            m13, [pw_16207x2]
+%else
+    mova                 m2, [%1+29*64]
+    mova                 m3, [%1+19*64]
+    VP9_UNPACK_MULSUB_2W_4X  12,  3, 13160,  9760, [pd_8192], 4, 8 ; t22, t25
+    VP9_UNPACK_MULSUB_2W_4X   2, 13,  2404, 16207, [pd_8192], 4, 8 ; t23, t24
+%endif
+
+    ; m10=t16, m4=t17, m8=t18, m6=t19, m14=t20, m0=t21, m12=t22, m2=t23,
+    ; m13=t24, m3=t25, m15=t26, m1=t27, m9=t28, m7=t29, m11=t30, m5=t31
+
+    SUMSUB_BA             w,  0, 14,  4
+    SUMSUB_BA             w, 12,  2,  4
+    SUMSUB_BA             w,  3, 13,  4
+    SUMSUB_BA             w, 15,  1,  4
+    SUMSUB_BA             w,  7,  9,  4
+    SUMSUB_BA             w, 11,  5,  4
+
+    ; m4=t16, m10=t17, m6=t18, m8=t19, m0=t20, m14=t21, m2=t22, m12=t23,
+    ; m3=t24, m13=t25, m1=t26, m15=t27, m7=t28, m9=t29, m5=t30, m11=t31
+
+    VP9_UNPACK_MULSUB_2W_4X   5, 10, 16069,  3196, [pd_8192], 4, 8 ; t17, t30
+    VP9_UNPACK_MULSUB_2W_4X   9,  6, 3196, m16069, [pd_8192], 4, 8 ; t18, t29
+    VP9_UNPACK_MULSUB_2W_4X   1, 14,  9102, 13623, [pd_8192], 4, 8 ; t21, t26
+    VP9_UNPACK_MULSUB_2W_4X  13,  2, 13623, m9102, [pd_8192], 4, 8 ; t22, t25
+%endif
+
+    ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
+    ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
+
+    SUMSUB_BA             w,  9,  5,  4
+    SUMSUB_BA             w,  1, 13,  4
+    SUMSUB_BA             w,  0, 12,  4
+    SUMSUB_BA             w, 15,  3,  4
+    SUMSUB_BA             w, 14,  2,  4
+    SUMSUB_BA             w,  6, 10,  4
+    SUMSUB_BA             w,  7, 11,  4
+
+    ; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
+    ; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
+
+    mova                 m8, [tmpq+17*%%str] ; t16
+    ; from 2 stages forward
+    SUMSUB_BA             w,  0,  8,  4
+    SUMSUB_BA             w, 15,  7,  4
+    ; from 3 stages forward
+    SUMSUB_BA             w,  8,  7,  4
+    pmulhrsw             m7, [pw_11585x2]
+    pmulhrsw             m8, [pw_11585x2]
+    ; store t16/t23
+    mova    [tmpq+ 1*%%str], m0     ; t16
+    mova    [tmpq+29*%%str], m7     ; t23
+
+    mova                 m4, [tmpq+21*%%str] ; t19
+    VP9_UNPACK_MULSUB_2W_4X  10,  5, 15137,  6270, [pd_8192], 0, 7 ; t18, t29
+    VP9_UNPACK_MULSUB_2W_4X  11,  4, 15137,  6270, [pd_8192], 0, 7 ; t19, t28
+    VP9_UNPACK_MULSUB_2W_4X   3, 12, 6270, m15137, [pd_8192], 0, 7 ; t20, t27
+    VP9_UNPACK_MULSUB_2W_4X   2, 13, 6270, m15137, [pd_8192], 0, 7 ; t21, t26
+
+    ; m8=t16, m9=t17, m10=t18, m11=t19, m3=t20, m2=t21, m1=t22, m0=t23,
+    ; m15=t24, m14=t25, m13=t26, m12=t27, m4=t28, m5=t29, m6=t30, m7=t31
+
+    SUMSUB_BA             w,  1,  9,  0
+    SUMSUB_BA             w,  2, 10,  0
+    SUMSUB_BA             w,  3, 11,  0
+    SUMSUB_BA             w, 12,  4,  0
+    SUMSUB_BA             w, 13,  5,  0
+    SUMSUB_BA             w, 14,  6,  0
+
+    ; m0=t16, m1=t17, m2=t18, m3=t19, m11=t20, m10=t21, m9=t22, m8=t23,
+    ; m7=t24, m6=t25, m5=t26, m4=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+    SUMSUB_BA             w,  9,  6,  0
+    SUMSUB_BA             w, 10,  5,  0
+    SUMSUB_BA             w, 11,  4,  0
+
+    pmulhrsw             m6, [pw_11585x2]
+    pmulhrsw             m9, [pw_11585x2]
+    pmulhrsw             m5, [pw_11585x2]
+    pmulhrsw            m10, [pw_11585x2]
+    pmulhrsw             m4, [pw_11585x2]
+    pmulhrsw            m11, [pw_11585x2]
+
+    ; m0=t16, m1=t17, m2=t18, m3=t19, m4=t20, m5=t21, m6=t22, m7=t23,
+    ; m8=t24, m9=t25, m10=t26, m11=t27, m12=t28, m13=t29, m14=t30, m15=t31
+
+    ; store t17-19 (and t20-22 for pass 1) - keep t24-31 in registers for
+    ; final sumsub in pass 1, or keep t20-22 and t24-31 in registers for
+    ; final sumsub of pass 2
+    mova    [tmpq+ 5*%%str], m1     ; t17
+    mova    [tmpq+ 9*%%str], m2     ; t18
+    mova    [tmpq+13*%%str], m3     ; t19
+
+    ; then do final pass to sumsub+store the two halves
+%if %2 == 1
+    mova    [tmpq+17*%%str], m4     ; t20
+    mova    [tmpq+21*%%str], m5     ; t21
+    mova    [tmpq+25*%%str], m6     ; t22
+
+    mova                 m0, [tmpq+ 0*%%str] ; t0
+    mova                 m1, [tmpq+ 4*%%str] ; t1
+    mova                 m2, [tmpq+ 8*%%str] ; t2
+    mova                 m3, [tmpq+12*%%str] ; t3
+    mova                 m4, [tmpq+16*%%str] ; t4
+    mova                 m5, [tmpq+20*%%str] ; t5
+    mova                 m6, [tmpq+24*%%str] ; t6
+
+    SUMSUB_BA             w, 15,  0, 7
+    mova    [tmpq+ 3*%%str], m0              ; t15
+    mova                 m7, [tmpq+28*%%str] ; t7
+    SUMSUB_BA             w, 14,  1, 0
+    SUMSUB_BA             w, 13,  2, 0
+    SUMSUB_BA             w, 12,  3, 0
+    SUMSUB_BA             w, 11,  4, 0
+    SUMSUB_BA             w, 10,  5, 0
+    SUMSUB_BA             w,  9,  6, 0
+    SUMSUB_BA             w,  8,  7, 0
+
+    TRANSPOSE8x8W        15, 14, 13, 12, 11, 10, 9, 8, 0
+    mova    [tmpq+ 0*%%str], m15
+    mova    [tmpq+ 4*%%str], m14
+    mova    [tmpq+ 8*%%str], m13
+    mova    [tmpq+12*%%str], m12
+    mova    [tmpq+16*%%str], m11
+    mova    [tmpq+20*%%str], m10
+    mova    [tmpq+24*%%str], m9
+    mova    [tmpq+28*%%str], m8
+
+    mova                  m0, [tmpq+ 3*%%str] ; t15
+    TRANSPOSE8x8W          7, 6, 5, 4, 3, 2, 1, 0, 8
+    mova    [tmpq+ 3*%%str], m7
+    mova    [tmpq+ 7*%%str], m6
+    mova    [tmpq+11*%%str], m5
+    mova    [tmpq+15*%%str], m4
+    mova    [tmpq+19*%%str], m3
+    mova    [tmpq+23*%%str], m2
+    mova    [tmpq+27*%%str], m1
+    mova    [tmpq+31*%%str], m0
+
+    mova                m15, [tmpq+ 2*%%str] ; t8
+    mova                m14, [tmpq+ 6*%%str] ; t9
+    mova                m13, [tmpq+10*%%str] ; t10
+    mova                m12, [tmpq+14*%%str] ; t11
+    mova                m11, [tmpq+18*%%str] ; t12
+    mova                m10, [tmpq+22*%%str] ; t13
+    mova                 m9, [tmpq+26*%%str] ; t14
+    mova                 m8, [tmpq+30*%%str] ; t15
+    mova                 m7, [tmpq+ 1*%%str] ; t16
+    mova                 m6, [tmpq+ 5*%%str] ; t17
+    mova                 m5, [tmpq+ 9*%%str] ; t18
+    mova                 m4, [tmpq+13*%%str] ; t19
+    mova                 m3, [tmpq+17*%%str] ; t20
+    mova                 m2, [tmpq+21*%%str] ; t21
+    mova                 m1, [tmpq+25*%%str] ; t22
+
+    SUMSUB_BA             w,  7,  8, 0
+    mova    [tmpq+ 2*%%str], m8
+    mova                 m0, [tmpq+29*%%str] ; t23
+    SUMSUB_BA             w,  6,  9, 8
+    SUMSUB_BA             w,  5, 10, 8
+    SUMSUB_BA             w,  4, 11, 8
+    SUMSUB_BA             w,  3, 12, 8
+    SUMSUB_BA             w,  2, 13, 8
+    SUMSUB_BA             w,  1, 14, 8
+    SUMSUB_BA             w,  0, 15, 8
+
+    TRANSPOSE8x8W         0, 1, 2, 3, 4, 5, 6, 7, 8
+    mova    [tmpq+ 1*%%str], m0
+    mova    [tmpq+ 5*%%str], m1
+    mova    [tmpq+ 9*%%str], m2
+    mova    [tmpq+13*%%str], m3
+    mova    [tmpq+17*%%str], m4
+    mova    [tmpq+21*%%str], m5
+    mova    [tmpq+25*%%str], m6
+    mova    [tmpq+29*%%str], m7
+
+    mova                 m8, [tmpq+ 2*%%str]
+    TRANSPOSE8x8W         8, 9, 10, 11, 12, 13, 14, 15, 0
+    mova    [tmpq+ 2*%%str], m8
+    mova    [tmpq+ 6*%%str], m9
+    mova    [tmpq+10*%%str], m10
+    mova    [tmpq+14*%%str], m11
+    mova    [tmpq+18*%%str], m12
+    mova    [tmpq+22*%%str], m13
+    mova    [tmpq+26*%%str], m14
+    mova    [tmpq+30*%%str], m15
+%else
+    ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
+    ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
+    ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
+    ; t20-22 is in m4-6
+    ; t24-31 is in m8-15
+    pxor                m7, m7
+
+%macro %%STORE_2X2 7-8 1 ; src[1-4], tmp[1-2], zero, inc_dst_ptrs
+    SUMSUB_BA            w, %4, %1, %5
+    SUMSUB_BA            w, %3, %2, %5
+    pmulhrsw           m%4, [pw_512]
+    pmulhrsw           m%3, [pw_512]
+    VP9_STORE_2X        %4, %3, %5, %6, %7
+%if %8 == 1
+    add               dstq, stride2q
+%endif
+    pmulhrsw           m%2, [pw_512]
+    pmulhrsw           m%1, [pw_512]
+    VP9_STORE_2X        %2, %1, %5, %6, %7, dst_endq
+%if %8 == 1
+    sub           dst_endq, stride2q
+%endif
+%endmacro
+
+    ; store t0-1 and t30-31
+    mova                m0, [tmpq+ 0*%%str]
+    mova                m1, [tmpq+ 4*%%str]
+    %%STORE_2X2          0,  1, 14, 15, 2, 3, 7
+
+    ; store t2-3 and t28-29
+    mova                m0, [tmpq+ 8*%%str]
+    mova                m1, [tmpq+12*%%str]
+    %%STORE_2X2          0,  1, 12, 13, 2, 3, 7
+
+    ; store t4-5 and t26-27
+    mova                m0, [tmpq+16*%%str]
+    mova                m1, [tmpq+20*%%str]
+    %%STORE_2X2          0,  1, 10, 11, 2, 3, 7
+
+    ; store t6-7 and t24-25
+    mova                m0, [tmpq+24*%%str]
+    mova                m1, [tmpq+28*%%str]
+    %%STORE_2X2          0,  1,  8,  9, 2, 3, 7
+
+    ; store t8-9 and t22-23
+    mova                m0, [tmpq+ 2*%%str]
+    mova                m1, [tmpq+ 6*%%str]
+    mova                m8, [tmpq+29*%%str]
+    %%STORE_2X2          0,  1,  6,  8, 2, 3, 7
+
+    ; store t10-11 and t20-21
+    mova                m0, [tmpq+10*%%str]
+    mova                m1, [tmpq+14*%%str]
+    %%STORE_2X2          0,  1,  4,  5, 2, 3, 7
+
+    ; store t12-13 and t18-19
+    mova                m0, [tmpq+18*%%str]
+    mova                m1, [tmpq+22*%%str]
+    mova                m5, [tmpq+13*%%str]
+    mova                m4, [tmpq+ 9*%%str]
+    %%STORE_2X2          0,  1,  4,  5, 2, 3, 7
+
+    ; store t14-17
+    mova                m0, [tmpq+26*%%str]
+    mova                m1, [tmpq+30*%%str]
+    mova                m5, [tmpq+ 5*%%str]
+    mova                m4, [tmpq+ 1*%%str]
+    %%STORE_2X2          0,  1,  4,  5, 2, 3, 7, 0
+%endif
+%endmacro
+
+%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
+INIT_XMM %1
+cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob
+    cmp eobd, 135
+    jg .idctfull
+    cmp eobd, 34
+    jg .idct16x16
+    cmp eobd, 1
+    jg .idct8x8
+
+    ; dc-only case
+    movd                m0, [blockq]
+    mova                m1, [pw_11585x2]
+    pmulhrsw            m0, m1
+    pmulhrsw            m0, m1
+    SPLATW              m0, m0, q0000
+    pmulhrsw            m0, [pw_512]
+    pxor                m5, m5
+    movd          [blockq], m5
+    DEFINE_ARGS        dst, stride, block, cnt
+%rep 31
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
+    add               dstq, strideq
+%endrep
+    VP9_STORE_2XFULL    0, 1, 2, 3, 4, 5, mmsize
+    RET
+
+    DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
+.idct8x8:
+    mov               tmpq, rsp
+    VP9_IDCT32_1D   blockq, 1, 8
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    sub          stride30q, stride2q        ; stride*30
+.loop2_8x8:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dst_bakq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2, 8
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_8x8
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 64,  8, m7
+    RET
+
+.idct16x16:
+    mov               cntd, 2
+    mov               tmpq, rsp
+.loop1_16x16:
+    VP9_IDCT32_1D   blockq, 1, 16
+    add             blockq, 16
+    add               tmpq, 512
+    dec               cntd
+    jg .loop1_16x16
+    sub             blockq, 32
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    mov               tmpq, rsp
+    sub          stride30q, stride2q        ; stride*30
+.loop2_16x16:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dst_bakq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2, 16
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_16x16
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 64, 16, m7
+    RET
+
+.idctfull:
+    mov               cntd, 4
+    mov               tmpq, rsp
+.loop1_full:
+    VP9_IDCT32_1D   blockq, 1
+    add             blockq, 16
+    add               tmpq, 512
+    dec               cntd
+    jg .loop1_full
+    sub             blockq, 64
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    mov               tmpq, rsp
+    sub          stride30q, stride2q        ; stride*30
+.loop2_full:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dst_bakq+stride30q]
+    VP9_IDCT32_1D     tmpq, 2
+    add           dst_bakq, 8
+    add               tmpq, 16
+    dec               cntd
+    jg .loop2_full
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 64, 32, m7
+    RET
+%endmacro
+
+VP9_IDCT_IDCT_32x32_ADD_XMM ssse3
+VP9_IDCT_IDCT_32x32_ADD_XMM avx
+
+%endif ; x86-64
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
new file mode 100644
index 0000000000..416f08f090
--- /dev/null
+++ b/libavcodec/x86/vp9lpf.asm
@@ -0,0 +1,816 @@
+;******************************************************************************
+;* VP9 loop filter SIMD optimizations
+;*
+;* Copyright (C) 2013-2014 Clément Bœsch <u pkh me>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA
+
+cextern pb_3
+cextern pb_80
+
+pb_4:   times 16 db 0x04
+pb_10:  times 16 db 0x10
+pb_40:  times 16 db 0x40
+pb_81:  times 16 db 0x81
+pb_f8:  times 16 db 0xf8
+pb_fe:  times 16 db 0xfe
+
+cextern pw_4
+cextern pw_8
+
+; with mix functions, two 8-bit thresholds are stored in a 16-bit storage,
+; the following mask is used to splat both in the same register
+mask_mix: times 8 db 0
+          times 8 db 1
+
+mask_mix84: times 8 db 0xff
+            times 8 db 0x00
+mask_mix48: times 8 db 0x00
+            times 8 db 0xff
+
+SECTION .text
+
+; %1 = abs(%2-%3)
+%macro ABSSUB 4 ; dst, src1 (RO), src2 (RO), tmp
+    psubusb             %1, %3, %2
+    psubusb             %4, %2, %3
+    por                 %1, %4
+%endmacro
+
+; %1 = %1<=%2
+%macro CMP_LTE 3-4 ; src/dst, cmp, tmp, pb_80
+%if %0 == 4
+    pxor                %1, %4
+%endif
+    pcmpgtb             %3, %2, %1          ; cmp > src?
+    pcmpeqb             %1, %2              ; cmp == src? XXX: avoid this with a -1/+1 well placed?
+    por                 %1, %3              ; cmp >= src?
+%endmacro
+
+; %1 = abs(%2-%3) <= %4
+%macro ABSSUB_CMP 6-7 [pb_80]; dst, src1, src2, cmp, tmp1, tmp2, [pb_80]
+    ABSSUB              %1, %2, %3, %6      ; dst = abs(src1-src2)
+    CMP_LTE             %1, %4, %6, %7      ; dst <= cmp
+%endmacro
+
+%macro MASK_APPLY 4 ; %1=new_data/dst %2=old_data %3=mask %4=tmp
+    pand                %1, %3              ; new &= mask
+    pandn               %4, %3, %2          ; tmp = ~mask & old
+    por                 %1, %4              ; new&mask | old&~mask
+%endmacro
+
+%macro FILTER_SUBx2_ADDx2 8 ; %1=dst %2=h/l %3=cache %4=sub1 %5=sub2 %6=add1 %7=add2 %8=rshift
+    punpck%2bw          %3, %4, m0
+    psubw               %1, %3
+    punpck%2bw          %3, %5, m0
+    psubw               %1, %3
+    punpck%2bw          %3, %6, m0
+    paddw               %1, %3
+    punpck%2bw          %3, %7, m0
+    paddw               %3, %1
+    psraw               %1, %3, %8
+%endmacro
+
+%macro FILTER_INIT 8 ; tmp1, tmp2, cacheL, cacheH, dstp, filterid, mask, source
+    FILTER%6_INIT       %1, l, %3
+    FILTER%6_INIT       %2, h, %4
+    packuswb            %1, %2
+    MASK_APPLY          %1, %8, %7, %2
+    mova                %5, %1
+%endmacro
+
+%macro FILTER_UPDATE 11-14 ; tmp1, tmp2, cacheL, cacheH, dstp, -, -, +, +, rshift, mask, [source], [preload reg + value]
+%if %0 == 13 ; no source + preload
+    mova                %12, %13
+%elif %0 == 14 ; source + preload
+    mova                %13, %14
+%endif
+    FILTER_SUBx2_ADDx2  %1, l, %3, %6, %7, %8, %9, %10
+    FILTER_SUBx2_ADDx2  %2, h, %4, %6, %7, %8, %9, %10
+    packuswb            %1, %2
+%if %0 == 12 || %0 == 14
+    MASK_APPLY          %1, %12, %11, %2
+%else
+    MASK_APPLY          %1, %5, %11, %2
+%endif
+    mova                %5, %1
+%endmacro
+
+%macro SRSHIFT3B_2X 4 ; reg1, reg2, [pb_10], tmp
+    mova                %4, [pb_f8]
+    pand                %1, %4
+    pand                %2, %4
+    psrlq               %1, 3
+    psrlq               %2, 3
+    pxor                %1, %3
+    pxor                %2, %3
+    psubb               %1, %3
+    psubb               %2, %3
+%endmacro
+
+%macro EXTRACT_POS_NEG 3 ; i8, neg, pos
+    pxor                %3, %3
+    pxor                %2, %2
+    pcmpgtb             %3, %1                          ; i8 < 0 mask
+    psubb               %2, %1                          ; neg values (only the originally - will be kept)
+    pand                %2, %3                          ; negative values of i8 (but stored as +)
+    pandn               %3, %1                          ; positive values of i8
+%endmacro
+
+; clip_u8(u8 + i8)
+%macro SIGN_ADD 5 ; dst, u8, i8, tmp1, tmp2
+    EXTRACT_POS_NEG     %3, %4, %5
+    psubusb             %1, %2, %4                      ; sub the negatives
+    paddusb             %1, %5                          ; add the positives
+%endmacro
+
+; clip_u8(u8 - i8)
+%macro SIGN_SUB 5 ; dst, u8, i8, tmp1, tmp2
+    EXTRACT_POS_NEG     %3, %4, %5
+    psubusb             %1, %2, %5                      ; sub the positives
+    paddusb             %1, %4                          ; add the negatives
+%endmacro
+
+%macro FILTER6_INIT 3 ; %1=dst %2=h/l %3=cache
+    punpck%2bw          %1, m14, m0                     ; p3: B->W
+    paddw               %3, %1, %1                      ; p3*2
+    paddw               %3, %1                          ; p3*3
+    punpck%2bw          %1, m15, m0                     ; p2: B->W
+    paddw               %3, %1                          ; p3*3 + p2
+    paddw               %3, %1                          ; p3*3 + p2*2
+    punpck%2bw          %1, m10, m0                     ; p1: B->W
+    paddw               %3, %1                          ; p3*3 + p2*2 + p1
+    punpck%2bw          %1, m11, m0                     ; p0: B->W
+    paddw               %3, %1                          ; p3*3 + p2*2 + p1 + p0
+    punpck%2bw          %1, m12, m0                     ; q0: B->W
+    paddw               %3, %1                          ; p3*3 + p2*2 + p1 + p0 + q0
+    paddw               %3, [pw_4]                      ; p3*3 + p2*2 + p1 + p0 + q0 + 4
+    psraw               %1, %3, 3                       ; (p3*3 + p2*2 + p1 + p0 + q0 + 4) >> 3
+%endmacro
+
+%macro FILTER14_INIT 3 ; %1=dst %2=h/l %3=cache
+    punpck%2bw          %1, m2, m0                      ; p7: B->W
+    psllw               %3, %1, 3                       ; p7*8
+    psubw               %3, %1                          ; p7*7
+    punpck%2bw          %1, m3, m0                      ; p6: B->W
+    paddw               %3, %1                          ; p7*7 + p6
+    paddw               %3, %1                          ; p7*7 + p6*2
+    punpck%2bw          %1, m8, m0                      ; p5: B->W
+    paddw               %3, %1                          ; p7*7 + p6*2 + p5
+    punpck%2bw          %1, m9, m0                      ; p4: B->W
+    paddw               %3, %1                          ; p7*7 + p6*2 + p5 + p4
+    punpck%2bw          %1, m14, m0                     ; p3: B->W
+    paddw               %3, %1                          ; p7*7 + p6*2 + p5 + p4 + p3
+    punpck%2bw          %1, m15, m0                     ; p2: B->W
+    paddw               %3, %1                          ; p7*7 + p6*2 + p5 + .. + p2
+    punpck%2bw          %1, m10, m0                     ; p1: B->W
+    paddw               %3, %1                          ; p7*7 + p6*2 + p5 + .. + p1
+    punpck%2bw          %1, m11, m0                     ; p0: B->W
+    paddw               %3, %1                          ; p7*7 + p6*2 + p5 + .. + p0
+    punpck%2bw          %1, m12, m0                     ; q0: B->W
+    paddw               %3, %1                          ; p7*7 + p6*2 + p5 + .. + p0 + q0
+    paddw               %3, [pw_8]                      ; p7*7 + p6*2 + p5 + .. + p0 + q0 + 8
+    psraw               %1, %3, 4                       ; (p7*7 + p6*2 + p5 + .. + p0 + q0 + 8) >> 4
+%endmacro
+
+%macro TRANSPOSE16x16B 17
+    mova %17, m%16
+    SBUTTERFLY bw,  %1,  %2,  %16
+    SBUTTERFLY bw,  %3,  %4,  %16
+    SBUTTERFLY bw,  %5,  %6,  %16
+    SBUTTERFLY bw,  %7,  %8,  %16
+    SBUTTERFLY bw,  %9,  %10, %16
+    SBUTTERFLY bw,  %11, %12, %16
+    SBUTTERFLY bw,  %13, %14, %16
+    mova m%16,  %17
+    mova  %17, m%14
+    SBUTTERFLY bw,  %15, %16, %14
+    SBUTTERFLY wd,  %1,  %3,  %14
+    SBUTTERFLY wd,  %2,  %4,  %14
+    SBUTTERFLY wd,  %5,  %7,  %14
+    SBUTTERFLY wd,  %6,  %8,  %14
+    SBUTTERFLY wd,  %9,  %11, %14
+    SBUTTERFLY wd,  %10, %12, %14
+    SBUTTERFLY wd,  %13, %15, %14
+    mova m%14,  %17
+    mova  %17, m%12
+    SBUTTERFLY wd,  %14, %16, %12
+    SBUTTERFLY dq,  %1,  %5,  %12
+    SBUTTERFLY dq,  %2,  %6,  %12
+    SBUTTERFLY dq,  %3,  %7,  %12
+    SBUTTERFLY dq,  %4,  %8,  %12
+    SBUTTERFLY dq,  %9,  %13, %12
+    SBUTTERFLY dq,  %10, %14, %12
+    SBUTTERFLY dq,  %11, %15, %12
+    mova m%12, %17
+    mova  %17, m%8
+    SBUTTERFLY dq,  %12, %16, %8
+    SBUTTERFLY qdq, %1,  %9,  %8
+    SBUTTERFLY qdq, %2,  %10, %8
+    SBUTTERFLY qdq, %3,  %11, %8
+    SBUTTERFLY qdq, %4,  %12, %8
+    SBUTTERFLY qdq, %5,  %13, %8
+    SBUTTERFLY qdq, %6,  %14, %8
+    SBUTTERFLY qdq, %7,  %15, %8
+    mova m%8, %17
+    mova %17, m%1
+    SBUTTERFLY qdq, %8,  %16, %1
+    mova m%1, %17
+    SWAP %2,  %9
+    SWAP %3,  %5
+    SWAP %4,  %13
+    SWAP %6,  %11
+    SWAP %8,  %15
+    SWAP %12, %14
+%endmacro
+
+; transpose 16 half lines (high part) to 8 full centered lines
+%macro TRANSPOSE16x8B 16
+    punpcklbw   m%1,  m%2
+    punpcklbw   m%3,  m%4
+    punpcklbw   m%5,  m%6
+    punpcklbw   m%7,  m%8
+    punpcklbw   m%9,  m%10
+    punpcklbw   m%11, m%12
+    punpcklbw   m%13, m%14
+    punpcklbw   m%15, m%16
+    SBUTTERFLY  wd,  %1,  %3,  %2
+    SBUTTERFLY  wd,  %5,  %7,  %2
+    SBUTTERFLY  wd,  %9,  %11, %2
+    SBUTTERFLY  wd,  %13, %15, %2
+    SBUTTERFLY  dq,  %1,  %5,  %2
+    SBUTTERFLY  dq,  %3,  %7,  %2
+    SBUTTERFLY  dq,  %9,  %13, %2
+    SBUTTERFLY  dq,  %11, %15, %2
+    SBUTTERFLY  qdq, %1,  %9,  %2
+    SBUTTERFLY  qdq, %3,  %11, %2
+    SBUTTERFLY  qdq, %5,  %13, %2
+    SBUTTERFLY  qdq, %7,  %15, %2
+    SWAP %5, %1
+    SWAP %6, %9
+    SWAP %7, %1
+    SWAP %8, %13
+    SWAP %9, %3
+    SWAP %10, %11
+    SWAP %11, %1
+    SWAP %12, %15
+%endmacro
+
+%macro DEFINE_REAL_P7_TO_Q7 0-1 0
+%define P7 dst1q + 2*mstrideq  + %1
+%define P6 dst1q +   mstrideq  + %1
+%define P5 dst1q               + %1
+%define P4 dst1q +    strideq  + %1
+%define P3 dstq  + 4*mstrideq  + %1
+%define P2 dstq  +   mstride3q + %1
+%define P1 dstq  + 2*mstrideq  + %1
+%define P0 dstq  +   mstrideq  + %1
+%define Q0 dstq                + %1
+%define Q1 dstq  +   strideq   + %1
+%define Q2 dstq  + 2*strideq   + %1
+%define Q3 dstq  +   stride3q  + %1
+%define Q4 dstq  + 4*strideq   + %1
+%define Q5 dst2q + mstrideq    + %1
+%define Q6 dst2q               + %1
+%define Q7 dst2q +  strideq    + %1
+%endmacro
+
+; ..............AB -> AAAAAAAABBBBBBBB
+%macro SPLATB_MIX 1-2 [mask_mix]
+%if cpuflag(ssse3)
+    pshufb     %1, %2
+%else
+    punpcklbw  %1, %1
+    punpcklwd  %1, %1
+    punpckldq  %1, %1
+%endif
+%endmacro
+
+%macro LOOPFILTER 2 ; %1=v/h %2=size1
+    lea mstrideq, [strideq]
+    neg mstrideq
+
+    lea stride3q, [strideq+2*strideq]
+    mov mstride3q, stride3q
+    neg mstride3q
+
+%ifidn %1, h
+%if %2 > 16
+%define movx movh
+    lea dstq, [dstq + 8*strideq - 4]
+%else
+%define movx movu
+    lea dstq, [dstq + 8*strideq - 8] ; go from top center (h pos) to center left (v pos)
+%endif
+%endif
+
+    lea dst1q, [dstq + 2*mstride3q]                         ; dst1q = &dst[stride * -6]
+    lea dst2q, [dstq + 2* stride3q]                         ; dst2q = &dst[stride * +6]
+
+    DEFINE_REAL_P7_TO_Q7
+
+%ifidn %1, h
+    movx                    m0, [P7]
+    movx                    m1, [P6]
+    movx                    m2, [P5]
+    movx                    m3, [P4]
+    movx                    m4, [P3]
+    movx                    m5, [P2]
+    movx                    m6, [P1]
+    movx                    m7, [P0]
+    movx                    m8, [Q0]
+    movx                    m9, [Q1]
+    movx                   m10, [Q2]
+    movx                   m11, [Q3]
+    movx                   m12, [Q4]
+    movx                   m13, [Q5]
+    movx                   m14, [Q6]
+    movx                   m15, [Q7]
+%define P7 rsp +   0
+%define P6 rsp +  16
+%define P5 rsp +  32
+%define P4 rsp +  48
+%define P3 rsp +  64
+%define P2 rsp +  80
+%define P1 rsp +  96
+%define P0 rsp + 112
+%define Q0 rsp + 128
+%define Q1 rsp + 144
+%define Q2 rsp + 160
+%define Q3 rsp + 176
+%define Q4 rsp + 192
+%define Q5 rsp + 208
+%define Q6 rsp + 224
+%define Q7 rsp + 240
+
+%if %2 == 16
+    TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
+    mova           [P7],  m0
+    mova           [P6],  m1
+    mova           [P5],  m2
+    mova           [P4],  m3
+%else
+    TRANSPOSE16x8B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+%endif
+    mova           [P3],  m4
+    mova           [P2],  m5
+    mova           [P1],  m6
+    mova           [P0],  m7
+    mova           [Q0],  m8
+    mova           [Q1],  m9
+    mova           [Q2], m10
+    mova           [Q3], m11
+%if %2 == 16
+    mova           [Q4], m12
+    mova           [Q5], m13
+    mova           [Q6], m14
+    mova           [Q7], m15
+%endif
+%endif
+
+    ; calc fm mask
+%if %2 == 16
+%if cpuflag(ssse3)
+    pxor                m0, m0
+%endif
+    SPLATB_REG          m2, I, m0                       ; I I I I ...
+    SPLATB_REG          m3, E, m0                       ; E E E E ...
+%else
+%if cpuflag(ssse3)
+    mova                m0, [mask_mix]
+%endif
+    movd                m2, Id
+    movd                m3, Ed
+    SPLATB_MIX          m2, m0
+    SPLATB_MIX          m3, m0
+%endif
+    mova                m0, [pb_80]
+    pxor                m2, m0
+    pxor                m3, m0
+%ifidn %1, v
+    mova                m8, [P3]
+    mova                m9, [P2]
+    mova               m10, [P1]
+    mova               m11, [P0]
+    mova               m12, [Q0]
+    mova               m13, [Q1]
+    mova               m14, [Q2]
+    mova               m15, [Q3]
+%else
+    ; In case of horizontal, P3..Q3 are already present in some registers due
+    ; to the previous transpose, so we just swap registers.
+    SWAP                 8,  4, 12
+    SWAP                 9,  5, 13
+    SWAP                10,  6, 14
+    SWAP                11,  7, 15
+%endif
+    ABSSUB_CMP          m5,  m8,  m9, m2, m6, m7, m0    ; m5 = abs(p3-p2) <= I
+    ABSSUB_CMP          m1,  m9, m10, m2, m6, m7, m0    ; m1 = abs(p2-p1) <= I
+    pand                m5, m1
+    ABSSUB_CMP          m1, m10, m11, m2, m6, m7, m0    ; m1 = abs(p1-p0) <= I
+    pand                m5, m1
+    ABSSUB_CMP          m1, m12, m13, m2, m6, m7, m0    ; m1 = abs(q1-q0) <= I
+    pand                m5, m1
+    ABSSUB_CMP          m1, m13, m14, m2, m6, m7, m0    ; m1 = abs(q2-q1) <= I
+    pand                m5, m1
+    ABSSUB_CMP          m1, m14, m15, m2, m6, m7, m0    ; m1 = abs(q3-q2) <= I
+    pand                m5, m1
+    ABSSUB              m1, m11, m12, m7                ; abs(p0-q0)
+    paddusb             m1, m1                          ; abs(p0-q0) * 2
+    ABSSUB              m2, m10, m13, m7                ; abs(p1-q1)
+    pand                m2, [pb_fe]                     ; drop lsb so shift can work
+    psrlq               m2, 1                           ; abs(p1-q1)/2
+    paddusb             m1, m2                          ; abs(p0-q0)*2 + abs(p1-q1)/2
+    pxor                m1, m0
+    pcmpgtb             m4, m3, m1                      ; E > X?
+    pcmpeqb             m3, m1                          ; E == X?
+    por                 m3, m4                          ; E >= X?
+    pand                m3, m5                          ; fm final value
+
+    ; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3)
+    ; calc flat8in (if not 44_16) and hev masks
+    mova                m6, [pb_81]                     ; [1 1 1 1 ...] ^ 0x80
+%if %2 != 44
+    ABSSUB_CMP          m2, m8, m11, m6, m4, m5         ; abs(p3 - p0) <= 1
+    mova                m8, [pb_80]
+    ABSSUB_CMP          m1, m9, m11, m6, m4, m5, m8     ; abs(p2 - p0) <= 1
+    pand                m2, m1
+    ABSSUB              m4, m10, m11, m5                ; abs(p1 - p0)
+%if %2 == 16
+%if cpuflag(ssse3)
+    pxor                m0, m0
+%endif
+    SPLATB_REG          m7, H, m0                       ; H H H H ...
+%else
+    movd                m7, Hd
+    SPLATB_MIX          m7
+%endif
+    pxor                m7, m8
+    pxor                m4, m8
+    pcmpgtb             m0, m4, m7                      ; abs(p1 - p0) > H (1/2 hev condition)
+    CMP_LTE             m4, m6, m5                      ; abs(p1 - p0) <= 1
+    pand                m2, m4                          ; (flat8in)
+    ABSSUB              m4, m13, m12, m1                ; abs(q1 - q0)
+    pxor                m4, m8
+    pcmpgtb             m5, m4, m7                      ; abs(q1 - q0) > H (2/2 hev condition)
+    por                 m0, m5                          ; hev final value
+    CMP_LTE             m4, m6, m5                      ; abs(q1 - q0) <= 1
+    pand                m2, m4                          ; (flat8in)
+    ABSSUB_CMP          m1, m14, m12, m6, m4, m5, m8    ; abs(q2 - q0) <= 1
+    pand                m2, m1
+    ABSSUB_CMP          m1, m15, m12, m6, m4, m5, m8    ; abs(q3 - q0) <= 1
+    pand                m2, m1                          ; flat8in final value
+%if %2 == 84 || %2 == 48
+    pand                m2, [mask_mix%2]
+%endif
+%else
+    mova                m6, [pb_80]
+    movd                m7, Hd
+    SPLATB_MIX          m7
+    pxor                m7, m6
+    ABSSUB              m4, m10, m11, m1                ; abs(p1 - p0)
+    pxor                m4, m6
+    pcmpgtb             m0, m4, m7                      ; abs(p1 - p0) > H (1/2 hev condition)
+    ABSSUB              m4, m13, m12, m1                ; abs(q1 - q0)
+    pxor                m4, m6
+    pcmpgtb             m5, m4, m7                      ; abs(q1 - q0) > H (2/2 hev condition)
+    por                 m0, m5                          ; hev final value
+%endif
+
+%if %2 == 16
+    ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
+    ; calc flat8out mask
+    mova                m8, [P7]
+    mova                m9, [P6]
+    ABSSUB_CMP          m1, m8, m11, m6, m4, m5         ; abs(p7 - p0) <= 1
+    ABSSUB_CMP          m7, m9, m11, m6, m4, m5         ; abs(p6 - p0) <= 1
+    pand                m1, m7
+    mova                m8, [P5]
+    mova                m9, [P4]
+    ABSSUB_CMP          m7, m8, m11, m6, m4, m5         ; abs(p5 - p0) <= 1
+    pand                m1, m7
+    ABSSUB_CMP          m7, m9, m11, m6, m4, m5         ; abs(p4 - p0) <= 1
+    pand                m1, m7
+    mova                m14, [Q4]
+    mova                m15, [Q5]
+    ABSSUB_CMP          m7, m14, m12, m6, m4, m5        ; abs(q4 - q0) <= 1
+    pand                m1, m7
+    ABSSUB_CMP          m7, m15, m12, m6, m4, m5        ; abs(q5 - q0) <= 1
+    pand                m1, m7
+    mova                m14, [Q6]
+    mova                m15, [Q7]
+    ABSSUB_CMP          m7, m14, m12, m6, m4, m5        ; abs(q4 - q0) <= 1
+    pand                m1, m7
+    ABSSUB_CMP          m7, m15, m12, m6, m4, m5        ; abs(q5 - q0) <= 1
+    pand                m1, m7                          ; flat8out final value
+%endif
+
+    ; if (fm) {
+    ;     if (out && in) filter_14()
+    ;     else if (in)   filter_6()
+    ;     else if (hev)  filter_2()
+    ;     else           filter_4()
+    ; }
+    ;
+    ; f14:                                                                            fm &  out &  in
+    ; f6:  fm & ~f14 & in        => fm & ~(out & in) & in                          => fm & ~out &  in
+    ; f2:  fm & ~f14 & ~f6 & hev => fm & ~(out & in) & ~(~out & in) & hev          => fm &  ~in &  hev
+    ; f4:  fm & ~f14 & ~f6 & ~f2 => fm & ~(out & in) & ~(~out & in) & ~(~in & hev) => fm &  ~in & ~hev
+
+    ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7)
+    ; filter2()
+%if %2 != 44
+    mova                m6, [pb_80]                     ; already in m6 if 44_16
+%endif
+    pxor                m15, m12, m6                    ; q0 ^ 0x80
+    pxor                m14, m11, m6                    ; p0 ^ 0x80
+    psubsb              m15, m14                        ; (signed) q0 - p0
+    pxor                m4, m10, m6                     ; p1 ^ 0x80
+    pxor                m5, m13, m6                     ; q1 ^ 0x80
+    psubsb              m4, m5                          ; (signed) p1 - q1
+    paddsb              m4, m15                         ;   (q0 - p0) + (p1 - q1)
+    paddsb              m4, m15                         ; 2*(q0 - p0) + (p1 - q1)
+    paddsb              m4, m15                         ; 3*(q0 - p0) + (p1 - q1)
+    paddsb              m6, m4, [pb_4]                  ; m6: f1 = clip(f + 4, 127)
+    paddsb              m4, [pb_3]                      ; m4: f2 = clip(f + 3, 127)
+    mova                m14, [pb_10]                    ; will be reused in filter4()
+    SRSHIFT3B_2X        m6, m4, m14, m7                 ; f1 and f2 sign byte shift by 3
+    SIGN_SUB            m7, m12, m6, m5, m9             ; m7 = q0 - f1
+    SIGN_ADD            m8, m11, m4, m5, m9             ; m8 = p0 + f2
+%if %2 != 44
+    pandn               m6, m2, m3                      ;  ~mask(in) & mask(fm)
+    pand                m6, m0                          ; (~mask(in) & mask(fm)) & mask(hev)
+%else
+    pand                m6, m3, m0
+%endif
+    MASK_APPLY          m7, m12, m6, m5                 ; m7 = filter2(q0) & mask / we write it in filter4()
+    MASK_APPLY          m8, m11, m6, m5                 ; m8 = filter2(p0) & mask / we write it in filter4()
+
+    ; (m0: hev, [m1: flat8out], [m2: flat8in], m3: fm, m7..m8: q0' p0', m10..13: p1 p0 q0 q1, m14: pb_10, m15: q0-p0)
+    ; filter4()
+    mova                m4, m15
+    paddsb              m15, m4                         ; 2 * (q0 - p0)
+    paddsb              m15, m4                         ; 3 * (q0 - p0)
+    paddsb              m6, m15, [pb_4]                 ; m6:  f1 = clip(f + 4, 127)
+    paddsb              m15, [pb_3]                     ; m15: f2 = clip(f + 3, 127)
+    SRSHIFT3B_2X        m6, m15, m14, m9                ; f1 and f2 sign byte shift by 3
+%if %2 != 44
+%define p0tmp m7
+%define q0tmp m9
+    pandn               m5, m2, m3                      ;               ~mask(in) & mask(fm)
+    pandn               m0, m5                          ; ~mask(hev) & (~mask(in) & mask(fm))
+%else
+%define p0tmp m1
+%define q0tmp m2
+    pandn               m0, m3
+%endif
+    SIGN_SUB            q0tmp, m12, m6, m4, m14         ; q0 - f1
+    MASK_APPLY          q0tmp, m7, m0, m5               ; filter4(q0) & mask
+    mova                [Q0], q0tmp
+    SIGN_ADD            p0tmp, m11, m15, m4, m14        ; p0 + f2
+    MASK_APPLY          p0tmp, m8, m0, m5               ; filter4(p0) & mask
+    mova                [P0], p0tmp
+    paddb               m6, [pb_80]                     ;
+    pxor                m8, m8                          ;   f=(f1+1)>>1
+    pavgb               m6, m8                          ;
+    psubb               m6, [pb_40]                     ;
+    SIGN_ADD            m7, m10, m6, m8, m9             ; p1 + f
+    SIGN_SUB            m4, m13, m6, m8, m9             ; q1 - f
+    MASK_APPLY          m7, m10, m0, m14                ; m7 = filter4(p1)
+    MASK_APPLY          m4, m13, m0, m14                ; m4 = filter4(q1)
+    mova                [P1], m7
+    mova                [Q1], m4
+
+    ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
+    ; filter6()
+%if %2 != 44
+    pxor                m0, m0
+%if %2 > 16
+    pand                m3, m2
+%else
+    pand                m2, m3                          ;               mask(fm) & mask(in)
+    pandn               m3, m1, m2                      ; ~mask(out) & (mask(fm) & mask(in))
+%endif
+    mova               m14, [P3]
+    mova               m15, [P2]
+    mova                m8, [Q2]
+    mova                m9, [Q3]
+    FILTER_INIT         m4, m5, m6, m7, [P2], 6,                     m3, m15    ; [p2]
+    FILTER_UPDATE       m6, m7, m4, m5, [P1], m14, m15, m10, m13, 3, m3         ; [p1] -p3 -p2 +p1 +q1
+    FILTER_UPDATE       m4, m5, m6, m7, [P0], m14, m10, m11,  m8, 3, m3         ; [p0] -p3 -p1 +p0 +q2
+    FILTER_UPDATE       m6, m7, m4, m5, [Q0], m14, m11, m12,  m9, 3, m3         ; [q0] -p3 -p0 +q0 +q3
+    FILTER_UPDATE       m4, m5, m6, m7, [Q1], m15, m12, m13,  m9, 3, m3         ; [q1] -p2 -q0 +q1 +q3
+    FILTER_UPDATE       m6, m7, m4, m5, [Q2], m10, m13,  m8,  m9, 3, m3,  m8    ; [q2] -p1 -q1 +q2 +q3
+%endif
+
+    ; (m0: 0, [m1: flat8out], m2: fm & flat8in, m8..15: q2 q3 p1 p0 q0 q1 p3 p2)
+    ; filter14()
+    ;
+    ;                            m2  m3  m8  m9 m14 m15 m10 m11 m12 m13
+    ;
+    ;                                    q2  q3  p3  p2  p1  p0  q0  q1
+    ; p6  -7                     p7  p6  p5  p4   .   .   .   .   .
+    ; p5  -6  -p7 -p6 +p5 +q1     .   .   .                           .
+    ; p4  -5  -p7 -p5 +p4 +q2     .       .   .                      q2
+    ; p3  -4  -p7 -p4 +p3 +q3     .           .   .                  q3
+    ; p2  -3  -p7 -p3 +p2 +q4     .               .   .              q4
+    ; p1  -2  -p7 -p2 +p1 +q5     .                   .   .          q5
+    ; p0  -1  -p7 -p1 +p0 +q6     .                       .   .      q6
+    ; q0  +0  -p7 -p0 +q0 +q7     .                           .   .  q7
+    ; q1  +1  -p6 -q0 +q1 +q7    q1   .                           .   .
+    ; q2  +2  -p5 -q1 +q2 +q7     .  q2   .                           .
+    ; q3  +3  -p4 -q2 +q3 +q7         .  q3   .                       .
+    ; q4  +4  -p3 -q3 +q4 +q7             .  q4   .                   .
+    ; q5  +5  -p2 -q4 +q5 +q7                 .  q5   .               .
+    ; q6  +6  -p1 -q5 +q6 +q7                     .  q6   .           .
+
+%if %2 == 16
+    pand            m1, m2                                                              ; mask(out) & (mask(fm) & mask(in))
+    mova            m2, [P7]
+    mova            m3, [P6]
+    mova            m8, [P5]
+    mova            m9, [P4]
+    FILTER_INIT     m4, m5, m6, m7, [P6],  14,                   m1,  m3
+    FILTER_UPDATE   m6, m7, m4, m5, [P5],  m2,  m3,  m8, m13, 4, m1,  m8                ; [p5] -p7 -p6 +p5 +q1
+    FILTER_UPDATE   m4, m5, m6, m7, [P4],  m2,  m8,  m9, m13, 4, m1,  m9, m13, [Q2]     ; [p4] -p7 -p5 +p4 +q2
+    FILTER_UPDATE   m6, m7, m4, m5, [P3],  m2,  m9, m14, m13, 4, m1, m14, m13, [Q3]     ; [p3] -p7 -p4 +p3 +q3
+    FILTER_UPDATE   m4, m5, m6, m7, [P2],  m2, m14, m15, m13, 4, m1,      m13, [Q4]     ; [p2] -p7 -p3 +p2 +q4
+    FILTER_UPDATE   m6, m7, m4, m5, [P1],  m2, m15, m10, m13, 4, m1,      m13, [Q5]     ; [p1] -p7 -p2 +p1 +q5
+    FILTER_UPDATE   m4, m5, m6, m7, [P0],  m2, m10, m11, m13, 4, m1,      m13, [Q6]     ; [p0] -p7 -p1 +p0 +q6
+    FILTER_UPDATE   m6, m7, m4, m5, [Q0],  m2, m11, m12, m13, 4, m1,      m13, [Q7]     ; [q0] -p7 -p0 +q0 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q1],  m3, m12,  m2, m13, 4, m1,       m2, [Q1]     ; [q1] -p6 -q0 +q1 +q7
+    FILTER_UPDATE   m6, m7, m4, m5, [Q2],  m8,  m2,  m3, m13, 4, m1,       m3, [Q2]     ; [q2] -p5 -q1 +q2 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q3],  m9,  m3,  m8, m13, 4, m1,  m8,  m8, [Q3]     ; [q3] -p4 -q2 +q3 +q7
+    FILTER_UPDATE   m6, m7, m4, m5, [Q4], m14,  m8,  m9, m13, 4, m1,  m9,  m9, [Q4]     ; [q4] -p3 -q3 +q4 +q7
+    FILTER_UPDATE   m4, m5, m6, m7, [Q5], m15,  m9, m14, m13, 4, m1, m14, m14, [Q5]     ; [q5] -p2 -q4 +q5 +q7
+    FILTER_UPDATE   m6, m7, m4, m5, [Q6], m10, m14, m15, m13, 4, m1, m15, m15, [Q6]     ; [q6] -p1 -q5 +q6 +q7
+%endif
+
+%ifidn %1, h
+%if %2 == 16
+    mova                    m0, [P7]
+    mova                    m1, [P6]
+    mova                    m2, [P5]
+    mova                    m3, [P4]
+    mova                    m4, [P3]
+    mova                    m5, [P2]
+    mova                    m6, [P1]
+    mova                    m7, [P0]
+    mova                    m8, [Q0]
+    mova                    m9, [Q1]
+    mova                   m10, [Q2]
+    mova                   m11, [Q3]
+    mova                   m12, [Q4]
+    mova                   m13, [Q5]
+    mova                   m14, [Q6]
+    mova                   m15, [Q7]
+    TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
+    DEFINE_REAL_P7_TO_Q7
+    movu  [P7],  m0
+    movu  [P6],  m1
+    movu  [P5],  m2
+    movu  [P4],  m3
+    movu  [P3],  m4
+    movu  [P2],  m5
+    movu  [P1],  m6
+    movu  [P0],  m7
+    movu  [Q0],  m8
+    movu  [Q1],  m9
+    movu  [Q2], m10
+    movu  [Q3], m11
+    movu  [Q4], m12
+    movu  [Q5], m13
+    movu  [Q6], m14
+    movu  [Q7], m15
+%elif %2 == 44
+    SWAP 0, 7   ; m0 = p1
+    SWAP 3, 4   ; m3 = q1
+    DEFINE_REAL_P7_TO_Q7 2
+    SBUTTERFLY  bw, 0, 1, 8
+    SBUTTERFLY  bw, 2, 3, 8
+    SBUTTERFLY  wd, 0, 2, 8
+    SBUTTERFLY  wd, 1, 3, 8
+    SBUTTERFLY  dq, 0, 4, 8
+    SBUTTERFLY  dq, 1, 5, 8
+    SBUTTERFLY  dq, 2, 6, 8
+    SBUTTERFLY  dq, 3, 7, 8
+    movd  [P7], m0
+    punpckhqdq m0, m8
+    movd  [P6], m0
+    movd  [Q0], m1
+    punpckhqdq  m1, m9
+    movd  [Q1], m1
+    movd  [P3], m2
+    punpckhqdq  m2, m10
+    movd  [P2], m2
+    movd  [Q4], m3
+    punpckhqdq m3, m11
+    movd  [Q5], m3
+    movd  [P5], m4
+    punpckhqdq m4, m12
+    movd  [P4], m4
+    movd  [Q2], m5
+    punpckhqdq m5, m13
+    movd  [Q3], m5
+    movd  [P1], m6
+    punpckhqdq m6, m14
+    movd  [P0], m6
+    movd  [Q6], m7
+    punpckhqdq m7, m8
+    movd  [Q7], m7
+%else
+    ; the following code do a transpose of 8 full lines to 16 half
+    ; lines (high part). It is inlined to avoid the need of a staging area
+    mova                    m0, [P3]
+    mova                    m1, [P2]
+    mova                    m2, [P1]
+    mova                    m3, [P0]
+    mova                    m4, [Q0]
+    mova                    m5, [Q1]
+    mova                    m6, [Q2]
+    mova                    m7, [Q3]
+    DEFINE_REAL_P7_TO_Q7
+    SBUTTERFLY  bw,  0,  1, 8
+    SBUTTERFLY  bw,  2,  3, 8
+    SBUTTERFLY  bw,  4,  5, 8
+    SBUTTERFLY  bw,  6,  7, 8
+    SBUTTERFLY  wd,  0,  2, 8
+    SBUTTERFLY  wd,  1,  3, 8
+    SBUTTERFLY  wd,  4,  6, 8
+    SBUTTERFLY  wd,  5,  7, 8
+    SBUTTERFLY  dq,  0,  4, 8
+    SBUTTERFLY  dq,  1,  5, 8
+    SBUTTERFLY  dq,  2,  6, 8
+    SBUTTERFLY  dq,  3,  7, 8
+    movh  [P7], m0
+    punpckhqdq m0, m8
+    movh  [P6], m0
+    movh  [Q0], m1
+    punpckhqdq  m1, m9
+    movh  [Q1], m1
+    movh  [P3], m2
+    punpckhqdq  m2, m10
+    movh  [P2], m2
+    movh  [Q4], m3
+    punpckhqdq m3, m11
+    movh  [Q5], m3
+    movh  [P5], m4
+    punpckhqdq m4, m12
+    movh  [P4], m4
+    movh  [Q2], m5
+    punpckhqdq m5, m13
+    movh  [Q3], m5
+    movh  [P1], m6
+    punpckhqdq m6, m14
+    movh  [P0], m6
+    movh  [Q6], m7
+    punpckhqdq m7, m8
+    movh  [Q7], m7
+%endif
+%endif
+
+    RET
+%endmacro
+
+%macro LPF_16_VH 2
+INIT_XMM %2
+cglobal vp9_loop_filter_v_%1_16, 5,10,16,      dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
+    LOOPFILTER v, %1
+cglobal vp9_loop_filter_h_%1_16, 5,10,16, 256, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
+    LOOPFILTER h, %1
+%endmacro
+
+%macro LPF_16_VH_ALL_OPTS 1
+LPF_16_VH %1, sse2
+LPF_16_VH %1, ssse3
+LPF_16_VH %1, avx
+%endmacro
+
+LPF_16_VH_ALL_OPTS 16
+LPF_16_VH_ALL_OPTS 44
+LPF_16_VH_ALL_OPTS 48
+LPF_16_VH_ALL_OPTS 84
+LPF_16_VH_ALL_OPTS 88
+
+%endif ; x86-64
diff --git a/libavcodec/x86/vp9dsp.asm b/libavcodec/x86/vp9mc.asm
index 6488f3092d..59e636da39 100644
--- a/libavcodec/x86/vp9dsp.asm
+++ b/libavcodec/x86/vp9mc.asm
@@ -1,39 +1,38 @@
 ;******************************************************************************
-;* VP9 SIMD optimizations
+;* VP9 MC SIMD optimizations
 ;*
 ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
 ;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
 ;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
-; FIXME share with vp8dsp.asm
-pw_256:   times 8 dw 256
+cextern pw_256
 
 %macro F8_TAPS 8
-times 8 db %1, %2
-times 8 db %3, %4
-times 8 db %5, %6
-times 8 db %7, %8
+times 16 db %1, %2
+times 16 db %3, %4
+times 16 db %5, %6
+times 16 db %7, %8
 %endmacro
-; int8_t ff_filters_ssse3[3][15][4][16]
+; int8_t ff_filters_ssse3[3][15][4][32]
 const filters_ssse3 ; smooth
                     F8_TAPS -3, -1,  32,  64,  38,   1, -3,  0
                     F8_TAPS -2, -2,  29,  63,  41,   2, -3,  0
@@ -87,13 +86,13 @@ SECTION .text
 
 %macro filter_h_fn 1
 %assign %%px mmsize/2
-cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, src, dstride, sstride, h, filtery
+cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery
     mova        m6, [pw_256]
     mova        m7, [filteryq+ 0]
 %if ARCH_X86_64 && mmsize > 8
-    mova        m8, [filteryq+16]
-    mova        m9, [filteryq+32]
-    mova       m10, [filteryq+48]
+    mova        m8, [filteryq+32]
+    mova        m9, [filteryq+64]
+    mova       m10, [filteryq+96]
 %endif
 .loop:
     movh        m0, [srcq-3]
@@ -115,9 +114,9 @@ cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, src, dstride, sstride, h, filtery
     pmaddubsw   m4, m9
     pmaddubsw   m1, m10
 %else
-    pmaddubsw   m2, [filteryq+16]
-    pmaddubsw   m4, [filteryq+32]
-    pmaddubsw   m1, [filteryq+48]
+    pmaddubsw   m2, [filteryq+32]
+    pmaddubsw   m4, [filteryq+64]
+    pmaddubsw   m1, [filteryq+96]
 %endif
     paddw       m0, m2
     paddw       m4, m1
@@ -145,30 +144,91 @@ INIT_XMM ssse3
 filter_h_fn put
 filter_h_fn avg
 
+%if ARCH_X86_64
+%macro filter_hx2_fn 1
+%assign %%px mmsize
+cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery
+    mova       m13, [pw_256]
+    mova        m8, [filteryq+ 0]
+    mova        m9, [filteryq+32]
+    mova       m10, [filteryq+64]
+    mova       m11, [filteryq+96]
+.loop:
+    movu        m0, [srcq-3]
+    movu        m1, [srcq-2]
+    movu        m2, [srcq-1]
+    movu        m3, [srcq+0]
+    movu        m4, [srcq+1]
+    movu        m5, [srcq+2]
+    movu        m6, [srcq+3]
+    movu        m7, [srcq+4]
+    add       srcq, sstrideq
+    SBUTTERFLY  bw, 0, 1, 12
+    SBUTTERFLY  bw, 2, 3, 12
+    SBUTTERFLY  bw, 4, 5, 12
+    SBUTTERFLY  bw, 6, 7, 12
+    pmaddubsw   m0, m8
+    pmaddubsw   m1, m8
+    pmaddubsw   m2, m9
+    pmaddubsw   m3, m9
+    pmaddubsw   m4, m10
+    pmaddubsw   m5, m10
+    pmaddubsw   m6, m11
+    pmaddubsw   m7, m11
+    paddw       m0, m2
+    paddw       m1, m3
+    paddw       m4, m6
+    paddw       m5, m7
+    paddsw      m0, m4
+    paddsw      m1, m5
+    pmulhrsw    m0, m13
+    pmulhrsw    m1, m13
+    packuswb    m0, m1
+%ifidn %1, avg
+    pavgb       m0, [dstq]
+%endif
+    mova    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+filter_hx2_fn put
+filter_hx2_fn avg
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_hx2_fn put
+filter_hx2_fn avg
+%endif
+
+%endif ; ARCH_X86_64
+
 %macro filter_v_fn 1
 %assign %%px mmsize/2
 %if ARCH_X86_64
-cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, src, dstride, sstride, h, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
 %else
-cglobal %1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, src, dstride, sstride, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
     mov   filteryq, r5mp
 %define hd r4mp
 %endif
-    sub       srcq, sstrideq
-    lea  sstride3q, [sstrideq*3]
-    sub       srcq, sstrideq
     mova        m6, [pw_256]
-    sub       srcq, sstrideq
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
     mova        m7, [filteryq+ 0]
-    lea      src4q, [srcq+sstrideq*4]
 %if ARCH_X86_64 && mmsize > 8
-    mova        m8, [filteryq+16]
-    mova        m9, [filteryq+32]
-    mova       m10, [filteryq+48]
+    mova        m8, [filteryq+32]
+    mova        m9, [filteryq+64]
+    mova       m10, [filteryq+96]
 %endif
 .loop:
-    ; FIXME maybe reuse loads from previous rows, or just more generally
-    ; unroll this to prevent multiple loads of the same data?
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
     movh        m0, [srcq]
     movh        m1, [srcq+sstrideq]
     movh        m2, [srcq+sstrideq*2]
@@ -189,9 +249,9 @@ cglobal %1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, src, dstride, sstride, filtery, sr
     pmaddubsw   m4, m9
     pmaddubsw   m1, m10
 %else
-    pmaddubsw   m2, [filteryq+16]
-    pmaddubsw   m4, [filteryq+32]
-    pmaddubsw   m1, [filteryq+48]
+    pmaddubsw   m2, [filteryq+32]
+    pmaddubsw   m4, [filteryq+64]
+    pmaddubsw   m1, [filteryq+96]
 %endif
     paddw       m0, m2
     paddw       m4, m1
@@ -219,6 +279,76 @@ INIT_XMM ssse3
 filter_v_fn put
 filter_v_fn avg
 
+%if ARCH_X86_64
+
+%macro filter_vx2_fn 1
+%assign %%px mmsize
+cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
+    mova       m13, [pw_256]
+    lea  sstride3q, [sstrideq*3]
+    lea      src4q, [srcq+sstrideq]
+    sub       srcq, sstride3q
+    mova        m8, [filteryq+ 0]
+    mova        m9, [filteryq+32]
+    mova       m10, [filteryq+64]
+    mova       m11, [filteryq+96]
+.loop:
+    ; FIXME maybe reuse loads from previous rows, or just
+    ; more generally unroll this to prevent multiple loads of
+    ; the same data?
+    movu        m0, [srcq]
+    movu        m1, [srcq+sstrideq]
+    movu        m2, [srcq+sstrideq*2]
+    movu        m3, [srcq+sstride3q]
+    movu        m4, [src4q]
+    movu        m5, [src4q+sstrideq]
+    movu        m6, [src4q+sstrideq*2]
+    movu        m7, [src4q+sstride3q]
+    add       srcq, sstrideq
+    add      src4q, sstrideq
+    SBUTTERFLY  bw, 0, 1, 12
+    SBUTTERFLY  bw, 2, 3, 12
+    SBUTTERFLY  bw, 4, 5, 12
+    SBUTTERFLY  bw, 6, 7, 12
+    pmaddubsw   m0, m8
+    pmaddubsw   m1, m8
+    pmaddubsw   m2, m9
+    pmaddubsw   m3, m9
+    pmaddubsw   m4, m10
+    pmaddubsw   m5, m10
+    pmaddubsw   m6, m11
+    pmaddubsw   m7, m11
+    paddw       m0, m2
+    paddw       m1, m3
+    paddw       m4, m6
+    paddw       m5, m7
+    paddsw      m0, m4
+    paddsw      m1, m5
+    pmulhrsw    m0, m13
+    pmulhrsw    m1, m13
+    packuswb    m0, m1
+%ifidn %1, avg
+    pavgb       m0, [dstq]
+%endif
+    mova    [dstq], m0
+    add       dstq, dstrideq
+    dec         hd
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+filter_vx2_fn put
+filter_vx2_fn avg
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+filter_vx2_fn put
+filter_vx2_fn avg
+%endif
+
+%endif ; ARCH_X86_64
+
 %macro fpel_fn 6
 %if %2 == 4
 %define %%srcfn movh
@@ -228,12 +358,12 @@ filter_v_fn avg
 %define %%dstfn mova
 %endif
 
-%if %2 <= 16
-cglobal %1%2, 5, 7, 4, dst, src, dstride, sstride, h, dstride3, sstride3
+%if %2 <= mmsize
+cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
     lea  sstride3q, [sstrideq*3]
     lea  dstride3q, [dstrideq*3]
 %else
-cglobal %1%2, 5, 5, 4, dst, src, dstride, sstride, h
+cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h
 %endif
 .loop:
     %%srcfn     m0, [srcq]
@@ -259,10 +389,12 @@ cglobal %1%2, 5, 5, 4, dst, src, dstride, sstride, h
 
 %define d16 16
 %define s16 16
+%define d32 32
+%define s32 32
 INIT_MMX mmx
 fpel_fn put, 4,  strideq, strideq*2, stride3q, 4
 fpel_fn put, 8,  strideq, strideq*2, stride3q, 4
-INIT_MMX sse
+INIT_MMX mmxext
 fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4
 fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4
 INIT_XMM sse
@@ -273,5 +405,15 @@ INIT_XMM sse2
 fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
 fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2
 fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1
+INIT_YMM avx
+fpel_fn put, 32, strideq, strideq*2, stride3q, 4
+fpel_fn put, 64, mmsize,  strideq,   strideq+mmsize, 2
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+fpel_fn avg, 32, strideq, strideq*2, stride3q, 4
+fpel_fn avg, 64, mmsize,  strideq,   strideq+mmsize, 2
+%endif
 %undef s16
 %undef d16
+%undef s32
+%undef d32
diff --git a/libavcodec/x86/w64xmmtest.c b/libavcodec/x86/w64xmmtest.c
index 2f064cad7b..25e833fef3 100644
--- a/libavcodec/x86/w64xmmtest.c
+++ b/libavcodec/x86/w64xmmtest.c
@@ -2,20 +2,20 @@
  * check XMM registers for clobbers on Win64
  * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -65,6 +65,13 @@ wrap(avcodec_encode_audio2(AVCodecContext *avctx,
                     got_packet_ptr);
 }
 
+wrap(avcodec_encode_video(AVCodecContext *avctx,
+                          uint8_t *buf, int buf_size,
+                          const AVFrame *pict))
+{
+    testxmmclobbers(avcodec_encode_video, avctx, buf, buf_size, pict);
+}
+
 wrap(avcodec_encode_subtitle(AVCodecContext *avctx,
                              uint8_t *buf, int buf_size,
                              const AVSubtitle *sub))
diff --git a/libavcodec/x86/xvididct.h b/libavcodec/x86/xvididct.h
index 13a4e85890..e8586fe524 100644
--- a/libavcodec/x86/xvididct.h
+++ b/libavcodec/x86/xvididct.h
@@ -1,20 +1,20 @@
 /*
  * XVID MPEG-4 VIDEO CODEC
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c
index e4f7345795..8c227844b5 100644
--- a/libavcodec/x86/xvididct_init.c
+++ b/libavcodec/x86/xvididct_init.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/x86/xvididct_mmx.c b/libavcodec/x86/xvididct_mmx.c
index e371142974..57aa8c0690 100644
--- a/libavcodec/x86/xvididct_mmx.c
+++ b/libavcodec/x86/xvididct_mmx.c
@@ -22,20 +22,20 @@
  *
  * conversion to gcc syntax by Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
+ * along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -46,6 +46,7 @@
 #include "libavutil/mem.h"
 
 #include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
 
 #include "idctdsp.h"
 #include "xvididct.h"
@@ -497,13 +498,13 @@ void ff_xvid_idct_mmx(short *block)
 void ff_xvid_idct_mmx_put(uint8_t *dest, int line_size, int16_t *block)
 {
     ff_xvid_idct_mmx(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
+    ff_put_pixels_clamped(block, dest, line_size);
 }
 
 void ff_xvid_idct_mmx_add(uint8_t *dest, int line_size, int16_t *block)
 {
     ff_xvid_idct_mmx(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
+    ff_add_pixels_clamped(block, dest, line_size);
 }
 
 #endif /* HAVE_MMX_INLINE */
@@ -536,13 +537,13 @@ void ff_xvid_idct_mmxext(short *block)
 void ff_xvid_idct_mmxext_put(uint8_t *dest, int line_size, int16_t *block)
 {
     ff_xvid_idct_mmxext(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
+    ff_put_pixels_clamped(block, dest, line_size);
 }
 
 void ff_xvid_idct_mmxext_add(uint8_t *dest, int line_size, int16_t *block)
 {
     ff_xvid_idct_mmxext(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
+    ff_add_pixels_clamped(block, dest, line_size);
 }
 
 #endif /* HAVE_MMXEXT_INLINE */
diff --git a/libavcodec/x86/xvididct_sse2.c b/libavcodec/x86/xvididct_sse2.c
index d4f01693ea..51a5d9d6b0 100644
--- a/libavcodec/x86/xvididct_sse2.c
+++ b/libavcodec/x86/xvididct_sse2.c
@@ -9,7 +9,7 @@
  *
  * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
  * Vertical pass is an implementation of the scheme:
  *  Loeffler C., Ligtenberg A., and Moschytz C.S.:
@@ -23,25 +23,26 @@
  *
  * More details at http://skal.planet-d.net/coding/dct.html
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
+ * along with FFmpeg; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 
+#include "libavcodec/idctdsp.h"
+
 #include "idctdsp.h"
 #include "xvididct.h"
 
@@ -146,7 +147,7 @@ DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
 
 #endif
 
-#define ROUND(x) "paddd   "MANGLE(x)
+#define ROUND(x) "paddd   "x
 
 #define JZ(reg, to)                         \
     "testl     "reg","reg"            \n\t" \
@@ -342,17 +343,17 @@ DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
     "movdqa   %%xmm6, 4*16("dct")     \n\t" \
     "movdqa   "SREG2", 7*16("dct")    \n\t"
 
-inline void ff_xvid_idct_sse2(short *block)
+av_extern_inline void ff_xvid_idct_sse2(short *block)
 {
     __asm__ volatile (
         "movq     "MANGLE (m127) ", %%mm0                              \n\t"
-        iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(walkenIdctRounders),          PUT_EVEN(ROW0))
-        iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 1 * 16), PUT_ODD(ROW1))
-        iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 2 * 16), PUT_EVEN(ROW2))
+        iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(MANGLE(walkenIdctRounders)),        PUT_EVEN(ROW0))
+        iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND("1*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW1))
+        iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND("2*16+"MANGLE(walkenIdctRounders)), PUT_EVEN(ROW2))
 
         TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
         JZ("%%eax", "1f")
-        iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 3 * 16), PUT_ODD(ROW3))
+        iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND("3*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW3))
 
         TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
         TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
@@ -367,20 +368,20 @@ inline void ff_xvid_idct_sse2(short *block)
         "2:                                                          \n\t"
         iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
         "3:                                                          \n\t"
-        iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 4 * 16), PUT_ODD(ROW5))
+        iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND("4*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW5))
         JZ("%%edx", "1f")
         "4:                                                          \n\t"
-        iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 5 * 16), PUT_EVEN(ROW6))
+        iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND("5*16+"MANGLE(walkenIdctRounders)), PUT_EVEN(ROW6))
         JZ("%%esi", "1f")
         "5:                                                          \n\t"
-        iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 5 * 16), PUT_ODD(ROW7))
+        iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND("5*16+"MANGLE(walkenIdctRounders)), PUT_ODD(ROW7))
 #if ARCH_X86_32
         iLLM_HEAD
 #endif
         iLLM_PASS("%0")
         "6:                                                          \n\t"
         : "+r" (block)
-        :
+        : NAMED_CONSTRAINTS_ARRAY(m127,iTab1,walkenIdctRounders,iTab2,iTab3,iTab4,tan3,tan1,tan2,sqrt2)
         : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
                        "%xmm4", "%xmm5", "%xmm6", "%xmm7", )
 #if ARCH_X86_64
@@ -393,13 +394,13 @@ inline void ff_xvid_idct_sse2(short *block)
 void ff_xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block)
 {
     ff_xvid_idct_sse2(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
+    ff_put_pixels_clamped(block, dest, line_size);
 }
 
 void ff_xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block)
 {
     ff_xvid_idct_sse2(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
+    ff_add_pixels_clamped(block, dest, line_size);
 }
 
 #endif /* HAVE_SSE2_INLINE */