8 files changed, 1993 insertions, 0 deletions
diff --git a/libswresample/x86/Makefile b/libswresample/x86/Makefile
new file mode 100644
index 0000000000..be44df56aa
--- /dev/null
+++ b/libswresample/x86/Makefile
@@ -0,0 +1,9 @@
+YASM-OBJS                       += x86/audio_convert.o\
+                                   x86/rematrix.o\
+                                   x86/resample.o\
+
+OBJS                            += x86/audio_convert_init.o\
+                                   x86/rematrix_init.o\
+                                   x86/resample_init.o\
+
+OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
diff --git a/libswresample/x86/audio_convert.asm b/libswresample/x86/audio_convert.asm
new file mode 100644
index 0000000000..d441636d3c
--- /dev/null
+++ b/libswresample/x86/audio_convert.asm
@@ -0,0 +1,739 @@
+;******************************************************************************
+;* Copyright (c) 2012 Michael Niedermayer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA 32
+flt2pm31: times 8 dd 4.6566129e-10
+flt2p31 : times 8 dd 2147483648.0
+flt2p15 : times 8 dd 32768.0
+
+word_unpack_shuf : db  0, 1, 4, 5, 8, 9,12,13, 2, 3, 6, 7,10,11,14,15
+
+SECTION .text
+
+
+;to, from, a/u, log2_outsize, log_intsize, const
+%macro PACK_2CH 5-7
+cglobal pack_2ch_%2_to_%1_%3, 3, 4, 6, dst, src, len, src2
+    mov src2q   , [srcq+gprsize]
+    mov srcq    , [srcq]
+    mov dstq    , [dstq]
+%ifidn %3, a
+    test dstq, mmsize-1
+        jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
+    test srcq, mmsize-1
+        jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
+    test src2q, mmsize-1
+        jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
+%else
+pack_2ch_%2_to_%1_u_int %+ SUFFIX:
+%endif
+    lea     srcq , [srcq  + (1<<%5)*lenq]
+    lea     src2q, [src2q + (1<<%5)*lenq]
+    lea     dstq , [dstq  + (2<<%4)*lenq]
+    neg     lenq
+    %7 m0,m1,m2,m3,m4,m5
+.next:
+%if %4 >= %5
+    mov%3     m0, [         srcq +(1<<%5)*lenq]
+    mova      m1, m0
+    mov%3     m2, [         src2q+(1<<%5)*lenq]
+%if %5 == 1
+    punpcklwd m0, m2
+    punpckhwd m1, m2
+%else
+    punpckldq m0, m2
+    punpckhdq m1, m2
+%endif
+    %6 m0,m1,m2,m3,m4,m5
+%else
+    mov%3     m0, [         srcq +(1<<%5)*lenq]
+    mov%3     m1, [mmsize + srcq +(1<<%5)*lenq]
+    mov%3     m2, [         src2q+(1<<%5)*lenq]
+    mov%3     m3, [mmsize + src2q+(1<<%5)*lenq]
+    %6 m0,m1,m2,m3,m4,m5
+    mova      m2, m0
+    punpcklwd m0, m1
+    punpckhwd m2, m1
+    SWAP 1,2
+%endif
+    mov%3 [           dstq+(2<<%4)*lenq], m0
+    mov%3 [  mmsize + dstq+(2<<%4)*lenq], m1
+%if %4 > %5
+    mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2
+    mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3
+    add lenq, 4*mmsize/(2<<%4)
+%else
+    add lenq, 2*mmsize/(2<<%4)
+%endif
+        jl .next
+    REP_RET
+%endmacro
+
+%macro UNPACK_2CH 5-7
+cglobal unpack_2ch_%2_to_%1_%3, 3, 4, 7, dst, src, len, dst2
+    mov dst2q   , [dstq+gprsize]
+    mov srcq    , [srcq]
+    mov dstq    , [dstq]
+%ifidn %3, a
+    test dstq, mmsize-1
+        jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
+    test srcq, mmsize-1
+        jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
+    test dst2q, mmsize-1
+        jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
+%else
+unpack_2ch_%2_to_%1_u_int %+ SUFFIX:
+%endif
+    lea     srcq , [srcq  + (2<<%5)*lenq]
+    lea     dstq , [dstq  + (1<<%4)*lenq]
+    lea     dst2q, [dst2q + (1<<%4)*lenq]
+    neg     lenq
+    %7 m0,m1,m2,m3,m4,m5
+    mova      m6, [word_unpack_shuf]
+.next:
+    mov%3     m0, [           srcq +(2<<%5)*lenq]
+    mov%3     m2, [  mmsize + srcq +(2<<%5)*lenq]
+%if %5 == 1
+%ifidn SUFFIX, _ssse3
+    pshufb    m0, m6
+    mova      m1, m0
+    pshufb    m2, m6
+    punpcklqdq m0,m2
+    punpckhqdq m1,m2
+%else
+    mova      m1, m0
+    punpcklwd m0,m2
+    punpckhwd m1,m2
+
+    mova      m2, m0
+    punpcklwd m0,m1
+    punpckhwd m2,m1
+
+    mova      m1, m0
+    punpcklwd m0,m2
+    punpckhwd m1,m2
+%endif
+%else
+    mova      m1, m0
+    shufps    m0, m2, 10001000b
+    shufps    m1, m2, 11011101b
+%endif
+%if %4 < %5
+    mov%3     m2, [2*mmsize + srcq +(2<<%5)*lenq]
+    mova      m3, m2
+    mov%3     m4, [3*mmsize + srcq +(2<<%5)*lenq]
+    shufps    m2, m4, 10001000b
+    shufps    m3, m4, 11011101b
+    SWAP 1,2
+%endif
+    %6 m0,m1,m2,m3,m4,m5
+    mov%3 [           dstq+(1<<%4)*lenq], m0
+%if %4 > %5
+    mov%3 [          dst2q+(1<<%4)*lenq], m2
+    mov%3 [ mmsize +  dstq+(1<<%4)*lenq], m1
+    mov%3 [ mmsize + dst2q+(1<<%4)*lenq], m3
+    add lenq, 2*mmsize/(1<<%4)
+%else
+    mov%3 [          dst2q+(1<<%4)*lenq], m1
+    add lenq, mmsize/(1<<%4)
+%endif
+        jl .next
+    REP_RET
+%endmacro
+
+%macro CONV 5-7
+cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len
+    mov srcq    , [srcq]
+    mov dstq    , [dstq]
+%ifidn %3, a
+    test dstq, mmsize-1
+        jne %2_to_%1_u_int %+ SUFFIX
+    test srcq, mmsize-1
+        jne %2_to_%1_u_int %+ SUFFIX
+%else
+%2_to_%1_u_int %+ SUFFIX:
+%endif
+    lea     srcq , [srcq  + (1<<%5)*lenq]
+    lea     dstq , [dstq  + (1<<%4)*lenq]
+    neg     lenq
+    %7 m0,m1,m2,m3,m4,m5
+.next:
+    mov%3     m0, [           srcq +(1<<%5)*lenq]
+    mov%3     m1, [  mmsize + srcq +(1<<%5)*lenq]
+%if %4 < %5
+    mov%3     m2, [2*mmsize + srcq +(1<<%5)*lenq]
+    mov%3     m3, [3*mmsize + srcq +(1<<%5)*lenq]
+%endif
+    %6 m0,m1,m2,m3,m4,m5
+    mov%3 [           dstq+(1<<%4)*lenq], m0
+    mov%3 [  mmsize + dstq+(1<<%4)*lenq], m1
+%if %4 > %5
+    mov%3 [2*mmsize + dstq+(1<<%4)*lenq], m2
+    mov%3 [3*mmsize + dstq+(1<<%4)*lenq], m3
+    add lenq, 4*mmsize/(1<<%4)
+%else
+    add lenq, 2*mmsize/(1<<%4)
+%endif
+        jl .next
+%if mmsize == 8
+    emms
+    RET
+%else
+    REP_RET
+%endif
+%endmacro
+
+%macro PACK_6CH 8
+cglobal pack_6ch_%2_to_%1_%3, 2, 8, %6, dst, src, src1, src2, src3, src4, src5, len
+%if ARCH_X86_64
+    mov     lend, r2d
+%else
+    %define lend dword r2m
+%endif
+    mov    src1q, [srcq+1*gprsize]
+    mov    src2q, [srcq+2*gprsize]
+    mov    src3q, [srcq+3*gprsize]
+    mov    src4q, [srcq+4*gprsize]
+    mov    src5q, [srcq+5*gprsize]
+    mov     srcq, [srcq]
+    mov     dstq, [dstq]
+%ifidn %3, a
+    test dstq, mmsize-1
+        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
+    test srcq, mmsize-1
+        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
+    test src1q, mmsize-1
+        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
+    test src2q, mmsize-1
+        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
+    test src3q, mmsize-1
+        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
+    test src4q, mmsize-1
+        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
+    test src5q, mmsize-1
+        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
+%else
+pack_6ch_%2_to_%1_u_int %+ SUFFIX:
+%endif
+    sub    src1q, srcq
+    sub    src2q, srcq
+    sub    src3q, srcq
+    sub    src4q, srcq
+    sub    src5q, srcq
+    %8 x,x,x,x,m7,x
+.loop:
+    mov%3     m0, [srcq      ]
+    mov%3     m1, [srcq+src1q]
+    mov%3     m2, [srcq+src2q]
+    mov%3     m3, [srcq+src3q]
+    mov%3     m4, [srcq+src4q]
+    mov%3     m5, [srcq+src5q]
+%if cpuflag(sse)
+    SBUTTERFLYPS 0, 1, 6
+    SBUTTERFLYPS 2, 3, 6
+    SBUTTERFLYPS 4, 5, 6
+
+%if cpuflag(avx)
+    blendps   m6, m4, m0, 1100b
+%else
+    movaps    m6, m4
+    shufps    m4, m0, q3210
+    SWAP 4,6
+%endif
+    movlhps   m0, m2
+    movhlps   m4, m2
+%if cpuflag(avx)
+    blendps   m2, m5, m1, 1100b
+%else
+    movaps    m2, m5
+    shufps    m5, m1, q3210
+    SWAP 2,5
+%endif
+    movlhps   m1, m3
+    movhlps   m5, m3
+
+    %7 m0,m6,x,x,m7,m3
+    %7 m4,m1,x,x,m7,m3
+    %7 m2,m5,x,x,m7,m3
+
+    mov %+ %3 %+ ps [dstq   ], m0
+    mov %+ %3 %+ ps [dstq+16], m6
+    mov %+ %3 %+ ps [dstq+32], m4
+    mov %+ %3 %+ ps [dstq+48], m1
+    mov %+ %3 %+ ps [dstq+64], m2
+    mov %+ %3 %+ ps [dstq+80], m5
+%else ; mmx
+    SBUTTERFLY dq, 0, 1, 6
+    SBUTTERFLY dq, 2, 3, 6
+    SBUTTERFLY dq, 4, 5, 6
+
+    movq   [dstq   ], m0
+    movq   [dstq+ 8], m2
+    movq   [dstq+16], m4
+    movq   [dstq+24], m1
+    movq   [dstq+32], m3
+    movq   [dstq+40], m5
+%endif
+    add      srcq, mmsize
+    add      dstq, mmsize*6
+    sub      lend, mmsize/4
+    jg .loop
+%if mmsize == 8
+    emms
+    RET
+%else
+    REP_RET
+%endif
+%endmacro
+
+%macro UNPACK_6CH 8
+cglobal unpack_6ch_%2_to_%1_%3, 2, 8, %6, dst, src, dst1, dst2, dst3, dst4, dst5, len
+%if ARCH_X86_64
+    mov     lend, r2d
+%else
+    %define lend dword r2m
+%endif
+    mov    dst1q, [dstq+1*gprsize]
+    mov    dst2q, [dstq+2*gprsize]
+    mov    dst3q, [dstq+3*gprsize]
+    mov    dst4q, [dstq+4*gprsize]
+    mov    dst5q, [dstq+5*gprsize]
+    mov     dstq, [dstq]
+    mov     srcq, [srcq]
+%ifidn %3, a
+    test dstq, mmsize-1
+        jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
+    test srcq, mmsize-1
+        jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
+    test dst1q, mmsize-1
+        jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
+    test dst2q, mmsize-1
+        jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
+    test dst3q, mmsize-1
+        jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
+    test dst4q, mmsize-1
+        jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
+    test dst5q, mmsize-1
+        jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
+%else
+unpack_6ch_%2_to_%1_u_int %+ SUFFIX:
+%endif
+    sub    dst1q, dstq
+    sub    dst2q, dstq
+    sub    dst3q, dstq
+    sub    dst4q, dstq
+    sub    dst5q, dstq
+    %8 x,x,x,x,m7,x
+.loop:
+    mov%3     m0, [srcq   ]
+    mov%3     m1, [srcq+16]
+    mov%3     m2, [srcq+32]
+    mov%3     m3, [srcq+48]
+    mov%3     m4, [srcq+64]
+    mov%3     m5, [srcq+80]
+
+    SBUTTERFLYPS 0, 3, 6
+    SBUTTERFLYPS 1, 4, 6
+    SBUTTERFLYPS 2, 5, 6
+    SBUTTERFLYPS 0, 4, 6
+    SBUTTERFLYPS 3, 2, 6
+    SBUTTERFLYPS 1, 5, 6
+    SWAP 1, 4
+    SWAP 2, 3
+
+    %7 m0,m1,x,x,m7,m6
+    %7 m2,m3,x,x,m7,m6
+    %7 m4,m5,x,x,m7,m6
+
+    mov %+ %3 %+ ps [dstq      ], m0
+    mov %+ %3 %+ ps [dstq+dst1q], m1
+    mov %+ %3 %+ ps [dstq+dst2q], m2
+    mov %+ %3 %+ ps [dstq+dst3q], m3
+    mov %+ %3 %+ ps [dstq+dst4q], m4
+    mov %+ %3 %+ ps [dstq+dst5q], m5
+
+    add      srcq, mmsize*6
+    add      dstq, mmsize
+    sub      lend, mmsize/4
+    jg .loop
+    REP_RET
+%endmacro
+
+%define PACK_8CH_GPRS (10 * ARCH_X86_64) + ((6 + HAVE_ALIGNED_STACK) * ARCH_X86_32)
+
+%macro PACK_8CH 8
+cglobal pack_8ch_%2_to_%1_%3, 2, PACK_8CH_GPRS, %6, ARCH_X86_32*48, dst, src, len, src1, src2, src3, src4, src5, src6, src7
+    mov     dstq, [dstq]
+%if ARCH_X86_32
+    DEFINE_ARGS dst, src, src2, src3, src4, src5, src6
+    %define lend dword r2m
+    %define src1q r0q
+    %define src1m dword [rsp+32]
+%if HAVE_ALIGNED_STACK == 0
+    DEFINE_ARGS dst, src, src2, src3, src5, src6
+    %define src4q r0q
+    %define src4m dword [rsp+36]
+%endif
+    %define src7q r0q
+    %define src7m dword [rsp+40]
+    mov     dstm, dstq
+%endif
+    mov    src7q, [srcq+7*gprsize]
+    mov    src6q, [srcq+6*gprsize]
+%if ARCH_X86_32
+    mov    src7m, src7q
+%endif
+    mov    src5q, [srcq+5*gprsize]
+    mov    src4q, [srcq+4*gprsize]
+    mov    src3q, [srcq+3*gprsize]
+%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
+    mov    src4m, src4q
+%endif
+    mov    src2q, [srcq+2*gprsize]
+    mov    src1q, [srcq+1*gprsize]
+    mov     srcq, [srcq]
+%ifidn %3, a
+%if ARCH_X86_32
+    test dstmp, mmsize-1
+%else
+    test dstq, mmsize-1
+%endif
+        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
+    test srcq, mmsize-1
+        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
+    test src1q, mmsize-1
+        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
+    test src2q, mmsize-1
+        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
+    test src3q, mmsize-1
+        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
+%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
+    test src4m, mmsize-1
+%else
+    test src4q, mmsize-1
+%endif
+        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
+    test src5q, mmsize-1
+        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
+    test src6q, mmsize-1
+        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
+%if ARCH_X86_32
+    test src7m, mmsize-1
+%else
+    test src7q, mmsize-1
+%endif
+        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
+%else
+pack_8ch_%2_to_%1_u_int %+ SUFFIX:
+%endif
+    sub    src1q, srcq
+    sub    src2q, srcq
+    sub    src3q, srcq
+%if ARCH_X86_64 || HAVE_ALIGNED_STACK
+    sub    src4q, srcq
+%else
+    sub    src4m, srcq
+%endif
+    sub    src5q, srcq
+    sub    src6q, srcq
+%if ARCH_X86_64
+    sub    src7q, srcq
+%else
+    mov src1m, src1q
+    sub src7m, srcq
+%endif
+
+%if ARCH_X86_64
+    %8 x,x,x,x,m9,x
+%elifidn %1, int32
+    %define m9 [flt2p31]
+%else
+    %define m9 [flt2pm31]
+%endif
+
+.loop:
+    mov%3     m0, [srcq      ]
+    mov%3     m1, [srcq+src1q]
+    mov%3     m2, [srcq+src2q]
+%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
+    mov    src4q, src4m
+%endif
+    mov%3     m3, [srcq+src3q]
+    mov%3     m4, [srcq+src4q]
+    mov%3     m5, [srcq+src5q]
+%if ARCH_X86_32
+    mov    src7q, src7m
+%endif
+    mov%3     m6, [srcq+src6q]
+    mov%3     m7, [srcq+src7q]
+
+%if ARCH_X86_64
+    TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
+
+    %7 m0,m1,x,x,m9,m8
+    %7 m2,m3,x,x,m9,m8
+    %7 m4,m5,x,x,m9,m8
+    %7 m6,m7,x,x,m9,m8
+
+    mov%3 [dstq], m0
+%else
+    mov     dstq, dstm
+
+    TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, [rsp], [rsp+16], 1
+
+    %7 m0,m1,x,x,m9,m2
+    mova     m2, [rsp]
+    mov%3   [dstq], m0
+    %7 m2,m3,x,x,m9,m0
+    %7 m4,m5,x,x,m9,m0
+    %7 m6,m7,x,x,m9,m0
+
+%endif
+
+    mov%3 [dstq+16],  m1
+    mov%3 [dstq+32],  m2
+    mov%3 [dstq+48],  m3
+    mov%3 [dstq+64],  m4
+    mov%3 [dstq+80],  m5
+    mov%3 [dstq+96],  m6
+    mov%3 [dstq+112], m7
+
+    add      srcq, mmsize
+    add      dstq, mmsize*8
+%if ARCH_X86_32
+    mov      dstm, dstq
+    mov      src1q, src1m
+%endif
+    sub      lend, mmsize/4
+    jg .loop
+    REP_RET
+%endmacro
+
+%macro INT16_TO_INT32_N 6
+    pxor      m2, m2
+    pxor      m3, m3
+    punpcklwd m2, m1
+    punpckhwd m3, m1
+    SWAP 4,0
+    pxor      m0, m0
+    pxor      m1, m1
+    punpcklwd m0, m4
+    punpckhwd m1, m4
+%endmacro
+
+%macro INT32_TO_INT16_N 6
+    psrad     m0, 16
+    psrad     m1, 16
+    psrad     m2, 16
+    psrad     m3, 16
+    packssdw  m0, m1
+    packssdw  m2, m3
+    SWAP 1,2
+%endmacro
+
+%macro INT32_TO_FLOAT_INIT 6
+    mova      %5, [flt2pm31]
+%endmacro
+%macro INT32_TO_FLOAT_N 6
+    cvtdq2ps  %1, %1
+    cvtdq2ps  %2, %2
+    mulps %1, %1, %5
+    mulps %2, %2, %5
+%endmacro
+
+%macro FLOAT_TO_INT32_INIT 6
+    mova      %5, [flt2p31]
+%endmacro
+%macro FLOAT_TO_INT32_N 6
+    mulps %1, %5
+    mulps %2, %5
+    cvtps2dq  %6, %1
+    cmpps %1, %1, %5, 5
+    paddd %1, %6
+    cvtps2dq  %6, %2
+    cmpps %2, %2, %5, 5
+    paddd %2, %6
+%endmacro
+
+%macro INT16_TO_FLOAT_INIT 6
+    mova      m5, [flt2pm31]
+%endmacro
+%macro INT16_TO_FLOAT_N 6
+    INT16_TO_INT32_N %1,%2,%3,%4,%5,%6
+    cvtdq2ps  m0, m0
+    cvtdq2ps  m1, m1
+    cvtdq2ps  m2, m2
+    cvtdq2ps  m3, m3
+    mulps m0, m0, m5
+    mulps m1, m1, m5
+    mulps m2, m2, m5
+    mulps m3, m3, m5
+%endmacro
+
+%macro FLOAT_TO_INT16_INIT 6
+    mova      m5, [flt2p15]
+%endmacro
+%macro FLOAT_TO_INT16_N 6
+    mulps m0, m5
+    mulps m1, m5
+    mulps m2, m5
+    mulps m3, m5
+    cvtps2dq  m0, m0
+    cvtps2dq  m1, m1
+    packssdw  m0, m1
+    cvtps2dq  m1, m2
+    cvtps2dq  m3, m3
+    packssdw  m1, m3
+%endmacro
+
+%macro NOP_N 0-6
+%endmacro
+
+INIT_MMX mmx
+CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
+CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
+CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
+CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
+
+PACK_6CH float, float, u, 2, 2, 0, NOP_N, NOP_N
+PACK_6CH float, float, a, 2, 2, 0, NOP_N, NOP_N
+
+INIT_XMM sse
+PACK_6CH float, float, u, 2, 2, 7, NOP_N, NOP_N
+PACK_6CH float, float, a, 2, 2, 7, NOP_N, NOP_N
+
+UNPACK_6CH float, float, u, 2, 2, 7, NOP_N, NOP_N
+UNPACK_6CH float, float, a, 2, 2, 7, NOP_N, NOP_N
+
+INIT_XMM sse2
+CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
+CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
+CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
+CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
+
+PACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
+PACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
+PACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
+PACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
+PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
+PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
+PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
+PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
+
+UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
+UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
+UNPACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
+UNPACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
+UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
+UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
+UNPACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
+UNPACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
+
+CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+CONV float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
+CONV float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
+CONV int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
+CONV int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
+
+PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
+PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
+PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
+PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
+
+UNPACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+UNPACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+UNPACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+UNPACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
+UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
+UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
+UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
+
+PACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+PACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+
+UNPACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+UNPACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+UNPACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+UNPACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+
+PACK_8CH float, float, u, 2, 2, 9, NOP_N, NOP_N
+PACK_8CH float, float, a, 2, 2, 9, NOP_N, NOP_N
+
+PACK_8CH float, int32, u, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_8CH float, int32, a, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_8CH int32, float, u, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+PACK_8CH int32, float, a, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+
+INIT_XMM ssse3
+UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
+UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
+UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
+UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
+UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
+UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+PACK_6CH float, float, u, 2, 2, 8, NOP_N, NOP_N
+PACK_6CH float, float, a, 2, 2, 8, NOP_N, NOP_N
+
+UNPACK_6CH float, float, u, 2, 2, 8, NOP_N, NOP_N
+UNPACK_6CH float, float, a, 2, 2, 8, NOP_N, NOP_N
+
+PACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+PACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+
+UNPACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+UNPACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+UNPACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+UNPACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+
+PACK_8CH float, float, u, 2, 2, 9, NOP_N, NOP_N
+PACK_8CH float, float, a, 2, 2, 9, NOP_N, NOP_N
+
+PACK_8CH float, int32, u, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_8CH float, int32, a, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_8CH int32, float, u, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+PACK_8CH int32, float, a, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+
+INIT_YMM avx
+CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+%endif
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+%endif
diff --git a/libswresample/x86/audio_convert_init.c b/libswresample/x86/audio_convert_init.c
new file mode 100644
index 0000000000..bb89cf604b
--- /dev/null
+++ b/libswresample/x86/audio_convert_init.c
@@ -0,0 +1,181 @@
+/*
+ * Copyright (C) 2012 Michael Niedermayer (michaelni@gmx.at)
+ *
+ * This file is part of libswresample
+ *
+ * libswresample is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libswresample is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libswresample; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+#include "libswresample/swresample_internal.h"
+#include "libswresample/audioconvert.h"
+
+#define PROTO(pre, in, out, cap) void ff ## pre ## in## _to_ ##out## _a_ ##cap(uint8_t **dst, const uint8_t **src, int len);
+#define PROTO2(pre, out, cap) PROTO(pre, int16, out, cap) PROTO(pre, int32, out, cap) PROTO(pre, float, out, cap)
+#define PROTO3(pre, cap) PROTO2(pre, int16, cap) PROTO2(pre, int32, cap) PROTO2(pre, float, cap)
+#define PROTO4(pre) PROTO3(pre, mmx) PROTO3(pre, sse) PROTO3(pre, sse2) PROTO3(pre, ssse3) PROTO3(pre, sse4) PROTO3(pre, avx) PROTO3(pre, avx2)
+PROTO4(_)
+PROTO4(_pack_2ch_)
+PROTO4(_pack_6ch_)
+PROTO4(_pack_8ch_)
+PROTO4(_unpack_2ch_)
+PROTO4(_unpack_6ch_)
+
+av_cold void swri_audio_convert_init_x86(struct AudioConvert *ac,
+                                 enum AVSampleFormat out_fmt,
+                                 enum AVSampleFormat in_fmt,
+                                 int channels){
+    int mm_flags = av_get_cpu_flags();
+
+    ac->simd_f= NULL;
+
+//FIXME add memcpy case
+
+#define MULTI_CAPS_FUNC(flag, cap) \
+    if (EXTERNAL_##flag(mm_flags)) {\
+        if(   out_fmt == AV_SAMPLE_FMT_S32  && in_fmt == AV_SAMPLE_FMT_S16 || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S16P)\
+            ac->simd_f =  ff_int16_to_int32_a_ ## cap;\
+        if(   out_fmt == AV_SAMPLE_FMT_S16  && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_S32P)\
+            ac->simd_f =  ff_int32_to_int16_a_ ## cap;\
+    }
+
+MULTI_CAPS_FUNC(MMX, mmx)
+MULTI_CAPS_FUNC(SSE2, sse2)
+
+    if(EXTERNAL_MMX(mm_flags)) {
+        if(channels == 6) {
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_6ch_float_to_float_a_mmx;
+        }
+    }
+    if(EXTERNAL_SSE(mm_flags)) {
+        if(channels == 6) {
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_6ch_float_to_float_a_sse;
+
+            if(   out_fmt == AV_SAMPLE_FMT_FLTP  && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S32)
+                ac->simd_f =  ff_unpack_6ch_float_to_float_a_sse;
+        }
+    }
+    if(EXTERNAL_SSE2(mm_flags)) {
+        if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P)
+            ac->simd_f =  ff_int32_to_float_a_sse2;
+        if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S16 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S16P)
+            ac->simd_f =  ff_int16_to_float_a_sse2;
+        if(   out_fmt == AV_SAMPLE_FMT_S32  && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_FLTP)
+            ac->simd_f =  ff_float_to_int32_a_sse2;
+        if(   out_fmt == AV_SAMPLE_FMT_S16  && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLTP)
+            ac->simd_f =  ff_float_to_int16_a_sse2;
+
+        if(channels == 2) {
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_2ch_int32_to_int32_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S16  && in_fmt == AV_SAMPLE_FMT_S16P)
+                ac->simd_f =  ff_pack_2ch_int16_to_int16_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S32  && in_fmt == AV_SAMPLE_FMT_S16P)
+                ac->simd_f =  ff_pack_2ch_int16_to_int32_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S16  && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_2ch_int32_to_int16_a_sse2;
+
+            if(   out_fmt == AV_SAMPLE_FMT_FLTP  && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S32)
+                ac->simd_f =  ff_unpack_2ch_int32_to_int32_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S16P  && in_fmt == AV_SAMPLE_FMT_S16)
+                ac->simd_f =  ff_unpack_2ch_int16_to_int16_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S32P  && in_fmt == AV_SAMPLE_FMT_S16)
+                ac->simd_f =  ff_unpack_2ch_int16_to_int32_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S16P  && in_fmt == AV_SAMPLE_FMT_S32)
+                ac->simd_f =  ff_unpack_2ch_int32_to_int16_a_sse2;
+
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_2ch_int32_to_float_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S32  && in_fmt == AV_SAMPLE_FMT_FLTP)
+                ac->simd_f =  ff_pack_2ch_float_to_int32_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S16P)
+                ac->simd_f =  ff_pack_2ch_int16_to_float_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S16  && in_fmt == AV_SAMPLE_FMT_FLTP)
+                ac->simd_f =  ff_pack_2ch_float_to_int16_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_FLTP  && in_fmt == AV_SAMPLE_FMT_S32)
+                ac->simd_f =  ff_unpack_2ch_int32_to_float_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S32P  && in_fmt == AV_SAMPLE_FMT_FLT)
+                ac->simd_f =  ff_unpack_2ch_float_to_int32_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_FLTP  && in_fmt == AV_SAMPLE_FMT_S16)
+                ac->simd_f =  ff_unpack_2ch_int16_to_float_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S16P  && in_fmt == AV_SAMPLE_FMT_FLT)
+                ac->simd_f =  ff_unpack_2ch_float_to_int16_a_sse2;
+        }
+        if(channels == 6) {
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_6ch_int32_to_float_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S32  && in_fmt == AV_SAMPLE_FMT_FLTP)
+                ac->simd_f =  ff_pack_6ch_float_to_int32_a_sse2;
+
+            if(   out_fmt == AV_SAMPLE_FMT_FLTP  && in_fmt == AV_SAMPLE_FMT_S32)
+                ac->simd_f =  ff_unpack_6ch_int32_to_float_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S32P  && in_fmt == AV_SAMPLE_FMT_FLT)
+                ac->simd_f =  ff_unpack_6ch_float_to_int32_a_sse2;
+        }
+        if(channels == 8) {
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_8ch_float_to_float_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_8ch_int32_to_float_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S32  && in_fmt == AV_SAMPLE_FMT_FLTP)
+                ac->simd_f =  ff_pack_8ch_float_to_int32_a_sse2;
+        }
+    }
+    if(EXTERNAL_SSSE3(mm_flags)) {
+        if(channels == 2) {
+            if(   out_fmt == AV_SAMPLE_FMT_S16P  && in_fmt == AV_SAMPLE_FMT_S16)
+                ac->simd_f =  ff_unpack_2ch_int16_to_int16_a_ssse3;
+            if(   out_fmt == AV_SAMPLE_FMT_S32P  && in_fmt == AV_SAMPLE_FMT_S16)
+                ac->simd_f =  ff_unpack_2ch_int16_to_int32_a_ssse3;
+            if(   out_fmt == AV_SAMPLE_FMT_FLTP  && in_fmt == AV_SAMPLE_FMT_S16)
+                ac->simd_f =  ff_unpack_2ch_int16_to_float_a_ssse3;
+        }
+    }
+    if(EXTERNAL_AVX_FAST(mm_flags)) {
+        if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P)
+            ac->simd_f =  ff_int32_to_float_a_avx;
+    }
+    if(EXTERNAL_AVX(mm_flags)) {
+        if(channels == 6) {
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_6ch_float_to_float_a_avx;
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_6ch_int32_to_float_a_avx;
+            if(   out_fmt == AV_SAMPLE_FMT_S32  && in_fmt == AV_SAMPLE_FMT_FLTP)
+                ac->simd_f =  ff_pack_6ch_float_to_int32_a_avx;
+
+            if(   out_fmt == AV_SAMPLE_FMT_FLTP  && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S32)
+                ac->simd_f =  ff_unpack_6ch_float_to_float_a_avx;
+            if(   out_fmt == AV_SAMPLE_FMT_FLTP  && in_fmt == AV_SAMPLE_FMT_S32)
+                ac->simd_f =  ff_unpack_6ch_int32_to_float_a_avx;
+            if(   out_fmt == AV_SAMPLE_FMT_S32P  && in_fmt == AV_SAMPLE_FMT_FLT)
+                ac->simd_f =  ff_unpack_6ch_float_to_int32_a_avx;
+        }
+        if(channels == 8) {
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_8ch_float_to_float_a_avx;
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_8ch_int32_to_float_a_avx;
+            if(   out_fmt == AV_SAMPLE_FMT_S32  && in_fmt == AV_SAMPLE_FMT_FLTP)
+                ac->simd_f =  ff_pack_8ch_float_to_int32_a_avx;
+        }
+    }
+    if(EXTERNAL_AVX2_FAST(mm_flags)) {
+        if(   out_fmt == AV_SAMPLE_FMT_S32  && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_FLTP)
+            ac->simd_f =  ff_float_to_int32_a_avx2;
+    }
+}
diff --git a/libswresample/x86/rematrix.asm b/libswresample/x86/rematrix.asm
new file mode 100644
index 0000000000..7984b9a729
--- /dev/null
+++ b/libswresample/x86/rematrix.asm
@@ -0,0 +1,250 @@
+;******************************************************************************
+;* Copyright (c) 2012 Michael Niedermayer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+
+SECTION_RODATA 32
+dw1: times 8  dd 1
+w1 : times 16 dw 1
+
+SECTION .text
+
+%macro MIX2_FLT 1
+cglobal mix_2_1_%1_float, 7, 7, 6, out, in1, in2, coeffp, index1, index2, len
+%ifidn %1, a
+    test in1q, mmsize-1
+        jne mix_2_1_float_u_int %+ SUFFIX
+    test in2q, mmsize-1
+        jne mix_2_1_float_u_int %+ SUFFIX
+    test outq, mmsize-1
+        jne mix_2_1_float_u_int %+ SUFFIX
+%else
+mix_2_1_float_u_int %+ SUFFIX:
+%endif
+    VBROADCASTSS m4, [coeffpq + 4*index1q]
+    VBROADCASTSS m5, [coeffpq + 4*index2q]
+    shl lend    , 2
+    add in1q    , lenq
+    add in2q    , lenq
+    add outq    , lenq
+    neg lenq
+.next:
+%ifidn %1, a
+    mulps        m0, m4, [in1q + lenq         ]
+    mulps        m1, m5, [in2q + lenq         ]
+    mulps        m2, m4, [in1q + lenq + mmsize]
+    mulps        m3, m5, [in2q + lenq + mmsize]
+%else
+    movu         m0, [in1q + lenq         ]
+    movu         m1, [in2q + lenq         ]
+    movu         m2, [in1q + lenq + mmsize]
+    movu         m3, [in2q + lenq + mmsize]
+    mulps        m0, m0, m4
+    mulps        m1, m1, m5
+    mulps        m2, m2, m4
+    mulps        m3, m3, m5
+%endif
+    addps        m0, m0, m1
+    addps        m2, m2, m3
+    mov%1  [outq + lenq         ], m0
+    mov%1  [outq + lenq + mmsize], m2
+    add        lenq, mmsize*2
+        jl .next
+    REP_RET
+%endmacro
+
+%macro MIX1_FLT 1
+cglobal mix_1_1_%1_float, 5, 5, 3, out, in, coeffp, index, len
+%ifidn %1, a
+    test inq, mmsize-1
+        jne mix_1_1_float_u_int %+ SUFFIX
+    test outq, mmsize-1
+        jne mix_1_1_float_u_int %+ SUFFIX
+%else
+mix_1_1_float_u_int %+ SUFFIX:
+%endif
+    VBROADCASTSS m2, [coeffpq + 4*indexq]
+    shl lenq    , 2
+    add inq     , lenq
+    add outq    , lenq
+    neg lenq
+.next:
+%ifidn %1, a
+    mulps        m0, m2, [inq + lenq         ]
+    mulps        m1, m2, [inq + lenq + mmsize]
+%else
+    movu         m0, [inq + lenq         ]
+    movu         m1, [inq + lenq + mmsize]
+    mulps        m0, m0, m2
+    mulps        m1, m1, m2
+%endif
+    mov%1  [outq + lenq         ], m0
+    mov%1  [outq + lenq + mmsize], m1
+    add        lenq, mmsize*2
+        jl .next
+    REP_RET
+%endmacro
+
+%macro MIX1_INT16 1
+cglobal mix_1_1_%1_int16, 5, 5, 6, out, in, coeffp, index, len
+%ifidn %1, a
+    test inq, mmsize-1
+        jne mix_1_1_int16_u_int %+ SUFFIX
+    test outq, mmsize-1
+        jne mix_1_1_int16_u_int %+ SUFFIX
+%else
+mix_1_1_int16_u_int %+ SUFFIX:
+%endif
+    movd   m4, [coeffpq + 4*indexq]
+    SPLATW m5, m4
+    psllq  m4, 32
+    psrlq  m4, 48
+    mova   m0, [w1]
+    psllw  m0, m4
+    psrlw  m0, 1
+    punpcklwd m5, m0
+    add lenq    , lenq
+    add inq     , lenq
+    add outq    , lenq
+    neg lenq
+.next:
+    mov%1        m0, [inq + lenq         ]
+    mov%1        m2, [inq + lenq + mmsize]
+    mova         m1, m0
+    mova         m3, m2
+    punpcklwd    m0, [w1]
+    punpckhwd    m1, [w1]
+    punpcklwd    m2, [w1]
+    punpckhwd    m3, [w1]
+    pmaddwd      m0, m5
+    pmaddwd      m1, m5
+    pmaddwd      m2, m5
+    pmaddwd      m3, m5
+    psrad        m0, m4
+    psrad        m1, m4
+    psrad        m2, m4
+    psrad        m3, m4
+    packssdw     m0, m1
+    packssdw     m2, m3
+    mov%1  [outq + lenq         ], m0
+    mov%1  [outq + lenq + mmsize], m2
+    add        lenq, mmsize*2
+        jl .next
+%if mmsize == 8
+    emms
+    RET
+%else
+    REP_RET
+%endif
+%endmacro
+
+%macro MIX2_INT16 1
+cglobal mix_2_1_%1_int16, 7, 7, 8, out, in1, in2, coeffp, index1, index2, len
+%ifidn %1, a
+    test in1q, mmsize-1
+        jne mix_2_1_int16_u_int %+ SUFFIX
+    test in2q, mmsize-1
+        jne mix_2_1_int16_u_int %+ SUFFIX
+    test outq, mmsize-1
+        jne mix_2_1_int16_u_int %+ SUFFIX
+%else
+mix_2_1_int16_u_int %+ SUFFIX:
+%endif
+    movd   m4, [coeffpq + 4*index1q]
+    movd   m6, [coeffpq + 4*index2q]
+    SPLATW m5, m4
+    SPLATW m6, m6
+    psllq  m4, 32
+    psrlq  m4, 48
+    mova   m7, [dw1]
+    pslld  m7, m4
+    psrld  m7, 1
+    punpcklwd m5, m6
+    add lend    , lend
+    add in1q    , lenq
+    add in2q    , lenq
+    add outq    , lenq
+    neg lenq
+.next:
+    mov%1        m0, [in1q + lenq         ]
+    mov%1        m2, [in2q + lenq         ]
+    mova         m1, m0
+    punpcklwd    m0, m2
+    punpckhwd    m1, m2
+
+    mov%1        m2, [in1q + lenq + mmsize]
+    mov%1        m6, [in2q + lenq + mmsize]
+    mova         m3, m2
+    punpcklwd    m2, m6
+    punpckhwd    m3, m6
+
+    pmaddwd      m0, m5
+    pmaddwd      m1, m5
+    pmaddwd      m2, m5
+    pmaddwd      m3, m5
+    paddd        m0, m7
+    paddd        m1, m7
+    paddd        m2, m7
+    paddd        m3, m7
+    psrad        m0, m4
+    psrad        m1, m4
+    psrad        m2, m4
+    psrad        m3, m4
+    packssdw     m0, m1
+    packssdw     m2, m3
+    mov%1  [outq + lenq         ], m0
+    mov%1  [outq + lenq + mmsize], m2
+    add        lenq, mmsize*2
+        jl .next
+%if mmsize == 8
+    emms
+    RET
+%else
+    REP_RET
+%endif
+%endmacro
+
+
+INIT_MMX mmx
+MIX1_INT16 u
+MIX1_INT16 a
+MIX2_INT16 u
+MIX2_INT16 a
+
+INIT_XMM sse
+MIX2_FLT u
+MIX2_FLT a
+MIX1_FLT u
+MIX1_FLT a
+
+INIT_XMM sse2
+MIX1_INT16 u
+MIX1_INT16 a
+MIX2_INT16 u
+MIX2_INT16 a
+
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+MIX2_FLT u
+MIX2_FLT a
+MIX1_FLT u
+MIX1_FLT a
+%endif
diff --git a/libswresample/x86/rematrix_init.c b/libswresample/x86/rematrix_init.c
new file mode 100644
index 0000000000..5f2c5fe170
--- /dev/null
+++ b/libswresample/x86/rematrix_init.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2012 Michael Niedermayer (michaelni@gmx.at)
+ *
+ * This file is part of libswresample
+ *
+ * libswresample is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libswresample is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libswresample; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+#include "libswresample/swresample_internal.h"
+
+#define D(type, simd) \
+mix_1_1_func_type ff_mix_1_1_a_## type ## _ ## simd;\
+mix_2_1_func_type ff_mix_2_1_a_## type ## _ ## simd;
+
+D(float, sse)
+D(float, avx)
+D(int16, mmx)
+D(int16, sse2)
+
+av_cold int swri_rematrix_init_x86(struct SwrContext *s){
+#if HAVE_YASM
+    int mm_flags = av_get_cpu_flags();
+    int nb_in  = av_get_channel_layout_nb_channels(s->in_ch_layout);
+    int nb_out = av_get_channel_layout_nb_channels(s->out_ch_layout);
+    int num    = nb_in * nb_out;
+    int i,j;
+
+    s->mix_1_1_simd = NULL;
+    s->mix_2_1_simd = NULL;
+
+    if (s->midbuf.fmt == AV_SAMPLE_FMT_S16P){
+        if(EXTERNAL_MMX(mm_flags)) {
+            s->mix_1_1_simd = ff_mix_1_1_a_int16_mmx;
+            s->mix_2_1_simd = ff_mix_2_1_a_int16_mmx;
+        }
+        if(EXTERNAL_SSE2(mm_flags)) {
+            s->mix_1_1_simd = ff_mix_1_1_a_int16_sse2;
+            s->mix_2_1_simd = ff_mix_2_1_a_int16_sse2;
+        }
+        s->native_simd_matrix = av_mallocz_array(num,  2 * sizeof(int16_t));
+        s->native_simd_one    = av_mallocz(2 * sizeof(int16_t));
+        if (!s->native_simd_matrix || !s->native_simd_one)
+            return AVERROR(ENOMEM);
+
+        for(i=0; i<nb_out; i++){
+            int sh = 0;
+            for(j=0; j<nb_in; j++)
+                sh = FFMAX(sh, FFABS(((int*)s->native_matrix)[i * nb_in + j]));
+            sh = FFMAX(av_log2(sh) - 14, 0);
+            for(j=0; j<nb_in; j++) {
+                ((int16_t*)s->native_simd_matrix)[2*(i * nb_in + j)+1] = 15 - sh;
+                ((int16_t*)s->native_simd_matrix)[2*(i * nb_in + j)] =
+                    ((((int*)s->native_matrix)[i * nb_in + j]) + (1<<sh>>1)) >> sh;
+            }
+        }
+        ((int16_t*)s->native_simd_one)[1] = 14;
+        ((int16_t*)s->native_simd_one)[0] = 16384;
+    } else if(s->midbuf.fmt == AV_SAMPLE_FMT_FLTP){
+        if(EXTERNAL_SSE(mm_flags)) {
+            s->mix_1_1_simd = ff_mix_1_1_a_float_sse;
+            s->mix_2_1_simd = ff_mix_2_1_a_float_sse;
+        }
+        if(EXTERNAL_AVX_FAST(mm_flags)) {
+            s->mix_1_1_simd = ff_mix_1_1_a_float_avx;
+            s->mix_2_1_simd = ff_mix_2_1_a_float_avx;
+        }
+        s->native_simd_matrix = av_mallocz_array(num, sizeof(float));
+        s->native_simd_one = av_mallocz(sizeof(float));
+        if (!s->native_simd_matrix || !s->native_simd_one)
+            return AVERROR(ENOMEM);
+        memcpy(s->native_simd_matrix, s->native_matrix, num * sizeof(float));
+        memcpy(s->native_simd_one, s->native_one, sizeof(float));
+    }
+#endif
+
+    return 0;
+}
diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm
new file mode 100644
index 0000000000..4989aa6991
--- /dev/null
+++ b/libswresample/x86/resample.asm
@@ -0,0 +1,605 @@
+;******************************************************************************
+;* Copyright (c) 2012 Michael Niedermayer
+;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com>
+;* Copyright (c) 2014 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+%if ARCH_X86_64
+%define pointer resq
+%else
+%define pointer resd
+%endif
+
+struc ResampleContext
+    .av_class:              pointer 1
+    .filter_bank:           pointer 1
+    .filter_length:         resd 1
+    .filter_alloc:          resd 1
+    .ideal_dst_incr:        resd 1
+    .dst_incr:              resd 1
+    .dst_incr_div:          resd 1
+    .dst_incr_mod:          resd 1
+    .index:                 resd 1
+    .frac:                  resd 1
+    .src_incr:              resd 1
+    .compensation_distance: resd 1
+    .phase_shift:           resd 1
+    .phase_mask:            resd 1
+
+    ; there's a few more here but we only care about the first few
+endstruc
+
+SECTION_RODATA
+
+pf_1:      dd 1.0
+pdbl_1:    dq 1.0
+pd_0x4000: dd 0x4000
+
+SECTION .text
+
+%macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant
+; int resample_common_$format(ResampleContext *ctx, $format *dst,
+;                             const $format *src, int size, int update_ctx)
+%if ARCH_X86_64 ; unix64 and win64
+cglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_shift, index, frac, \
+                                      dst_incr_mod, size, min_filter_count_x4, \
+                                      min_filter_len_x4, dst_incr_div, src_incr, \
+                                      phase_mask, dst_end, filter_bank
+
+    ; use red-zone for variable storage
+%define ctx_stackq            [rsp-0x8]
+%define src_stackq            [rsp-0x10]
+%if WIN64
+%define update_context_stackd r4m
+%else ; unix64
+%define update_context_stackd [rsp-0x14]
+%endif
+
+    ; load as many variables in registers as possible; for the rest, store
+    ; on stack so that we have 'ctx' available as one extra register
+    mov                        sized, r3d
+    mov                  phase_maskd, [ctxq+ResampleContext.phase_mask]
+%if UNIX64
+    mov        update_context_stackd, r4d
+%endif
+    mov                       indexd, [ctxq+ResampleContext.index]
+    mov                        fracd, [ctxq+ResampleContext.frac]
+    mov                dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
+    mov                 filter_bankq, [ctxq+ResampleContext.filter_bank]
+    mov                    src_incrd, [ctxq+ResampleContext.src_incr]
+    mov                   ctx_stackq, ctxq
+    mov           min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
+    mov                dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
+    shl           min_filter_len_x4d, %3
+    lea                     dst_endq, [dstq+sizeq*%2]
+
+%if UNIX64
+    mov                          ecx, [ctxq+ResampleContext.phase_shift]
+    mov                          edi, [ctxq+ResampleContext.filter_alloc]
+
+    DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \
+                filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
+                src_incr, phase_mask, dst_end, filter_bank
+%elif WIN64
+    mov                          R9d, [ctxq+ResampleContext.filter_alloc]
+    mov                          ecx, [ctxq+ResampleContext.phase_shift]
+
+    DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \
+                filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
+                src_incr, phase_mask, dst_end, filter_bank
+%endif
+
+    neg           min_filter_len_x4q
+    sub                 filter_bankq, min_filter_len_x4q
+    sub                         srcq, min_filter_len_x4q
+    mov                   src_stackq, srcq
+%else ; x86-32
+cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
+                                     index, min_filter_length_x4, filter_bank
+
+    ; push temp variables to stack
+%define ctx_stackq            r0mp
+%define src_stackq            r2mp
+%define update_context_stackd r4m
+
+    mov                         dstq, r1mp
+    mov                           r3, r3mp
+    lea                           r3, [dstq+r3*%2]
+    PUSH                              dword [ctxq+ResampleContext.dst_incr_div]
+    PUSH                              dword [ctxq+ResampleContext.dst_incr_mod]
+    PUSH                              dword [ctxq+ResampleContext.filter_alloc]
+    PUSH                              r3
+    PUSH                              dword [ctxq+ResampleContext.phase_mask]
+    PUSH                              dword [ctxq+ResampleContext.src_incr]
+    mov        min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
+    mov                       indexd, [ctxq+ResampleContext.index]
+    shl        min_filter_length_x4d, %3
+    mov                        fracd, [ctxq+ResampleContext.frac]
+    neg        min_filter_length_x4q
+    mov                 filter_bankq, [ctxq+ResampleContext.filter_bank]
+    sub                         r2mp, min_filter_length_x4q
+    sub                 filter_bankq, min_filter_length_x4q
+    PUSH                              min_filter_length_x4q
+    PUSH                              filter_bankq
+    mov                 phase_shiftd, [ctxq+ResampleContext.phase_shift]
+
+    DEFINE_ARGS src, phase_shift, dst, frac, index, min_filter_count_x4, filter
+
+%define filter_bankq          dword [rsp+0x0]
+%define min_filter_length_x4q dword [rsp+0x4]
+%define src_incrd             dword [rsp+0x8]
+%define phase_maskd           dword [rsp+0xc]
+%define dst_endq              dword [rsp+0x10]
+%define filter_allocd         dword [rsp+0x14]
+%define dst_incr_modd         dword [rsp+0x18]
+%define dst_incr_divd         dword [rsp+0x1c]
+
+    mov                         srcq, r2mp
+%endif
+
+.loop:
+    mov                      filterd, filter_allocd
+    imul                     filterd, indexd
+%if ARCH_X86_64
+    mov         min_filter_count_x4q, min_filter_len_x4q
+    lea                      filterq, [filter_bankq+filterq*%2]
+%else ; x86-32
+    mov         min_filter_count_x4q, filter_bankq
+    lea                      filterq, [min_filter_count_x4q+filterq*%2]
+    mov         min_filter_count_x4q, min_filter_length_x4q
+%endif
+%ifidn %1, int16
+    movd                          m0, [pd_0x4000]
+%else ; float/double
+    xorps                         m0, m0, m0
+%endif
+
+    align 16
+.inner_loop:
+    movu                          m1, [srcq+min_filter_count_x4q*1]
+%ifidn %1, int16
+%if cpuflag(xop)
+    vpmadcswd                     m0, m1, [filterq+min_filter_count_x4q*1], m0
+%else
+    pmaddwd                       m1, [filterq+min_filter_count_x4q*1]
+    paddd                         m0, m1
+%endif
+%else ; float/double
+%if cpuflag(fma4) || cpuflag(fma3)
+    fmaddp%4                      m0, m1, [filterq+min_filter_count_x4q*1], m0
+%else
+    mulp%4                        m1, m1, [filterq+min_filter_count_x4q*1]
+    addp%4                        m0, m0, m1
+%endif ; cpuflag
+%endif
+    add         min_filter_count_x4q, mmsize
+    js .inner_loop
+
+%ifidn %1, int16
+    HADDD                         m0, m1
+    psrad                         m0, 15
+    add                        fracd, dst_incr_modd
+    packssdw                      m0, m0
+    add                       indexd, dst_incr_divd
+    movd                      [dstq], m0
+%else ; float/double
+    ; horizontal sum & store
+%if mmsize == 32
+    vextractf128                 xm1, m0, 0x1
+    addps                        xm0, xm1
+%endif
+    movhlps                      xm1, xm0
+%ifidn %1, float
+    addps                        xm0, xm1
+    shufps                       xm1, xm0, xm0, q0001
+%endif
+    add                        fracd, dst_incr_modd
+    addp%4                       xm0, xm1
+    add                       indexd, dst_incr_divd
+    movs%4                    [dstq], xm0
+%endif
+    cmp                        fracd, src_incrd
+    jl .skip
+    sub                        fracd, src_incrd
+    inc                       indexd
+
+%if UNIX64
+    DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \
+                index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
+                src_incr, phase_mask, dst_end, filter_bank
+%elif WIN64
+    DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \
+                index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
+                src_incr, phase_mask, dst_end, filter_bank
+%else ; x86-32
+    DEFINE_ARGS src, phase_shift, dst, frac, index, index_incr
+%endif
+
+.skip:
+    mov                  index_incrd, indexd
+    add                         dstq, %2
+    and                       indexd, phase_maskd
+    sar                  index_incrd, phase_shiftb
+    lea                         srcq, [srcq+index_incrq*%2]
+    cmp                         dstq, dst_endq
+    jne .loop
+
+%if ARCH_X86_64
+    DEFINE_ARGS ctx, dst, src, phase_shift, index, frac
+%else ; x86-32
+    DEFINE_ARGS src, ctx, update_context, frac, index
+%endif
+
+    cmp  dword update_context_stackd, 0
+    jz .skip_store
+    ; strictly speaking, the function should always return the consumed
+    ; number of bytes; however, we only use the value if update_context
+    ; is true, so let's just leave it uninitialized otherwise
+    mov                         ctxq, ctx_stackq
+    movifnidn                    rax, srcq
+    mov [ctxq+ResampleContext.frac ], fracd
+    sub                          rax, src_stackq
+    mov [ctxq+ResampleContext.index], indexd
+    shr                          rax, %3
+
+.skip_store:
+%if ARCH_X86_32
+    ADD                          rsp, 0x20
+%endif
+    RET
+
+; int resample_linear_$format(ResampleContext *ctx, float *dst,
+;                             const float *src, int size, int update_ctx)
+%if ARCH_X86_64 ; unix64 and win64
+%if UNIX64
+cglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_shift, index, frac, \
+                                      size, dst_incr_mod, min_filter_count_x4, \
+                                      min_filter_len_x4, dst_incr_div, src_incr, \
+                                      src, dst_end, filter_bank
+
+    mov                         srcq, r2mp
+%else ; win64
+cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_shift, index, frac, \
+                                      size, dst_incr_mod, min_filter_count_x4, \
+                                      min_filter_len_x4, dst_incr_div, src_incr, \
+                                      dst, dst_end, filter_bank
+
+    mov                         dstq, r1mp
+%endif
+
+    ; use red-zone for variable storage
+%define ctx_stackq            [rsp-0x8]
+%define src_stackq            [rsp-0x10]
+%define phase_mask_stackd     [rsp-0x14]
+%if WIN64
+%define update_context_stackd r4m
+%else ; unix64
+%define update_context_stackd [rsp-0x18]
+%endif
+
+    ; load as many variables in registers as possible; for the rest, store
+    ; on stack so that we have 'ctx' available as one extra register
+    mov                        sized, r3d
+    mov                  phase_maskd, [ctxq+ResampleContext.phase_mask]
+%if UNIX64
+    mov        update_context_stackd, r4d
+%endif
+    mov                       indexd, [ctxq+ResampleContext.index]
+    mov                        fracd, [ctxq+ResampleContext.frac]
+    mov                dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
+    mov                 filter_bankq, [ctxq+ResampleContext.filter_bank]
+    mov                    src_incrd, [ctxq+ResampleContext.src_incr]
+    mov                   ctx_stackq, ctxq
+    mov            phase_mask_stackd, phase_maskd
+    mov           min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
+%ifidn %1, int16
+    movd                          m4, [pd_0x4000]
+%else ; float/double
+    cvtsi2s%4                    xm0, src_incrd
+    movs%4                       xm4, [%5]
+    divs%4                       xm4, xm0
+%endif
+    mov                dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
+    shl           min_filter_len_x4d, %3
+    lea                     dst_endq, [dstq+sizeq*%2]
+
+%if UNIX64
+    mov                          ecx, [ctxq+ResampleContext.phase_shift]
+    mov                          edi, [ctxq+ResampleContext.filter_alloc]
+
+    DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, filter1, \
+                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
+                dst_incr_div, src_incr, src, dst_end, filter_bank
+%elif WIN64
+    mov                          R9d, [ctxq+ResampleContext.filter_alloc]
+    mov                          ecx, [ctxq+ResampleContext.phase_shift]
+
+    DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, filter1, \
+                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
+                dst_incr_div, src_incr, dst, dst_end, filter_bank
+%endif
+
+    neg           min_filter_len_x4q
+    sub                 filter_bankq, min_filter_len_x4q
+    sub                         srcq, min_filter_len_x4q
+    mov                   src_stackq, srcq
+%else ; x86-32
+cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
+                                     frac, index, dst, filter_bank
+
+    ; push temp variables to stack
+%define ctx_stackq            r0mp
+%define src_stackq            r2mp
+%define update_context_stackd r4m
+
+    mov                         dstq, r1mp
+    mov                           r3, r3mp
+    lea                           r3, [dstq+r3*%2]
+    PUSH                              dword [ctxq+ResampleContext.dst_incr_div]
+    PUSH                              r3
+    mov                           r3, dword [ctxq+ResampleContext.filter_alloc]
+    PUSH                              dword [ctxq+ResampleContext.dst_incr_mod]
+    PUSH                              r3
+    shl                           r3, %3
+    PUSH                              r3
+    mov                           r3, dword [ctxq+ResampleContext.src_incr]
+    PUSH                              dword [ctxq+ResampleContext.phase_mask]
+    PUSH                              r3d
+%ifidn %1, int16
+    movd                          m4, [pd_0x4000]
+%else ; float/double
+    cvtsi2s%4                    xm0, r3d
+    movs%4                       xm4, [%5]
+    divs%4                       xm4, xm0
+%endif
+    mov        min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
+    mov                       indexd, [ctxq+ResampleContext.index]
+    shl        min_filter_length_x4d, %3
+    mov                        fracd, [ctxq+ResampleContext.frac]
+    neg        min_filter_length_x4q
+    mov                 filter_bankq, [ctxq+ResampleContext.filter_bank]
+    sub                         r2mp, min_filter_length_x4q
+    sub                 filter_bankq, min_filter_length_x4q
+    PUSH                              min_filter_length_x4q
+    PUSH                              filter_bankq
+    PUSH                              dword [ctxq+ResampleContext.phase_shift]
+
+    DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src
+
+%define phase_shift_stackd    dword [rsp+0x0]
+%define filter_bankq          dword [rsp+0x4]
+%define min_filter_length_x4q dword [rsp+0x8]
+%define src_incrd             dword [rsp+0xc]
+%define phase_mask_stackd     dword [rsp+0x10]
+%define filter_alloc_x4q      dword [rsp+0x14]
+%define filter_allocd         dword [rsp+0x18]
+%define dst_incr_modd         dword [rsp+0x1c]
+%define dst_endq              dword [rsp+0x20]
+%define dst_incr_divd         dword [rsp+0x24]
+
+    mov                         srcq, r2mp
+%endif
+
+.loop:
+    mov                     filter1d, filter_allocd
+    imul                    filter1d, indexd
+%if ARCH_X86_64
+    mov         min_filter_count_x4q, min_filter_len_x4q
+    lea                     filter1q, [filter_bankq+filter1q*%2]
+    lea                     filter2q, [filter1q+filter_allocq*%2]
+%else ; x86-32
+    mov         min_filter_count_x4q, filter_bankq
+    lea                     filter1q, [min_filter_count_x4q+filter1q*%2]
+    mov         min_filter_count_x4q, min_filter_length_x4q
+    mov                     filter2q, filter1q
+    add                     filter2q, filter_alloc_x4q
+%endif
+%ifidn %1, int16
+    mova                          m0, m4
+    mova                          m2, m4
+%else ; float/double
+    xorps                         m0, m0, m0
+    xorps                         m2, m2, m2
+%endif
+
+    align 16
+.inner_loop:
+    movu                          m1, [srcq+min_filter_count_x4q*1]
+%ifidn %1, int16
+%if cpuflag(xop)
+    vpmadcswd                     m2, m1, [filter2q+min_filter_count_x4q*1], m2
+    vpmadcswd                     m0, m1, [filter1q+min_filter_count_x4q*1], m0
+%else
+    pmaddwd                       m3, m1, [filter2q+min_filter_count_x4q*1]
+    pmaddwd                       m1, [filter1q+min_filter_count_x4q*1]
+    paddd                         m2, m3
+    paddd                         m0, m1
+%endif ; cpuflag
+%else ; float/double
+%if cpuflag(fma4) || cpuflag(fma3)
+    fmaddp%4                      m2, m1, [filter2q+min_filter_count_x4q*1], m2
+    fmaddp%4                      m0, m1, [filter1q+min_filter_count_x4q*1], m0
+%else
+    mulp%4                        m3, m1, [filter2q+min_filter_count_x4q*1]
+    mulp%4                        m1, m1, [filter1q+min_filter_count_x4q*1]
+    addp%4                        m2, m2, m3
+    addp%4                        m0, m0, m1
+%endif ; cpuflag
+%endif
+    add         min_filter_count_x4q, mmsize
+    js .inner_loop
+
+%ifidn %1, int16
+%if mmsize == 16
+%if cpuflag(xop)
+    vphadddq                      m2, m2
+    vphadddq                      m0, m0
+%endif
+    pshufd                        m3, m2, q0032
+    pshufd                        m1, m0, q0032
+    paddd                         m2, m3
+    paddd                         m0, m1
+%endif
+%if notcpuflag(xop)
+    PSHUFLW                       m3, m2, q0032
+    PSHUFLW                       m1, m0, q0032
+    paddd                         m2, m3
+    paddd                         m0, m1
+%endif
+    psubd                         m2, m0
+    ; This is probably a really bad idea on atom and other machines with a
+    ; long transfer latency between GPRs and XMMs (atom). However, it does
+    ; make the clip a lot simpler...
+    movd                         eax, m2
+    add                       indexd, dst_incr_divd
+    imul                              fracd
+    idiv                              src_incrd
+    movd                          m1, eax
+    add                        fracd, dst_incr_modd
+    paddd                         m0, m1
+    psrad                         m0, 15
+    packssdw                      m0, m0
+    movd                      [dstq], m0
+
+    ; note that for imul/idiv, I need to move filter to edx/eax for each:
+    ; - 32bit: eax=r0[filter1], edx=r2[filter2]
+    ; - win64: eax=r6[filter1], edx=r1[todo]
+    ; - unix64: eax=r6[filter1], edx=r2[todo]
+%else ; float/double
+    ; val += (v2 - val) * (FELEML) frac / c->src_incr;
+%if mmsize == 32
+    vextractf128                 xm1, m0, 0x1
+    vextractf128                 xm3, m2, 0x1
+    addps                        xm0, xm1
+    addps                        xm2, xm3
+%endif
+    cvtsi2s%4                    xm1, fracd
+    subp%4                       xm2, xm0
+    mulp%4                       xm1, xm4
+    shufp%4                      xm1, xm1, q0000
+%if cpuflag(fma4) || cpuflag(fma3)
+    fmaddp%4                     xm0, xm2, xm1, xm0
+%else
+    mulp%4                       xm2, xm1
+    addp%4                       xm0, xm2
+%endif ; cpuflag
+
+    ; horizontal sum & store
+    movhlps                      xm1, xm0
+%ifidn %1, float
+    addps                        xm0, xm1
+    shufps                       xm1, xm0, xm0, q0001
+%endif
+    add                        fracd, dst_incr_modd
+    addp%4                       xm0, xm1
+    add                       indexd, dst_incr_divd
+    movs%4                    [dstq], xm0
+%endif
+    cmp                        fracd, src_incrd
+    jl .skip
+    sub                        fracd, src_incrd
+    inc                       indexd
+
+%if UNIX64
+    DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, index_incr, \
+                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
+                dst_incr_div, src_incr, src, dst_end, filter_bank
+%elif WIN64
+    DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, index_incr, \
+                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
+                dst_incr_div, src_incr, dst, dst_end, filter_bank
+%else ; x86-32
+    DEFINE_ARGS filter1, phase_shift, index_incr, frac, index, dst, src
+%endif
+
+.skip:
+%if ARCH_X86_32
+    mov                 phase_shiftd, phase_shift_stackd
+%endif
+    mov                  index_incrd, indexd
+    add                         dstq, %2
+    and                       indexd, phase_mask_stackd
+    sar                  index_incrd, phase_shiftb
+    lea                         srcq, [srcq+index_incrq*%2]
+    cmp                         dstq, dst_endq
+    jne .loop
+
+%if UNIX64
+    DEFINE_ARGS ctx, dst, filter2, phase_shift, index, frac, index_incr, \
+                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
+                dst_incr_div, src_incr, src, dst_end, filter_bank
+%elif WIN64
+    DEFINE_ARGS ctx, filter2, src, phase_shift, index, frac, index_incr, \
+                dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
+                dst_incr_div, src_incr, dst, dst_end, filter_bank
+%else ; x86-32
+    DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src
+%endif
+
+    cmp  dword update_context_stackd, 0
+    jz .skip_store
+    ; strictly speaking, the function should always return the consumed
+    ; number of bytes; however, we only use the value if update_context
+    ; is true, so let's just leave it uninitialized otherwise
+    mov                         ctxq, ctx_stackq
+    movifnidn                    rax, srcq
+    mov [ctxq+ResampleContext.frac ], fracd
+    sub                          rax, src_stackq
+    mov [ctxq+ResampleContext.index], indexd
+    shr                          rax, %3
+
+.skip_store:
+%if ARCH_X86_32
+    ADD                          rsp, 0x28
+%endif
+    RET
+%endmacro
+
+INIT_XMM sse
+RESAMPLE_FNS float, 4, 2, s, pf_1
+
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+RESAMPLE_FNS float, 4, 2, s, pf_1
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+RESAMPLE_FNS float, 4, 2, s, pf_1
+%endif
+%if HAVE_FMA4_EXTERNAL
+INIT_XMM fma4
+RESAMPLE_FNS float, 4, 2, s, pf_1
+%endif
+
+%if ARCH_X86_32
+INIT_MMX mmxext
+RESAMPLE_FNS int16, 2, 1
+%endif
+
+INIT_XMM sse2
+RESAMPLE_FNS int16, 2, 1
+%if HAVE_XOP_EXTERNAL
+INIT_XMM xop
+RESAMPLE_FNS int16, 2, 1
+%endif
+
+INIT_XMM sse2
+RESAMPLE_FNS double, 8, 3, d, pdbl_1
diff --git a/libswresample/x86/resample_init.c b/libswresample/x86/resample_init.c
new file mode 100644
index 0000000000..9d7d5cf89e
--- /dev/null
+++ b/libswresample/x86/resample_init.c
@@ -0,0 +1,90 @@
+/*
+ * audio resampling
+ * Copyright (c) 2004-2012 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * audio resampling
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "libavutil/x86/cpu.h"
+#include "libswresample/resample.h"
+
+#define RESAMPLE_FUNCS(type, opt) \
+int ff_resample_common_##type##_##opt(ResampleContext *c, void *dst, \
+                                      const void *src, int sz, int upd); \
+int ff_resample_linear_##type##_##opt(ResampleContext *c, void *dst, \
+                                      const void *src, int sz, int upd)
+
+RESAMPLE_FUNCS(int16,  mmxext);
+RESAMPLE_FUNCS(int16,  sse2);
+RESAMPLE_FUNCS(int16,  xop);
+RESAMPLE_FUNCS(float,  sse);
+RESAMPLE_FUNCS(float,  avx);
+RESAMPLE_FUNCS(float,  fma3);
+RESAMPLE_FUNCS(float,  fma4);
+RESAMPLE_FUNCS(double, sse2);
+
+av_cold void swri_resample_dsp_x86_init(ResampleContext *c)
+{
+    int av_unused mm_flags = av_get_cpu_flags();
+
+    switch(c->format){
+    case AV_SAMPLE_FMT_S16P:
+        if (ARCH_X86_32 && EXTERNAL_MMXEXT(mm_flags)) {
+            c->dsp.resample = c->linear ? ff_resample_linear_int16_mmxext
+                                        : ff_resample_common_int16_mmxext;
+        }
+        if (EXTERNAL_SSE2(mm_flags)) {
+            c->dsp.resample = c->linear ? ff_resample_linear_int16_sse2
+                                        : ff_resample_common_int16_sse2;
+        }
+        if (EXTERNAL_XOP(mm_flags)) {
+            c->dsp.resample = c->linear ? ff_resample_linear_int16_xop
+                                        : ff_resample_common_int16_xop;
+        }
+        break;
+    case AV_SAMPLE_FMT_FLTP:
+        if (EXTERNAL_SSE(mm_flags)) {
+            c->dsp.resample = c->linear ? ff_resample_linear_float_sse
+                                        : ff_resample_common_float_sse;
+        }
+        if (EXTERNAL_AVX_FAST(mm_flags)) {
+            c->dsp.resample = c->linear ? ff_resample_linear_float_avx
+                                        : ff_resample_common_float_avx;
+        }
+        if (EXTERNAL_FMA3_FAST(mm_flags)) {
+            c->dsp.resample = c->linear ? ff_resample_linear_float_fma3
+                                        : ff_resample_common_float_fma3;
+        }
+        if (EXTERNAL_FMA4(mm_flags)) {
+            c->dsp.resample = c->linear ? ff_resample_linear_float_fma4
+                                        : ff_resample_common_float_fma4;
+        }
+        break;
+    case AV_SAMPLE_FMT_DBLP:
+        if (EXTERNAL_SSE2(mm_flags)) {
+            c->dsp.resample = c->linear ? ff_resample_linear_double_sse2
+                                        : ff_resample_common_double_sse2;
+        }
+        break;
+    }
+}
diff --git a/libswresample/x86/w64xmmtest.c b/libswresample/x86/w64xmmtest.c
new file mode 100644
index 0000000000..9cddb4a858
--- /dev/null
+++ b/libswresample/x86/w64xmmtest.c
@@ -0,0 +1,29 @@
+/*
+ * check XMM registers for clobbers on Win64
+ * Copyright (c) 2013 Martin Storsjo
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libswresample/swresample.h"
+#include "libavutil/x86/w64xmmtest.h"
+
+wrap(swr_convert(struct SwrContext *s, uint8_t **out, int out_count,
+                 const uint8_t **in , int in_count))
+{
+    testxmmclobbers(swr_convert, s, out, out_count, in, in_count);
+}