summaryrefslogtreecommitdiff
path: root/libavutil/x86
diff options
context:
space:
mode:
Diffstat (limited to 'libavutil/x86')
-rw-r--r--libavutil/x86/Makefile4
-rw-r--r--libavutil/x86/asm.h51
-rw-r--r--libavutil/x86/bswap.h8
-rw-r--r--libavutil/x86/cpu.c12
-rw-r--r--libavutil/x86/cpu.h8
-rw-r--r--libavutil/x86/cpuid.asm8
-rw-r--r--libavutil/x86/emms.asm8
-rw-r--r--libavutil/x86/emms.h12
-rw-r--r--libavutil/x86/float_dsp.asm153
-rw-r--r--libavutil/x86/float_dsp_init.c97
-rw-r--r--libavutil/x86/intreadwrite.h8
-rw-r--r--libavutil/x86/lls.asm11
-rw-r--r--libavutil/x86/lls_init.c14
-rw-r--r--libavutil/x86/pixelutils.asm165
-rw-r--r--libavutil/x86/pixelutils.h26
-rw-r--r--libavutil/x86/pixelutils_init.c58
-rw-r--r--libavutil/x86/timer.h9
-rw-r--r--libavutil/x86/w64xmmtest.h8
-rw-r--r--libavutil/x86/x86inc.asm23
-rw-r--r--libavutil/x86/x86util.asm109
20 files changed, 608 insertions, 184 deletions
diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
index 1e19082233..ad3bdfc29d 100644
--- a/libavutil/x86/Makefile
+++ b/libavutil/x86/Makefile
@@ -2,7 +2,11 @@ OBJS += x86/cpu.o \
x86/float_dsp_init.o \
x86/lls_init.o \
+OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o \
+
YASM-OBJS += x86/cpuid.o \
x86/emms.o \
x86/float_dsp.o \
x86/lls.o \
+
+YASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o \
diff --git a/libavutil/x86/asm.h b/libavutil/x86/asm.h
index e30f5dbaf7..616ad6c96f 100644
--- a/libavutil/x86/asm.h
+++ b/libavutil/x86/asm.h
@@ -1,20 +1,20 @@
/*
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -25,6 +25,7 @@
#include "config.h"
typedef struct xmm_reg { uint64_t a, b; } xmm_reg;
+typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg;
#if ARCH_X86_64
# define OPSIZE "q"
@@ -107,6 +108,46 @@ typedef int x86_reg;
# define LOCAL_MANGLE(a) #a
#endif
-#define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
+#if HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS
+# define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
+# define NAMED_CONSTRAINTS_ADD(...)
+# define NAMED_CONSTRAINTS(...)
+# define NAMED_CONSTRAINTS_ARRAY_ADD(...)
+# define NAMED_CONSTRAINTS_ARRAY(...)
+#else
+ /* When direct symbol references are used in code passed to a compiler that does not support them
+ * then these references need to be converted to named asm constraints instead.
+ * Instead of returning a direct symbol MANGLE now returns a named constraint for that specific symbol.
+ * In order for this to work there must also be a corresponding entry in the asm-interface. To add this
+ * entry use the macro NAMED_CONSTRAINTS() and pass in a list of each symbol reference used in the
+ * corresponding block of code. (e.g. NAMED_CONSTRAINTS(var1,var2,var3) where var1 is the first symbol etc. ).
+ * If there are already existing constraints then use NAMED_CONSTRAINTS_ADD to add to the existing constraint list.
+ */
+# define MANGLE(a) "%["#a"]"
+ // Intel/MSVC does not correctly expand va-args so we need a rather ugly hack in order to get it to work
+# define FE_0(P,X) P(X)
+# define FE_1(P,X,X1) P(X), FE_0(P,X1)
+# define FE_2(P,X,X1,X2) P(X), FE_1(P,X1,X2)
+# define FE_3(P,X,X1,X2,X3) P(X), FE_2(P,X1,X2,X3)
+# define FE_4(P,X,X1,X2,X3,X4) P(X), FE_3(P,X1,X2,X3,X4)
+# define FE_5(P,X,X1,X2,X3,X4,X5) P(X), FE_4(P,X1,X2,X3,X4,X5)
+# define FE_6(P,X,X1,X2,X3,X4,X5,X6) P(X), FE_5(P,X1,X2,X3,X4,X5,X6)
+# define FE_7(P,X,X1,X2,X3,X4,X5,X6,X7) P(X), FE_6(P,X1,X2,X3,X4,X5,X6,X7)
+# define FE_8(P,X,X1,X2,X3,X4,X5,X6,X7,X8) P(X), FE_7(P,X1,X2,X3,X4,X5,X6,X7,X8)
+# define FE_9(P,X,X1,X2,X3,X4,X5,X6,X7,X8,X9) P(X), FE_8(P,X1,X2,X3,X4,X5,X6,X7,X8,X9)
+# define GET_FE_IMPL(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME
+# define GET_FE(A) GET_FE_IMPL A
+# define GET_FE_GLUE(x, y) x y
+# define FOR_EACH_VA(P,...) GET_FE_GLUE(GET_FE((__VA_ARGS__,FE_9,FE_8,FE_7,FE_6,FE_5,FE_4,FE_3,FE_2,FE_1,FE_0)), (P,__VA_ARGS__))
+# define NAME_CONSTRAINT(x) [x] "m"(x)
+ // Parameters are a list of each symbol reference required
+# define NAMED_CONSTRAINTS_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__)
+ // Same but without comma for when there are no previously defined constraints
+# define NAMED_CONSTRAINTS(...) FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__)
+ // Same as above NAMED_CONSTRAINTS except used for passing arrays/pointers instead of normal variables
+# define NAME_CONSTRAINT_ARRAY(x) [x] "m"(*x)
+# define NAMED_CONSTRAINTS_ARRAY_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__)
+# define NAMED_CONSTRAINTS_ARRAY(...) FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__)
+#endif
#endif /* AVUTIL_X86_ASM_H */
diff --git a/libavutil/x86/bswap.h b/libavutil/x86/bswap.h
index c73be9af81..08e2a62520 100644
--- a/libavutil/x86/bswap.h
+++ b/libavutil/x86/bswap.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index 8be6d94742..2b62e92479 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -3,20 +3,20 @@
* (c)1997-99 by H. Dietz and R. Fisher
* Converted to C and improved by Fabrice Bellard.
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -45,7 +45,7 @@
"cpuid \n\t" \
"xchg %%"REG_b", %%"REG_S \
: "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx) \
- : "0" (index))
+ : "0" (index), "2"(0))
#define xgetbv(index, eax, edx) \
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (index))
@@ -143,7 +143,7 @@ int ff_get_cpu_flags_x86(void)
if (max_std_level >= 7) {
cpuid(7, eax, ebx, ecx, edx);
#if HAVE_AVX2
- if (ebx & 0x00000020)
+ if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
rval |= AV_CPU_FLAG_AVX2;
#endif /* HAVE_AVX2 */
/* BMI1/2 don't need OS support */
diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h
index 50da30e389..bc64b1b3bd 100644
--- a/libavutil/x86/cpu.h
+++ b/libavutil/x86/cpu.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavutil/x86/cpuid.asm b/libavutil/x86/cpuid.asm
index 1cb8e94ea3..c3f7866ec7 100644
--- a/libavutil/x86/cpuid.asm
+++ b/libavutil/x86/cpuid.asm
@@ -4,20 +4,20 @@
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavutil/x86/emms.asm b/libavutil/x86/emms.asm
index a6851acc99..0aad34af3f 100644
--- a/libavutil/x86/emms.asm
+++ b/libavutil/x86/emms.asm
@@ -1,20 +1,20 @@
;*****************************************************************************
;* Copyright (C) 2013 Martin Storsjo
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavutil/x86/emms.h b/libavutil/x86/emms.h
index 2ed9e5d09d..a529b6bbbe 100644
--- a/libavutil/x86/emms.h
+++ b/libavutil/x86/emms.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -21,6 +21,7 @@
#include "config.h"
#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
void avpriv_emms_yasm(void);
@@ -33,7 +34,8 @@ void avpriv_emms_yasm(void);
*/
static av_always_inline void emms_c(void)
{
- __asm__ volatile ("emms" ::: "memory");
+ if(av_get_cpu_flags() & AV_CPU_FLAG_MMX)
+ __asm__ volatile ("emms" ::: "memory");
}
#elif HAVE_MMX && HAVE_MM_EMPTY
# include <mmintrin.h>
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index d96249978a..ec3d22b230 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -1,20 +1,22 @@
;*****************************************************************************
;* x86-optimized Float DSP functions
;*
-;* This file is part of Libav.
+;* Copyright 2006 Loren Merritt
;*
-;* Libav is free software; you can redistribute it and/or
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -48,8 +50,10 @@ ALIGN 16
INIT_XMM sse
VECTOR_FMUL
+%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMUL
+%endif
;------------------------------------------------------------------------------
; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
@@ -57,33 +61,48 @@ VECTOR_FMUL
%macro VECTOR_FMAC_SCALAR 0
%if UNIX64
-cglobal vector_fmac_scalar, 3,3,3, dst, src, len
+cglobal vector_fmac_scalar, 3,3,5, dst, src, len
%else
-cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
+cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
%endif
%if ARCH_X86_32
VBROADCASTSS m0, mulm
%else
%if WIN64
- mova xmm0, xmm2
+ SWAP 0, 2
%endif
- shufps xmm0, xmm0, 0
+ shufps xm0, xm0, 0
%if cpuflag(avx)
- vinsertf128 m0, m0, xmm0, 1
+ vinsertf128 m0, m0, xm0, 1
%endif
%endif
lea lenq, [lend*4-64]
.loop:
-%assign a 0
-%rep 32/mmsize
- mulps m1, m0, [srcq+lenq+(a+0)*mmsize]
- mulps m2, m0, [srcq+lenq+(a+1)*mmsize]
- addps m1, m1, [dstq+lenq+(a+0)*mmsize]
- addps m2, m2, [dstq+lenq+(a+1)*mmsize]
- mova [dstq+lenq+(a+0)*mmsize], m1
- mova [dstq+lenq+(a+1)*mmsize], m2
-%assign a a+2
-%endrep
+%if cpuflag(fma3)
+ mova m1, [dstq+lenq]
+ mova m2, [dstq+lenq+1*mmsize]
+ fmaddps m1, m0, [srcq+lenq], m1
+ fmaddps m2, m0, [srcq+lenq+1*mmsize], m2
+%else ; cpuflag
+ mulps m1, m0, [srcq+lenq]
+ mulps m2, m0, [srcq+lenq+1*mmsize]
+%if mmsize < 32
+ mulps m3, m0, [srcq+lenq+2*mmsize]
+ mulps m4, m0, [srcq+lenq+3*mmsize]
+%endif ; mmsize
+ addps m1, m1, [dstq+lenq]
+ addps m2, m2, [dstq+lenq+1*mmsize]
+%if mmsize < 32
+ addps m3, m3, [dstq+lenq+2*mmsize]
+ addps m4, m4, [dstq+lenq+3*mmsize]
+%endif ; mmsize
+%endif ; cpuflag
+ mova [dstq+lenq], m1
+ mova [dstq+lenq+1*mmsize], m2
+%if mmsize < 32
+ mova [dstq+lenq+2*mmsize], m3
+ mova [dstq+lenq+3*mmsize], m4
+%endif ; mmsize
sub lenq, 64
jge .loop
REP_RET
@@ -91,8 +110,14 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
INIT_XMM sse
VECTOR_FMAC_SCALAR
+%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMAC_SCALAR
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+VECTOR_FMAC_SCALAR
+%endif
;------------------------------------------------------------------------------
; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
@@ -141,16 +166,11 @@ cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
VBROADCASTSD m0, mulm
%else
%if WIN64
- movlhps xmm2, xmm2
-%if cpuflag(avx)
- vinsertf128 ymm2, ymm2, xmm2, 1
-%endif
SWAP 0, 2
-%else
- movlhps xmm0, xmm0
-%if cpuflag(avx)
- vinsertf128 ymm0, ymm0, xmm0, 1
%endif
+ movlhps xm0, xm0
+%if cpuflag(avx)
+ vinsertf128 ym0, ym0, xm0, 1
%endif
%endif
lea lenq, [lend*8-2*mmsize]
@@ -172,20 +192,85 @@ VECTOR_DMUL_SCALAR
%endif
;-----------------------------------------------------------------------------
+; vector_fmul_window(float *dst, const float *src0,
+; const float *src1, const float *win, int len);
+;-----------------------------------------------------------------------------
+%macro VECTOR_FMUL_WINDOW 0
+cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
+ shl lend, 2
+ lea len1q, [lenq - mmsize]
+ add src0q, lenq
+ add dstq, lenq
+ add winq, lenq
+ neg lenq
+.loop
+ mova m0, [winq + lenq]
+ mova m4, [src0q + lenq]
+%if cpuflag(sse)
+ mova m1, [winq + len1q]
+ mova m5, [src1q + len1q]
+ shufps m1, m1, 0x1b
+ shufps m5, m5, 0x1b
+ mova m2, m0
+ mova m3, m1
+ mulps m2, m4
+ mulps m3, m5
+ mulps m1, m4
+ mulps m0, m5
+ addps m2, m3
+ subps m1, m0
+ shufps m2, m2, 0x1b
+%else
+ pswapd m1, [winq + len1q]
+ pswapd m5, [src1q + len1q]
+ mova m2, m0
+ mova m3, m1
+ pfmul m2, m4
+ pfmul m3, m5
+ pfmul m1, m4
+ pfmul m0, m5
+ pfadd m2, m3
+ pfsub m1, m0
+ pswapd m2, m2
+%endif
+ mova [dstq + lenq], m1
+ mova [dstq + len1q], m2
+ sub len1q, mmsize
+ add lenq, mmsize
+ jl .loop
+%if mmsize == 8
+ femms
+%endif
+ REP_RET
+%endmacro
+
+INIT_MMX 3dnowext
+VECTOR_FMUL_WINDOW
+INIT_XMM sse
+VECTOR_FMUL_WINDOW
+
+;-----------------------------------------------------------------------------
; vector_fmul_add(float *dst, const float *src0, const float *src1,
; const float *src2, int len)
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_ADD 0
-cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
+cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
lea lenq, [lend*4 - 2*mmsize]
ALIGN 16
.loop:
mova m0, [src0q + lenq]
mova m1, [src0q + lenq + mmsize]
+%if cpuflag(fma3)
+ mova m2, [src2q + lenq]
+ mova m3, [src2q + lenq + mmsize]
+ fmaddps m0, m0, [src1q + lenq], m2
+ fmaddps m1, m1, [src1q + lenq + mmsize], m3
+%else
mulps m0, m0, [src1q + lenq]
mulps m1, m1, [src1q + lenq + mmsize]
addps m0, m0, [src2q + lenq]
addps m1, m1, [src2q + lenq + mmsize]
+%endif
mova [dstq + lenq], m0
mova [dstq + lenq + mmsize], m1
@@ -196,8 +281,14 @@ ALIGN 16
INIT_XMM sse
VECTOR_FMUL_ADD
+%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMUL_ADD
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+VECTOR_FMUL_ADD
+%endif
;-----------------------------------------------------------------------------
; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
@@ -233,8 +324,10 @@ ALIGN 16
INIT_XMM sse
VECTOR_FMUL_REVERSE
+%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMUL_REVERSE
+%endif
; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
INIT_XMM sse
@@ -272,8 +365,8 @@ cglobal butterflies_float, 3,3,3, src0, src1, len
test lenq, lenq
jz .end
shl lenq, 2
- lea src0q, [src0q + lenq]
- lea src1q, [src1q + lenq]
+ add src0q, lenq
+ add src1q, lenq
neg lenq
.loop:
mova m0, [src0q + lenq]
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index a04d91c923..64b3a4d6bd 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -33,6 +33,8 @@ void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
int len);
void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
int len);
+void ff_vector_fmac_scalar_fma3(float *dst, const float *src, float mul,
+ int len);
void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
int len);
@@ -42,10 +44,17 @@ void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
void ff_vector_dmul_scalar_avx(double *dst, const double *src,
double mul, int len);
+void ff_vector_fmul_window_3dnowext(float *dst, const float *src0,
+ const float *src1, const float *win, int len);
+void ff_vector_fmul_window_sse(float *dst, const float *src0,
+ const float *src1, const float *win, int len);
+
void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
const float *src2, int len);
void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
const float *src2, int len);
+void ff_vector_fmul_add_fma3(float *dst, const float *src0, const float *src1,
+ const float *src2, int len);
void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
const float *src1, int len);
@@ -56,88 +65,18 @@ float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
void ff_butterflies_float_sse(float *src0, float *src1, int len);
-#if HAVE_6REGS && HAVE_INLINE_ASM
-static void vector_fmul_window_3dnowext(float *dst, const float *src0,
- const float *src1, const float *win,
- int len)
-{
- x86_reg i = -len * 4;
- x86_reg j = len * 4 - 8;
- __asm__ volatile (
- "1: \n"
- "pswapd (%5, %1), %%mm1 \n"
- "movq (%5, %0), %%mm0 \n"
- "pswapd (%4, %1), %%mm5 \n"
- "movq (%3, %0), %%mm4 \n"
- "movq %%mm0, %%mm2 \n"
- "movq %%mm1, %%mm3 \n"
- "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
- "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
- "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
- "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
- "pfadd %%mm3, %%mm2 \n"
- "pfsub %%mm0, %%mm1 \n"
- "pswapd %%mm2, %%mm2 \n"
- "movq %%mm1, (%2, %0) \n"
- "movq %%mm2, (%2, %1) \n"
- "sub $8, %1 \n"
- "add $8, %0 \n"
- "jl 1b \n"
- "femms \n"
- : "+r"(i), "+r"(j)
- : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
- );
-}
-
-static void vector_fmul_window_sse(float *dst, const float *src0,
- const float *src1, const float *win, int len)
-{
- x86_reg i = -len * 4;
- x86_reg j = len * 4 - 16;
- __asm__ volatile (
- "1: \n"
- "movaps (%5, %1), %%xmm1 \n"
- "movaps (%5, %0), %%xmm0 \n"
- "movaps (%4, %1), %%xmm5 \n"
- "movaps (%3, %0), %%xmm4 \n"
- "shufps $0x1b, %%xmm1, %%xmm1 \n"
- "shufps $0x1b, %%xmm5, %%xmm5 \n"
- "movaps %%xmm0, %%xmm2 \n"
- "movaps %%xmm1, %%xmm3 \n"
- "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
- "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
- "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
- "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
- "addps %%xmm3, %%xmm2 \n"
- "subps %%xmm0, %%xmm1 \n"
- "shufps $0x1b, %%xmm2, %%xmm2 \n"
- "movaps %%xmm1, (%2, %0) \n"
- "movaps %%xmm2, (%2, %1) \n"
- "sub $16, %1 \n"
- "add $16, %0 \n"
- "jl 1b \n"
- : "+r"(i), "+r"(j)
- : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
- );
-}
-#endif /* HAVE_6REGS && HAVE_INLINE_ASM */
-
av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{
int cpu_flags = av_get_cpu_flags();
-#if HAVE_6REGS && HAVE_INLINE_ASM
- if (INLINE_AMD3DNOWEXT(cpu_flags)) {
- fdsp->vector_fmul_window = vector_fmul_window_3dnowext;
+ if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
+ fdsp->vector_fmul_window = ff_vector_fmul_window_3dnowext;
}
- if (INLINE_SSE(cpu_flags)) {
- fdsp->vector_fmul_window = vector_fmul_window_sse;
- }
-#endif
if (EXTERNAL_SSE(cpu_flags)) {
fdsp->vector_fmul = ff_vector_fmul_sse;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
+ fdsp->vector_fmul_window = ff_vector_fmul_window_sse;
fdsp->vector_fmul_add = ff_vector_fmul_add_sse;
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
fdsp->scalarproduct_float = ff_scalarproduct_float_sse;
@@ -153,4 +92,8 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
}
+ if (EXTERNAL_FMA3(cpu_flags)) {
+ fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
+ fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
+ }
}
diff --git a/libavutil/x86/intreadwrite.h b/libavutil/x86/intreadwrite.h
index 635096e569..4061d19231 100644
--- a/libavutil/x86/intreadwrite.h
+++ b/libavutil/x86/intreadwrite.h
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2010 Alexander Strange <astrange@ithinksw.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
index eab85ed050..769befb769 100644
--- a/libavutil/x86/lls.asm
+++ b/libavutil/x86/lls.asm
@@ -3,20 +3,20 @@
;*
;* Copyright (c) 2013 Loren Merritt
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -125,6 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
.ret:
REP_RET
+%if HAVE_AVX_EXTERNAL
INIT_YMM avx
cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
%define covarq ctxq
@@ -194,7 +195,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
jle .loop2x1
.ret:
REP_RET
-
+%endif
INIT_XMM sse2
cglobal evaluate_lls, 3,4,2, ctx, var, order, i
diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c
index 888bc54a39..f53190488a 100644
--- a/libavutil/x86/lls_init.c
+++ b/libavutil/x86/lls_init.c
@@ -3,29 +3,29 @@
*
* Copyright (c) 2013 Loren Merritt
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/lls.h"
#include "libavutil/x86/cpu.h"
-void ff_update_lls_sse2(LLSModel *m, double *var);
-void ff_update_lls_avx(LLSModel *m, double *var);
-double ff_evaluate_lls_sse2(LLSModel *m, double *var, int order);
+void ff_update_lls_sse2(LLSModel *m, const double *var);
+void ff_update_lls_avx(LLSModel *m, const double *var);
+double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order);
av_cold void ff_init_lls_x86(LLSModel *m)
{
diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm
new file mode 100644
index 0000000000..7522f24a42
--- /dev/null
+++ b/libavutil/x86/pixelutils.asm
@@ -0,0 +1,165 @@
+;******************************************************************************
+;* Pixel utilities SIMD
+;*
+;* Copyright (C) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (C) 2014 Clément Bœsch <u pkh me>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86util.asm"
+
+SECTION_TEXT
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_8x8_mmx(const uint8_t *src1, ptrdiff_t stride1,
+; const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+INIT_MMX mmx
+cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2
+ pxor m7, m7
+ pxor m6, m6
+%rep 4
+ mova m0, [src1q]
+ mova m2, [src1q + stride1q]
+ mova m1, [src2q]
+ mova m3, [src2q + stride2q]
+ psubusb m4, m0, m1
+ psubusb m5, m2, m3
+ psubusb m1, m0
+ psubusb m3, m2
+ por m1, m4
+ por m3, m5
+ punpcklbw m0, m1, m7
+ punpcklbw m2, m3, m7
+ punpckhbw m1, m7
+ punpckhbw m3, m7
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m2
+ paddw m6, m0
+ lea src1q, [src1q + 2*stride1q]
+ lea src2q, [src2q + 2*stride2q]
+%endrep
+ psrlq m0, m6, 32
+ paddw m6, m0
+ psrlq m0, m6, 16
+ paddw m6, m0
+ movd eax, m6
+ movzx eax, ax
+ RET
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1,
+; const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+INIT_MMX mmxext
+cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2
+ pxor m2, m2
+%rep 4
+ mova m0, [src1q]
+ mova m1, [src1q + stride1q]
+ psadbw m0, [src2q]
+ psadbw m1, [src2q + stride2q]
+ paddw m2, m0
+ paddw m2, m1
+ lea src1q, [src1q + 2*stride1q]
+ lea src2q, [src2q + 2*stride2q]
+%endrep
+ movd eax, m2
+ RET
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_16x16_mmxext(const uint8_t *src1, ptrdiff_t stride1,
+; const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+INIT_MMX mmxext
+cglobal pixelutils_sad_16x16, 4,4,0, src1, stride1, src2, stride2
+ pxor m2, m2
+%rep 16
+ mova m0, [src1q]
+ mova m1, [src1q + 8]
+ psadbw m0, [src2q]
+ psadbw m1, [src2q + 8]
+ paddw m2, m0
+ paddw m2, m1
+ add src1q, stride1q
+ add src2q, stride2q
+%endrep
+ movd eax, m2
+ RET
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_16x16_sse(const uint8_t *src1, ptrdiff_t stride1,
+; const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2
+ movu m4, [src1q]
+ movu m2, [src2q]
+ movu m1, [src1q + stride1q]
+ movu m3, [src2q + stride2q]
+ psadbw m4, m2
+ psadbw m1, m3
+ paddw m4, m1
+%rep 7
+ lea src1q, [src1q + 2*stride1q]
+ lea src2q, [src2q + 2*stride2q]
+ movu m0, [src1q]
+ movu m2, [src2q]
+ movu m1, [src1q + stride1q]
+ movu m3, [src2q + stride2q]
+ psadbw m0, m2
+ psadbw m1, m3
+ paddw m4, m0
+ paddw m4, m1
+%endrep
+ movhlps m0, m4
+ paddw m4, m0
+ movd eax, m4
+ RET
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_[au]_16x16_sse(const uint8_t *src1, ptrdiff_t stride1,
+; const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+%macro SAD_XMM_16x16 1
+INIT_XMM sse2
+cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2
+ mov%1 m2, [src2q]
+ psadbw m2, [src1q]
+ mov%1 m1, [src2q + stride2q]
+ psadbw m1, [src1q + stride1q]
+ paddw m2, m1
+%rep 7
+ lea src1q, [src1q + 2*stride1q]
+ lea src2q, [src2q + 2*stride2q]
+ mov%1 m0, [src2q]
+ psadbw m0, [src1q]
+ mov%1 m1, [src2q + stride2q]
+ psadbw m1, [src1q + stride1q]
+ paddw m2, m0
+ paddw m2, m1
+%endrep
+ movhlps m0, m2
+ paddw m2, m0
+ movd eax, m2
+ RET
+%endmacro
+
+SAD_XMM_16x16 a
+SAD_XMM_16x16 u
diff --git a/libavutil/x86/pixelutils.h b/libavutil/x86/pixelutils.h
new file mode 100644
index 0000000000..876cf46053
--- /dev/null
+++ b/libavutil/x86/pixelutils.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_X86_PIXELUTILS_H
+#define AVUTIL_X86_PIXELUTILS_H
+
+#include "libavutil/pixelutils.h"
+
+void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned);
+
+#endif /* AVUTIL_X86_PIXELUTILS_H */
diff --git a/libavutil/x86/pixelutils_init.c b/libavutil/x86/pixelutils_init.c
new file mode 100644
index 0000000000..d60051067a
--- /dev/null
+++ b/libavutil/x86/pixelutils_init.c
@@ -0,0 +1,58 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "pixelutils.h"
+#include "cpu.h"
+
+int ff_pixelutils_sad_8x8_mmx(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+
+int ff_pixelutils_sad_16x16_mmxext(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_a_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_u_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+
+void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(cpu_flags)) {
+ sad[2] = ff_pixelutils_sad_8x8_mmx;
+ }
+
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ sad[2] = ff_pixelutils_sad_8x8_mmxext;
+ sad[3] = ff_pixelutils_sad_16x16_mmxext;
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ switch (aligned) {
+ case 0: sad[3] = ff_pixelutils_sad_16x16_sse2; break; // src1 unaligned, src2 unaligned
+ case 1: sad[3] = ff_pixelutils_sad_u_16x16_sse2; break; // src1 aligned, src2 unaligned
+ case 2: sad[3] = ff_pixelutils_sad_a_16x16_sse2; break; // src1 aligned, src2 aligned
+ }
+ }
+}
diff --git a/libavutil/x86/timer.h b/libavutil/x86/timer.h
index cdd67ddbb5..5b24b511c1 100644
--- a/libavutil/x86/timer.h
+++ b/libavutil/x86/timer.h
@@ -1,20 +1,20 @@
/*
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -25,6 +25,7 @@
#if HAVE_INLINE_ASM
+#define FF_TIMER_UNITS "decicycles"
#define AV_READ_TIME read_time
static inline uint64_t read_time(void)
diff --git a/libavutil/x86/w64xmmtest.h b/libavutil/x86/w64xmmtest.h
index b4ce7d3daf..9df499f7f6 100644
--- a/libavutil/x86/w64xmmtest.h
+++ b/libavutil/x86/w64xmmtest.h
@@ -2,20 +2,20 @@
* check XMM registers for clobbers on Win64
* Copyright (c) 2008 Ramiro Polla <ramiro.polla@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index e3ed3813e0..8a13064bdc 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -856,6 +856,14 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
INIT_CPUFLAGS %1
%endmacro
+; FIXME: INIT_AVX can be replaced by INIT_XMM avx
+%macro INIT_AVX 0
+ INIT_XMM
+ %assign avx_enabled 1
+ %define PALIGNR PALIGNR_SSSE3
+ %define RESET_MM_PERMUTATION INIT_AVX
+%endmacro
+
%macro INIT_YMM 0-1+
%assign avx_enabled 1
%define RESET_MM_PERMUTATION INIT_YMM %1
@@ -1402,21 +1410,6 @@ AVX_INSTR pfmul, 1, 0, 1
%undef i
%undef j
-%macro FMA_INSTR 3
- %macro %1 4-7 %1, %2, %3
- %if cpuflag(xop)
- v%5 %1, %2, %3, %4
- %else
- %6 %1, %2, %3
- %7 %1, %4
- %endif
- %endmacro
-%endmacro
-
-FMA_INSTR pmacsdd, pmulld, paddd
-FMA_INSTR pmacsww, pmullw, paddw
-FMA_INSTR pmadcswd, pmaddwd, paddd
-
; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
; This lets us use tzcnt without bumping the yasm version requirement yet.
%define tzcnt rep bsf
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index 11779cf07f..2d02f75069 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -6,20 +6,20 @@
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <holger@lubitz.org>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -69,6 +69,15 @@
SWAP %2, %3
%endmacro
+%macro TRANSPOSE2x4x4B 5
+ SBUTTERFLY bw, %1, %2, %5
+ SBUTTERFLY bw, %3, %4, %5
+ SBUTTERFLY wd, %1, %3, %5
+ SBUTTERFLY wd, %2, %4, %5
+ SBUTTERFLY dq, %1, %2, %5
+ SBUTTERFLY dq, %3, %4, %5
+%endmacro
+
%macro TRANSPOSE2x4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
@@ -273,6 +282,44 @@
%endif
%endmacro
+%macro HADDD 2 ; sum junk
+%if sizeof%1 == 32
+%define %2 xmm%2
+ vextracti128 %2, %1, 1
+%define %1 xmm%1
+ paddd %1, %2
+%endif
+%if mmsize >= 16
+%if cpuflag(xop) && sizeof%1 == 16
+ vphadddq %1, %1
+%endif
+ movhlps %2, %1
+ paddd %1, %2
+%endif
+%if notcpuflag(xop) || sizeof%1 != 16
+%if cpuflag(mmxext)
+ PSHUFLW %2, %1, q0032
+%else ; mmx
+ mova %2, %1
+ psrlq %2, 32
+%endif
+ paddd %1, %2
+%endif
+%undef %1
+%undef %2
+%endmacro
+
+%macro HADDW 2 ; reg, tmp
+%if cpuflag(xop) && sizeof%1 == 16
+ vphaddwq %1, %1
+ movhlps %2, %1
+ paddd %1, %2
+%else
+ pmaddwd %1, [pw_1]
+ HADDD %1, %2
+%endif
+%endmacro
+
%macro PALIGNR 4-5
%if cpuflag(ssse3)
%if %0==5
@@ -302,11 +349,19 @@
%endif
%endmacro
-%macro PAVGB 2
+%macro PAVGB 2-4
%if cpuflag(mmxext)
pavgb %1, %2
%elif cpuflag(3dnow)
pavgusb %1, %2
+%elif cpuflag(mmx)
+ movu %3, %2
+ por %3, %1
+ pxor %1, %2
+ pand %1, %4
+ psrlq %1, 1
+ psubb %3, %1
+ SWAP %1, %3
%endif
%endmacro
@@ -552,7 +607,9 @@
%endmacro
%macro SPLATW 2-3 0
-%if mmsize == 16
+%if cpuflag(avx2) && %3 == 0
+ vpbroadcastw %1, %2
+%elif mmsize == 16
pshuflw %1, %2, (%3)*0x55
punpcklqdq %1, %1
%elif cpuflag(mmxext)
@@ -584,6 +641,11 @@
%endif
%endmacro
+%macro CLIPUB 3 ;(dst, min, max)
+ pmaxub %1, %2
+ pminub %1, %3
+%endmacro
+
%macro CLIPW 3 ;(dst, min, max)
pmaxsw %1, %2
pminsw %1, %3
@@ -666,6 +728,25 @@
%endif
%endmacro
+%macro PMA_EMU 4
+ %macro %1 5-8 %2, %3, %4
+ %if cpuflag(xop)
+ v%6 %1, %2, %3, %4
+ %elifidn %1, %4
+ %7 %5, %2, %3
+ %8 %1, %4, %5
+ %else
+ %7 %1, %2, %3
+ %8 %1, %4
+ %endif
+ %endmacro
+%endmacro
+
+PMA_EMU PMACSWW, pmacsww, pmullw, paddw
+PMA_EMU PMACSDD, pmacsdd, pmulld, paddd ; sse4 emulation
+PMA_EMU PMACSDQL, pmacsdql, pmuldq, paddq ; sse4 emulation
+PMA_EMU PMADCSWD, pmadcswd, pmaddwd, paddd
+
; Wrapper for non-FMA version of fmaddps
%macro FMULADD_PS 5
%if cpuflag(fma3) || cpuflag(fma4)
@@ -678,3 +759,19 @@
addps %1, %4
%endif
%endmacro
+
+%macro LSHIFT 2
+%if mmsize > 8
+ pslldq %1, %2
+%else
+ psllq %1, 8*(%2)
+%endif
+%endmacro
+
+%macro RSHIFT 2
+%if mmsize > 8
+ psrldq %1, %2
+%else
+ psrlq %1, 8*(%2)
+%endif
+%endmacro