summaryrefslogtreecommitdiff
path: root/libavutil/x86
diff options
context:
space:
mode:
Diffstat (limited to 'libavutil/x86')
-rw-r--r--libavutil/x86/Makefile16
-rw-r--r--libavutil/x86/asm.h55
-rw-r--r--libavutil/x86/bswap.h44
-rw-r--r--libavutil/x86/cpu.c60
-rw-r--r--libavutil/x86/cpu.h17
-rw-r--r--libavutil/x86/cpuid.asm8
-rw-r--r--libavutil/x86/emms.asm8
-rw-r--r--libavutil/x86/emms.h20
-rw-r--r--libavutil/x86/fixed_dsp.asm48
-rw-r--r--libavutil/x86/fixed_dsp_init.c35
-rw-r--r--libavutil/x86/float_dsp.asm286
-rw-r--r--libavutil/x86/float_dsp_init.c121
-rw-r--r--libavutil/x86/imgutils.asm8
-rw-r--r--libavutil/x86/imgutils_init.c8
-rw-r--r--libavutil/x86/intmath.h139
-rw-r--r--libavutil/x86/intreadwrite.h8
-rw-r--r--libavutil/x86/lls.asm66
-rw-r--r--libavutil/x86/lls_init.c18
-rw-r--r--libavutil/x86/pixelutils.asm386
-rw-r--r--libavutil/x86/pixelutils.h26
-rw-r--r--libavutil/x86/pixelutils_init.c94
-rw-r--r--libavutil/x86/timer.h9
-rw-r--r--libavutil/x86/w64xmmtest.h13
-rw-r--r--libavutil/x86/x86inc.asm354
-rw-r--r--libavutil/x86/x86util.asm357
25 files changed, 1875 insertions, 329 deletions
diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
index b619478610..5f5242b5bd 100644
--- a/libavutil/x86/Makefile
+++ b/libavutil/x86/Makefile
@@ -1,10 +1,18 @@
OBJS += x86/cpu.o \
+ x86/fixed_dsp_init.o \
x86/float_dsp_init.o \
x86/imgutils_init.o \
x86/lls_init.o \
+OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o \
+
+EMMS_OBJS_$(HAVE_MMX_INLINE)_$(HAVE_MMX_EXTERNAL)_$(HAVE_MM_EMPTY) = x86/emms.o
+
X86ASM-OBJS += x86/cpuid.o \
- x86/emms.o \
- x86/float_dsp.o \
- x86/imgutils.o \
- x86/lls.o \
+ $(EMMS_OBJS__yes_) \
+ x86/fixed_dsp.o \
+ x86/float_dsp.o \
+ x86/imgutils.o \
+ x86/lls.o \
+
+X86ASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o \
diff --git a/libavutil/x86/asm.h b/libavutil/x86/asm.h
index f005a3a38a..9bff42d628 100644
--- a/libavutil/x86/asm.h
+++ b/libavutil/x86/asm.h
@@ -1,20 +1,20 @@
/*
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -38,7 +38,8 @@ typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg;
# define FF_PTR_SIZE "8"
typedef int64_t x86_reg;
-# define FF_REG_SP "rsp"
+/* FF_REG_SP is defined in Solaris sys headers, so use FF_REG_sp */
+# define FF_REG_sp "rsp"
# define FF_REG_BP "rbp"
# define FF_REGBP rbp
# define FF_REGa rax
@@ -59,7 +60,7 @@ typedef int64_t x86_reg;
# define FF_PTR_SIZE "4"
typedef int32_t x86_reg;
-# define FF_REG_SP "esp"
+# define FF_REG_sp "esp"
# define FF_REG_BP "ebp"
# define FF_REGBP ebp
# define FF_REGa eax
@@ -108,6 +109,46 @@ typedef int x86_reg;
# define LOCAL_MANGLE(a) #a
#endif
-#define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
+#if HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS
+# define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
+# define NAMED_CONSTRAINTS_ADD(...)
+# define NAMED_CONSTRAINTS(...)
+# define NAMED_CONSTRAINTS_ARRAY_ADD(...)
+# define NAMED_CONSTRAINTS_ARRAY(...)
+#else
+ /* When direct symbol references are used in code passed to a compiler that does not support them
+ * then these references need to be converted to named asm constraints instead.
+ * Instead of returning a direct symbol MANGLE now returns a named constraint for that specific symbol.
+ * In order for this to work there must also be a corresponding entry in the asm-interface. To add this
+ * entry use the macro NAMED_CONSTRAINTS() and pass in a list of each symbol reference used in the
+ * corresponding block of code. (e.g. NAMED_CONSTRAINTS(var1,var2,var3) where var1 is the first symbol etc. ).
+ * If there are already existing constraints then use NAMED_CONSTRAINTS_ADD to add to the existing constraint list.
+ */
+# define MANGLE(a) "%["#a"]"
+ // Intel/MSVC does not correctly expand va-args so we need a rather ugly hack in order to get it to work
+# define FE_0(P,X) P(X)
+# define FE_1(P,X,X1) P(X), FE_0(P,X1)
+# define FE_2(P,X,X1,X2) P(X), FE_1(P,X1,X2)
+# define FE_3(P,X,X1,X2,X3) P(X), FE_2(P,X1,X2,X3)
+# define FE_4(P,X,X1,X2,X3,X4) P(X), FE_3(P,X1,X2,X3,X4)
+# define FE_5(P,X,X1,X2,X3,X4,X5) P(X), FE_4(P,X1,X2,X3,X4,X5)
+# define FE_6(P,X,X1,X2,X3,X4,X5,X6) P(X), FE_5(P,X1,X2,X3,X4,X5,X6)
+# define FE_7(P,X,X1,X2,X3,X4,X5,X6,X7) P(X), FE_6(P,X1,X2,X3,X4,X5,X6,X7)
+# define FE_8(P,X,X1,X2,X3,X4,X5,X6,X7,X8) P(X), FE_7(P,X1,X2,X3,X4,X5,X6,X7,X8)
+# define FE_9(P,X,X1,X2,X3,X4,X5,X6,X7,X8,X9) P(X), FE_8(P,X1,X2,X3,X4,X5,X6,X7,X8,X9)
+# define GET_FE_IMPL(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME
+# define GET_FE(A) GET_FE_IMPL A
+# define GET_FE_GLUE(x, y) x y
+# define FOR_EACH_VA(P,...) GET_FE_GLUE(GET_FE((__VA_ARGS__,FE_9,FE_8,FE_7,FE_6,FE_5,FE_4,FE_3,FE_2,FE_1,FE_0)), (P,__VA_ARGS__))
+# define NAME_CONSTRAINT(x) [x] "m"(x)
+ // Parameters are a list of each symbol reference required
+# define NAMED_CONSTRAINTS_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__)
+ // Same but without comma for when there are no previously defined constraints
+# define NAMED_CONSTRAINTS(...) FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__)
+ // Same as above NAMED_CONSTRAINTS except used for passing arrays/pointers instead of normal variables
+# define NAME_CONSTRAINT_ARRAY(x) [x] "m"(*x)
+# define NAMED_CONSTRAINTS_ARRAY_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__)
+# define NAMED_CONSTRAINTS_ARRAY(...) FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__)
+#endif
#endif /* AVUTIL_X86_ASM_H */
diff --git a/libavutil/x86/bswap.h b/libavutil/x86/bswap.h
index c73be9af81..ffa59e4c82 100644
--- a/libavutil/x86/bswap.h
+++ b/libavutil/x86/bswap.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -25,21 +25,47 @@
#define AVUTIL_X86_BSWAP_H
#include <stdint.h>
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
#include "config.h"
#include "libavutil/attributes.h"
-#if HAVE_INLINE_ASM
+#if defined(_MSC_VER)
+
+#define av_bswap16 av_bswap16
+static av_always_inline av_const uint16_t av_bswap16(uint16_t x)
+{
+ return _rotr16(x, 8);
+}
+
+#define av_bswap32 av_bswap32
+static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
+{
+ return _byteswap_ulong(x);
+}
+
+#if ARCH_X86_64
+#define av_bswap64 av_bswap64
+static inline uint64_t av_const av_bswap64(uint64_t x)
+{
+ return _byteswap_uint64(x);
+}
+#endif
+
+
+#elif HAVE_INLINE_ASM
-#if !AV_GCC_VERSION_AT_LEAST(4,1)
+#if AV_GCC_VERSION_AT_MOST(4,0)
#define av_bswap16 av_bswap16
static av_always_inline av_const unsigned av_bswap16(unsigned x)
{
__asm__("rorw $8, %w0" : "+r"(x));
return x;
}
-#endif /* !AV_GCC_VERSION_AT_LEAST(4,1) */
+#endif /* AV_GCC_VERSION_AT_MOST(4,0) */
-#if !AV_GCC_VERSION_AT_LEAST(4,5)
+#if AV_GCC_VERSION_AT_MOST(4,4) || defined(__INTEL_COMPILER)
#define av_bswap32 av_bswap32
static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
{
@@ -55,7 +81,7 @@ static inline uint64_t av_const av_bswap64(uint64_t x)
return x;
}
#endif
-#endif /* !AV_GCC_VERSION_AT_LEAST(4,5) */
+#endif /* AV_GCC_VERSION_AT_MOST(4,4) */
#endif /* HAVE_INLINE_ASM */
#endif /* AVUTIL_X86_BSWAP_H */
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index fa50eeb9dd..bcd41a50a2 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -3,20 +3,20 @@
* (c)1997-99 by H. Dietz and R. Fisher
* Converted to C and improved by Fabrice Bellard.
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -42,10 +42,10 @@
#define cpuid(index, eax, ebx, ecx, edx) \
__asm__ volatile ( \
"mov %%"FF_REG_b", %%"FF_REG_S" \n\t" \
- "cpuid \n\t" \
+ "cpuid \n\t" \
"xchg %%"FF_REG_b", %%"FF_REG_S \
: "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx) \
- : "0" (index))
+ : "0" (index), "2"(0))
#define xgetbv(index, eax, edx) \
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (index))
@@ -97,6 +97,7 @@ int ff_get_cpu_flags_x86(void)
int max_std_level, max_ext_level, std_caps = 0, ext_caps = 0;
int family = 0, model = 0;
union { int i[3]; char c[12]; } vendor;
+ int xcr0_lo = 0, xcr0_hi = 0;
if (!cpuid_test())
return 0; /* CPUID not supported */
@@ -126,12 +127,14 @@ int ff_get_cpu_flags_x86(void)
rval |= AV_CPU_FLAG_SSE4;
if (ecx & 0x00100000 )
rval |= AV_CPU_FLAG_SSE42;
+ if (ecx & 0x02000000 )
+ rval |= AV_CPU_FLAG_AESNI;
#if HAVE_AVX
/* Check OXSAVE and AVX bits */
if ((ecx & 0x18000000) == 0x18000000) {
/* Check for OS support */
- xgetbv(0, eax, edx);
- if ((eax & 0x6) == 0x6) {
+ xgetbv(0, xcr0_lo, xcr0_hi);
+ if ((xcr0_lo & 0x6) == 0x6) {
rval |= AV_CPU_FLAG_AVX;
if (ecx & 0x00001000)
rval |= AV_CPU_FLAG_FMA3;
@@ -143,8 +146,15 @@ int ff_get_cpu_flags_x86(void)
if (max_std_level >= 7) {
cpuid(7, eax, ebx, ecx, edx);
#if HAVE_AVX2
- if (ebx & 0x00000020)
+ if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
rval |= AV_CPU_FLAG_AVX2;
+#if HAVE_AVX512 /* F, CD, BW, DQ, VL */
+ if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
+ if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
+ rval |= AV_CPU_FLAG_AVX512;
+
+ }
+#endif /* HAVE_AVX512 */
#endif /* HAVE_AVX2 */
/* BMI1/2 don't need OS support */
if (ebx & 0x00000008) {
@@ -180,13 +190,11 @@ int ff_get_cpu_flags_x86(void)
/* Similar to the above but for AVX functions on AMD processors.
This is necessary only for functions using YMM registers on Bulldozer
- based CPUs as they lack 256-bit execution units. SSE/AVX functions
- using XMM registers are always faster on them.
+ and Jaguar based CPUs as they lack 256-bit execution units. SSE/AVX
+ functions using XMM registers are always faster on them.
AV_CPU_FLAG_AVX and AV_CPU_FLAG_AVXSLOW are both set so that AVX is
- used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW.
- TODO: Confirm if Excavator is affected or not by this once it's
- released, and update the check if necessary. Same for btver2. */
- if (family == 0x15 && (rval & AV_CPU_FLAG_AVX))
+ used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
+ if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
rval |= AV_CPU_FLAG_AVXSLOW;
}
@@ -238,9 +246,27 @@ size_t ff_get_cpu_max_align_x86(void)
{
int flags = av_get_cpu_flags();
- if (flags & AV_CPU_FLAG_AVX)
+ if (flags & AV_CPU_FLAG_AVX512)
+ return 64;
+ if (flags & (AV_CPU_FLAG_AVX2 |
+ AV_CPU_FLAG_AVX |
+ AV_CPU_FLAG_XOP |
+ AV_CPU_FLAG_FMA4 |
+ AV_CPU_FLAG_FMA3 |
+ AV_CPU_FLAG_AVXSLOW))
return 32;
- if (flags & AV_CPU_FLAG_SSE)
+ if (flags & (AV_CPU_FLAG_AESNI |
+ AV_CPU_FLAG_SSE42 |
+ AV_CPU_FLAG_SSE4 |
+ AV_CPU_FLAG_SSSE3 |
+ AV_CPU_FLAG_SSE3 |
+ AV_CPU_FLAG_SSE2 |
+ AV_CPU_FLAG_SSE |
+ AV_CPU_FLAG_ATOM |
+ AV_CPU_FLAG_SSSE3SLOW |
+ AV_CPU_FLAG_SSE3SLOW |
+ AV_CPU_FLAG_SSE2SLOW))
return 16;
+
return 8;
}
diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h
index 6373e57db2..937c697fa0 100644
--- a/libavutil/x86/cpu.h
+++ b/libavutil/x86/cpu.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -48,6 +48,8 @@
#define X86_FMA3(flags) CPUEXT(flags, FMA3)
#define X86_FMA4(flags) CPUEXT(flags, FMA4)
#define X86_AVX2(flags) CPUEXT(flags, AVX2)
+#define X86_AESNI(flags) CPUEXT(flags, AESNI)
+#define X86_AVX512(flags) CPUEXT(flags, AVX512)
#define EXTERNAL_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOW)
#define EXTERNAL_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOWEXT)
@@ -70,8 +72,14 @@
#define EXTERNAL_AVX_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, AVX)
#define EXTERNAL_XOP(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, XOP)
#define EXTERNAL_FMA3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA3)
+#define EXTERNAL_FMA3_FAST(flags) CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, FMA3, AVX)
+#define EXTERNAL_FMA3_SLOW(flags) CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, FMA3, AVX)
#define EXTERNAL_FMA4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA4)
#define EXTERNAL_AVX2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2)
+#define EXTERNAL_AVX2_FAST(flags) CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, AVX2, AVX)
+#define EXTERNAL_AVX2_SLOW(flags) CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, AVX2, AVX)
+#define EXTERNAL_AESNI(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AESNI)
+#define EXTERNAL_AVX512(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512)
#define INLINE_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOW)
#define INLINE_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOWEXT)
@@ -96,6 +104,7 @@
#define INLINE_FMA3(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA3)
#define INLINE_FMA4(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA4)
#define INLINE_AVX2(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX2)
+#define INLINE_AESNI(flags) CPUEXT_SUFFIX(flags, _INLINE, AESNI)
void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx);
void ff_cpu_xgetbv(int op, int *eax, int *edx);
diff --git a/libavutil/x86/cpuid.asm b/libavutil/x86/cpuid.asm
index 1cb8e94ea3..c3f7866ec7 100644
--- a/libavutil/x86/cpuid.asm
+++ b/libavutil/x86/cpuid.asm
@@ -4,20 +4,20 @@
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavutil/x86/emms.asm b/libavutil/x86/emms.asm
index 05557de5da..8611762d73 100644
--- a/libavutil/x86/emms.asm
+++ b/libavutil/x86/emms.asm
@@ -1,20 +1,20 @@
;*****************************************************************************
;* Copyright (C) 2013 Martin Storsjo
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavutil/x86/emms.h b/libavutil/x86/emms.h
index 4b3ebbe7a4..c21e34b451 100644
--- a/libavutil/x86/emms.h
+++ b/libavutil/x86/emms.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -21,6 +21,7 @@
#include "config.h"
#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
void avpriv_emms_asm(void);
@@ -30,10 +31,19 @@ void avpriv_emms_asm(void);
* Empty mmx state.
* this must be called between any dsp function and float/double code.
* for example sin(); dsp->idct_put(); emms_c(); cos()
+ * Note, *alloc() and *free() also use float code in some libc implementations
+ * thus this also applies to them or any function using them.
*/
static av_always_inline void emms_c(void)
{
- __asm__ volatile ("emms" ::: "memory");
+/* Some inlined functions may also use mmx instructions regardless of
+ * runtime cpuflags. With that in mind, we unconditionally empty the
+ * mmx state if the target cpu chosen at configure time supports it.
+ */
+#if !defined(__MMX__)
+ if(av_get_cpu_flags() & AV_CPU_FLAG_MMX)
+#endif
+ __asm__ volatile ("emms" ::: "memory");
}
#elif HAVE_MMX && HAVE_MM_EMPTY
# include <mmintrin.h>
diff --git a/libavutil/x86/fixed_dsp.asm b/libavutil/x86/fixed_dsp.asm
new file mode 100644
index 0000000000..979dd5c334
--- /dev/null
+++ b/libavutil/x86/fixed_dsp.asm
@@ -0,0 +1,48 @@
+;*****************************************************************************
+;* x86-optimized Float DSP functions
+;*
+;* Copyright 2016 James Almer
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86util.asm"
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void ff_butterflies_fixed(float *src0, float *src1, int len);
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal butterflies_fixed, 3,3,3, src0, src1, len
+ shl lend, 2
+ add src0q, lenq
+ add src1q, lenq
+ neg lenq
+
+align 16
+.loop:
+ mova m0, [src0q + lenq]
+ mova m1, [src1q + lenq]
+ mova m2, m0
+ paddd m0, m1
+ psubd m2, m1
+ mova [src0q + lenq], m0
+ mova [src1q + lenq], m2
+ add lenq, mmsize
+ jl .loop
+ RET
diff --git a/libavutil/x86/fixed_dsp_init.c b/libavutil/x86/fixed_dsp_init.c
new file mode 100644
index 0000000000..303a2eb922
--- /dev/null
+++ b/libavutil/x86/fixed_dsp_init.c
@@ -0,0 +1,35 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/fixed_dsp.h"
+#include "cpu.h"
+
+void ff_butterflies_fixed_sse2(int *src0, int *src1, int len);
+
+av_cold void ff_fixed_dsp_init_x86(AVFixedDSPContext *fdsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ fdsp->butterflies_fixed = ff_butterflies_fixed_sse2;
+ }
+}
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index d96249978a..517fd63638 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -1,25 +1,30 @@
;*****************************************************************************
;* x86-optimized Float DSP functions
;*
-;* This file is part of Libav.
+;* Copyright 2006 Loren Merritt
;*
-;* Libav is free software; you can redistribute it and/or
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86util.asm"
+SECTION_RODATA 32
+pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
+
SECTION .text
;-----------------------------------------------------------------------------
@@ -48,8 +53,43 @@ ALIGN 16
INIT_XMM sse
VECTOR_FMUL
+%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMUL
+%endif
+
+;-----------------------------------------------------------------------------
+; void vector_dmul(double *dst, const double *src0, const double *src1, int len)
+;-----------------------------------------------------------------------------
+%macro VECTOR_DMUL 0
+cglobal vector_dmul, 4,4,4, dst, src0, src1, len
+ lea lend, [lenq*8 - mmsize*4]
+ALIGN 16
+.loop:
+ movaps m0, [src0q + lenq + 0*mmsize]
+ movaps m1, [src0q + lenq + 1*mmsize]
+ movaps m2, [src0q + lenq + 2*mmsize]
+ movaps m3, [src0q + lenq + 3*mmsize]
+ mulpd m0, m0, [src1q + lenq + 0*mmsize]
+ mulpd m1, m1, [src1q + lenq + 1*mmsize]
+ mulpd m2, m2, [src1q + lenq + 2*mmsize]
+ mulpd m3, m3, [src1q + lenq + 3*mmsize]
+ movaps [dstq + lenq + 0*mmsize], m0
+ movaps [dstq + lenq + 1*mmsize], m1
+ movaps [dstq + lenq + 2*mmsize], m2
+ movaps [dstq + lenq + 3*mmsize], m3
+
+ sub lenq, mmsize*4
+ jge .loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+VECTOR_DMUL
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+VECTOR_DMUL
+%endif
;------------------------------------------------------------------------------
; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
@@ -57,33 +97,48 @@ VECTOR_FMUL
%macro VECTOR_FMAC_SCALAR 0
%if UNIX64
-cglobal vector_fmac_scalar, 3,3,3, dst, src, len
+cglobal vector_fmac_scalar, 3,3,5, dst, src, len
%else
-cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
+cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
%endif
%if ARCH_X86_32
VBROADCASTSS m0, mulm
%else
%if WIN64
- mova xmm0, xmm2
+ SWAP 0, 2
%endif
- shufps xmm0, xmm0, 0
+ shufps xm0, xm0, 0
%if cpuflag(avx)
- vinsertf128 m0, m0, xmm0, 1
+ vinsertf128 m0, m0, xm0, 1
%endif
%endif
lea lenq, [lend*4-64]
.loop:
-%assign a 0
-%rep 32/mmsize
- mulps m1, m0, [srcq+lenq+(a+0)*mmsize]
- mulps m2, m0, [srcq+lenq+(a+1)*mmsize]
- addps m1, m1, [dstq+lenq+(a+0)*mmsize]
- addps m2, m2, [dstq+lenq+(a+1)*mmsize]
- mova [dstq+lenq+(a+0)*mmsize], m1
- mova [dstq+lenq+(a+1)*mmsize], m2
-%assign a a+2
-%endrep
+%if cpuflag(fma3)
+ mova m1, [dstq+lenq]
+ mova m2, [dstq+lenq+1*mmsize]
+ fmaddps m1, m0, [srcq+lenq], m1
+ fmaddps m2, m0, [srcq+lenq+1*mmsize], m2
+%else ; cpuflag
+ mulps m1, m0, [srcq+lenq]
+ mulps m2, m0, [srcq+lenq+1*mmsize]
+%if mmsize < 32
+ mulps m3, m0, [srcq+lenq+2*mmsize]
+ mulps m4, m0, [srcq+lenq+3*mmsize]
+%endif ; mmsize
+ addps m1, m1, [dstq+lenq]
+ addps m2, m2, [dstq+lenq+1*mmsize]
+%if mmsize < 32
+ addps m3, m3, [dstq+lenq+2*mmsize]
+ addps m4, m4, [dstq+lenq+3*mmsize]
+%endif ; mmsize
+%endif ; cpuflag
+ mova [dstq+lenq], m1
+ mova [dstq+lenq+1*mmsize], m2
+%if mmsize < 32
+ mova [dstq+lenq+2*mmsize], m3
+ mova [dstq+lenq+3*mmsize], m4
+%endif ; mmsize
sub lenq, 64
jge .loop
REP_RET
@@ -91,8 +146,14 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
INIT_XMM sse
VECTOR_FMAC_SCALAR
+%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMAC_SCALAR
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+VECTOR_FMAC_SCALAR
+%endif
;------------------------------------------------------------------------------
; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
@@ -124,6 +185,69 @@ INIT_XMM sse
VECTOR_FMUL_SCALAR
;------------------------------------------------------------------------------
+; void ff_vector_dmac_scalar(double *dst, const double *src, double mul,
+; int len)
+;------------------------------------------------------------------------------
+
+%macro VECTOR_DMAC_SCALAR 0
+%if ARCH_X86_32
+cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr
+ mov lenq, lenaddrm
+ VBROADCASTSD m0, mulm
+%else
+%if UNIX64
+cglobal vector_dmac_scalar, 3,3,5, dst, src, len
+%else
+cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len
+ SWAP 0, 2
+%endif
+ movlhps xm0, xm0
+%if cpuflag(avx)
+ vinsertf128 m0, m0, xm0, 1
+%endif
+%endif
+ lea lenq, [lend*8-mmsize*4]
+.loop:
+%if cpuflag(fma3)
+ movaps m1, [dstq+lenq]
+ movaps m2, [dstq+lenq+1*mmsize]
+ movaps m3, [dstq+lenq+2*mmsize]
+ movaps m4, [dstq+lenq+3*mmsize]
+ fmaddpd m1, m0, [srcq+lenq], m1
+ fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2
+ fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3
+ fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4
+%else ; cpuflag
+ mulpd m1, m0, [srcq+lenq]
+ mulpd m2, m0, [srcq+lenq+1*mmsize]
+ mulpd m3, m0, [srcq+lenq+2*mmsize]
+ mulpd m4, m0, [srcq+lenq+3*mmsize]
+ addpd m1, m1, [dstq+lenq]
+ addpd m2, m2, [dstq+lenq+1*mmsize]
+ addpd m3, m3, [dstq+lenq+2*mmsize]
+ addpd m4, m4, [dstq+lenq+3*mmsize]
+%endif ; cpuflag
+ movaps [dstq+lenq], m1
+ movaps [dstq+lenq+1*mmsize], m2
+ movaps [dstq+lenq+2*mmsize], m3
+ movaps [dstq+lenq+3*mmsize], m4
+ sub lenq, mmsize*4
+ jge .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+VECTOR_DMAC_SCALAR
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+VECTOR_DMAC_SCALAR
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+VECTOR_DMAC_SCALAR
+%endif
+
+;------------------------------------------------------------------------------
; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
; int len)
;------------------------------------------------------------------------------
@@ -141,24 +265,19 @@ cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
VBROADCASTSD m0, mulm
%else
%if WIN64
- movlhps xmm2, xmm2
-%if cpuflag(avx)
- vinsertf128 ymm2, ymm2, xmm2, 1
-%endif
SWAP 0, 2
-%else
- movlhps xmm0, xmm0
-%if cpuflag(avx)
- vinsertf128 ymm0, ymm0, xmm0, 1
%endif
+ movlhps xm0, xm0
+%if cpuflag(avx)
+ vinsertf128 ym0, ym0, xm0, 1
%endif
%endif
lea lenq, [lend*8-2*mmsize]
.loop:
mulpd m1, m0, [srcq+lenq ]
mulpd m2, m0, [srcq+lenq+mmsize]
- mova [dstq+lenq ], m1
- mova [dstq+lenq+mmsize], m2
+ movaps [dstq+lenq ], m1
+ movaps [dstq+lenq+mmsize], m2
sub lenq, 2*mmsize
jge .loop
REP_RET
@@ -172,20 +291,85 @@ VECTOR_DMUL_SCALAR
%endif
;-----------------------------------------------------------------------------
+; vector_fmul_window(float *dst, const float *src0,
+; const float *src1, const float *win, int len);
+;-----------------------------------------------------------------------------
+%macro VECTOR_FMUL_WINDOW 0
+cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
+ shl lend, 2
+ lea len1q, [lenq - mmsize]
+ add src0q, lenq
+ add dstq, lenq
+ add winq, lenq
+ neg lenq
+.loop:
+ mova m0, [winq + lenq]
+ mova m4, [src0q + lenq]
+%if cpuflag(sse)
+ mova m1, [winq + len1q]
+ mova m5, [src1q + len1q]
+ shufps m1, m1, 0x1b
+ shufps m5, m5, 0x1b
+ mova m2, m0
+ mova m3, m1
+ mulps m2, m4
+ mulps m3, m5
+ mulps m1, m4
+ mulps m0, m5
+ addps m2, m3
+ subps m1, m0
+ shufps m2, m2, 0x1b
+%else
+ pswapd m1, [winq + len1q]
+ pswapd m5, [src1q + len1q]
+ mova m2, m0
+ mova m3, m1
+ pfmul m2, m4
+ pfmul m3, m5
+ pfmul m1, m4
+ pfmul m0, m5
+ pfadd m2, m3
+ pfsub m1, m0
+ pswapd m2, m2
+%endif
+ mova [dstq + lenq], m1
+ mova [dstq + len1q], m2
+ sub len1q, mmsize
+ add lenq, mmsize
+ jl .loop
+%if mmsize == 8
+ femms
+%endif
+ REP_RET
+%endmacro
+
+INIT_MMX 3dnowext
+VECTOR_FMUL_WINDOW
+INIT_XMM sse
+VECTOR_FMUL_WINDOW
+
+;-----------------------------------------------------------------------------
; vector_fmul_add(float *dst, const float *src0, const float *src1,
; const float *src2, int len)
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_ADD 0
-cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
+cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
lea lenq, [lend*4 - 2*mmsize]
ALIGN 16
.loop:
mova m0, [src0q + lenq]
mova m1, [src0q + lenq + mmsize]
+%if cpuflag(fma3)
+ mova m2, [src2q + lenq]
+ mova m3, [src2q + lenq + mmsize]
+ fmaddps m0, m0, [src1q + lenq], m2
+ fmaddps m1, m1, [src1q + lenq + mmsize], m3
+%else
mulps m0, m0, [src1q + lenq]
mulps m1, m1, [src1q + lenq + mmsize]
addps m0, m0, [src2q + lenq]
addps m1, m1, [src2q + lenq + mmsize]
+%endif
mova [dstq + lenq], m0
mova [dstq + lenq + mmsize], m1
@@ -196,8 +380,14 @@ ALIGN 16
INIT_XMM sse
VECTOR_FMUL_ADD
+%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMUL_ADD
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+VECTOR_FMUL_ADD
+%endif
;-----------------------------------------------------------------------------
; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
@@ -205,10 +395,16 @@ VECTOR_FMUL_ADD
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_REVERSE 0
cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
+%if cpuflag(avx2)
+ movaps m2, [pd_reverse]
+%endif
lea lenq, [lend*4 - 2*mmsize]
ALIGN 16
.loop:
-%if cpuflag(avx)
+%if cpuflag(avx2)
+ vpermps m0, m2, [src1q]
+ vpermps m1, m2, [src1q+mmsize]
+%elif cpuflag(avx)
vmovaps xmm0, [src1q + 16]
vinsertf128 m0, m0, [src1q], 1
vshufps m0, m0, m0, q0123
@@ -223,8 +419,8 @@ ALIGN 16
%endif
mulps m0, m0, [src0q + lenq + mmsize]
mulps m1, m1, [src0q + lenq]
- mova [dstq + lenq + mmsize], m0
- mova [dstq + lenq], m1
+ movaps [dstq + lenq + mmsize], m0
+ movaps [dstq + lenq], m1
add src1q, 2*mmsize
sub lenq, 2*mmsize
jge .loop
@@ -233,16 +429,22 @@ ALIGN 16
INIT_XMM sse
VECTOR_FMUL_REVERSE
+%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMUL_REVERSE
+%endif
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+VECTOR_FMUL_REVERSE
+%endif
; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
INIT_XMM sse
cglobal scalarproduct_float, 3,3,2, v1, v2, offset
+ shl offsetd, 2
+ add v1q, offsetq
+ add v2q, offsetq
neg offsetq
- shl offsetq, 2
- sub v1q, offsetq
- sub v2q, offsetq
xorps xmm0, xmm0
.loop:
movaps xmm1, [v1q+offsetq]
@@ -266,14 +468,9 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset
;-----------------------------------------------------------------------------
INIT_XMM sse
cglobal butterflies_float, 3,3,3, src0, src1, len
-%if ARCH_X86_64
- movsxd lenq, lend
-%endif
- test lenq, lenq
- jz .end
- shl lenq, 2
- lea src0q, [src0q + lenq]
- lea src1q, [src1q + lenq]
+ shl lend, 2
+ add src0q, lenq
+ add src1q, lenq
neg lenq
.loop:
mova m0, [src0q + lenq]
@@ -284,5 +481,4 @@ cglobal butterflies_float, 3,3,3, src0, src1, len
mova [src0q + lenq], m0
add lenq, mmsize
jl .loop
-.end:
REP_RET
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index dcbe3f1f3c..8826e4e2c9 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -29,128 +29,93 @@ void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
int len);
+void ff_vector_dmul_sse2(double *dst, const double *src0, const double *src1,
+ int len);
+void ff_vector_dmul_avx(double *dst, const double *src0, const double *src1,
+ int len);
+
void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
int len);
void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
int len);
+void ff_vector_fmac_scalar_fma3(float *dst, const float *src, float mul,
+ int len);
void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
int len);
+void ff_vector_dmac_scalar_sse2(double *dst, const double *src, double mul,
+ int len);
+void ff_vector_dmac_scalar_avx(double *dst, const double *src, double mul,
+ int len);
+void ff_vector_dmac_scalar_fma3(double *dst, const double *src, double mul,
+ int len);
+
void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
double mul, int len);
void ff_vector_dmul_scalar_avx(double *dst, const double *src,
double mul, int len);
+void ff_vector_fmul_window_3dnowext(float *dst, const float *src0,
+ const float *src1, const float *win, int len);
+void ff_vector_fmul_window_sse(float *dst, const float *src0,
+ const float *src1, const float *win, int len);
+
void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
const float *src2, int len);
void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
const float *src2, int len);
+void ff_vector_fmul_add_fma3(float *dst, const float *src0, const float *src1,
+ const float *src2, int len);
void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
const float *src1, int len);
+void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
+ const float *src1, int len);
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
-void ff_butterflies_float_sse(float *restrict src0, float *restrict src1, int len);
-
-#if HAVE_6REGS && HAVE_INLINE_ASM
-static void vector_fmul_window_3dnowext(float *dst, const float *src0,
- const float *src1, const float *win,
- int len)
-{
- x86_reg i = -len * 4;
- x86_reg j = len * 4 - 8;
- __asm__ volatile (
- "1: \n"
- "pswapd (%5, %1), %%mm1 \n"
- "movq (%5, %0), %%mm0 \n"
- "pswapd (%4, %1), %%mm5 \n"
- "movq (%3, %0), %%mm4 \n"
- "movq %%mm0, %%mm2 \n"
- "movq %%mm1, %%mm3 \n"
- "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
- "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
- "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
- "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
- "pfadd %%mm3, %%mm2 \n"
- "pfsub %%mm0, %%mm1 \n"
- "pswapd %%mm2, %%mm2 \n"
- "movq %%mm1, (%2, %0) \n"
- "movq %%mm2, (%2, %1) \n"
- "sub $8, %1 \n"
- "add $8, %0 \n"
- "jl 1b \n"
- "femms \n"
- : "+r"(i), "+r"(j)
- : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
- );
-}
-
-static void vector_fmul_window_sse(float *dst, const float *src0,
- const float *src1, const float *win, int len)
-{
- x86_reg i = -len * 4;
- x86_reg j = len * 4 - 16;
- __asm__ volatile (
- "1: \n"
- "movaps (%5, %1), %%xmm1 \n"
- "movaps (%5, %0), %%xmm0 \n"
- "movaps (%4, %1), %%xmm5 \n"
- "movaps (%3, %0), %%xmm4 \n"
- "shufps $0x1b, %%xmm1, %%xmm1 \n"
- "shufps $0x1b, %%xmm5, %%xmm5 \n"
- "movaps %%xmm0, %%xmm2 \n"
- "movaps %%xmm1, %%xmm3 \n"
- "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
- "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
- "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
- "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
- "addps %%xmm3, %%xmm2 \n"
- "subps %%xmm0, %%xmm1 \n"
- "shufps $0x1b, %%xmm2, %%xmm2 \n"
- "movaps %%xmm1, (%2, %0) \n"
- "movaps %%xmm2, (%2, %1) \n"
- "sub $16, %1 \n"
- "add $16, %0 \n"
- "jl 1b \n"
- : "+r"(i), "+r"(j)
- : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
- );
-}
-#endif /* HAVE_6REGS && HAVE_INLINE_ASM */
+void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len);
av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{
int cpu_flags = av_get_cpu_flags();
-#if HAVE_6REGS && HAVE_INLINE_ASM
- if (INLINE_AMD3DNOWEXT(cpu_flags)) {
- fdsp->vector_fmul_window = vector_fmul_window_3dnowext;
+ if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
+ fdsp->vector_fmul_window = ff_vector_fmul_window_3dnowext;
}
- if (INLINE_SSE(cpu_flags)) {
- fdsp->vector_fmul_window = vector_fmul_window_sse;
- }
-#endif
if (EXTERNAL_SSE(cpu_flags)) {
fdsp->vector_fmul = ff_vector_fmul_sse;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
+ fdsp->vector_fmul_window = ff_vector_fmul_window_sse;
fdsp->vector_fmul_add = ff_vector_fmul_add_sse;
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
fdsp->scalarproduct_float = ff_scalarproduct_float_sse;
fdsp->butterflies_float = ff_butterflies_float_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
+ fdsp->vector_dmul = ff_vector_dmul_sse2;
+ fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_sse2;
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
fdsp->vector_fmul = ff_vector_fmul_avx;
+ fdsp->vector_dmul = ff_vector_dmul_avx;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
+ fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx;
fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
}
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2;
+ }
+ if (EXTERNAL_FMA3_FAST(cpu_flags)) {
+ fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
+ fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
+ fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3;
+ }
}
diff --git a/libavutil/x86/imgutils.asm b/libavutil/x86/imgutils.asm
index be7a47f6d0..3cca56cdca 100644
--- a/libavutil/x86/imgutils.asm
+++ b/libavutil/x86/imgutils.asm
@@ -1,20 +1,20 @@
;*****************************************************************************
;* Copyright 2016 Anton Khirnov
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavutil/x86/imgutils_init.c b/libavutil/x86/imgutils_init.c
index 20ee9abaa2..4ea398205e 100644
--- a/libavutil/x86/imgutils_init.c
+++ b/libavutil/x86/imgutils_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavutil/x86/intmath.h b/libavutil/x86/intmath.h
new file mode 100644
index 0000000000..40743fd13e
--- /dev/null
+++ b/libavutil/x86/intmath.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2015 James Almer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_X86_INTMATH_H
+#define AVUTIL_X86_INTMATH_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#if HAVE_FAST_CLZ
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(__INTEL_COMPILER)
+#include <immintrin.h>
+#endif
+#endif
+#include "config.h"
+
+#if HAVE_FAST_CLZ
+#if (defined(__INTEL_COMPILER) && (__INTEL_COMPILER>=1216)) || defined(_MSC_VER)
+# if defined(__INTEL_COMPILER)
+# define ff_log2(x) (_bit_scan_reverse((x)|1))
+# else
+# define ff_log2 ff_log2_x86
+static av_always_inline av_const int ff_log2_x86(unsigned int v)
+{
+ unsigned long n;
+ _BitScanReverse(&n, v|1);
+ return n;
+}
+# endif
+# define ff_log2_16bit av_log2
+
+#if defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER >= 1700) && \
+ (defined(__BMI__) || !defined(__clang__)))
+# define ff_ctz(v) _tzcnt_u32(v)
+
+# if ARCH_X86_64
+# define ff_ctzll(v) _tzcnt_u64(v)
+# else
+# define ff_ctzll ff_ctzll_x86
+static av_always_inline av_const int ff_ctzll_x86(long long v)
+{
+ return ((uint32_t)v == 0) ? _tzcnt_u32((uint32_t)(v >> 32)) + 32 : _tzcnt_u32((uint32_t)v);
+}
+# endif
+#endif /* _MSC_VER */
+
+#endif /* __INTEL_COMPILER */
+
+#endif /* HAVE_FAST_CLZ */
+
+#if defined(__GNUC__)
+
+/* Our generic version of av_popcount is faster than GCC's built-in on
+ * CPUs that don't support the popcnt instruction.
+ */
+#if defined(__POPCNT__)
+ #define av_popcount __builtin_popcount
+#if ARCH_X86_64
+ #define av_popcount64 __builtin_popcountll
+#endif
+
+#endif /* __POPCNT__ */
+
+#if defined(__BMI2__)
+
+#if AV_GCC_VERSION_AT_LEAST(5,1)
+#define av_mod_uintp2 __builtin_ia32_bzhi_si
+#elif HAVE_INLINE_ASM
+/* GCC releases before 5.1.0 have a broken bzhi builtin, so for those we
+ * implement it using inline assembly
+ */
+#define av_mod_uintp2 av_mod_uintp2_bmi2
+static av_always_inline av_const unsigned av_mod_uintp2_bmi2(unsigned a, unsigned p)
+{
+ if (av_builtin_constant_p(p))
+ return a & ((1 << p) - 1);
+ else {
+ unsigned x;
+ __asm__ ("bzhi %2, %1, %0 \n\t" : "=r"(x) : "rm"(a), "r"(p));
+ return x;
+ }
+}
+#endif /* AV_GCC_VERSION_AT_LEAST */
+
+#endif /* __BMI2__ */
+
+#if defined(__SSE2__) && !defined(__INTEL_COMPILER)
+
+#define av_clipd av_clipd_sse2
+static av_always_inline av_const double av_clipd_sse2(double a, double amin, double amax)
+{
+#if defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2
+ if (amin > amax) abort();
+#endif
+ __asm__ ("minsd %2, %0 \n\t"
+ "maxsd %1, %0 \n\t"
+ : "+&x"(a) : "xm"(amin), "xm"(amax));
+ return a;
+}
+
+#endif /* __SSE2__ */
+
+#if defined(__SSE__) && !defined(__INTEL_COMPILER)
+
+#define av_clipf av_clipf_sse
+static av_always_inline av_const float av_clipf_sse(float a, float amin, float amax)
+{
+#if defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2
+ if (amin > amax) abort();
+#endif
+ __asm__ ("minss %2, %0 \n\t"
+ "maxss %1, %0 \n\t"
+ : "+&x"(a) : "xm"(amin), "xm"(amax));
+ return a;
+}
+
+#endif /* __SSE__ */
+
+#endif /* __GNUC__ */
+
+#endif /* AVUTIL_X86_INTMATH_H */
diff --git a/libavutil/x86/intreadwrite.h b/libavutil/x86/intreadwrite.h
index 635096e569..4061d19231 100644
--- a/libavutil/x86/intreadwrite.h
+++ b/libavutil/x86/intreadwrite.h
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2010 Alexander Strange <astrange@ithinksw.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm
index eab85ed050..317fba6fca 100644
--- a/libavutil/x86/lls.asm
+++ b/libavutil/x86/lls.asm
@@ -3,20 +3,20 @@
;*
;* Copyright (c) 2013 Loren Merritt
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -125,7 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
.ret:
REP_RET
-INIT_YMM avx
+%macro UPDATE_LLS 0
cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
%define covarq ctxq
mov countd, [ctxq + LLSModel.indep_count]
@@ -139,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
vbroadcastsd ymm6, [varq + iq*8 + 16]
vbroadcastsd ymm7, [varq + iq*8 + 24]
vextractf128 xmm3, ymm1, 1
+%if cpuflag(fma3)
+ mova ymm0, COVAR(iq ,0)
+ mova xmm2, COVAR(iq+2,2)
+ fmaddpd ymm0, ymm1, ymm4, ymm0
+ fmaddpd xmm2, xmm3, xmm6, xmm2
+ fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1)
+ fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3)
+ mova COVAR(iq ,0), ymm0
+ mova COVAR(iq ,1), ymm1
+ mova COVAR(iq+2,2), xmm2
+ mova COVAR(iq+2,3), xmm3
+%else
vmulpd ymm0, ymm1, ymm4
vmulpd ymm1, ymm1, ymm5
vmulpd xmm2, xmm3, xmm6
@@ -147,12 +159,26 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
ADDPD_MEM COVAR(iq ,1), ymm1
ADDPD_MEM COVAR(iq+2,2), xmm2
ADDPD_MEM COVAR(iq+2,3), xmm3
+%endif ; cpuflag(fma3)
lea jd, [iq + 4]
cmp jd, count2d
jg .skip4x4
.loop4x4:
; Compute all 16 pairwise products of a 4x4 block
mova ymm3, [varq + jq*8]
+%if cpuflag(fma3)
+ mova ymm0, COVAR(jq, 0)
+ mova ymm1, COVAR(jq, 1)
+ mova ymm2, COVAR(jq, 2)
+ fmaddpd ymm0, ymm3, ymm4, ymm0
+ fmaddpd ymm1, ymm3, ymm5, ymm1
+ fmaddpd ymm2, ymm3, ymm6, ymm2
+ fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3)
+ mova COVAR(jq, 0), ymm0
+ mova COVAR(jq, 1), ymm1
+ mova COVAR(jq, 2), ymm2
+ mova COVAR(jq, 3), ymm3
+%else
vmulpd ymm0, ymm3, ymm4
vmulpd ymm1, ymm3, ymm5
vmulpd ymm2, ymm3, ymm6
@@ -161,6 +187,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
ADDPD_MEM COVAR(jq,1), ymm1
ADDPD_MEM COVAR(jq,2), ymm2
ADDPD_MEM COVAR(jq,3), ymm3
+%endif ; cpuflag(fma3)
add jd, 4
cmp jd, count2d
jle .loop4x4
@@ -168,6 +195,19 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
cmp jd, countd
jg .skip2x4
mova xmm3, [varq + jq*8]
+%if cpuflag(fma3)
+ mova xmm0, COVAR(jq, 0)
+ mova xmm1, COVAR(jq, 1)
+ mova xmm2, COVAR(jq, 2)
+ fmaddpd xmm0, xmm3, xmm4, xmm0
+ fmaddpd xmm1, xmm3, xmm5, xmm1
+ fmaddpd xmm2, xmm3, xmm6, xmm2
+ fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3)
+ mova COVAR(jq, 0), xmm0
+ mova COVAR(jq, 1), xmm1
+ mova COVAR(jq, 2), xmm2
+ mova COVAR(jq, 3), xmm3
+%else
vmulpd xmm0, xmm3, xmm4
vmulpd xmm1, xmm3, xmm5
vmulpd xmm2, xmm3, xmm6
@@ -176,6 +216,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
ADDPD_MEM COVAR(jq,1), xmm1
ADDPD_MEM COVAR(jq,2), xmm2
ADDPD_MEM COVAR(jq,3), xmm3
+%endif ; cpuflag(fma3)
.skip2x4:
add id, 4
add covarq, 4*COVAR_STRIDE
@@ -186,15 +227,30 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
mov jd, id
.loop2x1:
vmovddup xmm0, [varq + iq*8]
+%if cpuflag(fma3)
+ mova xmm1, [varq + jq*8]
+ fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0)
+ mova COVAR(jq,0), xmm0
+%else
vmulpd xmm0, [varq + jq*8]
ADDPD_MEM COVAR(jq,0), xmm0
+%endif ; cpuflag(fma3)
inc id
add covarq, COVAR_STRIDE
cmp id, countd
jle .loop2x1
.ret:
REP_RET
+%endmacro ; UPDATE_LLS
+%if HAVE_AVX_EXTERNAL
+INIT_YMM avx
+UPDATE_LLS
+%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+UPDATE_LLS
+%endif
INIT_XMM sse2
cglobal evaluate_lls, 3,4,2, ctx, var, order, i
diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c
index 80cda29139..1c5dca42dc 100644
--- a/libavutil/x86/lls_init.c
+++ b/libavutil/x86/lls_init.c
@@ -3,29 +3,30 @@
*
* Copyright (c) 2013 Loren Merritt
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/lls.h"
#include "libavutil/x86/cpu.h"
-void ff_update_lls_sse2(LLSModel *m, double *var);
-void ff_update_lls_avx(LLSModel *m, double *var);
-double ff_evaluate_lls_sse2(LLSModel *m, double *var, int order);
+void ff_update_lls_sse2(LLSModel *m, const double *var);
+void ff_update_lls_avx(LLSModel *m, const double *var);
+void ff_update_lls_fma3(LLSModel *m, const double *var);
+double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order);
av_cold void ff_init_lls_x86(LLSModel *m)
{
@@ -38,4 +39,7 @@ av_cold void ff_init_lls_x86(LLSModel *m)
if (EXTERNAL_AVX_FAST(cpu_flags)) {
m->update_lls = ff_update_lls_avx;
}
+ if (EXTERNAL_FMA3_FAST(cpu_flags)) {
+ m->update_lls = ff_update_lls_fma3;
+ }
}
diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm
new file mode 100644
index 0000000000..36c57c5f7f
--- /dev/null
+++ b/libavutil/x86/pixelutils.asm
@@ -0,0 +1,386 @@
+;******************************************************************************
+;* Pixel utilities SIMD
+;*
+;* Copyright (C) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (C) 2014 Clément Bœsch <u pkh me>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86util.asm"
+
+SECTION .text
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_8x8_mmx(const uint8_t *src1, ptrdiff_t stride1,
+; const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+INIT_MMX mmx
+cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2
+ pxor m7, m7
+ pxor m6, m6
+%rep 4
+ mova m0, [src1q]
+ mova m2, [src1q + stride1q]
+ mova m1, [src2q]
+ mova m3, [src2q + stride2q]
+ psubusb m4, m0, m1
+ psubusb m5, m2, m3
+ psubusb m1, m0
+ psubusb m3, m2
+ por m1, m4
+ por m3, m5
+ punpcklbw m0, m1, m7
+ punpcklbw m2, m3, m7
+ punpckhbw m1, m7
+ punpckhbw m3, m7
+ paddw m0, m1
+ paddw m2, m3
+ paddw m0, m2
+ paddw m6, m0
+ lea src1q, [src1q + 2*stride1q]
+ lea src2q, [src2q + 2*stride2q]
+%endrep
+ psrlq m0, m6, 32
+ paddw m6, m0
+ psrlq m0, m6, 16
+ paddw m6, m0
+ movd eax, m6
+ movzx eax, ax
+ RET
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1,
+; const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+INIT_MMX mmxext
+cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2
+ pxor m2, m2
+%rep 4
+ mova m0, [src1q]
+ mova m1, [src1q + stride1q]
+ psadbw m0, [src2q]
+ psadbw m1, [src2q + stride2q]
+ paddw m2, m0
+ paddw m2, m1
+ lea src1q, [src1q + 2*stride1q]
+ lea src2q, [src2q + 2*stride2q]
+%endrep
+ movd eax, m2
+ RET
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_16x16_mmxext(const uint8_t *src1, ptrdiff_t stride1,
+; const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+INIT_MMX mmxext
+cglobal pixelutils_sad_16x16, 4,4,0, src1, stride1, src2, stride2
+ pxor m2, m2
+%rep 16
+ mova m0, [src1q]
+ mova m1, [src1q + 8]
+ psadbw m0, [src2q]
+ psadbw m1, [src2q + 8]
+ paddw m2, m0
+ paddw m2, m1
+ add src1q, stride1q
+ add src2q, stride2q
+%endrep
+ movd eax, m2
+ RET
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
+; const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2
+ movu m4, [src1q]
+ movu m2, [src2q]
+ movu m1, [src1q + stride1q]
+ movu m3, [src2q + stride2q]
+ psadbw m4, m2
+ psadbw m1, m3
+ paddw m4, m1
+%rep 7
+ lea src1q, [src1q + 2*stride1q]
+ lea src2q, [src2q + 2*stride2q]
+ movu m0, [src1q]
+ movu m2, [src2q]
+ movu m1, [src1q + stride1q]
+ movu m3, [src2q + stride2q]
+ psadbw m0, m2
+ psadbw m1, m3
+ paddw m4, m0
+ paddw m4, m1
+%endrep
+ movhlps m0, m4
+ paddw m4, m0
+ movd eax, m4
+ RET
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_[au]_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
+; const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+%macro SAD_XMM_16x16 1
+INIT_XMM sse2
+cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2
+ mov%1 m2, [src2q]
+ psadbw m2, [src1q]
+ mov%1 m1, [src2q + stride2q]
+ psadbw m1, [src1q + stride1q]
+ paddw m2, m1
+%rep 7
+ lea src1q, [src1q + 2*stride1q]
+ lea src2q, [src2q + 2*stride2q]
+ mov%1 m0, [src2q]
+ psadbw m0, [src1q]
+ mov%1 m1, [src2q + stride2q]
+ psadbw m1, [src1q + stride1q]
+ paddw m2, m0
+ paddw m2, m1
+%endrep
+ movhlps m0, m2
+ paddw m2, m0
+ movd eax, m2
+ RET
+%endmacro
+
+SAD_XMM_16x16 a
+SAD_XMM_16x16 u
+
+
+%macro PROCESS_SAD_32x4_U 0
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ movu m3, [r0]
+ movu m4, [r0 + 16]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ movu m3, [r0]
+ movu m4, [r0 + 16]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ movu m3, [r0]
+ movu m4, [r0 + 16]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ movu m3, [r0]
+ movu m4, [r0 + 16]
+ psadbw m1, m3
+ psadbw m2, m4
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+%endmacro
+
+%macro PROCESS_SAD_32x4 1
+ mov%1 m1, [r2]
+ mov%1 m2, [r2 + 16]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ mov%1 m1, [r2]
+ mov%1 m2, [r2 + 16]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ mov%1 m1, [r2]
+ mov%1 m2, [r2 + 16]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+
+ mov%1 m1, [r2]
+ mov%1 m2, [r2 + 16]
+ psadbw m1, [r0]
+ psadbw m2, [r0 + 16]
+ paddd m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
+; const uint8_t *src2, ptrdiff_t stride2);
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pixelutils_sad_32x32, 4,5,5, src1, stride1, src2, stride2
+ pxor m0, m0
+ mov r4d, 4
+.loop:
+ PROCESS_SAD_32x4_U
+ PROCESS_SAD_32x4_U
+ dec r4d
+ jnz .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_[au]_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
+; const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+%macro SAD_XMM_32x32 1
+INIT_XMM sse2
+cglobal pixelutils_sad_%1_32x32, 4,5,3, src1, stride1, src2, stride2
+ pxor m0, m0
+ mov r4d, 4
+.loop:
+ PROCESS_SAD_32x4 %1
+ PROCESS_SAD_32x4 %1
+ dec r4d
+ jnz .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+SAD_XMM_32x32 a
+SAD_XMM_32x32 u
+
+%if HAVE_AVX2_EXTERNAL
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
+; const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal pixelutils_sad_32x32, 4,7,5, src1, stride1, src2, stride2
+ pxor m0, m0
+ mov r4d, 32/4
+ lea r5, [stride1q * 3]
+ lea r6, [stride2q * 3]
+
+.loop:
+ movu m1, [src1q] ; row 0 of pix0
+ movu m2, [src2q] ; row 0 of pix1
+ movu m3, [src1q + stride1q] ; row 1 of pix0
+ movu m4, [src2q + stride2q] ; row 1 of pix1
+
+ psadbw m1, m2
+ psadbw m3, m4
+ paddd m0, m1
+ paddd m0, m3
+
+ movu m1, [src1q + 2 * stride1q] ; row 2 of pix0
+ movu m2, [src2q + 2 * stride2q] ; row 2 of pix1
+ movu m3, [src1q + r5] ; row 3 of pix0
+ movu m4, [src2q + r6] ; row 3 of pix1
+
+ psadbw m1, m2
+ psadbw m3, m4
+ paddd m0, m1
+ paddd m0, m3
+
+ lea src2q, [src2q + 4 * stride2q]
+ lea src1q, [src1q + 4 * stride1q]
+
+ dec r4d
+ jnz .loop
+
+ vextracti128 xm1, m0, 1
+ paddd xm0, xm1
+ pshufd xm1, xm0, 2
+ paddd xm0, xm1
+ movd eax, xm0
+ RET
+
+;-------------------------------------------------------------------------------
+; int ff_pixelutils_sad_[au]_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
+; const uint8_t *src2, ptrdiff_t stride2);
+;-------------------------------------------------------------------------------
+%macro SAD_AVX2_32x32 1
+INIT_YMM avx2
+cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2
+ pxor m0, m0
+ mov r4d, 32/4
+ lea r5, [stride1q * 3]
+ lea r6, [stride2q * 3]
+
+.loop:
+ mov%1 m1, [src2q] ; row 0 of pix1
+ psadbw m1, [src1q]
+ mov%1 m2, [src2q + stride2q] ; row 1 of pix1
+ psadbw m2, [src1q + stride1q]
+
+ paddd m0, m1
+ paddd m0, m2
+
+ mov%1 m1, [src2q + 2 * stride2q] ; row 2 of pix1
+ psadbw m1, [src1q + 2 * stride1q]
+ mov%1 m2, [src2q + r6] ; row 3 of pix1
+ psadbw m2, [src1q + r5]
+
+ paddd m0, m1
+ paddd m0, m2
+
+ lea src2q, [src2q + 4 * stride2q]
+ lea src1q, [src1q + 4 * stride1q]
+
+ dec r4d
+ jnz .loop
+
+ vextracti128 xm1, m0, 1
+ paddd xm0, xm1
+ pshufd xm1, xm0, 2
+ paddd xm0, xm1
+ movd eax, xm0
+ RET
+%endmacro
+
+SAD_AVX2_32x32 a
+SAD_AVX2_32x32 u
+%endif
diff --git a/libavutil/x86/pixelutils.h b/libavutil/x86/pixelutils.h
new file mode 100644
index 0000000000..876cf46053
--- /dev/null
+++ b/libavutil/x86/pixelutils.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_X86_PIXELUTILS_H
+#define AVUTIL_X86_PIXELUTILS_H
+
+#include "libavutil/pixelutils.h"
+
+void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned);
+
+#endif /* AVUTIL_X86_PIXELUTILS_H */
diff --git a/libavutil/x86/pixelutils_init.c b/libavutil/x86/pixelutils_init.c
new file mode 100644
index 0000000000..184a3a4a9f
--- /dev/null
+++ b/libavutil/x86/pixelutils_init.c
@@ -0,0 +1,94 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "pixelutils.h"
+#include "cpu.h"
+
+int ff_pixelutils_sad_8x8_mmx(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+
+int ff_pixelutils_sad_16x16_mmxext(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_a_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_u_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+
+int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_a_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+
+int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_a_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_u_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
+ const uint8_t *src2, ptrdiff_t stride2);
+
+void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(cpu_flags)) {
+ sad[2] = ff_pixelutils_sad_8x8_mmx;
+ }
+
+ // The best way to use SSE2 would be to do 2 SADs in parallel,
+ // but we'd have to modify the pixelutils API to return SIMD functions.
+
+ // It's probably not faster to shuffle data around
+ // to get two lines of 8 pixels into a single 16byte register,
+ // so just use the MMX 8x8 version even when SSE2 is available.
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ sad[2] = ff_pixelutils_sad_8x8_mmxext;
+ sad[3] = ff_pixelutils_sad_16x16_mmxext;
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ switch (aligned) {
+ case 0: sad[3] = ff_pixelutils_sad_16x16_sse2; break; // src1 unaligned, src2 unaligned
+ case 1: sad[3] = ff_pixelutils_sad_u_16x16_sse2; break; // src1 aligned, src2 unaligned
+ case 2: sad[3] = ff_pixelutils_sad_a_16x16_sse2; break; // src1 aligned, src2 aligned
+ }
+ }
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ switch (aligned) {
+ case 0: sad[4] = ff_pixelutils_sad_32x32_sse2; break; // src1 unaligned, src2 unaligned
+ case 1: sad[4] = ff_pixelutils_sad_u_32x32_sse2; break; // src1 aligned, src2 unaligned
+ case 2: sad[4] = ff_pixelutils_sad_a_32x32_sse2; break; // src1 aligned, src2 aligned
+ }
+ }
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ switch (aligned) {
+ case 0: sad[4] = ff_pixelutils_sad_32x32_avx2; break; // src1 unaligned, src2 unaligned
+ case 1: sad[4] = ff_pixelutils_sad_u_32x32_avx2; break; // src1 aligned, src2 unaligned
+ case 2: sad[4] = ff_pixelutils_sad_a_32x32_avx2; break; // src1 aligned, src2 aligned
+ }
+ }
+}
diff --git a/libavutil/x86/timer.h b/libavutil/x86/timer.h
index bb7c341341..4d1e88def0 100644
--- a/libavutil/x86/timer.h
+++ b/libavutil/x86/timer.h
@@ -1,20 +1,20 @@
/*
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -25,6 +25,7 @@
#if HAVE_INLINE_ASM
+#define FF_TIMER_UNITS "decicycles"
#define AV_READ_TIME read_time
static inline uint64_t read_time(void)
diff --git a/libavutil/x86/w64xmmtest.h b/libavutil/x86/w64xmmtest.h
index b4ce7d3daf..a4a05b0419 100644
--- a/libavutil/x86/w64xmmtest.h
+++ b/libavutil/x86/w64xmmtest.h
@@ -2,23 +2,26 @@
* check XMM registers for clobbers on Win64
* Copyright (c) 2008 Ramiro Polla <ramiro.polla@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#ifndef AVUTIL_X86_W64XMMTEST_H
+#define AVUTIL_X86_W64XMMTEST_H
+
#include <inttypes.h>
#include <stdint.h>
#include <stdlib.h>
@@ -71,3 +74,5 @@
int __real_ ## func; \
int __wrap_ ## func; \
int __wrap_ ## func
+
+#endif /* AVUTIL_X86_W64XMMTEST_H */
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index e04dbfedf3..5044ee86f0 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1,12 +1,12 @@
;*****************************************************************************
;* x86inc.asm: x264asm abstraction layer
;*****************************************************************************
-;* Copyright (C) 2005-2016 x264 project
+;* Copyright (C) 2005-2018 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Henrik Gramner <henrik@gramner.com>
;* Anton Mitrofanov <BugMaster@narod.ru>
;* Fiona Glaser <fiona@x264.com>
-;* Henrik Gramner <henrik@gramner.com>
;*
;* Permission to use, copy, modify, and/or distribute this software for any
;* purpose with or without fee is hereby granted, provided that the above
@@ -88,6 +88,12 @@
%macro SECTION_RODATA 0-1 16
%ifidn __OUTPUT_FORMAT__,aout
SECTION .text
+ %elifidn __OUTPUT_FORMAT__,coff
+ SECTION .text
+ %elifidn __OUTPUT_FORMAT__,win32
+ SECTION .rdata align=%1
+ %elif WIN64
+ SECTION .rdata align=%1
%else
SECTION .rodata align=%1
%endif
@@ -335,6 +341,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%endmacro
%define required_stack_alignment ((mmsize + 15) & ~15)
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
+%define high_mm_regs (16*cpuflag(avx512))
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
%ifnum %1
@@ -426,10 +434,10 @@ DECLARE_REG 7, rdi, 64
DECLARE_REG 8, rsi, 72
DECLARE_REG 9, rbx, 80
DECLARE_REG 10, rbp, 88
-DECLARE_REG 11, R12, 96
-DECLARE_REG 12, R13, 104
-DECLARE_REG 13, R14, 112
-DECLARE_REG 14, R15, 120
+DECLARE_REG 11, R14, 96
+DECLARE_REG 12, R15, 104
+DECLARE_REG 13, R12, 112
+DECLARE_REG 14, R13, 120
%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
@@ -448,15 +456,16 @@ DECLARE_REG 14, R15, 120
%macro WIN64_PUSH_XMM 0
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
- %if xmm_regs_used > 6
+ %if xmm_regs_used > 6 + high_mm_regs
movaps [rstk + stack_offset + 8], xmm6
%endif
- %if xmm_regs_used > 7
+ %if xmm_regs_used > 7 + high_mm_regs
movaps [rstk + stack_offset + 24], xmm7
%endif
- %if xmm_regs_used > 8
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
%assign %%i 8
- %rep xmm_regs_used-8
+ %rep %%xmm_regs_on_stack
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
%assign %%i %%i+1
%endrep
@@ -465,53 +474,56 @@ DECLARE_REG 14, R15, 120
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
- ASSERT xmm_regs_used <= 16
- %if xmm_regs_used > 8
+ ASSERT xmm_regs_used <= 16 + high_mm_regs
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
- %assign %%pad (xmm_regs_used-8)*16 + 32
+ %assign %%pad %%xmm_regs_on_stack*16 + 32
%assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
SUB rsp, stack_size_padded
%endif
WIN64_PUSH_XMM
%endmacro
-%macro WIN64_RESTORE_XMM_INTERNAL 1
+%macro WIN64_RESTORE_XMM_INTERNAL 0
%assign %%pad_size 0
- %if xmm_regs_used > 8
- %assign %%i xmm_regs_used
- %rep xmm_regs_used-8
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+ %if %%xmm_regs_on_stack > 0
+ %assign %%i xmm_regs_used - high_mm_regs
+ %rep %%xmm_regs_on_stack
%assign %%i %%i-1
- movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
+ movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
%endrep
%endif
%if stack_size_padded > 0
%if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
mov rsp, rstkm
%else
- add %1, stack_size_padded
+ add rsp, stack_size_padded
%assign %%pad_size stack_size_padded
%endif
%endif
- %if xmm_regs_used > 7
- movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+ %if xmm_regs_used > 7 + high_mm_regs
+ movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
%endif
- %if xmm_regs_used > 6
- movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
+ %if xmm_regs_used > 6 + high_mm_regs
+ movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
%endif
%endmacro
-%macro WIN64_RESTORE_XMM 1
- WIN64_RESTORE_XMM_INTERNAL %1
+%macro WIN64_RESTORE_XMM 0
+ WIN64_RESTORE_XMM_INTERNAL
%assign stack_offset (stack_offset-stack_size_padded)
+ %assign stack_size_padded 0
%assign xmm_regs_used 0
%endmacro
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
%macro RET 0
- WIN64_RESTORE_XMM_INTERNAL rsp
+ WIN64_RESTORE_XMM_INTERNAL
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
- %if mmsize == 32
+ %if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
@@ -530,14 +542,15 @@ DECLARE_REG 7, R10, 16
DECLARE_REG 8, R11, 24
DECLARE_REG 9, rbx, 32
DECLARE_REG 10, rbp, 40
-DECLARE_REG 11, R12, 48
-DECLARE_REG 12, R13, 56
-DECLARE_REG 13, R14, 64
-DECLARE_REG 14, R15, 72
+DECLARE_REG 11, R14, 48
+DECLARE_REG 12, R15, 56
+DECLARE_REG 13, R12, 64
+DECLARE_REG 14, R13, 72
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
+ %assign xmm_regs_used %3
ASSERT regs_used >= num_args
SETUP_STACK_POINTER %4
ASSERT regs_used <= 15
@@ -547,7 +560,7 @@ DECLARE_REG 14, R15, 72
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
%macro RET 0
%if stack_size_padded > 0
@@ -558,7 +571,7 @@ DECLARE_REG 14, R15, 72
%endif
%endif
POP_IF_USED 14, 13, 12, 11, 10, 9
- %if mmsize == 32
+ %if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
@@ -603,7 +616,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
%macro RET 0
%if stack_size_padded > 0
@@ -614,7 +627,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%endif
%endif
POP_IF_USED 6, 5, 4, 3
- %if mmsize == 32
+ %if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
@@ -625,7 +638,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%if WIN64 == 0
%macro WIN64_SPILL_XMM 1
%endmacro
- %macro WIN64_RESTORE_XMM 1
+ %macro WIN64_RESTORE_XMM 0
%endmacro
%macro WIN64_PUSH_XMM 0
%endmacro
@@ -636,7 +649,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
; We can automatically detect "follows a branch", but not a branch target.
; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
%macro REP_RET 0
- %if has_epilogue
+ %if has_epilogue || cpuflag(ssse3)
RET
%else
rep ret
@@ -724,12 +737,22 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign stack_offset 0 ; stack pointer offset relative to the return address
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
- %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+ %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
%ifnidn %3, ""
PROLOGUE %3
%endif
%endmacro
+; Create a global symbol from a local label with the correct name mangling and type
+%macro cglobal_label 1
+ %if FORMAT_ELF
+ global current_function %+ %1:function hidden
+ %else
+ global current_function %+ %1
+ %endif
+ %1:
+%endmacro
+
%macro cextern 1
%xdefine %1 mangle(private_prefix %+ _ %+ %1)
CAT_XDEFINE cglobaled_, %1, 1
@@ -787,24 +810,25 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign cpuflags_sse (1<<4) | cpuflags_mmx2
%assign cpuflags_sse2 (1<<5) | cpuflags_sse
%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
-%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
-%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
-%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
-%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
-%assign cpuflags_avx (1<<11)| cpuflags_sse42
-%assign cpuflags_xop (1<<12)| cpuflags_avx
-%assign cpuflags_fma4 (1<<13)| cpuflags_avx
-%assign cpuflags_fma3 (1<<14)| cpuflags_avx
-%assign cpuflags_avx2 (1<<15)| cpuflags_fma3
-
-%assign cpuflags_cache32 (1<<16)
-%assign cpuflags_cache64 (1<<17)
-%assign cpuflags_slowctz (1<<18)
-%assign cpuflags_lzcnt (1<<19)
-%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant
-%assign cpuflags_atom (1<<21)
-%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
-%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
+%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
+%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
+%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
+%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3
+%assign cpuflags_sse42 (1<<11)| cpuflags_sse4
+%assign cpuflags_aesni (1<<12)| cpuflags_sse42
+%assign cpuflags_avx (1<<13)| cpuflags_sse42
+%assign cpuflags_xop (1<<14)| cpuflags_avx
+%assign cpuflags_fma4 (1<<15)| cpuflags_avx
+%assign cpuflags_fma3 (1<<16)| cpuflags_avx
+%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1
+%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512 (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
+
+%assign cpuflags_cache32 (1<<21)
+%assign cpuflags_cache64 (1<<22)
+%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<24)
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
@@ -852,11 +876,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%endif
%endmacro
-; Merge mmx and sse*
+; Merge mmx, sse*, and avx*
; m# is a simd register of the currently selected size
; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
-; (All 3 remain in sync through SWAP.)
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
+; (All 4 remain in sync through SWAP.)
%macro CAT_XDEFINE 3
%xdefine %1%2 %3
@@ -866,69 +891,99 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%undef %1%2
%endmacro
+%macro DEFINE_MMREGS 1 ; mmtype
+ %assign %%prev_mmregs 0
+ %ifdef num_mmregs
+ %assign %%prev_mmregs num_mmregs
+ %endif
+
+ %assign num_mmregs 8
+ %if ARCH_X86_64 && mmsize >= 16
+ %assign num_mmregs 16
+ %if cpuflag(avx512) || mmsize == 64
+ %assign num_mmregs 32
+ %endif
+ %endif
+
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, %1 %+ %%i
+ CAT_XDEFINE nn%1, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %if %%prev_mmregs > num_mmregs
+ %rep %%prev_mmregs - num_mmregs
+ CAT_UNDEF m, %%i
+ CAT_UNDEF nn %+ mmtype, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+ %xdefine mmtype %1
+%endmacro
+
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
+ %if ARCH_X86_64 && cpuflag(avx512)
+ %assign %%i %1
+ %rep 16-%1
+ %assign %%i_high %%i+16
+ SWAP %%i, %%i_high
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
%macro INIT_MMX 0-1+
%assign avx_enabled 0
%define RESET_MM_PERMUTATION INIT_MMX %1
%define mmsize 8
- %define num_mmregs 8
%define mova movq
%define movu movq
%define movh movd
%define movnta movntq
- %assign %%i 0
- %rep 8
- CAT_XDEFINE m, %%i, mm %+ %%i
- CAT_XDEFINE nnmm, %%i, %%i
- %assign %%i %%i+1
- %endrep
- %rep 8
- CAT_UNDEF m, %%i
- CAT_UNDEF nnmm, %%i
- %assign %%i %%i+1
- %endrep
INIT_CPUFLAGS %1
+ DEFINE_MMREGS mm
%endmacro
%macro INIT_XMM 0-1+
%assign avx_enabled 0
%define RESET_MM_PERMUTATION INIT_XMM %1
%define mmsize 16
- %define num_mmregs 8
- %if ARCH_X86_64
- %define num_mmregs 16
- %endif
%define mova movdqa
%define movu movdqu
%define movh movq
%define movnta movntdq
- %assign %%i 0
- %rep num_mmregs
- CAT_XDEFINE m, %%i, xmm %+ %%i
- CAT_XDEFINE nnxmm, %%i, %%i
- %assign %%i %%i+1
- %endrep
INIT_CPUFLAGS %1
+ DEFINE_MMREGS xmm
+ %if WIN64
+ AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
+ %endif
%endmacro
%macro INIT_YMM 0-1+
%assign avx_enabled 1
%define RESET_MM_PERMUTATION INIT_YMM %1
%define mmsize 32
- %define num_mmregs 8
- %if ARCH_X86_64
- %define num_mmregs 16
- %endif
%define mova movdqa
%define movu movdqu
%undef movh
%define movnta movntdq
- %assign %%i 0
- %rep num_mmregs
- CAT_XDEFINE m, %%i, ymm %+ %%i
- CAT_XDEFINE nnymm, %%i, %%i
- %assign %%i %%i+1
- %endrep
INIT_CPUFLAGS %1
+ DEFINE_MMREGS ymm
+ AVX512_MM_PERMUTATION
+%endmacro
+
+%macro INIT_ZMM 0-1+
+ %assign avx_enabled 1
+ %define RESET_MM_PERMUTATION INIT_ZMM %1
+ %define mmsize 64
+ %define mova movdqa
+ %define movu movdqu
+ %undef movh
+ %define movnta movntdq
+ INIT_CPUFLAGS %1
+ DEFINE_MMREGS zmm
+ AVX512_MM_PERMUTATION
%endmacro
INIT_XMM
@@ -937,18 +992,26 @@ INIT_XMM
%define mmmm%1 mm%1
%define mmxmm%1 mm%1
%define mmymm%1 mm%1
+ %define mmzmm%1 mm%1
%define xmmmm%1 mm%1
%define xmmxmm%1 xmm%1
%define xmmymm%1 xmm%1
+ %define xmmzmm%1 xmm%1
%define ymmmm%1 mm%1
%define ymmxmm%1 xmm%1
%define ymmymm%1 ymm%1
+ %define ymmzmm%1 ymm%1
+ %define zmmmm%1 mm%1
+ %define zmmxmm%1 xmm%1
+ %define zmmymm%1 ymm%1
+ %define zmmzmm%1 zmm%1
%define xm%1 xmm %+ m%1
%define ym%1 ymm %+ m%1
+ %define zm%1 zmm %+ m%1
%endmacro
%assign i 0
-%rep 16
+%rep 32
DECLARE_MMCAST i
%assign i i+1
%endrep
@@ -1036,7 +1099,11 @@ INIT_XMM
; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
%macro call 1
- call_internal %1 %+ SUFFIX, %1
+ %ifid %1
+ call_internal %1 %+ SUFFIX, %1
+ %else
+ call %1
+ %endif
%endmacro
%macro call_internal 2
%xdefine %%i %2
@@ -1079,12 +1146,17 @@ INIT_XMM
;=============================================================================
%assign i 0
-%rep 16
+%rep 32
%if i < 8
CAT_XDEFINE sizeofmm, i, 8
+ CAT_XDEFINE regnumofmm, i, i
%endif
CAT_XDEFINE sizeofxmm, i, 16
CAT_XDEFINE sizeofymm, i, 32
+ CAT_XDEFINE sizeofzmm, i, 64
+ CAT_XDEFINE regnumofxmm, i, i
+ CAT_XDEFINE regnumofymm, i, i
+ CAT_XDEFINE regnumofzmm, i, i
%assign i i+1
%endrep
%undef i
@@ -1201,7 +1273,7 @@ INIT_XMM
%endmacro
%endmacro
-; Instructions with both VEX and non-VEX encodings
+; Instructions with both VEX/EVEX and legacy encodings
; Non-destructive instructions are written without parameters
AVX_INSTR addpd, sse2, 1, 0, 1
AVX_INSTR addps, sse, 1, 0, 1
@@ -1209,12 +1281,12 @@ AVX_INSTR addsd, sse2, 1, 0, 0
AVX_INSTR addss, sse, 1, 0, 0
AVX_INSTR addsubpd, sse3, 1, 0, 0
AVX_INSTR addsubps, sse3, 1, 0, 0
-AVX_INSTR aesdec, fnord, 0, 0, 0
-AVX_INSTR aesdeclast, fnord, 0, 0, 0
-AVX_INSTR aesenc, fnord, 0, 0, 0
-AVX_INSTR aesenclast, fnord, 0, 0, 0
-AVX_INSTR aesimc
-AVX_INSTR aeskeygenassist
+AVX_INSTR aesdec, aesni, 0, 0, 0
+AVX_INSTR aesdeclast, aesni, 0, 0, 0
+AVX_INSTR aesenc, aesni, 0, 0, 0
+AVX_INSTR aesenclast, aesni, 0, 0, 0
+AVX_INSTR aesimc, aesni
+AVX_INSTR aeskeygenassist, aesni
AVX_INSTR andnpd, sse2, 1, 0, 0
AVX_INSTR andnps, sse, 1, 0, 0
AVX_INSTR andpd, sse2, 1, 0, 1
@@ -1223,10 +1295,42 @@ AVX_INSTR blendpd, sse4, 1, 1, 0
AVX_INSTR blendps, sse4, 1, 1, 0
AVX_INSTR blendvpd, sse4 ; can't be emulated
AVX_INSTR blendvps, sse4 ; can't be emulated
+AVX_INSTR cmpeqpd, sse2, 1, 0, 1
+AVX_INSTR cmpeqps, sse, 1, 0, 1
+AVX_INSTR cmpeqsd, sse2, 1, 0, 0
+AVX_INSTR cmpeqss, sse, 1, 0, 0
+AVX_INSTR cmplepd, sse2, 1, 0, 0
+AVX_INSTR cmpleps, sse, 1, 0, 0
+AVX_INSTR cmplesd, sse2, 1, 0, 0
+AVX_INSTR cmpless, sse, 1, 0, 0
+AVX_INSTR cmpltpd, sse2, 1, 0, 0
+AVX_INSTR cmpltps, sse, 1, 0, 0
+AVX_INSTR cmpltsd, sse2, 1, 0, 0
+AVX_INSTR cmpltss, sse, 1, 0, 0
+AVX_INSTR cmpneqpd, sse2, 1, 0, 1
+AVX_INSTR cmpneqps, sse, 1, 0, 1
+AVX_INSTR cmpneqsd, sse2, 1, 0, 0
+AVX_INSTR cmpneqss, sse, 1, 0, 0
+AVX_INSTR cmpnlepd, sse2, 1, 0, 0
+AVX_INSTR cmpnleps, sse, 1, 0, 0
+AVX_INSTR cmpnlesd, sse2, 1, 0, 0
+AVX_INSTR cmpnless, sse, 1, 0, 0
+AVX_INSTR cmpnltpd, sse2, 1, 0, 0
+AVX_INSTR cmpnltps, sse, 1, 0, 0
+AVX_INSTR cmpnltsd, sse2, 1, 0, 0
+AVX_INSTR cmpnltss, sse, 1, 0, 0
+AVX_INSTR cmpordpd, sse2 1, 0, 1
+AVX_INSTR cmpordps, sse 1, 0, 1
+AVX_INSTR cmpordsd, sse2 1, 0, 0
+AVX_INSTR cmpordss, sse 1, 0, 0
AVX_INSTR cmppd, sse2, 1, 1, 0
AVX_INSTR cmpps, sse, 1, 1, 0
AVX_INSTR cmpsd, sse2, 1, 1, 0
AVX_INSTR cmpss, sse, 1, 1, 0
+AVX_INSTR cmpunordpd, sse2, 1, 0, 1
+AVX_INSTR cmpunordps, sse, 1, 0, 1
+AVX_INSTR cmpunordsd, sse2, 1, 0, 0
+AVX_INSTR cmpunordss, sse, 1, 0, 0
AVX_INSTR comisd, sse2
AVX_INSTR comiss, sse
AVX_INSTR cvtdq2pd, sse2
@@ -1537,6 +1641,52 @@ FMA4_INSTR fmsubadd, pd, ps
FMA4_INSTR fnmadd, pd, ps, sd, ss
FMA4_INSTR fnmsub, pd, ps, sd, ss
+; Macros for converting VEX instructions to equivalent EVEX ones.
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
+ %macro %1 2-7 fnord, fnord, %1, %2, %3
+ %ifidn %3, fnord
+ %define %%args %1, %2
+ %elifidn %4, fnord
+ %define %%args %1, %2, %3
+ %else
+ %define %%args %1, %2, %3, %4
+ %endif
+ %assign %%evex_required cpuflag(avx512) & %7
+ %ifnum regnumof%1
+ %if regnumof%1 >= 16 || sizeof%1 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %ifnum regnumof%2
+ %if regnumof%2 >= 16 || sizeof%2 > 32
+ %assign %%evex_required 1
+ %endif
+ %endif
+ %if %%evex_required
+ %6 %%args
+ %else
+ %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
+ %endif
+ %endmacro
+%endmacro
+
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
+EVEX_INSTR vextractf128, vextractf32x4
+EVEX_INSTR vextracti128, vextracti32x4
+EVEX_INSTR vinsertf128, vinsertf32x4
+EVEX_INSTR vinserti128, vinserti32x4
+EVEX_INSTR vmovdqa, vmovdqa32
+EVEX_INSTR vmovdqu, vmovdqu32
+EVEX_INSTR vpand, vpandd
+EVEX_INSTR vpandn, vpandnd
+EVEX_INSTR vpor, vpord
+EVEX_INSTR vpxor, vpxord
+EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision
+EVEX_INSTR vrcpss, vrcp14ss, 1
+EVEX_INSTR vrsqrtps, vrsqrt14ps, 1
+EVEX_INSTR vrsqrtss, vrsqrt14ss, 1
+
; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
%ifdef __YASM_VER__
%if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index 81a3ada6e6..d7cd996842 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -6,20 +6,20 @@
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <holger@lubitz.org>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -29,6 +29,11 @@
%include "libavutil/x86/x86inc.asm"
+; expands to [base],...,[base+7*stride]
+%define PASS8ROWS(base, base3, stride, stride3) \
+ [base], [base + stride], [base + 2*stride], [base3], \
+ [base3 + stride], [base3 + 2*stride], [base3 + stride3], [base3 + stride*4]
+
; Interleave low src0 with low src1 and store in src0,
; interleave high src0 with high src1 and store in src1.
; %1 - types
@@ -40,7 +45,10 @@
; output: src0: x0 y0 x1 y1 x2 y2 x3 y3
; src1: z0 q0 z1 q1 z2 q2 z3 q3
%macro SBUTTERFLY 4
-%if avx_enabled == 0
+%ifidn %1, dqqq
+ vperm2i128 m%4, m%2, m%3, q0301
+ vinserti128 m%2, m%2, xm%3, 1
+%elif avx_enabled == 0
mova m%4, m%2
punpckl%1 m%2, m%3
punpckh%1 m%4, m%3
@@ -63,6 +71,12 @@
SWAP %1, %3, %2
%endmacro
+%macro SBUTTERFLYPD 3
+ movlhps m%3, m%1, m%2
+ movhlps m%2, m%2, m%1
+ SWAP %1, %3
+%endmacro
+
%macro TRANSPOSE4x4B 5
SBUTTERFLY bw, %1, %2, %5
SBUTTERFLY bw, %3, %4, %5
@@ -79,6 +93,15 @@
SWAP %2, %3
%endmacro
+%macro TRANSPOSE2x4x4B 5
+ SBUTTERFLY bw, %1, %2, %5
+ SBUTTERFLY bw, %3, %4, %5
+ SBUTTERFLY wd, %1, %3, %5
+ SBUTTERFLY wd, %2, %4, %5
+ SBUTTERFLY dq, %1, %2, %5
+ SBUTTERFLY dq, %3, %4, %5
+%endmacro
+
%macro TRANSPOSE2x4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
@@ -100,12 +123,46 @@
%macro TRANSPOSE4x4PS 5
SBUTTERFLYPS %1, %2, %5
SBUTTERFLYPS %3, %4, %5
- movlhps m%5, m%1, m%3
- movhlps m%3, m%1
- SWAP %5, %1
- movlhps m%5, m%2, m%4
- movhlps m%4, m%2
- SWAP %5, %2, %3
+ SBUTTERFLYPD %1, %3, %5
+ SBUTTERFLYPD %2, %4, %5
+ SWAP %2, %3
+%endmacro
+
+%macro TRANSPOSE8x4D 9-11
+%if ARCH_X86_64
+ SBUTTERFLY dq, %1, %2, %9
+ SBUTTERFLY dq, %3, %4, %9
+ SBUTTERFLY dq, %5, %6, %9
+ SBUTTERFLY dq, %7, %8, %9
+ SBUTTERFLY qdq, %1, %3, %9
+ SBUTTERFLY qdq, %2, %4, %9
+ SBUTTERFLY qdq, %5, %7, %9
+ SBUTTERFLY qdq, %6, %8, %9
+ SWAP %2, %5
+ SWAP %4, %7
+%else
+; in: m0..m7
+; out: m0..m7, unless %11 in which case m2 is in %9
+; spills into %9 and %10
+ movdqa %9, m%7
+ SBUTTERFLY dq, %1, %2, %7
+ movdqa %10, m%2
+ movdqa m%7, %9
+ SBUTTERFLY dq, %3, %4, %2
+ SBUTTERFLY dq, %5, %6, %2
+ SBUTTERFLY dq, %7, %8, %2
+ SBUTTERFLY qdq, %1, %3, %2
+ movdqa %9, m%3
+ movdqa m%2, %10
+ SBUTTERFLY qdq, %2, %4, %3
+ SBUTTERFLY qdq, %5, %7, %3
+ SBUTTERFLY qdq, %6, %8, %3
+ SWAP %2, %5
+ SWAP %4, %7
+%if %0<11
+ movdqa m%3, %9
+%endif
+%endif
%endmacro
%macro TRANSPOSE8x8W 9-11
@@ -157,6 +214,85 @@
%endif
%endmacro
+%macro TRANSPOSE16x16W 18-19
+; in: m0..m15, unless %19 in which case m6 is in %17
+; out: m0..m15, unless %19 in which case m4 is in %18
+; spills into %17 and %18
+%if %0 < 19
+ mova %17, m%7
+%endif
+
+ SBUTTERFLY dqqq, %1, %9, %7
+ SBUTTERFLY dqqq, %2, %10, %7
+ SBUTTERFLY dqqq, %3, %11, %7
+ SBUTTERFLY dqqq, %4, %12, %7
+ SBUTTERFLY dqqq, %5, %13, %7
+ SBUTTERFLY dqqq, %6, %14, %7
+ mova %18, m%14
+ mova m%7, %17
+ SBUTTERFLY dqqq, %7, %15, %14
+ SBUTTERFLY dqqq, %8, %16, %14
+
+ SBUTTERFLY wd, %1, %2, %14
+ SBUTTERFLY wd, %3, %4, %14
+ SBUTTERFLY wd, %5, %6, %14
+ SBUTTERFLY wd, %7, %8, %14
+ SBUTTERFLY wd, %9, %10, %14
+ SBUTTERFLY wd, %11, %12, %14
+ mova %17, m%12
+ mova m%14, %18
+ SBUTTERFLY wd, %13, %14, %12
+ SBUTTERFLY wd, %15, %16, %12
+
+ SBUTTERFLY dq, %1, %3, %12
+ SBUTTERFLY dq, %2, %4, %12
+ SBUTTERFLY dq, %5, %7, %12
+ SBUTTERFLY dq, %6, %8, %12
+ SBUTTERFLY dq, %9, %11, %12
+ mova %18, m%11
+ mova m%12, %17
+ SBUTTERFLY dq, %10, %12, %11
+ SBUTTERFLY dq, %13, %15, %11
+ SBUTTERFLY dq, %14, %16, %11
+
+ SBUTTERFLY qdq, %1, %5, %11
+ SBUTTERFLY qdq, %2, %6, %11
+ SBUTTERFLY qdq, %3, %7, %11
+ SBUTTERFLY qdq, %4, %8, %11
+
+ SWAP %2, %5
+ SWAP %4, %7
+
+ SBUTTERFLY qdq, %9, %13, %11
+ SBUTTERFLY qdq, %10, %14, %11
+ mova m%11, %18
+ mova %18, m%5
+ SBUTTERFLY qdq, %11, %15, %5
+ SBUTTERFLY qdq, %12, %16, %5
+
+%if %0 < 19
+ mova m%5, %18
+%endif
+
+ SWAP %10, %13
+ SWAP %12, %15
+%endmacro
+
+%macro TRANSPOSE_8X8B 8
+ %if mmsize == 8
+ %error "This macro does not support mmsize == 8"
+ %endif
+ punpcklbw m%1, m%2
+ punpcklbw m%3, m%4
+ punpcklbw m%5, m%6
+ punpcklbw m%7, m%8
+ TRANSPOSE4x4W %1, %3, %5, %7, %2
+ MOVHL m%2, m%1
+ MOVHL m%4, m%3
+ MOVHL m%6, m%5
+ MOVHL m%8, m%7
+%endmacro
+
; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place
%macro PABSW 2
%if cpuflag(ssse3)
@@ -283,6 +419,55 @@
%endif
%endmacro
+%macro HADDD 2 ; sum junk
+%if sizeof%1 == 32
+%define %2 xmm%2
+ vextracti128 %2, %1, 1
+%define %1 xmm%1
+ paddd %1, %2
+%endif
+%if mmsize >= 16
+%if cpuflag(xop) && sizeof%1 == 16
+ vphadddq %1, %1
+%endif
+ movhlps %2, %1
+ paddd %1, %2
+%endif
+%if notcpuflag(xop) || sizeof%1 != 16
+%if cpuflag(mmxext)
+ PSHUFLW %2, %1, q0032
+%else ; mmx
+ mova %2, %1
+ psrlq %2, 32
+%endif
+ paddd %1, %2
+%endif
+%undef %1
+%undef %2
+%endmacro
+
+%macro HADDW 2 ; reg, tmp
+%if cpuflag(xop) && sizeof%1 == 16
+ vphaddwq %1, %1
+ movhlps %2, %1
+ paddd %1, %2
+%else
+ pmaddwd %1, [pw_1]
+ HADDD %1, %2
+%endif
+%endmacro
+
+%macro HADDPS 3 ; dst, src, tmp
+%if cpuflag(sse3)
+ haddps %1, %1, %2
+%else
+ movaps %3, %1
+ shufps %1, %2, q2020
+ shufps %3, %2, q3131
+ addps %1, %3
+%endif
+%endmacro
+
%macro PALIGNR 4-5
%if cpuflag(ssse3)
%if %0==5
@@ -312,11 +497,19 @@
%endif
%endmacro
-%macro PAVGB 2
+%macro PAVGB 2-4
%if cpuflag(mmxext)
pavgb %1, %2
%elif cpuflag(3dnow)
pavgusb %1, %2
+%elif cpuflag(mmx)
+ movu %3, %2
+ por %3, %1
+ pxor %1, %2
+ pand %1, %4
+ psrlq %1, 1
+ psubb %3, %1
+ SWAP %1, %3
%endif
%endmacro
@@ -649,12 +842,25 @@
%endif
%endmacro
-%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
-%if cpuflag(avx)
- vbroadcastss %1, %2
-%else ; sse
- movss %1, %2
- shufps %1, %1, 0
+%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+ vbroadcastss %1, %2
+%elif cpuflag(avx)
+ %ifnum sizeof%2 ; avx1 register
+ shufps xmm%1, xmm%2, xmm%2, q0000
+ %if sizeof%1 >= 32 ; mmsize>=32
+ vinsertf128 %1, %1, xmm%1, 1
+ %endif
+ %else ; avx1 memory
+ vbroadcastss %1, %2
+ %endif
+%else
+ %ifnum sizeof%2 ; sse register
+ shufps %1, %2, %2, q0000
+ %else ; sse memory
+ movss %1, %2
+ shufps %1, %1, 0
+ %endif
%endif
%endmacro
@@ -669,6 +875,29 @@
%endif
%endmacro
+%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+ vpbroadcastd %1, %2
+%elif cpuflag(avx) && sizeof%1 >= 32
+ %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss
+%else
+ %ifnum sizeof%2 ; sse2 register
+ pshufd %1, %2, q0000
+ %else ; sse memory
+ movd %1, %2
+ pshufd %1, %1, 0
+ %endif
+%endif
+%endmacro
+
+%macro VBROADCASTI128 2 ; dst xmm/ymm, src : 128bits val
+%if mmsize > 16
+ vbroadcasti128 %1, %2
+%else
+ mova %1, %2
+%endif
+%endmacro
+
%macro SHUFFLE_MASK_W 8
%rep 8
%if %1>=0x80
@@ -705,3 +934,95 @@
addps %1, %4
%endif
%endmacro
+
+%macro LSHIFT 2
+%if mmsize > 8
+ pslldq %1, %2
+%else
+ psllq %1, 8*(%2)
+%endif
+%endmacro
+
+%macro RSHIFT 2
+%if mmsize > 8
+ psrldq %1, %2
+%else
+ psrlq %1, 8*(%2)
+%endif
+%endmacro
+
+%macro MOVHL 2 ; dst, src
+%ifidn %1, %2
+ punpckhqdq %1, %2
+%elif cpuflag(avx)
+ punpckhqdq %1, %2, %2
+%elif cpuflag(sse4)
+ pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
+%else
+ movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst
+%endif
+%endmacro
+
+; Horizontal Sum of Packed Single precision floats
+; The resulting sum is in all elements.
+%macro HSUMPS 2 ; dst/src, tmp
+%if cpuflag(avx)
+ %if sizeof%1>=32 ; avx
+ vperm2f128 %2, %1, %1, (0)*16+(1)
+ addps %1, %2
+ %endif
+ shufps %2, %1, %1, q1032
+ addps %1, %2
+ shufps %2, %1, %1, q0321
+ addps %1, %2
+%else ; this form is a bit faster than the short avx-like emulation.
+ movaps %2, %1
+ shufps %1, %1, q1032
+ addps %1, %2
+ movaps %2, %1
+ shufps %1, %1, q0321
+ addps %1, %2
+ ; all %1 members should be equal for as long as float a+b==b+a
+%endif
+%endmacro
+
+; Emulate blendvps if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro BLENDVPS 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+ blendvps %1, %1, %2, %3
+%elif cpuflag(sse4)
+ %ifnidn %3,xmm0
+ %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
+ %endif
+ blendvps %1, %2, %3
+%else
+ xorps %2, %1
+ andps %2, %3
+ xorps %1, %2
+%endif
+%endmacro
+
+; Emulate pblendvb if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro PBLENDVB 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+ %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32
+ %error pblendb not possible with ymm on avx1, try blendvps.
+ %endif
+ pblendvb %1, %1, %2, %3
+%elif cpuflag(sse4)
+ %ifnidn %3,xmm0
+ %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3
+ %endif
+ pblendvb %1, %2, %3
+%else
+ pxor %2, %1
+ pand %2, %3
+ pxor %1, %2
+%endif
+%endmacro