diff options
Diffstat (limited to 'libavutil/x86')
-rw-r--r-- | libavutil/x86/Makefile | 16 | ||||
-rw-r--r-- | libavutil/x86/asm.h | 55 | ||||
-rw-r--r-- | libavutil/x86/bswap.h | 44 | ||||
-rw-r--r-- | libavutil/x86/cpu.c | 60 | ||||
-rw-r--r-- | libavutil/x86/cpu.h | 17 | ||||
-rw-r--r-- | libavutil/x86/cpuid.asm | 8 | ||||
-rw-r--r-- | libavutil/x86/emms.asm | 8 | ||||
-rw-r--r-- | libavutil/x86/emms.h | 20 | ||||
-rw-r--r-- | libavutil/x86/fixed_dsp.asm | 48 | ||||
-rw-r--r-- | libavutil/x86/fixed_dsp_init.c | 35 | ||||
-rw-r--r-- | libavutil/x86/float_dsp.asm | 286 | ||||
-rw-r--r-- | libavutil/x86/float_dsp_init.c | 121 | ||||
-rw-r--r-- | libavutil/x86/imgutils.asm | 8 | ||||
-rw-r--r-- | libavutil/x86/imgutils_init.c | 8 | ||||
-rw-r--r-- | libavutil/x86/intmath.h | 139 | ||||
-rw-r--r-- | libavutil/x86/intreadwrite.h | 8 | ||||
-rw-r--r-- | libavutil/x86/lls.asm | 66 | ||||
-rw-r--r-- | libavutil/x86/lls_init.c | 18 | ||||
-rw-r--r-- | libavutil/x86/pixelutils.asm | 386 | ||||
-rw-r--r-- | libavutil/x86/pixelutils.h | 26 | ||||
-rw-r--r-- | libavutil/x86/pixelutils_init.c | 94 | ||||
-rw-r--r-- | libavutil/x86/timer.h | 9 | ||||
-rw-r--r-- | libavutil/x86/w64xmmtest.h | 13 | ||||
-rw-r--r-- | libavutil/x86/x86inc.asm | 354 | ||||
-rw-r--r-- | libavutil/x86/x86util.asm | 357 |
25 files changed, 1875 insertions, 329 deletions
diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile index b619478610..5f5242b5bd 100644 --- a/libavutil/x86/Makefile +++ b/libavutil/x86/Makefile @@ -1,10 +1,18 @@ OBJS += x86/cpu.o \ + x86/fixed_dsp_init.o \ x86/float_dsp_init.o \ x86/imgutils_init.o \ x86/lls_init.o \ +OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o \ + +EMMS_OBJS_$(HAVE_MMX_INLINE)_$(HAVE_MMX_EXTERNAL)_$(HAVE_MM_EMPTY) = x86/emms.o + X86ASM-OBJS += x86/cpuid.o \ - x86/emms.o \ - x86/float_dsp.o \ - x86/imgutils.o \ - x86/lls.o \ + $(EMMS_OBJS__yes_) \ + x86/fixed_dsp.o \ + x86/float_dsp.o \ + x86/imgutils.o \ + x86/lls.o \ + +X86ASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o \ diff --git a/libavutil/x86/asm.h b/libavutil/x86/asm.h index f005a3a38a..9bff42d628 100644 --- a/libavutil/x86/asm.h +++ b/libavutil/x86/asm.h @@ -1,20 +1,20 @@ /* * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -38,7 +38,8 @@ typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg; # define FF_PTR_SIZE "8" typedef int64_t x86_reg; -# define FF_REG_SP "rsp" +/* FF_REG_SP is defined in Solaris sys headers, so use FF_REG_sp */ +# define FF_REG_sp "rsp" # define FF_REG_BP "rbp" # define FF_REGBP rbp # define FF_REGa rax @@ -59,7 +60,7 @@ typedef int64_t x86_reg; # define FF_PTR_SIZE "4" typedef int32_t x86_reg; -# define FF_REG_SP "esp" +# define FF_REG_sp "esp" # define FF_REG_BP "ebp" # define FF_REGBP ebp # define FF_REGa eax @@ -108,6 +109,46 @@ typedef int x86_reg; # define LOCAL_MANGLE(a) #a #endif -#define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a) +#if HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS +# define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a) +# define NAMED_CONSTRAINTS_ADD(...) +# define NAMED_CONSTRAINTS(...) +# define NAMED_CONSTRAINTS_ARRAY_ADD(...) +# define NAMED_CONSTRAINTS_ARRAY(...) +#else + /* When direct symbol references are used in code passed to a compiler that does not support them + * then these references need to be converted to named asm constraints instead. + * Instead of returning a direct symbol MANGLE now returns a named constraint for that specific symbol. + * In order for this to work there must also be a corresponding entry in the asm-interface. To add this + * entry use the macro NAMED_CONSTRAINTS() and pass in a list of each symbol reference used in the + * corresponding block of code. (e.g. NAMED_CONSTRAINTS(var1,var2,var3) where var1 is the first symbol etc. ). + * If there are already existing constraints then use NAMED_CONSTRAINTS_ADD to add to the existing constraint list. + */ +# define MANGLE(a) "%["#a"]" + // Intel/MSVC does not correctly expand va-args so we need a rather ugly hack in order to get it to work +# define FE_0(P,X) P(X) +# define FE_1(P,X,X1) P(X), FE_0(P,X1) +# define FE_2(P,X,X1,X2) P(X), FE_1(P,X1,X2) +# define FE_3(P,X,X1,X2,X3) P(X), FE_2(P,X1,X2,X3) +# define FE_4(P,X,X1,X2,X3,X4) P(X), FE_3(P,X1,X2,X3,X4) +# define FE_5(P,X,X1,X2,X3,X4,X5) P(X), FE_4(P,X1,X2,X3,X4,X5) +# define FE_6(P,X,X1,X2,X3,X4,X5,X6) P(X), FE_5(P,X1,X2,X3,X4,X5,X6) +# define FE_7(P,X,X1,X2,X3,X4,X5,X6,X7) P(X), FE_6(P,X1,X2,X3,X4,X5,X6,X7) +# define FE_8(P,X,X1,X2,X3,X4,X5,X6,X7,X8) P(X), FE_7(P,X1,X2,X3,X4,X5,X6,X7,X8) +# define FE_9(P,X,X1,X2,X3,X4,X5,X6,X7,X8,X9) P(X), FE_8(P,X1,X2,X3,X4,X5,X6,X7,X8,X9) +# define GET_FE_IMPL(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME +# define GET_FE(A) GET_FE_IMPL A +# define GET_FE_GLUE(x, y) x y +# define FOR_EACH_VA(P,...) GET_FE_GLUE(GET_FE((__VA_ARGS__,FE_9,FE_8,FE_7,FE_6,FE_5,FE_4,FE_3,FE_2,FE_1,FE_0)), (P,__VA_ARGS__)) +# define NAME_CONSTRAINT(x) [x] "m"(x) + // Parameters are a list of each symbol reference required +# define NAMED_CONSTRAINTS_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__) + // Same but without comma for when there are no previously defined constraints +# define NAMED_CONSTRAINTS(...) FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__) + // Same as above NAMED_CONSTRAINTS except used for passing arrays/pointers instead of normal variables +# define NAME_CONSTRAINT_ARRAY(x) [x] "m"(*x) +# define NAMED_CONSTRAINTS_ARRAY_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__) +# define NAMED_CONSTRAINTS_ARRAY(...) FOR_EACH_VA(NAME_CONSTRAINT_ARRAY,__VA_ARGS__) +#endif #endif /* AVUTIL_X86_ASM_H */ diff --git a/libavutil/x86/bswap.h b/libavutil/x86/bswap.h index c73be9af81..ffa59e4c82 100644 --- a/libavutil/x86/bswap.h +++ b/libavutil/x86/bswap.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -25,21 +25,47 @@ #define AVUTIL_X86_BSWAP_H #include <stdint.h> +#if defined(_MSC_VER) +#include <intrin.h> +#endif #include "config.h" #include "libavutil/attributes.h" -#if HAVE_INLINE_ASM +#if defined(_MSC_VER) + +#define av_bswap16 av_bswap16 +static av_always_inline av_const uint16_t av_bswap16(uint16_t x) +{ + return _rotr16(x, 8); +} + +#define av_bswap32 av_bswap32 +static av_always_inline av_const uint32_t av_bswap32(uint32_t x) +{ + return _byteswap_ulong(x); +} + +#if ARCH_X86_64 +#define av_bswap64 av_bswap64 +static inline uint64_t av_const av_bswap64(uint64_t x) +{ + return _byteswap_uint64(x); +} +#endif + + +#elif HAVE_INLINE_ASM -#if !AV_GCC_VERSION_AT_LEAST(4,1) +#if AV_GCC_VERSION_AT_MOST(4,0) #define av_bswap16 av_bswap16 static av_always_inline av_const unsigned av_bswap16(unsigned x) { __asm__("rorw $8, %w0" : "+r"(x)); return x; } -#endif /* !AV_GCC_VERSION_AT_LEAST(4,1) */ +#endif /* AV_GCC_VERSION_AT_MOST(4,0) */ -#if !AV_GCC_VERSION_AT_LEAST(4,5) +#if AV_GCC_VERSION_AT_MOST(4,4) || defined(__INTEL_COMPILER) #define av_bswap32 av_bswap32 static av_always_inline av_const uint32_t av_bswap32(uint32_t x) { @@ -55,7 +81,7 @@ static inline uint64_t av_const av_bswap64(uint64_t x) return x; } #endif -#endif /* !AV_GCC_VERSION_AT_LEAST(4,5) */ +#endif /* AV_GCC_VERSION_AT_MOST(4,4) */ #endif /* HAVE_INLINE_ASM */ #endif /* AVUTIL_X86_BSWAP_H */ diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c index fa50eeb9dd..bcd41a50a2 100644 --- a/libavutil/x86/cpu.c +++ b/libavutil/x86/cpu.c @@ -3,20 +3,20 @@ * (c)1997-99 by H. Dietz and R. Fisher * Converted to C and improved by Fabrice Bellard. * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -42,10 +42,10 @@ #define cpuid(index, eax, ebx, ecx, edx) \ __asm__ volatile ( \ "mov %%"FF_REG_b", %%"FF_REG_S" \n\t" \ - "cpuid \n\t" \ + "cpuid \n\t" \ "xchg %%"FF_REG_b", %%"FF_REG_S \ : "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx) \ - : "0" (index)) + : "0" (index), "2"(0)) #define xgetbv(index, eax, edx) \ __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (index)) @@ -97,6 +97,7 @@ int ff_get_cpu_flags_x86(void) int max_std_level, max_ext_level, std_caps = 0, ext_caps = 0; int family = 0, model = 0; union { int i[3]; char c[12]; } vendor; + int xcr0_lo = 0, xcr0_hi = 0; if (!cpuid_test()) return 0; /* CPUID not supported */ @@ -126,12 +127,14 @@ int ff_get_cpu_flags_x86(void) rval |= AV_CPU_FLAG_SSE4; if (ecx & 0x00100000 ) rval |= AV_CPU_FLAG_SSE42; + if (ecx & 0x02000000 ) + rval |= AV_CPU_FLAG_AESNI; #if HAVE_AVX /* Check OXSAVE and AVX bits */ if ((ecx & 0x18000000) == 0x18000000) { /* Check for OS support */ - xgetbv(0, eax, edx); - if ((eax & 0x6) == 0x6) { + xgetbv(0, xcr0_lo, xcr0_hi); + if ((xcr0_lo & 0x6) == 0x6) { rval |= AV_CPU_FLAG_AVX; if (ecx & 0x00001000) rval |= AV_CPU_FLAG_FMA3; @@ -143,8 +146,15 @@ int ff_get_cpu_flags_x86(void) if (max_std_level >= 7) { cpuid(7, eax, ebx, ecx, edx); #if HAVE_AVX2 - if (ebx & 0x00000020) + if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020)) rval |= AV_CPU_FLAG_AVX2; +#if HAVE_AVX512 /* F, CD, BW, DQ, VL */ + if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */ + if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000) + rval |= AV_CPU_FLAG_AVX512; + + } +#endif /* HAVE_AVX512 */ #endif /* HAVE_AVX2 */ /* BMI1/2 don't need OS support */ if (ebx & 0x00000008) { @@ -180,13 +190,11 @@ int ff_get_cpu_flags_x86(void) /* Similar to the above but for AVX functions on AMD processors. This is necessary only for functions using YMM registers on Bulldozer - based CPUs as they lack 256-bit execution units. SSE/AVX functions - using XMM registers are always faster on them. + and Jaguar based CPUs as they lack 256-bit execution units. SSE/AVX + functions using XMM registers are always faster on them. AV_CPU_FLAG_AVX and AV_CPU_FLAG_AVXSLOW are both set so that AVX is - used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. - TODO: Confirm if Excavator is affected or not by this once it's - released, and update the check if necessary. Same for btver2. */ - if (family == 0x15 && (rval & AV_CPU_FLAG_AVX)) + used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */ + if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX)) rval |= AV_CPU_FLAG_AVXSLOW; } @@ -238,9 +246,27 @@ size_t ff_get_cpu_max_align_x86(void) { int flags = av_get_cpu_flags(); - if (flags & AV_CPU_FLAG_AVX) + if (flags & AV_CPU_FLAG_AVX512) + return 64; + if (flags & (AV_CPU_FLAG_AVX2 | + AV_CPU_FLAG_AVX | + AV_CPU_FLAG_XOP | + AV_CPU_FLAG_FMA4 | + AV_CPU_FLAG_FMA3 | + AV_CPU_FLAG_AVXSLOW)) return 32; - if (flags & AV_CPU_FLAG_SSE) + if (flags & (AV_CPU_FLAG_AESNI | + AV_CPU_FLAG_SSE42 | + AV_CPU_FLAG_SSE4 | + AV_CPU_FLAG_SSSE3 | + AV_CPU_FLAG_SSE3 | + AV_CPU_FLAG_SSE2 | + AV_CPU_FLAG_SSE | + AV_CPU_FLAG_ATOM | + AV_CPU_FLAG_SSSE3SLOW | + AV_CPU_FLAG_SSE3SLOW | + AV_CPU_FLAG_SSE2SLOW)) return 16; + return 8; } diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h index 6373e57db2..937c697fa0 100644 --- a/libavutil/x86/cpu.h +++ b/libavutil/x86/cpu.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -48,6 +48,8 @@ #define X86_FMA3(flags) CPUEXT(flags, FMA3) #define X86_FMA4(flags) CPUEXT(flags, FMA4) #define X86_AVX2(flags) CPUEXT(flags, AVX2) +#define X86_AESNI(flags) CPUEXT(flags, AESNI) +#define X86_AVX512(flags) CPUEXT(flags, AVX512) #define EXTERNAL_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOW) #define EXTERNAL_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOWEXT) @@ -70,8 +72,14 @@ #define EXTERNAL_AVX_SLOW(flags) CPUEXT_SUFFIX_SLOW(flags, _EXTERNAL, AVX) #define EXTERNAL_XOP(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, XOP) #define EXTERNAL_FMA3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA3) +#define EXTERNAL_FMA3_FAST(flags) CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, FMA3, AVX) +#define EXTERNAL_FMA3_SLOW(flags) CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, FMA3, AVX) #define EXTERNAL_FMA4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA4) #define EXTERNAL_AVX2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2) +#define EXTERNAL_AVX2_FAST(flags) CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, AVX2, AVX) +#define EXTERNAL_AVX2_SLOW(flags) CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, AVX2, AVX) +#define EXTERNAL_AESNI(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AESNI) +#define EXTERNAL_AVX512(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512) #define INLINE_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOW) #define INLINE_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOWEXT) @@ -96,6 +104,7 @@ #define INLINE_FMA3(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA3) #define INLINE_FMA4(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA4) #define INLINE_AVX2(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX2) +#define INLINE_AESNI(flags) CPUEXT_SUFFIX(flags, _INLINE, AESNI) void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx); void ff_cpu_xgetbv(int op, int *eax, int *edx); diff --git a/libavutil/x86/cpuid.asm b/libavutil/x86/cpuid.asm index 1cb8e94ea3..c3f7866ec7 100644 --- a/libavutil/x86/cpuid.asm +++ b/libavutil/x86/cpuid.asm @@ -4,20 +4,20 @@ ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Fiona Glaser <fiona@x264.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavutil/x86/emms.asm b/libavutil/x86/emms.asm index 05557de5da..8611762d73 100644 --- a/libavutil/x86/emms.asm +++ b/libavutil/x86/emms.asm @@ -1,20 +1,20 @@ ;***************************************************************************** ;* Copyright (C) 2013 Martin Storsjo ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavutil/x86/emms.h b/libavutil/x86/emms.h index 4b3ebbe7a4..c21e34b451 100644 --- a/libavutil/x86/emms.h +++ b/libavutil/x86/emms.h @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -21,6 +21,7 @@ #include "config.h" #include "libavutil/attributes.h" +#include "libavutil/cpu.h" void avpriv_emms_asm(void); @@ -30,10 +31,19 @@ void avpriv_emms_asm(void); * Empty mmx state. * this must be called between any dsp function and float/double code. * for example sin(); dsp->idct_put(); emms_c(); cos() + * Note, *alloc() and *free() also use float code in some libc implementations + * thus this also applies to them or any function using them. */ static av_always_inline void emms_c(void) { - __asm__ volatile ("emms" ::: "memory"); +/* Some inlined functions may also use mmx instructions regardless of + * runtime cpuflags. With that in mind, we unconditionally empty the + * mmx state if the target cpu chosen at configure time supports it. + */ +#if !defined(__MMX__) + if(av_get_cpu_flags() & AV_CPU_FLAG_MMX) +#endif + __asm__ volatile ("emms" ::: "memory"); } #elif HAVE_MMX && HAVE_MM_EMPTY # include <mmintrin.h> diff --git a/libavutil/x86/fixed_dsp.asm b/libavutil/x86/fixed_dsp.asm new file mode 100644 index 0000000000..979dd5c334 --- /dev/null +++ b/libavutil/x86/fixed_dsp.asm @@ -0,0 +1,48 @@ +;***************************************************************************** +;* x86-optimized Float DSP functions +;* +;* Copyright 2016 James Almer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86util.asm" + +SECTION .text + +;----------------------------------------------------------------------------- +; void ff_butterflies_fixed(float *src0, float *src1, int len); +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal butterflies_fixed, 3,3,3, src0, src1, len + shl lend, 2 + add src0q, lenq + add src1q, lenq + neg lenq + +align 16 +.loop: + mova m0, [src0q + lenq] + mova m1, [src1q + lenq] + mova m2, m0 + paddd m0, m1 + psubd m2, m1 + mova [src0q + lenq], m0 + mova [src1q + lenq], m2 + add lenq, mmsize + jl .loop + RET diff --git a/libavutil/x86/fixed_dsp_init.c b/libavutil/x86/fixed_dsp_init.c new file mode 100644 index 0000000000..303a2eb922 --- /dev/null +++ b/libavutil/x86/fixed_dsp_init.c @@ -0,0 +1,35 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/fixed_dsp.h" +#include "cpu.h" + +void ff_butterflies_fixed_sse2(int *src0, int *src1, int len); + +av_cold void ff_fixed_dsp_init_x86(AVFixedDSPContext *fdsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) { + fdsp->butterflies_fixed = ff_butterflies_fixed_sse2; + } +} diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index d96249978a..517fd63638 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -1,25 +1,30 @@ ;***************************************************************************** ;* x86-optimized Float DSP functions ;* -;* This file is part of Libav. +;* Copyright 2006 Loren Merritt ;* -;* Libav is free software; you can redistribute it and/or +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "x86util.asm" +SECTION_RODATA 32 +pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0 + SECTION .text ;----------------------------------------------------------------------------- @@ -48,8 +53,43 @@ ALIGN 16 INIT_XMM sse VECTOR_FMUL +%if HAVE_AVX_EXTERNAL INIT_YMM avx VECTOR_FMUL +%endif + +;----------------------------------------------------------------------------- +; void vector_dmul(double *dst, const double *src0, const double *src1, int len) +;----------------------------------------------------------------------------- +%macro VECTOR_DMUL 0 +cglobal vector_dmul, 4,4,4, dst, src0, src1, len + lea lend, [lenq*8 - mmsize*4] +ALIGN 16 +.loop: + movaps m0, [src0q + lenq + 0*mmsize] + movaps m1, [src0q + lenq + 1*mmsize] + movaps m2, [src0q + lenq + 2*mmsize] + movaps m3, [src0q + lenq + 3*mmsize] + mulpd m0, m0, [src1q + lenq + 0*mmsize] + mulpd m1, m1, [src1q + lenq + 1*mmsize] + mulpd m2, m2, [src1q + lenq + 2*mmsize] + mulpd m3, m3, [src1q + lenq + 3*mmsize] + movaps [dstq + lenq + 0*mmsize], m0 + movaps [dstq + lenq + 1*mmsize], m1 + movaps [dstq + lenq + 2*mmsize], m2 + movaps [dstq + lenq + 3*mmsize], m3 + + sub lenq, mmsize*4 + jge .loop + RET +%endmacro + +INIT_XMM sse2 +VECTOR_DMUL +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_DMUL +%endif ;------------------------------------------------------------------------------ ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len) @@ -57,33 +97,48 @@ VECTOR_FMUL %macro VECTOR_FMAC_SCALAR 0 %if UNIX64 -cglobal vector_fmac_scalar, 3,3,3, dst, src, len +cglobal vector_fmac_scalar, 3,3,5, dst, src, len %else -cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len +cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len %endif %if ARCH_X86_32 VBROADCASTSS m0, mulm %else %if WIN64 - mova xmm0, xmm2 + SWAP 0, 2 %endif - shufps xmm0, xmm0, 0 + shufps xm0, xm0, 0 %if cpuflag(avx) - vinsertf128 m0, m0, xmm0, 1 + vinsertf128 m0, m0, xm0, 1 %endif %endif lea lenq, [lend*4-64] .loop: -%assign a 0 -%rep 32/mmsize - mulps m1, m0, [srcq+lenq+(a+0)*mmsize] - mulps m2, m0, [srcq+lenq+(a+1)*mmsize] - addps m1, m1, [dstq+lenq+(a+0)*mmsize] - addps m2, m2, [dstq+lenq+(a+1)*mmsize] - mova [dstq+lenq+(a+0)*mmsize], m1 - mova [dstq+lenq+(a+1)*mmsize], m2 -%assign a a+2 -%endrep +%if cpuflag(fma3) + mova m1, [dstq+lenq] + mova m2, [dstq+lenq+1*mmsize] + fmaddps m1, m0, [srcq+lenq], m1 + fmaddps m2, m0, [srcq+lenq+1*mmsize], m2 +%else ; cpuflag + mulps m1, m0, [srcq+lenq] + mulps m2, m0, [srcq+lenq+1*mmsize] +%if mmsize < 32 + mulps m3, m0, [srcq+lenq+2*mmsize] + mulps m4, m0, [srcq+lenq+3*mmsize] +%endif ; mmsize + addps m1, m1, [dstq+lenq] + addps m2, m2, [dstq+lenq+1*mmsize] +%if mmsize < 32 + addps m3, m3, [dstq+lenq+2*mmsize] + addps m4, m4, [dstq+lenq+3*mmsize] +%endif ; mmsize +%endif ; cpuflag + mova [dstq+lenq], m1 + mova [dstq+lenq+1*mmsize], m2 +%if mmsize < 32 + mova [dstq+lenq+2*mmsize], m3 + mova [dstq+lenq+3*mmsize], m4 +%endif ; mmsize sub lenq, 64 jge .loop REP_RET @@ -91,8 +146,14 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len INIT_XMM sse VECTOR_FMAC_SCALAR +%if HAVE_AVX_EXTERNAL INIT_YMM avx VECTOR_FMAC_SCALAR +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_FMAC_SCALAR +%endif ;------------------------------------------------------------------------------ ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) @@ -124,6 +185,69 @@ INIT_XMM sse VECTOR_FMUL_SCALAR ;------------------------------------------------------------------------------ +; void ff_vector_dmac_scalar(double *dst, const double *src, double mul, +; int len) +;------------------------------------------------------------------------------ + +%macro VECTOR_DMAC_SCALAR 0 +%if ARCH_X86_32 +cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr + mov lenq, lenaddrm + VBROADCASTSD m0, mulm +%else +%if UNIX64 +cglobal vector_dmac_scalar, 3,3,5, dst, src, len +%else +cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len + SWAP 0, 2 +%endif + movlhps xm0, xm0 +%if cpuflag(avx) + vinsertf128 m0, m0, xm0, 1 +%endif +%endif + lea lenq, [lend*8-mmsize*4] +.loop: +%if cpuflag(fma3) + movaps m1, [dstq+lenq] + movaps m2, [dstq+lenq+1*mmsize] + movaps m3, [dstq+lenq+2*mmsize] + movaps m4, [dstq+lenq+3*mmsize] + fmaddpd m1, m0, [srcq+lenq], m1 + fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2 + fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3 + fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4 +%else ; cpuflag + mulpd m1, m0, [srcq+lenq] + mulpd m2, m0, [srcq+lenq+1*mmsize] + mulpd m3, m0, [srcq+lenq+2*mmsize] + mulpd m4, m0, [srcq+lenq+3*mmsize] + addpd m1, m1, [dstq+lenq] + addpd m2, m2, [dstq+lenq+1*mmsize] + addpd m3, m3, [dstq+lenq+2*mmsize] + addpd m4, m4, [dstq+lenq+3*mmsize] +%endif ; cpuflag + movaps [dstq+lenq], m1 + movaps [dstq+lenq+1*mmsize], m2 + movaps [dstq+lenq+2*mmsize], m3 + movaps [dstq+lenq+3*mmsize], m4 + sub lenq, mmsize*4 + jge .loop + REP_RET +%endmacro + +INIT_XMM sse2 +VECTOR_DMAC_SCALAR +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_DMAC_SCALAR +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_DMAC_SCALAR +%endif + +;------------------------------------------------------------------------------ ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, ; int len) ;------------------------------------------------------------------------------ @@ -141,24 +265,19 @@ cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len VBROADCASTSD m0, mulm %else %if WIN64 - movlhps xmm2, xmm2 -%if cpuflag(avx) - vinsertf128 ymm2, ymm2, xmm2, 1 -%endif SWAP 0, 2 -%else - movlhps xmm0, xmm0 -%if cpuflag(avx) - vinsertf128 ymm0, ymm0, xmm0, 1 %endif + movlhps xm0, xm0 +%if cpuflag(avx) + vinsertf128 ym0, ym0, xm0, 1 %endif %endif lea lenq, [lend*8-2*mmsize] .loop: mulpd m1, m0, [srcq+lenq ] mulpd m2, m0, [srcq+lenq+mmsize] - mova [dstq+lenq ], m1 - mova [dstq+lenq+mmsize], m2 + movaps [dstq+lenq ], m1 + movaps [dstq+lenq+mmsize], m2 sub lenq, 2*mmsize jge .loop REP_RET @@ -172,20 +291,85 @@ VECTOR_DMUL_SCALAR %endif ;----------------------------------------------------------------------------- +; vector_fmul_window(float *dst, const float *src0, +; const float *src1, const float *win, int len); +;----------------------------------------------------------------------------- +%macro VECTOR_FMUL_WINDOW 0 +cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1 + shl lend, 2 + lea len1q, [lenq - mmsize] + add src0q, lenq + add dstq, lenq + add winq, lenq + neg lenq +.loop: + mova m0, [winq + lenq] + mova m4, [src0q + lenq] +%if cpuflag(sse) + mova m1, [winq + len1q] + mova m5, [src1q + len1q] + shufps m1, m1, 0x1b + shufps m5, m5, 0x1b + mova m2, m0 + mova m3, m1 + mulps m2, m4 + mulps m3, m5 + mulps m1, m4 + mulps m0, m5 + addps m2, m3 + subps m1, m0 + shufps m2, m2, 0x1b +%else + pswapd m1, [winq + len1q] + pswapd m5, [src1q + len1q] + mova m2, m0 + mova m3, m1 + pfmul m2, m4 + pfmul m3, m5 + pfmul m1, m4 + pfmul m0, m5 + pfadd m2, m3 + pfsub m1, m0 + pswapd m2, m2 +%endif + mova [dstq + lenq], m1 + mova [dstq + len1q], m2 + sub len1q, mmsize + add lenq, mmsize + jl .loop +%if mmsize == 8 + femms +%endif + REP_RET +%endmacro + +INIT_MMX 3dnowext +VECTOR_FMUL_WINDOW +INIT_XMM sse +VECTOR_FMUL_WINDOW + +;----------------------------------------------------------------------------- ; vector_fmul_add(float *dst, const float *src0, const float *src1, ; const float *src2, int len) ;----------------------------------------------------------------------------- %macro VECTOR_FMUL_ADD 0 -cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len +cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len lea lenq, [lend*4 - 2*mmsize] ALIGN 16 .loop: mova m0, [src0q + lenq] mova m1, [src0q + lenq + mmsize] +%if cpuflag(fma3) + mova m2, [src2q + lenq] + mova m3, [src2q + lenq + mmsize] + fmaddps m0, m0, [src1q + lenq], m2 + fmaddps m1, m1, [src1q + lenq + mmsize], m3 +%else mulps m0, m0, [src1q + lenq] mulps m1, m1, [src1q + lenq + mmsize] addps m0, m0, [src2q + lenq] addps m1, m1, [src2q + lenq + mmsize] +%endif mova [dstq + lenq], m0 mova [dstq + lenq + mmsize], m1 @@ -196,8 +380,14 @@ ALIGN 16 INIT_XMM sse VECTOR_FMUL_ADD +%if HAVE_AVX_EXTERNAL INIT_YMM avx VECTOR_FMUL_ADD +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_FMUL_ADD +%endif ;----------------------------------------------------------------------------- ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, @@ -205,10 +395,16 @@ VECTOR_FMUL_ADD ;----------------------------------------------------------------------------- %macro VECTOR_FMUL_REVERSE 0 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len +%if cpuflag(avx2) + movaps m2, [pd_reverse] +%endif lea lenq, [lend*4 - 2*mmsize] ALIGN 16 .loop: -%if cpuflag(avx) +%if cpuflag(avx2) + vpermps m0, m2, [src1q] + vpermps m1, m2, [src1q+mmsize] +%elif cpuflag(avx) vmovaps xmm0, [src1q + 16] vinsertf128 m0, m0, [src1q], 1 vshufps m0, m0, m0, q0123 @@ -223,8 +419,8 @@ ALIGN 16 %endif mulps m0, m0, [src0q + lenq + mmsize] mulps m1, m1, [src0q + lenq] - mova [dstq + lenq + mmsize], m0 - mova [dstq + lenq], m1 + movaps [dstq + lenq + mmsize], m0 + movaps [dstq + lenq], m1 add src1q, 2*mmsize sub lenq, 2*mmsize jge .loop @@ -233,16 +429,22 @@ ALIGN 16 INIT_XMM sse VECTOR_FMUL_REVERSE +%if HAVE_AVX_EXTERNAL INIT_YMM avx VECTOR_FMUL_REVERSE +%endif +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +VECTOR_FMUL_REVERSE +%endif ; float scalarproduct_float_sse(const float *v1, const float *v2, int len) INIT_XMM sse cglobal scalarproduct_float, 3,3,2, v1, v2, offset + shl offsetd, 2 + add v1q, offsetq + add v2q, offsetq neg offsetq - shl offsetq, 2 - sub v1q, offsetq - sub v2q, offsetq xorps xmm0, xmm0 .loop: movaps xmm1, [v1q+offsetq] @@ -266,14 +468,9 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset ;----------------------------------------------------------------------------- INIT_XMM sse cglobal butterflies_float, 3,3,3, src0, src1, len -%if ARCH_X86_64 - movsxd lenq, lend -%endif - test lenq, lenq - jz .end - shl lenq, 2 - lea src0q, [src0q + lenq] - lea src1q, [src1q + lenq] + shl lend, 2 + add src0q, lenq + add src1q, lenq neg lenq .loop: mova m0, [src0q + lenq] @@ -284,5 +481,4 @@ cglobal butterflies_float, 3,3,3, src0, src1, len mova [src0q + lenq], m0 add lenq, mmsize jl .loop -.end: REP_RET diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c index dcbe3f1f3c..8826e4e2c9 100644 --- a/libavutil/x86/float_dsp_init.c +++ b/libavutil/x86/float_dsp_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -29,128 +29,93 @@ void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1, void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1, int len); +void ff_vector_dmul_sse2(double *dst, const double *src0, const double *src1, + int len); +void ff_vector_dmul_avx(double *dst, const double *src0, const double *src1, + int len); + void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul, int len); void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul, int len); +void ff_vector_fmac_scalar_fma3(float *dst, const float *src, float mul, + int len); void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul, int len); +void ff_vector_dmac_scalar_sse2(double *dst, const double *src, double mul, + int len); +void ff_vector_dmac_scalar_avx(double *dst, const double *src, double mul, + int len); +void ff_vector_dmac_scalar_fma3(double *dst, const double *src, double mul, + int len); + void ff_vector_dmul_scalar_sse2(double *dst, const double *src, double mul, int len); void ff_vector_dmul_scalar_avx(double *dst, const double *src, double mul, int len); +void ff_vector_fmul_window_3dnowext(float *dst, const float *src0, + const float *src1, const float *win, int len); +void ff_vector_fmul_window_sse(float *dst, const float *src0, + const float *src1, const float *win, int len); + void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1, const float *src2, int len); void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1, const float *src2, int len); +void ff_vector_fmul_add_fma3(float *dst, const float *src0, const float *src1, + const float *src2, int len); void ff_vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len); void ff_vector_fmul_reverse_avx(float *dst, const float *src0, const float *src1, int len); +void ff_vector_fmul_reverse_avx2(float *dst, const float *src0, + const float *src1, int len); float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); -void ff_butterflies_float_sse(float *restrict src0, float *restrict src1, int len); - -#if HAVE_6REGS && HAVE_INLINE_ASM -static void vector_fmul_window_3dnowext(float *dst, const float *src0, - const float *src1, const float *win, - int len) -{ - x86_reg i = -len * 4; - x86_reg j = len * 4 - 8; - __asm__ volatile ( - "1: \n" - "pswapd (%5, %1), %%mm1 \n" - "movq (%5, %0), %%mm0 \n" - "pswapd (%4, %1), %%mm5 \n" - "movq (%3, %0), %%mm4 \n" - "movq %%mm0, %%mm2 \n" - "movq %%mm1, %%mm3 \n" - "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i] - "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j] - "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j] - "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i] - "pfadd %%mm3, %%mm2 \n" - "pfsub %%mm0, %%mm1 \n" - "pswapd %%mm2, %%mm2 \n" - "movq %%mm1, (%2, %0) \n" - "movq %%mm2, (%2, %1) \n" - "sub $8, %1 \n" - "add $8, %0 \n" - "jl 1b \n" - "femms \n" - : "+r"(i), "+r"(j) - : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len) - ); -} - -static void vector_fmul_window_sse(float *dst, const float *src0, - const float *src1, const float *win, int len) -{ - x86_reg i = -len * 4; - x86_reg j = len * 4 - 16; - __asm__ volatile ( - "1: \n" - "movaps (%5, %1), %%xmm1 \n" - "movaps (%5, %0), %%xmm0 \n" - "movaps (%4, %1), %%xmm5 \n" - "movaps (%3, %0), %%xmm4 \n" - "shufps $0x1b, %%xmm1, %%xmm1 \n" - "shufps $0x1b, %%xmm5, %%xmm5 \n" - "movaps %%xmm0, %%xmm2 \n" - "movaps %%xmm1, %%xmm3 \n" - "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i] - "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j] - "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j] - "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i] - "addps %%xmm3, %%xmm2 \n" - "subps %%xmm0, %%xmm1 \n" - "shufps $0x1b, %%xmm2, %%xmm2 \n" - "movaps %%xmm1, (%2, %0) \n" - "movaps %%xmm2, (%2, %1) \n" - "sub $16, %1 \n" - "add $16, %0 \n" - "jl 1b \n" - : "+r"(i), "+r"(j) - : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len) - ); -} -#endif /* HAVE_6REGS && HAVE_INLINE_ASM */ +void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len); av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) { int cpu_flags = av_get_cpu_flags(); -#if HAVE_6REGS && HAVE_INLINE_ASM - if (INLINE_AMD3DNOWEXT(cpu_flags)) { - fdsp->vector_fmul_window = vector_fmul_window_3dnowext; + if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) { + fdsp->vector_fmul_window = ff_vector_fmul_window_3dnowext; } - if (INLINE_SSE(cpu_flags)) { - fdsp->vector_fmul_window = vector_fmul_window_sse; - } -#endif if (EXTERNAL_SSE(cpu_flags)) { fdsp->vector_fmul = ff_vector_fmul_sse; fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse; fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse; + fdsp->vector_fmul_window = ff_vector_fmul_window_sse; fdsp->vector_fmul_add = ff_vector_fmul_add_sse; fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse; fdsp->scalarproduct_float = ff_scalarproduct_float_sse; fdsp->butterflies_float = ff_butterflies_float_sse; } if (EXTERNAL_SSE2(cpu_flags)) { + fdsp->vector_dmul = ff_vector_dmul_sse2; + fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_sse2; fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2; } if (EXTERNAL_AVX_FAST(cpu_flags)) { fdsp->vector_fmul = ff_vector_fmul_avx; + fdsp->vector_dmul = ff_vector_dmul_avx; fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx; fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx; + fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx; fdsp->vector_fmul_add = ff_vector_fmul_add_avx; fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx; } + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2; + } + if (EXTERNAL_FMA3_FAST(cpu_flags)) { + fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3; + fdsp->vector_fmul_add = ff_vector_fmul_add_fma3; + fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3; + } } diff --git a/libavutil/x86/imgutils.asm b/libavutil/x86/imgutils.asm index be7a47f6d0..3cca56cdca 100644 --- a/libavutil/x86/imgutils.asm +++ b/libavutil/x86/imgutils.asm @@ -1,20 +1,20 @@ ;***************************************************************************** ;* Copyright 2016 Anton Khirnov ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavutil/x86/imgutils_init.c b/libavutil/x86/imgutils_init.c index 20ee9abaa2..4ea398205e 100644 --- a/libavutil/x86/imgutils_init.c +++ b/libavutil/x86/imgutils_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavutil/x86/intmath.h b/libavutil/x86/intmath.h new file mode 100644 index 0000000000..40743fd13e --- /dev/null +++ b/libavutil/x86/intmath.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2015 James Almer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_X86_INTMATH_H +#define AVUTIL_X86_INTMATH_H + +#include <stdint.h> +#include <stdlib.h> +#if HAVE_FAST_CLZ +#if defined(_MSC_VER) +#include <intrin.h> +#elif defined(__INTEL_COMPILER) +#include <immintrin.h> +#endif +#endif +#include "config.h" + +#if HAVE_FAST_CLZ +#if (defined(__INTEL_COMPILER) && (__INTEL_COMPILER>=1216)) || defined(_MSC_VER) +# if defined(__INTEL_COMPILER) +# define ff_log2(x) (_bit_scan_reverse((x)|1)) +# else +# define ff_log2 ff_log2_x86 +static av_always_inline av_const int ff_log2_x86(unsigned int v) +{ + unsigned long n; + _BitScanReverse(&n, v|1); + return n; +} +# endif +# define ff_log2_16bit av_log2 + +#if defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER >= 1700) && \ + (defined(__BMI__) || !defined(__clang__))) +# define ff_ctz(v) _tzcnt_u32(v) + +# if ARCH_X86_64 +# define ff_ctzll(v) _tzcnt_u64(v) +# else +# define ff_ctzll ff_ctzll_x86 +static av_always_inline av_const int ff_ctzll_x86(long long v) +{ + return ((uint32_t)v == 0) ? _tzcnt_u32((uint32_t)(v >> 32)) + 32 : _tzcnt_u32((uint32_t)v); +} +# endif +#endif /* _MSC_VER */ + +#endif /* __INTEL_COMPILER */ + +#endif /* HAVE_FAST_CLZ */ + +#if defined(__GNUC__) + +/* Our generic version of av_popcount is faster than GCC's built-in on + * CPUs that don't support the popcnt instruction. + */ +#if defined(__POPCNT__) + #define av_popcount __builtin_popcount +#if ARCH_X86_64 + #define av_popcount64 __builtin_popcountll +#endif + +#endif /* __POPCNT__ */ + +#if defined(__BMI2__) + +#if AV_GCC_VERSION_AT_LEAST(5,1) +#define av_mod_uintp2 __builtin_ia32_bzhi_si +#elif HAVE_INLINE_ASM +/* GCC releases before 5.1.0 have a broken bzhi builtin, so for those we + * implement it using inline assembly + */ +#define av_mod_uintp2 av_mod_uintp2_bmi2 +static av_always_inline av_const unsigned av_mod_uintp2_bmi2(unsigned a, unsigned p) +{ + if (av_builtin_constant_p(p)) + return a & ((1 << p) - 1); + else { + unsigned x; + __asm__ ("bzhi %2, %1, %0 \n\t" : "=r"(x) : "rm"(a), "r"(p)); + return x; + } +} +#endif /* AV_GCC_VERSION_AT_LEAST */ + +#endif /* __BMI2__ */ + +#if defined(__SSE2__) && !defined(__INTEL_COMPILER) + +#define av_clipd av_clipd_sse2 +static av_always_inline av_const double av_clipd_sse2(double a, double amin, double amax) +{ +#if defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2 + if (amin > amax) abort(); +#endif + __asm__ ("minsd %2, %0 \n\t" + "maxsd %1, %0 \n\t" + : "+&x"(a) : "xm"(amin), "xm"(amax)); + return a; +} + +#endif /* __SSE2__ */ + +#if defined(__SSE__) && !defined(__INTEL_COMPILER) + +#define av_clipf av_clipf_sse +static av_always_inline av_const float av_clipf_sse(float a, float amin, float amax) +{ +#if defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2 + if (amin > amax) abort(); +#endif + __asm__ ("minss %2, %0 \n\t" + "maxss %1, %0 \n\t" + : "+&x"(a) : "xm"(amin), "xm"(amax)); + return a; +} + +#endif /* __SSE__ */ + +#endif /* __GNUC__ */ + +#endif /* AVUTIL_X86_INTMATH_H */ diff --git a/libavutil/x86/intreadwrite.h b/libavutil/x86/intreadwrite.h index 635096e569..4061d19231 100644 --- a/libavutil/x86/intreadwrite.h +++ b/libavutil/x86/intreadwrite.h @@ -1,20 +1,20 @@ /* * Copyright (c) 2010 Alexander Strange <astrange@ithinksw.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm index eab85ed050..317fba6fca 100644 --- a/libavutil/x86/lls.asm +++ b/libavutil/x86/lls.asm @@ -3,20 +3,20 @@ ;* ;* Copyright (c) 2013 Loren Merritt ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -125,7 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2 .ret: REP_RET -INIT_YMM avx +%macro UPDATE_LLS 0 cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 %define covarq ctxq mov countd, [ctxq + LLSModel.indep_count] @@ -139,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 vbroadcastsd ymm6, [varq + iq*8 + 16] vbroadcastsd ymm7, [varq + iq*8 + 24] vextractf128 xmm3, ymm1, 1 +%if cpuflag(fma3) + mova ymm0, COVAR(iq ,0) + mova xmm2, COVAR(iq+2,2) + fmaddpd ymm0, ymm1, ymm4, ymm0 + fmaddpd xmm2, xmm3, xmm6, xmm2 + fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1) + fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3) + mova COVAR(iq ,0), ymm0 + mova COVAR(iq ,1), ymm1 + mova COVAR(iq+2,2), xmm2 + mova COVAR(iq+2,3), xmm3 +%else vmulpd ymm0, ymm1, ymm4 vmulpd ymm1, ymm1, ymm5 vmulpd xmm2, xmm3, xmm6 @@ -147,12 +159,26 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 ADDPD_MEM COVAR(iq ,1), ymm1 ADDPD_MEM COVAR(iq+2,2), xmm2 ADDPD_MEM COVAR(iq+2,3), xmm3 +%endif ; cpuflag(fma3) lea jd, [iq + 4] cmp jd, count2d jg .skip4x4 .loop4x4: ; Compute all 16 pairwise products of a 4x4 block mova ymm3, [varq + jq*8] +%if cpuflag(fma3) + mova ymm0, COVAR(jq, 0) + mova ymm1, COVAR(jq, 1) + mova ymm2, COVAR(jq, 2) + fmaddpd ymm0, ymm3, ymm4, ymm0 + fmaddpd ymm1, ymm3, ymm5, ymm1 + fmaddpd ymm2, ymm3, ymm6, ymm2 + fmaddpd ymm3, ymm7, ymm3, COVAR(jq,3) + mova COVAR(jq, 0), ymm0 + mova COVAR(jq, 1), ymm1 + mova COVAR(jq, 2), ymm2 + mova COVAR(jq, 3), ymm3 +%else vmulpd ymm0, ymm3, ymm4 vmulpd ymm1, ymm3, ymm5 vmulpd ymm2, ymm3, ymm6 @@ -161,6 +187,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 ADDPD_MEM COVAR(jq,1), ymm1 ADDPD_MEM COVAR(jq,2), ymm2 ADDPD_MEM COVAR(jq,3), ymm3 +%endif ; cpuflag(fma3) add jd, 4 cmp jd, count2d jle .loop4x4 @@ -168,6 +195,19 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 cmp jd, countd jg .skip2x4 mova xmm3, [varq + jq*8] +%if cpuflag(fma3) + mova xmm0, COVAR(jq, 0) + mova xmm1, COVAR(jq, 1) + mova xmm2, COVAR(jq, 2) + fmaddpd xmm0, xmm3, xmm4, xmm0 + fmaddpd xmm1, xmm3, xmm5, xmm1 + fmaddpd xmm2, xmm3, xmm6, xmm2 + fmaddpd xmm3, xmm7, xmm3, COVAR(jq,3) + mova COVAR(jq, 0), xmm0 + mova COVAR(jq, 1), xmm1 + mova COVAR(jq, 2), xmm2 + mova COVAR(jq, 3), xmm3 +%else vmulpd xmm0, xmm3, xmm4 vmulpd xmm1, xmm3, xmm5 vmulpd xmm2, xmm3, xmm6 @@ -176,6 +216,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 ADDPD_MEM COVAR(jq,1), xmm1 ADDPD_MEM COVAR(jq,2), xmm2 ADDPD_MEM COVAR(jq,3), xmm3 +%endif ; cpuflag(fma3) .skip2x4: add id, 4 add covarq, 4*COVAR_STRIDE @@ -186,15 +227,30 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 mov jd, id .loop2x1: vmovddup xmm0, [varq + iq*8] +%if cpuflag(fma3) + mova xmm1, [varq + jq*8] + fmaddpd xmm0, xmm1, xmm0, COVAR(jq,0) + mova COVAR(jq,0), xmm0 +%else vmulpd xmm0, [varq + jq*8] ADDPD_MEM COVAR(jq,0), xmm0 +%endif ; cpuflag(fma3) inc id add covarq, COVAR_STRIDE cmp id, countd jle .loop2x1 .ret: REP_RET +%endmacro ; UPDATE_LLS +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +UPDATE_LLS +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +UPDATE_LLS +%endif INIT_XMM sse2 cglobal evaluate_lls, 3,4,2, ctx, var, order, i diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c index 80cda29139..1c5dca42dc 100644 --- a/libavutil/x86/lls_init.c +++ b/libavutil/x86/lls_init.c @@ -3,29 +3,30 @@ * * Copyright (c) 2013 Loren Merritt * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/lls.h" #include "libavutil/x86/cpu.h" -void ff_update_lls_sse2(LLSModel *m, double *var); -void ff_update_lls_avx(LLSModel *m, double *var); -double ff_evaluate_lls_sse2(LLSModel *m, double *var, int order); +void ff_update_lls_sse2(LLSModel *m, const double *var); +void ff_update_lls_avx(LLSModel *m, const double *var); +void ff_update_lls_fma3(LLSModel *m, const double *var); +double ff_evaluate_lls_sse2(LLSModel *m, const double *var, int order); av_cold void ff_init_lls_x86(LLSModel *m) { @@ -38,4 +39,7 @@ av_cold void ff_init_lls_x86(LLSModel *m) if (EXTERNAL_AVX_FAST(cpu_flags)) { m->update_lls = ff_update_lls_avx; } + if (EXTERNAL_FMA3_FAST(cpu_flags)) { + m->update_lls = ff_update_lls_fma3; + } } diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm new file mode 100644 index 0000000000..36c57c5f7f --- /dev/null +++ b/libavutil/x86/pixelutils.asm @@ -0,0 +1,386 @@ +;****************************************************************************** +;* Pixel utilities SIMD +;* +;* Copyright (C) 2002-2004 Michael Niedermayer <michaelni@gmx.at> +;* Copyright (C) 2014 Clément Bœsch <u pkh me> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86util.asm" + +SECTION .text + +;------------------------------------------------------------------------------- +; int ff_pixelutils_sad_8x8_mmx(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;------------------------------------------------------------------------------- +INIT_MMX mmx +cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2 + pxor m7, m7 + pxor m6, m6 +%rep 4 + mova m0, [src1q] + mova m2, [src1q + stride1q] + mova m1, [src2q] + mova m3, [src2q + stride2q] + psubusb m4, m0, m1 + psubusb m5, m2, m3 + psubusb m1, m0 + psubusb m3, m2 + por m1, m4 + por m3, m5 + punpcklbw m0, m1, m7 + punpcklbw m2, m3, m7 + punpckhbw m1, m7 + punpckhbw m3, m7 + paddw m0, m1 + paddw m2, m3 + paddw m0, m2 + paddw m6, m0 + lea src1q, [src1q + 2*stride1q] + lea src2q, [src2q + 2*stride2q] +%endrep + psrlq m0, m6, 32 + paddw m6, m0 + psrlq m0, m6, 16 + paddw m6, m0 + movd eax, m6 + movzx eax, ax + RET + +;------------------------------------------------------------------------------- +; int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;------------------------------------------------------------------------------- +INIT_MMX mmxext +cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2 + pxor m2, m2 +%rep 4 + mova m0, [src1q] + mova m1, [src1q + stride1q] + psadbw m0, [src2q] + psadbw m1, [src2q + stride2q] + paddw m2, m0 + paddw m2, m1 + lea src1q, [src1q + 2*stride1q] + lea src2q, [src2q + 2*stride2q] +%endrep + movd eax, m2 + RET + +;------------------------------------------------------------------------------- +; int ff_pixelutils_sad_16x16_mmxext(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;------------------------------------------------------------------------------- +INIT_MMX mmxext +cglobal pixelutils_sad_16x16, 4,4,0, src1, stride1, src2, stride2 + pxor m2, m2 +%rep 16 + mova m0, [src1q] + mova m1, [src1q + 8] + psadbw m0, [src2q] + psadbw m1, [src2q + 8] + paddw m2, m0 + paddw m2, m1 + add src1q, stride1q + add src2q, stride2q +%endrep + movd eax, m2 + RET + +;------------------------------------------------------------------------------- +; int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2 + movu m4, [src1q] + movu m2, [src2q] + movu m1, [src1q + stride1q] + movu m3, [src2q + stride2q] + psadbw m4, m2 + psadbw m1, m3 + paddw m4, m1 +%rep 7 + lea src1q, [src1q + 2*stride1q] + lea src2q, [src2q + 2*stride2q] + movu m0, [src1q] + movu m2, [src2q] + movu m1, [src1q + stride1q] + movu m3, [src2q + stride2q] + psadbw m0, m2 + psadbw m1, m3 + paddw m4, m0 + paddw m4, m1 +%endrep + movhlps m0, m4 + paddw m4, m0 + movd eax, m4 + RET + +;------------------------------------------------------------------------------- +; int ff_pixelutils_sad_[au]_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;------------------------------------------------------------------------------- +%macro SAD_XMM_16x16 1 +INIT_XMM sse2 +cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2 + mov%1 m2, [src2q] + psadbw m2, [src1q] + mov%1 m1, [src2q + stride2q] + psadbw m1, [src1q + stride1q] + paddw m2, m1 +%rep 7 + lea src1q, [src1q + 2*stride1q] + lea src2q, [src2q + 2*stride2q] + mov%1 m0, [src2q] + psadbw m0, [src1q] + mov%1 m1, [src2q + stride2q] + psadbw m1, [src1q + stride1q] + paddw m2, m0 + paddw m2, m1 +%endrep + movhlps m0, m2 + paddw m2, m0 + movd eax, m2 + RET +%endmacro + +SAD_XMM_16x16 a +SAD_XMM_16x16 u + + +%macro PROCESS_SAD_32x4_U 0 + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r0] + movu m4, [r0 + 16] + psadbw m1, m3 + psadbw m2, m4 + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r0] + movu m4, [r0 + 16] + psadbw m1, m3 + psadbw m2, m4 + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r0] + movu m4, [r0 + 16] + psadbw m1, m3 + psadbw m2, m4 + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r0] + movu m4, [r0 + 16] + psadbw m1, m3 + psadbw m2, m4 + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] +%endmacro + +%macro PROCESS_SAD_32x4 1 + mov%1 m1, [r2] + mov%1 m2, [r2 + 16] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + mov%1 m1, [r2] + mov%1 m2, [r2 + 16] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + mov%1 m1, [r2] + mov%1 m2, [r2 + 16] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + + mov%1 m1, [r2] + mov%1 m2, [r2 + 16] + psadbw m1, [r0] + psadbw m2, [r0 + 16] + paddd m1, m2 + paddd m0, m1 + lea r2, [r2 + r3] + lea r0, [r0 + r1] +%endmacro + +;----------------------------------------------------------------------------- +; int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal pixelutils_sad_32x32, 4,5,5, src1, stride1, src2, stride2 + pxor m0, m0 + mov r4d, 4 +.loop: + PROCESS_SAD_32x4_U + PROCESS_SAD_32x4_U + dec r4d + jnz .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET + +;------------------------------------------------------------------------------- +; int ff_pixelutils_sad_[au]_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;------------------------------------------------------------------------------- +%macro SAD_XMM_32x32 1 +INIT_XMM sse2 +cglobal pixelutils_sad_%1_32x32, 4,5,3, src1, stride1, src2, stride2 + pxor m0, m0 + mov r4d, 4 +.loop: + PROCESS_SAD_32x4 %1 + PROCESS_SAD_32x4 %1 + dec r4d + jnz .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +SAD_XMM_32x32 a +SAD_XMM_32x32 u + +%if HAVE_AVX2_EXTERNAL +;------------------------------------------------------------------------------- +; int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal pixelutils_sad_32x32, 4,7,5, src1, stride1, src2, stride2 + pxor m0, m0 + mov r4d, 32/4 + lea r5, [stride1q * 3] + lea r6, [stride2q * 3] + +.loop: + movu m1, [src1q] ; row 0 of pix0 + movu m2, [src2q] ; row 0 of pix1 + movu m3, [src1q + stride1q] ; row 1 of pix0 + movu m4, [src2q + stride2q] ; row 1 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m0, m3 + + movu m1, [src1q + 2 * stride1q] ; row 2 of pix0 + movu m2, [src2q + 2 * stride2q] ; row 2 of pix1 + movu m3, [src1q + r5] ; row 3 of pix0 + movu m4, [src2q + r6] ; row 3 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m0, m3 + + lea src2q, [src2q + 4 * stride2q] + lea src1q, [src1q + 4 * stride1q] + + dec r4d + jnz .loop + + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + pshufd xm1, xm0, 2 + paddd xm0, xm1 + movd eax, xm0 + RET + +;------------------------------------------------------------------------------- +; int ff_pixelutils_sad_[au]_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, +; const uint8_t *src2, ptrdiff_t stride2); +;------------------------------------------------------------------------------- +%macro SAD_AVX2_32x32 1 +INIT_YMM avx2 +cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2 + pxor m0, m0 + mov r4d, 32/4 + lea r5, [stride1q * 3] + lea r6, [stride2q * 3] + +.loop: + mov%1 m1, [src2q] ; row 0 of pix1 + psadbw m1, [src1q] + mov%1 m2, [src2q + stride2q] ; row 1 of pix1 + psadbw m2, [src1q + stride1q] + + paddd m0, m1 + paddd m0, m2 + + mov%1 m1, [src2q + 2 * stride2q] ; row 2 of pix1 + psadbw m1, [src1q + 2 * stride1q] + mov%1 m2, [src2q + r6] ; row 3 of pix1 + psadbw m2, [src1q + r5] + + paddd m0, m1 + paddd m0, m2 + + lea src2q, [src2q + 4 * stride2q] + lea src1q, [src1q + 4 * stride1q] + + dec r4d + jnz .loop + + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + pshufd xm1, xm0, 2 + paddd xm0, xm1 + movd eax, xm0 + RET +%endmacro + +SAD_AVX2_32x32 a +SAD_AVX2_32x32 u +%endif diff --git a/libavutil/x86/pixelutils.h b/libavutil/x86/pixelutils.h new file mode 100644 index 0000000000..876cf46053 --- /dev/null +++ b/libavutil/x86/pixelutils.h @@ -0,0 +1,26 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_X86_PIXELUTILS_H +#define AVUTIL_X86_PIXELUTILS_H + +#include "libavutil/pixelutils.h" + +void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned); + +#endif /* AVUTIL_X86_PIXELUTILS_H */ diff --git a/libavutil/x86/pixelutils_init.c b/libavutil/x86/pixelutils_init.c new file mode 100644 index 0000000000..184a3a4a9f --- /dev/null +++ b/libavutil/x86/pixelutils_init.c @@ -0,0 +1,94 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "pixelutils.h" +#include "cpu.h" + +int ff_pixelutils_sad_8x8_mmx(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +int ff_pixelutils_sad_16x16_mmxext(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_a_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_u_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_a_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_a_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_u_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + sad[2] = ff_pixelutils_sad_8x8_mmx; + } + + // The best way to use SSE2 would be to do 2 SADs in parallel, + // but we'd have to modify the pixelutils API to return SIMD functions. + + // It's probably not faster to shuffle data around + // to get two lines of 8 pixels into a single 16byte register, + // so just use the MMX 8x8 version even when SSE2 is available. + if (EXTERNAL_MMXEXT(cpu_flags)) { + sad[2] = ff_pixelutils_sad_8x8_mmxext; + sad[3] = ff_pixelutils_sad_16x16_mmxext; + } + + if (EXTERNAL_SSE2(cpu_flags)) { + switch (aligned) { + case 0: sad[3] = ff_pixelutils_sad_16x16_sse2; break; // src1 unaligned, src2 unaligned + case 1: sad[3] = ff_pixelutils_sad_u_16x16_sse2; break; // src1 aligned, src2 unaligned + case 2: sad[3] = ff_pixelutils_sad_a_16x16_sse2; break; // src1 aligned, src2 aligned + } + } + + if (EXTERNAL_SSE2(cpu_flags)) { + switch (aligned) { + case 0: sad[4] = ff_pixelutils_sad_32x32_sse2; break; // src1 unaligned, src2 unaligned + case 1: sad[4] = ff_pixelutils_sad_u_32x32_sse2; break; // src1 aligned, src2 unaligned + case 2: sad[4] = ff_pixelutils_sad_a_32x32_sse2; break; // src1 aligned, src2 aligned + } + } + + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + switch (aligned) { + case 0: sad[4] = ff_pixelutils_sad_32x32_avx2; break; // src1 unaligned, src2 unaligned + case 1: sad[4] = ff_pixelutils_sad_u_32x32_avx2; break; // src1 aligned, src2 unaligned + case 2: sad[4] = ff_pixelutils_sad_a_32x32_avx2; break; // src1 aligned, src2 aligned + } + } +} diff --git a/libavutil/x86/timer.h b/libavutil/x86/timer.h index bb7c341341..4d1e88def0 100644 --- a/libavutil/x86/timer.h +++ b/libavutil/x86/timer.h @@ -1,20 +1,20 @@ /* * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -25,6 +25,7 @@ #if HAVE_INLINE_ASM +#define FF_TIMER_UNITS "decicycles" #define AV_READ_TIME read_time static inline uint64_t read_time(void) diff --git a/libavutil/x86/w64xmmtest.h b/libavutil/x86/w64xmmtest.h index b4ce7d3daf..a4a05b0419 100644 --- a/libavutil/x86/w64xmmtest.h +++ b/libavutil/x86/w64xmmtest.h @@ -2,23 +2,26 @@ * check XMM registers for clobbers on Win64 * Copyright (c) 2008 Ramiro Polla <ramiro.polla@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#ifndef AVUTIL_X86_W64XMMTEST_H +#define AVUTIL_X86_W64XMMTEST_H + #include <inttypes.h> #include <stdint.h> #include <stdlib.h> @@ -71,3 +74,5 @@ int __real_ ## func; \ int __wrap_ ## func; \ int __wrap_ ## func + +#endif /* AVUTIL_X86_W64XMMTEST_H */ diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index e04dbfedf3..5044ee86f0 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -1,12 +1,12 @@ ;***************************************************************************** ;* x86inc.asm: x264asm abstraction layer ;***************************************************************************** -;* Copyright (C) 2005-2016 x264 project +;* Copyright (C) 2005-2018 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> +;* Henrik Gramner <henrik@gramner.com> ;* Anton Mitrofanov <BugMaster@narod.ru> ;* Fiona Glaser <fiona@x264.com> -;* Henrik Gramner <henrik@gramner.com> ;* ;* Permission to use, copy, modify, and/or distribute this software for any ;* purpose with or without fee is hereby granted, provided that the above @@ -88,6 +88,12 @@ %macro SECTION_RODATA 0-1 16 %ifidn __OUTPUT_FORMAT__,aout SECTION .text + %elifidn __OUTPUT_FORMAT__,coff + SECTION .text + %elifidn __OUTPUT_FORMAT__,win32 + SECTION .rdata align=%1 + %elif WIN64 + SECTION .rdata align=%1 %else SECTION .rodata align=%1 %endif @@ -335,6 +341,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %endmacro %define required_stack_alignment ((mmsize + 15) & ~15) +%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) +%define high_mm_regs (16*cpuflag(avx512)) %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) %ifnum %1 @@ -426,10 +434,10 @@ DECLARE_REG 7, rdi, 64 DECLARE_REG 8, rsi, 72 DECLARE_REG 9, rbx, 80 DECLARE_REG 10, rbp, 88 -DECLARE_REG 11, R12, 96 -DECLARE_REG 12, R13, 104 -DECLARE_REG 13, R14, 112 -DECLARE_REG 14, R15, 120 +DECLARE_REG 11, R14, 96 +DECLARE_REG 12, R15, 104 +DECLARE_REG 13, R12, 112 +DECLARE_REG 14, R13, 120 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 @@ -448,15 +456,16 @@ DECLARE_REG 14, R15, 120 %macro WIN64_PUSH_XMM 0 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. - %if xmm_regs_used > 6 + %if xmm_regs_used > 6 + high_mm_regs movaps [rstk + stack_offset + 8], xmm6 %endif - %if xmm_regs_used > 7 + %if xmm_regs_used > 7 + high_mm_regs movaps [rstk + stack_offset + 24], xmm7 %endif - %if xmm_regs_used > 8 + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 %assign %%i 8 - %rep xmm_regs_used-8 + %rep %%xmm_regs_on_stack movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i %assign %%i %%i+1 %endrep @@ -465,53 +474,56 @@ DECLARE_REG 14, R15, 120 %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 - ASSERT xmm_regs_used <= 16 - %if xmm_regs_used > 8 + ASSERT xmm_regs_used <= 16 + high_mm_regs + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. - %assign %%pad (xmm_regs_used-8)*16 + 32 + %assign %%pad %%xmm_regs_on_stack*16 + 32 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded %endif WIN64_PUSH_XMM %endmacro -%macro WIN64_RESTORE_XMM_INTERNAL 1 +%macro WIN64_RESTORE_XMM_INTERNAL 0 %assign %%pad_size 0 - %if xmm_regs_used > 8 - %assign %%i xmm_regs_used - %rep xmm_regs_used-8 + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + %assign %%i xmm_regs_used - high_mm_regs + %rep %%xmm_regs_on_stack %assign %%i %%i-1 - movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] + movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] %endrep %endif %if stack_size_padded > 0 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else - add %1, stack_size_padded + add rsp, stack_size_padded %assign %%pad_size stack_size_padded %endif %endif - %if xmm_regs_used > 7 - movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + %if xmm_regs_used > 7 + high_mm_regs + movaps xmm7, [rsp + stack_offset - %%pad_size + 24] %endif - %if xmm_regs_used > 6 - movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + %if xmm_regs_used > 6 + high_mm_regs + movaps xmm6, [rsp + stack_offset - %%pad_size + 8] %endif %endmacro -%macro WIN64_RESTORE_XMM 1 - WIN64_RESTORE_XMM_INTERNAL %1 +%macro WIN64_RESTORE_XMM 0 + WIN64_RESTORE_XMM_INTERNAL %assign stack_offset (stack_offset-stack_size_padded) + %assign stack_size_padded 0 %assign xmm_regs_used 0 %endmacro -%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 +%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs %macro RET 0 - WIN64_RESTORE_XMM_INTERNAL rsp + WIN64_RESTORE_XMM_INTERNAL POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 - %if mmsize == 32 + %if vzeroupper_required vzeroupper %endif AUTO_REP_RET @@ -530,14 +542,15 @@ DECLARE_REG 7, R10, 16 DECLARE_REG 8, R11, 24 DECLARE_REG 9, rbx, 32 DECLARE_REG 10, rbp, 40 -DECLARE_REG 11, R12, 48 -DECLARE_REG 12, R13, 56 -DECLARE_REG 13, R14, 64 -DECLARE_REG 14, R15, 72 +DECLARE_REG 11, R14, 48 +DECLARE_REG 12, R15, 56 +DECLARE_REG 13, R12, 64 +DECLARE_REG 14, R13, 72 -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 + %assign xmm_regs_used %3 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 @@ -547,7 +560,7 @@ DECLARE_REG 14, R15, 72 DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro -%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 +%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 @@ -558,7 +571,7 @@ DECLARE_REG 14, R15, 72 %endif %endif POP_IF_USED 14, 13, 12, 11, 10, 9 - %if mmsize == 32 + %if vzeroupper_required vzeroupper %endif AUTO_REP_RET @@ -603,7 +616,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro -%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 +%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 @@ -614,7 +627,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %endif %endif POP_IF_USED 6, 5, 4, 3 - %if mmsize == 32 + %if vzeroupper_required vzeroupper %endif AUTO_REP_RET @@ -625,7 +638,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %if WIN64 == 0 %macro WIN64_SPILL_XMM 1 %endmacro - %macro WIN64_RESTORE_XMM 1 + %macro WIN64_RESTORE_XMM 0 %endmacro %macro WIN64_PUSH_XMM 0 %endmacro @@ -636,7 +649,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 ; We can automatically detect "follows a branch", but not a branch target. ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) %macro REP_RET 0 - %if has_epilogue + %if has_epilogue || cpuflag(ssse3) RET %else rep ret @@ -724,12 +737,22 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %assign stack_offset 0 ; stack pointer offset relative to the return address %assign stack_size 0 ; amount of stack space that can be freely used inside a function %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding - %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper %ifnidn %3, "" PROLOGUE %3 %endif %endmacro +; Create a global symbol from a local label with the correct name mangling and type +%macro cglobal_label 1 + %if FORMAT_ELF + global current_function %+ %1:function hidden + %else + global current_function %+ %1 + %endif + %1: +%endmacro + %macro cextern 1 %xdefine %1 mangle(private_prefix %+ _ %+ %1) CAT_XDEFINE cglobaled_, %1, 1 @@ -787,24 +810,25 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 -%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 -%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 -%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 -%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 -%assign cpuflags_avx (1<<11)| cpuflags_sse42 -%assign cpuflags_xop (1<<12)| cpuflags_avx -%assign cpuflags_fma4 (1<<13)| cpuflags_avx -%assign cpuflags_fma3 (1<<14)| cpuflags_avx -%assign cpuflags_avx2 (1<<15)| cpuflags_fma3 - -%assign cpuflags_cache32 (1<<16) -%assign cpuflags_cache64 (1<<17) -%assign cpuflags_slowctz (1<<18) -%assign cpuflags_lzcnt (1<<19) -%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<21) -%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 +%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 +%assign cpuflags_sse42 (1<<11)| cpuflags_sse4 +%assign cpuflags_aesni (1<<12)| cpuflags_sse42 +%assign cpuflags_avx (1<<13)| cpuflags_sse42 +%assign cpuflags_xop (1<<14)| cpuflags_avx +%assign cpuflags_fma4 (1<<15)| cpuflags_avx +%assign cpuflags_fma3 (1<<16)| cpuflags_avx +%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1 +%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2 +%assign cpuflags_avx512 (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL + +%assign cpuflags_cache32 (1<<21) +%assign cpuflags_cache64 (1<<22) +%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<24) ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) @@ -852,11 +876,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %endif %endmacro -; Merge mmx and sse* +; Merge mmx, sse*, and avx* ; m# is a simd register of the currently selected size ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# -; (All 3 remain in sync through SWAP.) +; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# +; (All 4 remain in sync through SWAP.) %macro CAT_XDEFINE 3 %xdefine %1%2 %3 @@ -866,69 +891,99 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %undef %1%2 %endmacro +%macro DEFINE_MMREGS 1 ; mmtype + %assign %%prev_mmregs 0 + %ifdef num_mmregs + %assign %%prev_mmregs num_mmregs + %endif + + %assign num_mmregs 8 + %if ARCH_X86_64 && mmsize >= 16 + %assign num_mmregs 16 + %if cpuflag(avx512) || mmsize == 64 + %assign num_mmregs 32 + %endif + %endif + + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1 %+ %%i + CAT_XDEFINE nn%1, %%i, %%i + %assign %%i %%i+1 + %endrep + %if %%prev_mmregs > num_mmregs + %rep %%prev_mmregs - num_mmregs + CAT_UNDEF m, %%i + CAT_UNDEF nn %+ mmtype, %%i + %assign %%i %%i+1 + %endrep + %endif + %xdefine mmtype %1 +%endmacro + +; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper +%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg + %if ARCH_X86_64 && cpuflag(avx512) + %assign %%i %1 + %rep 16-%1 + %assign %%i_high %%i+16 + SWAP %%i, %%i_high + %assign %%i %%i+1 + %endrep + %endif +%endmacro + %macro INIT_MMX 0-1+ %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_MMX %1 %define mmsize 8 - %define num_mmregs 8 %define mova movq %define movu movq %define movh movd %define movnta movntq - %assign %%i 0 - %rep 8 - CAT_XDEFINE m, %%i, mm %+ %%i - CAT_XDEFINE nnmm, %%i, %%i - %assign %%i %%i+1 - %endrep - %rep 8 - CAT_UNDEF m, %%i - CAT_UNDEF nnmm, %%i - %assign %%i %%i+1 - %endrep INIT_CPUFLAGS %1 + DEFINE_MMREGS mm %endmacro %macro INIT_XMM 0-1+ %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_XMM %1 %define mmsize 16 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif %define mova movdqa %define movu movdqu %define movh movq %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, xmm %+ %%i - CAT_XDEFINE nnxmm, %%i, %%i - %assign %%i %%i+1 - %endrep INIT_CPUFLAGS %1 + DEFINE_MMREGS xmm + %if WIN64 + AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers + %endif %endmacro %macro INIT_YMM 0-1+ %assign avx_enabled 1 %define RESET_MM_PERMUTATION INIT_YMM %1 %define mmsize 32 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif %define mova movdqa %define movu movdqu %undef movh %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, ymm %+ %%i - CAT_XDEFINE nnymm, %%i, %%i - %assign %%i %%i+1 - %endrep INIT_CPUFLAGS %1 + DEFINE_MMREGS ymm + AVX512_MM_PERMUTATION +%endmacro + +%macro INIT_ZMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_ZMM %1 + %define mmsize 64 + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + INIT_CPUFLAGS %1 + DEFINE_MMREGS zmm + AVX512_MM_PERMUTATION %endmacro INIT_XMM @@ -937,18 +992,26 @@ INIT_XMM %define mmmm%1 mm%1 %define mmxmm%1 mm%1 %define mmymm%1 mm%1 + %define mmzmm%1 mm%1 %define xmmmm%1 mm%1 %define xmmxmm%1 xmm%1 %define xmmymm%1 xmm%1 + %define xmmzmm%1 xmm%1 %define ymmmm%1 mm%1 %define ymmxmm%1 xmm%1 %define ymmymm%1 ymm%1 + %define ymmzmm%1 ymm%1 + %define zmmmm%1 mm%1 + %define zmmxmm%1 xmm%1 + %define zmmymm%1 ymm%1 + %define zmmzmm%1 zmm%1 %define xm%1 xmm %+ m%1 %define ym%1 ymm %+ m%1 + %define zm%1 zmm %+ m%1 %endmacro %assign i 0 -%rep 16 +%rep 32 DECLARE_MMCAST i %assign i i+1 %endrep @@ -1036,7 +1099,11 @@ INIT_XMM ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't %macro call 1 - call_internal %1 %+ SUFFIX, %1 + %ifid %1 + call_internal %1 %+ SUFFIX, %1 + %else + call %1 + %endif %endmacro %macro call_internal 2 %xdefine %%i %2 @@ -1079,12 +1146,17 @@ INIT_XMM ;============================================================================= %assign i 0 -%rep 16 +%rep 32 %if i < 8 CAT_XDEFINE sizeofmm, i, 8 + CAT_XDEFINE regnumofmm, i, i %endif CAT_XDEFINE sizeofxmm, i, 16 CAT_XDEFINE sizeofymm, i, 32 + CAT_XDEFINE sizeofzmm, i, 64 + CAT_XDEFINE regnumofxmm, i, i + CAT_XDEFINE regnumofymm, i, i + CAT_XDEFINE regnumofzmm, i, i %assign i i+1 %endrep %undef i @@ -1201,7 +1273,7 @@ INIT_XMM %endmacro %endmacro -; Instructions with both VEX and non-VEX encodings +; Instructions with both VEX/EVEX and legacy encodings ; Non-destructive instructions are written without parameters AVX_INSTR addpd, sse2, 1, 0, 1 AVX_INSTR addps, sse, 1, 0, 1 @@ -1209,12 +1281,12 @@ AVX_INSTR addsd, sse2, 1, 0, 0 AVX_INSTR addss, sse, 1, 0, 0 AVX_INSTR addsubpd, sse3, 1, 0, 0 AVX_INSTR addsubps, sse3, 1, 0, 0 -AVX_INSTR aesdec, fnord, 0, 0, 0 -AVX_INSTR aesdeclast, fnord, 0, 0, 0 -AVX_INSTR aesenc, fnord, 0, 0, 0 -AVX_INSTR aesenclast, fnord, 0, 0, 0 -AVX_INSTR aesimc -AVX_INSTR aeskeygenassist +AVX_INSTR aesdec, aesni, 0, 0, 0 +AVX_INSTR aesdeclast, aesni, 0, 0, 0 +AVX_INSTR aesenc, aesni, 0, 0, 0 +AVX_INSTR aesenclast, aesni, 0, 0, 0 +AVX_INSTR aesimc, aesni +AVX_INSTR aeskeygenassist, aesni AVX_INSTR andnpd, sse2, 1, 0, 0 AVX_INSTR andnps, sse, 1, 0, 0 AVX_INSTR andpd, sse2, 1, 0, 1 @@ -1223,10 +1295,42 @@ AVX_INSTR blendpd, sse4, 1, 1, 0 AVX_INSTR blendps, sse4, 1, 1, 0 AVX_INSTR blendvpd, sse4 ; can't be emulated AVX_INSTR blendvps, sse4 ; can't be emulated +AVX_INSTR cmpeqpd, sse2, 1, 0, 1 +AVX_INSTR cmpeqps, sse, 1, 0, 1 +AVX_INSTR cmpeqsd, sse2, 1, 0, 0 +AVX_INSTR cmpeqss, sse, 1, 0, 0 +AVX_INSTR cmplepd, sse2, 1, 0, 0 +AVX_INSTR cmpleps, sse, 1, 0, 0 +AVX_INSTR cmplesd, sse2, 1, 0, 0 +AVX_INSTR cmpless, sse, 1, 0, 0 +AVX_INSTR cmpltpd, sse2, 1, 0, 0 +AVX_INSTR cmpltps, sse, 1, 0, 0 +AVX_INSTR cmpltsd, sse2, 1, 0, 0 +AVX_INSTR cmpltss, sse, 1, 0, 0 +AVX_INSTR cmpneqpd, sse2, 1, 0, 1 +AVX_INSTR cmpneqps, sse, 1, 0, 1 +AVX_INSTR cmpneqsd, sse2, 1, 0, 0 +AVX_INSTR cmpneqss, sse, 1, 0, 0 +AVX_INSTR cmpnlepd, sse2, 1, 0, 0 +AVX_INSTR cmpnleps, sse, 1, 0, 0 +AVX_INSTR cmpnlesd, sse2, 1, 0, 0 +AVX_INSTR cmpnless, sse, 1, 0, 0 +AVX_INSTR cmpnltpd, sse2, 1, 0, 0 +AVX_INSTR cmpnltps, sse, 1, 0, 0 +AVX_INSTR cmpnltsd, sse2, 1, 0, 0 +AVX_INSTR cmpnltss, sse, 1, 0, 0 +AVX_INSTR cmpordpd, sse2 1, 0, 1 +AVX_INSTR cmpordps, sse 1, 0, 1 +AVX_INSTR cmpordsd, sse2 1, 0, 0 +AVX_INSTR cmpordss, sse 1, 0, 0 AVX_INSTR cmppd, sse2, 1, 1, 0 AVX_INSTR cmpps, sse, 1, 1, 0 AVX_INSTR cmpsd, sse2, 1, 1, 0 AVX_INSTR cmpss, sse, 1, 1, 0 +AVX_INSTR cmpunordpd, sse2, 1, 0, 1 +AVX_INSTR cmpunordps, sse, 1, 0, 1 +AVX_INSTR cmpunordsd, sse2, 1, 0, 0 +AVX_INSTR cmpunordss, sse, 1, 0, 0 AVX_INSTR comisd, sse2 AVX_INSTR comiss, sse AVX_INSTR cvtdq2pd, sse2 @@ -1537,6 +1641,52 @@ FMA4_INSTR fmsubadd, pd, ps FMA4_INSTR fnmadd, pd, ps, sd, ss FMA4_INSTR fnmsub, pd, ps, sd, ss +; Macros for converting VEX instructions to equivalent EVEX ones. +%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex + %macro %1 2-7 fnord, fnord, %1, %2, %3 + %ifidn %3, fnord + %define %%args %1, %2 + %elifidn %4, fnord + %define %%args %1, %2, %3 + %else + %define %%args %1, %2, %3, %4 + %endif + %assign %%evex_required cpuflag(avx512) & %7 + %ifnum regnumof%1 + %if regnumof%1 >= 16 || sizeof%1 > 32 + %assign %%evex_required 1 + %endif + %endif + %ifnum regnumof%2 + %if regnumof%2 >= 16 || sizeof%2 > 32 + %assign %%evex_required 1 + %endif + %endif + %if %%evex_required + %6 %%args + %else + %5 %%args ; Prefer VEX over EVEX due to shorter instruction length + %endif + %endmacro +%endmacro + +EVEX_INSTR vbroadcastf128, vbroadcastf32x4 +EVEX_INSTR vbroadcasti128, vbroadcasti32x4 +EVEX_INSTR vextractf128, vextractf32x4 +EVEX_INSTR vextracti128, vextracti32x4 +EVEX_INSTR vinsertf128, vinsertf32x4 +EVEX_INSTR vinserti128, vinserti32x4 +EVEX_INSTR vmovdqa, vmovdqa32 +EVEX_INSTR vmovdqu, vmovdqu32 +EVEX_INSTR vpand, vpandd +EVEX_INSTR vpandn, vpandnd +EVEX_INSTR vpor, vpord +EVEX_INSTR vpxor, vpxord +EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision +EVEX_INSTR vrcpss, vrcp14ss, 1 +EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 +EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 + ; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) %ifdef __YASM_VER__ %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index 81a3ada6e6..d7cd996842 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -6,20 +6,20 @@ ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Holger Lubitz <holger@lubitz.org> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -29,6 +29,11 @@ %include "libavutil/x86/x86inc.asm" +; expands to [base],...,[base+7*stride] +%define PASS8ROWS(base, base3, stride, stride3) \ + [base], [base + stride], [base + 2*stride], [base3], \ + [base3 + stride], [base3 + 2*stride], [base3 + stride3], [base3 + stride*4] + ; Interleave low src0 with low src1 and store in src0, ; interleave high src0 with high src1 and store in src1. ; %1 - types @@ -40,7 +45,10 @@ ; output: src0: x0 y0 x1 y1 x2 y2 x3 y3 ; src1: z0 q0 z1 q1 z2 q2 z3 q3 %macro SBUTTERFLY 4 -%if avx_enabled == 0 +%ifidn %1, dqqq + vperm2i128 m%4, m%2, m%3, q0301 + vinserti128 m%2, m%2, xm%3, 1 +%elif avx_enabled == 0 mova m%4, m%2 punpckl%1 m%2, m%3 punpckh%1 m%4, m%3 @@ -63,6 +71,12 @@ SWAP %1, %3, %2 %endmacro +%macro SBUTTERFLYPD 3 + movlhps m%3, m%1, m%2 + movhlps m%2, m%2, m%1 + SWAP %1, %3 +%endmacro + %macro TRANSPOSE4x4B 5 SBUTTERFLY bw, %1, %2, %5 SBUTTERFLY bw, %3, %4, %5 @@ -79,6 +93,15 @@ SWAP %2, %3 %endmacro +%macro TRANSPOSE2x4x4B 5 + SBUTTERFLY bw, %1, %2, %5 + SBUTTERFLY bw, %3, %4, %5 + SBUTTERFLY wd, %1, %3, %5 + SBUTTERFLY wd, %2, %4, %5 + SBUTTERFLY dq, %1, %2, %5 + SBUTTERFLY dq, %3, %4, %5 +%endmacro + %macro TRANSPOSE2x4x4W 5 SBUTTERFLY wd, %1, %2, %5 SBUTTERFLY wd, %3, %4, %5 @@ -100,12 +123,46 @@ %macro TRANSPOSE4x4PS 5 SBUTTERFLYPS %1, %2, %5 SBUTTERFLYPS %3, %4, %5 - movlhps m%5, m%1, m%3 - movhlps m%3, m%1 - SWAP %5, %1 - movlhps m%5, m%2, m%4 - movhlps m%4, m%2 - SWAP %5, %2, %3 + SBUTTERFLYPD %1, %3, %5 + SBUTTERFLYPD %2, %4, %5 + SWAP %2, %3 +%endmacro + +%macro TRANSPOSE8x4D 9-11 +%if ARCH_X86_64 + SBUTTERFLY dq, %1, %2, %9 + SBUTTERFLY dq, %3, %4, %9 + SBUTTERFLY dq, %5, %6, %9 + SBUTTERFLY dq, %7, %8, %9 + SBUTTERFLY qdq, %1, %3, %9 + SBUTTERFLY qdq, %2, %4, %9 + SBUTTERFLY qdq, %5, %7, %9 + SBUTTERFLY qdq, %6, %8, %9 + SWAP %2, %5 + SWAP %4, %7 +%else +; in: m0..m7 +; out: m0..m7, unless %11 in which case m2 is in %9 +; spills into %9 and %10 + movdqa %9, m%7 + SBUTTERFLY dq, %1, %2, %7 + movdqa %10, m%2 + movdqa m%7, %9 + SBUTTERFLY dq, %3, %4, %2 + SBUTTERFLY dq, %5, %6, %2 + SBUTTERFLY dq, %7, %8, %2 + SBUTTERFLY qdq, %1, %3, %2 + movdqa %9, m%3 + movdqa m%2, %10 + SBUTTERFLY qdq, %2, %4, %3 + SBUTTERFLY qdq, %5, %7, %3 + SBUTTERFLY qdq, %6, %8, %3 + SWAP %2, %5 + SWAP %4, %7 +%if %0<11 + movdqa m%3, %9 +%endif +%endif %endmacro %macro TRANSPOSE8x8W 9-11 @@ -157,6 +214,85 @@ %endif %endmacro +%macro TRANSPOSE16x16W 18-19 +; in: m0..m15, unless %19 in which case m6 is in %17 +; out: m0..m15, unless %19 in which case m4 is in %18 +; spills into %17 and %18 +%if %0 < 19 + mova %17, m%7 +%endif + + SBUTTERFLY dqqq, %1, %9, %7 + SBUTTERFLY dqqq, %2, %10, %7 + SBUTTERFLY dqqq, %3, %11, %7 + SBUTTERFLY dqqq, %4, %12, %7 + SBUTTERFLY dqqq, %5, %13, %7 + SBUTTERFLY dqqq, %6, %14, %7 + mova %18, m%14 + mova m%7, %17 + SBUTTERFLY dqqq, %7, %15, %14 + SBUTTERFLY dqqq, %8, %16, %14 + + SBUTTERFLY wd, %1, %2, %14 + SBUTTERFLY wd, %3, %4, %14 + SBUTTERFLY wd, %5, %6, %14 + SBUTTERFLY wd, %7, %8, %14 + SBUTTERFLY wd, %9, %10, %14 + SBUTTERFLY wd, %11, %12, %14 + mova %17, m%12 + mova m%14, %18 + SBUTTERFLY wd, %13, %14, %12 + SBUTTERFLY wd, %15, %16, %12 + + SBUTTERFLY dq, %1, %3, %12 + SBUTTERFLY dq, %2, %4, %12 + SBUTTERFLY dq, %5, %7, %12 + SBUTTERFLY dq, %6, %8, %12 + SBUTTERFLY dq, %9, %11, %12 + mova %18, m%11 + mova m%12, %17 + SBUTTERFLY dq, %10, %12, %11 + SBUTTERFLY dq, %13, %15, %11 + SBUTTERFLY dq, %14, %16, %11 + + SBUTTERFLY qdq, %1, %5, %11 + SBUTTERFLY qdq, %2, %6, %11 + SBUTTERFLY qdq, %3, %7, %11 + SBUTTERFLY qdq, %4, %8, %11 + + SWAP %2, %5 + SWAP %4, %7 + + SBUTTERFLY qdq, %9, %13, %11 + SBUTTERFLY qdq, %10, %14, %11 + mova m%11, %18 + mova %18, m%5 + SBUTTERFLY qdq, %11, %15, %5 + SBUTTERFLY qdq, %12, %16, %5 + +%if %0 < 19 + mova m%5, %18 +%endif + + SWAP %10, %13 + SWAP %12, %15 +%endmacro + +%macro TRANSPOSE_8X8B 8 + %if mmsize == 8 + %error "This macro does not support mmsize == 8" + %endif + punpcklbw m%1, m%2 + punpcklbw m%3, m%4 + punpcklbw m%5, m%6 + punpcklbw m%7, m%8 + TRANSPOSE4x4W %1, %3, %5, %7, %2 + MOVHL m%2, m%1 + MOVHL m%4, m%3 + MOVHL m%6, m%5 + MOVHL m%8, m%7 +%endmacro + ; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place %macro PABSW 2 %if cpuflag(ssse3) @@ -283,6 +419,55 @@ %endif %endmacro +%macro HADDD 2 ; sum junk +%if sizeof%1 == 32 +%define %2 xmm%2 + vextracti128 %2, %1, 1 +%define %1 xmm%1 + paddd %1, %2 +%endif +%if mmsize >= 16 +%if cpuflag(xop) && sizeof%1 == 16 + vphadddq %1, %1 +%endif + movhlps %2, %1 + paddd %1, %2 +%endif +%if notcpuflag(xop) || sizeof%1 != 16 +%if cpuflag(mmxext) + PSHUFLW %2, %1, q0032 +%else ; mmx + mova %2, %1 + psrlq %2, 32 +%endif + paddd %1, %2 +%endif +%undef %1 +%undef %2 +%endmacro + +%macro HADDW 2 ; reg, tmp +%if cpuflag(xop) && sizeof%1 == 16 + vphaddwq %1, %1 + movhlps %2, %1 + paddd %1, %2 +%else + pmaddwd %1, [pw_1] + HADDD %1, %2 +%endif +%endmacro + +%macro HADDPS 3 ; dst, src, tmp +%if cpuflag(sse3) + haddps %1, %1, %2 +%else + movaps %3, %1 + shufps %1, %2, q2020 + shufps %3, %2, q3131 + addps %1, %3 +%endif +%endmacro + %macro PALIGNR 4-5 %if cpuflag(ssse3) %if %0==5 @@ -312,11 +497,19 @@ %endif %endmacro -%macro PAVGB 2 +%macro PAVGB 2-4 %if cpuflag(mmxext) pavgb %1, %2 %elif cpuflag(3dnow) pavgusb %1, %2 +%elif cpuflag(mmx) + movu %3, %2 + por %3, %1 + pxor %1, %2 + pand %1, %4 + psrlq %1, 1 + psubb %3, %1 + SWAP %1, %3 %endif %endmacro @@ -649,12 +842,25 @@ %endif %endmacro -%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32 -%if cpuflag(avx) - vbroadcastss %1, %2 -%else ; sse - movss %1, %2 - shufps %1, %1, 0 +%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm +%if cpuflag(avx2) + vbroadcastss %1, %2 +%elif cpuflag(avx) + %ifnum sizeof%2 ; avx1 register + shufps xmm%1, xmm%2, xmm%2, q0000 + %if sizeof%1 >= 32 ; mmsize>=32 + vinsertf128 %1, %1, xmm%1, 1 + %endif + %else ; avx1 memory + vbroadcastss %1, %2 + %endif +%else + %ifnum sizeof%2 ; sse register + shufps %1, %2, %2, q0000 + %else ; sse memory + movss %1, %2 + shufps %1, %1, 0 + %endif %endif %endmacro @@ -669,6 +875,29 @@ %endif %endmacro +%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm +%if cpuflag(avx2) + vpbroadcastd %1, %2 +%elif cpuflag(avx) && sizeof%1 >= 32 + %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss +%else + %ifnum sizeof%2 ; sse2 register + pshufd %1, %2, q0000 + %else ; sse memory + movd %1, %2 + pshufd %1, %1, 0 + %endif +%endif +%endmacro + +%macro VBROADCASTI128 2 ; dst xmm/ymm, src : 128bits val +%if mmsize > 16 + vbroadcasti128 %1, %2 +%else + mova %1, %2 +%endif +%endmacro + %macro SHUFFLE_MASK_W 8 %rep 8 %if %1>=0x80 @@ -705,3 +934,95 @@ addps %1, %4 %endif %endmacro + +%macro LSHIFT 2 +%if mmsize > 8 + pslldq %1, %2 +%else + psllq %1, 8*(%2) +%endif +%endmacro + +%macro RSHIFT 2 +%if mmsize > 8 + psrldq %1, %2 +%else + psrlq %1, 8*(%2) +%endif +%endmacro + +%macro MOVHL 2 ; dst, src +%ifidn %1, %2 + punpckhqdq %1, %2 +%elif cpuflag(avx) + punpckhqdq %1, %2, %2 +%elif cpuflag(sse4) + pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones +%else + movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst +%endif +%endmacro + +; Horizontal Sum of Packed Single precision floats +; The resulting sum is in all elements. +%macro HSUMPS 2 ; dst/src, tmp +%if cpuflag(avx) + %if sizeof%1>=32 ; avx + vperm2f128 %2, %1, %1, (0)*16+(1) + addps %1, %2 + %endif + shufps %2, %1, %1, q1032 + addps %1, %2 + shufps %2, %1, %1, q0321 + addps %1, %2 +%else ; this form is a bit faster than the short avx-like emulation. + movaps %2, %1 + shufps %1, %1, q1032 + addps %1, %2 + movaps %2, %1 + shufps %1, %1, q0321 + addps %1, %2 + ; all %1 members should be equal for as long as float a+b==b+a +%endif +%endmacro + +; Emulate blendvps if not available +; +; src_b is destroyed when using emulation with logical operands +; SSE41 blendv instruction is hard coded to use xmm0 as mask +%macro BLENDVPS 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + blendvps %1, %1, %2, %3 +%elif cpuflag(sse4) + %ifnidn %3,xmm0 + %error sse41 blendvps uses xmm0 as default 3d operand, you used %3 + %endif + blendvps %1, %2, %3 +%else + xorps %2, %1 + andps %2, %3 + xorps %1, %2 +%endif +%endmacro + +; Emulate pblendvb if not available +; +; src_b is destroyed when using emulation with logical operands +; SSE41 blendv instruction is hard coded to use xmm0 as mask +%macro PBLENDVB 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32 + %error pblendb not possible with ymm on avx1, try blendvps. + %endif + pblendvb %1, %1, %2, %3 +%elif cpuflag(sse4) + %ifnidn %3,xmm0 + %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3 + %endif + pblendvb %1, %2, %3 +%else + pxor %2, %1 + pand %2, %3 + pxor %1, %2 +%endif +%endmacro |