diff options
-rw-r--r-- | src/vectors-4-SSE.h | 28 | ||||
-rw-r--r-- | src/vectors-8-AVX.h | 6 | ||||
-rw-r--r-- | src/vectors-8-SSE2.h | 14 |
3 files changed, 41 insertions, 7 deletions
diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h index 927d54e..8fea7ba 100644 --- a/src/vectors-4-SSE.h +++ b/src/vectors-4-SSE.h @@ -8,7 +8,18 @@ #include <assert.h> #include <math.h> -#include <x86intrin.h> +#include <xmmintrin.h> +#ifdef __SSE4_1__ +// Intel's SSE 4.1 +# include <smmintrin.h> +#endif +#ifdef __SSE4A__ +// AMD's SSE 4a +# include <ammintrin.h> +#endif +#ifdef __FMA4__ +# include <fma4intrin.h> +#endif @@ -265,10 +276,17 @@ static const union { #define k4div(x,y) (_mm_div_ps(x,y)) // Fused multiply-add, defined as [+-]x*y[+-]z -#define k4madd(x,y,z) (k4add(k4mul(x,y),z)) -#define k4msub(x,y,z) (k4sub(k4mul(x,y),z)) -#define k4nmadd(x,y,z) (k4sub(k4neg(z),k4mul(x,y))) -#define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y))) +#ifdef __FMA4__ +# define k4madd(x,y,z) (_mm_macc_ps(x,y,z)) +# define k4msub(x,y,z) (_mm_msub_ps(x,y,z)) +# define k4nmadd(x,y,z) (_mm_nmsub_ps(x,y,z)) +# define k4nmsub(x,y,z) (_mm_nmacc_ps(x,y,z)) +#else +# define k4madd(x,y,z) (k4add(k4mul(x,y),z)) +# define k4msub(x,y,z) (k4sub(k4mul(x,y),z)) +# define k4nmadd(x,y,z) (k4sub(k4neg(z),k4mul(x,y))) +# define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y))) +#endif // Cheap functions #define k4fabs(x) (_mm_andnot_ps(k4sign_mask,x)) diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h index fc3e4d5..f96bd01 100644 --- a/src/vectors-8-AVX.h +++ b/src/vectors-8-AVX.h @@ -5,9 +5,13 @@ -#include <x86intrin.h> #if VECTORISE_EMULATE_AVX # include "avxintrin_emu.h" +#else +# include <immintrin.h> +#endif +#ifdef __FMA4__ +# include <fma4intrin.h> #endif diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h index bce093c..46a855e 100644 --- a/src/vectors-8-SSE2.h +++ b/src/vectors-8-SSE2.h @@ -8,7 +8,14 @@ #include <assert.h> #include <math.h> -#include <x86intrin.h> +#include <emmintrin.h> +#ifdef __SSE4_1__ +// Intel's SSE 4.1 +# include <smmintrin.h> +#endif +#ifdef __SSE4A__ +// AMD's SSE 4a +# include <ammintrin.h> // Intel compilers don't support SSE 4a. Here is how we can implement // these instructions in assembler instead: @@ -19,6 +26,11 @@ // asm ("movntsd %[x],%[p]" : "=m" (*p) : [p] "m" (*p), [x] "x" (x)); // } +#endif +#ifdef __FMA4__ +# include <fma4intrin.h> +#endif + #ifdef __SSE4_1__ |