From 52064a4e928175bfa1be3f7e8d262f584ca025e9 Mon Sep 17 00:00:00 2001 From: eschnett Date: Wed, 14 Dec 2011 16:25:28 +0000 Subject: Support FMA4 instructions (AMD's fused multiply-add) git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@39 105869f7-3296-0410-a4ea-f4349344b45a --- src/vectors-4-SSE.h | 27 ++++++++++++++------------- src/vectors-8-AVX.h | 18 ++++++++++++------ src/vectors-8-SSE2.h | 43 +++++++++++++++++++++++++------------------ 3 files changed, 51 insertions(+), 37 deletions(-) diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h index 2bde97e..927d54e 100644 --- a/src/vectors-4-SSE.h +++ b/src/vectors-4-SSE.h @@ -8,25 +8,26 @@ #include #include -#include -#ifdef __SSE4_1__ -// Intel's SSE 4.1 -# include -#endif -#ifdef __SSE4A__ -// AMD's SSE 4a -# include -#endif +#include #ifdef __SSE4_1__ -# define vec4_architecture "SSE4.1 (32-bit precision)" -#elif defined(__SSE4A__) -# define vec4_architecture "SSE4A (32-bit precision)" +# define vec4_architecture_SSE4_1 "+SSE4.1" +#else +# define vec4_architecture_SSE4_1 "" +#endif +#ifdef __SSE4A__ +# define vec4_architecture_SSE4a "+SSE4A" +#else +# define vec4_architecture_SSE4a "" +#endif +#ifdef __FMA4__ +# define vec4_architecture_FMA4 "+FMA4" #else -# define vec4_architecture "SSE (32-bit precision)" +# define vec4_architecture_FMA4 "" #endif +#define vec4_architecture "SSE" vec4_architecture_SSE4_1 vec4_architecture_SSE4a vec4_architecture_FMA4 " (32-bit precision)" // Vector type corresponding to CCTK_REAL #define CCTK_REAL4_VEC __m128 diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h index f7d00d9..fc3e4d5 100644 --- a/src/vectors-8-AVX.h +++ b/src/vectors-8-AVX.h @@ -5,10 +5,9 @@ +#include #if VECTORISE_EMULATE_AVX # include "avxintrin_emu.h" -#else -# include #endif @@ -162,10 +161,17 @@ static const k8const_t k8abs_mask_union = #define k8div(x,y) (_mm256_div_pd(x,y)) // Fused multiply-add, defined as [+-]x*y[+-]z -#define k8madd(x,y,z) (k8add(k8mul(x,y),z)) -#define k8msub(x,y,z) (k8sub(k8mul(x,y),z)) -#define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y))) -#define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y))) +#ifdef __FMA4__ +# define k8madd(x,y,z) (_mm256_macc_pd(x,y,z)) +# define k8msub(x,y,z) (_mm256_msub_pd(x,y,z)) +# define k8nmadd(x,y,z) (_mm256_nmsub_pd(x,y,z)) +# define k8nmsub(x,y,z) (_mm256_nmacc_pd(x,y,z)) +#else +# define k8madd(x,y,z) (k8add(k8mul(x,y),z)) +# define k8msub(x,y,z) (k8sub(k8mul(x,y),z)) +# define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y))) +# define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y))) +#endif // Cheap functions #define k8fabs(x) (_mm256_and_pd(x,k8abs_mask_union.vd)) diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h index fbdcd4b..bce093c 100644 --- a/src/vectors-8-SSE2.h +++ b/src/vectors-8-SSE2.h @@ -8,14 +8,7 @@ #include #include -#include -#ifdef __SSE4_1__ -// Intel's SSE 4.1 -# include -#endif -#ifdef __SSE4A__ -// AMD's SSE 4a -# include +#include // Intel compilers don't support SSE 4a. Here is how we can implement // these instructions in assembler instead: @@ -26,17 +19,24 @@ // asm ("movntsd %[x],%[p]" : "=m" (*p) : [p] "m" (*p), [x] "x" (x)); // } -#endif - #ifdef __SSE4_1__ -# define vec8_architecture "SSE4.1 (64-bit precision)" -#elif defined(__SSE4A__) -# define vec8_architecture "SSE4A (64-bit precision)" +# define vec8_architecture_SSE4_1 "+SSE4.1" +#else +# define vec8_architecture_SSE4_1 "" +#endif +#ifdef __SSE4A__ +# define vec8_architecture_SSE4a "+SSE4A" +#else +# define vec8_architecture_SSE4a "" +#endif +#ifdef __FMA4__ +# define vec8_architecture_FMA4 "+FMA4" #else -# define vec8_architecture "SSE2 (64-bit precision)" +# define vec8_architecture_FMA4 "" #endif +#define vec8_architecture "SSE2" vec8_architecture_SSE4_1 vec8_architecture_SSE4a vec8_architecture_FMA4 " (64-bit precision)" // Vector type corresponding to CCTK_REAL #define CCTK_REAL8_VEC __m128d @@ -204,10 +204,17 @@ static const union { #define k8div(x,y) (_mm_div_pd(x,y)) // Fused multiply-add, defined as [+-]x*y[+-]z -#define k8madd(x,y,z) (k8add(k8mul(x,y),z)) -#define k8msub(x,y,z) (k8sub(k8mul(x,y),z)) -#define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y))) -#define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y))) +#ifdef __FMA4__ +# define k8madd(x,y,z) (_mm_macc_pd(x,y,z)) +# define k8msub(x,y,z) (_mm_msub_pd(x,y,z)) +# define k8nmadd(x,y,z) (_mm_nmsub_pd(x,y,z)) +# define k8nmsub(x,y,z) (_mm_nmacc_pd(x,y,z)) +#else +# define k8madd(x,y,z) (k8add(k8mul(x,y),z)) +# define k8msub(x,y,z) (k8sub(k8mul(x,y),z)) +# define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y))) +# define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y))) +#endif // Cheap functions #define k8fabs(x) (_mm_andnot_pd(k8sign_mask,x)) -- cgit v1.2.3