aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoreschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2011-12-14 16:25:28 +0000
committereschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2011-12-14 16:25:28 +0000
commit52064a4e928175bfa1be3f7e8d262f584ca025e9 (patch)
tree2b34ff76c182d7447d1149698f9efe48e2cf2ad3
parent1d1684b6537454f25703f295be2220add0c348a5 (diff)
Support FMA4 instructions (AMD's fused multiply-add)
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@39 105869f7-3296-0410-a4ea-f4349344b45a
-rw-r--r--src/vectors-4-SSE.h27
-rw-r--r--src/vectors-8-AVX.h18
-rw-r--r--src/vectors-8-SSE2.h43
3 files changed, 51 insertions, 37 deletions
diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h
index 2bde97e..927d54e 100644
--- a/src/vectors-4-SSE.h
+++ b/src/vectors-4-SSE.h
@@ -8,25 +8,26 @@
#include <assert.h>
#include <math.h>
-#include <xmmintrin.h>
-#ifdef __SSE4_1__
-// Intel's SSE 4.1
-# include <smmintrin.h>
-#endif
-#ifdef __SSE4A__
-// AMD's SSE 4a
-# include <ammintrin.h>
-#endif
+#include <x86intrin.h>
#ifdef __SSE4_1__
-# define vec4_architecture "SSE4.1 (32-bit precision)"
-#elif defined(__SSE4A__)
-# define vec4_architecture "SSE4A (32-bit precision)"
+# define vec4_architecture_SSE4_1 "+SSE4.1"
+#else
+# define vec4_architecture_SSE4_1 ""
+#endif
+#ifdef __SSE4A__
+# define vec4_architecture_SSE4a "+SSE4A"
+#else
+# define vec4_architecture_SSE4a ""
+#endif
+#ifdef __FMA4__
+# define vec4_architecture_FMA4 "+FMA4"
#else
-# define vec4_architecture "SSE (32-bit precision)"
+# define vec4_architecture_FMA4 ""
#endif
+#define vec4_architecture "SSE" vec4_architecture_SSE4_1 vec4_architecture_SSE4a vec4_architecture_FMA4 " (32-bit precision)"
// Vector type corresponding to CCTK_REAL
#define CCTK_REAL4_VEC __m128
diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h
index f7d00d9..fc3e4d5 100644
--- a/src/vectors-8-AVX.h
+++ b/src/vectors-8-AVX.h
@@ -5,10 +5,9 @@
+#include <x86intrin.h>
#if VECTORISE_EMULATE_AVX
# include "avxintrin_emu.h"
-#else
-# include <immintrin.h>
#endif
@@ -162,10 +161,17 @@ static const k8const_t k8abs_mask_union =
#define k8div(x,y) (_mm256_div_pd(x,y))
// Fused multiply-add, defined as [+-]x*y[+-]z
-#define k8madd(x,y,z) (k8add(k8mul(x,y),z))
-#define k8msub(x,y,z) (k8sub(k8mul(x,y),z))
-#define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y)))
-#define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y)))
+#ifdef __FMA4__
+# define k8madd(x,y,z) (_mm256_macc_pd(x,y,z))
+# define k8msub(x,y,z) (_mm256_msub_pd(x,y,z))
+# define k8nmadd(x,y,z) (_mm256_nmsub_pd(x,y,z))
+# define k8nmsub(x,y,z) (_mm256_nmacc_pd(x,y,z))
+#else
+# define k8madd(x,y,z) (k8add(k8mul(x,y),z))
+# define k8msub(x,y,z) (k8sub(k8mul(x,y),z))
+# define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y)))
+# define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y)))
+#endif
// Cheap functions
#define k8fabs(x) (_mm256_and_pd(x,k8abs_mask_union.vd))
diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h
index fbdcd4b..bce093c 100644
--- a/src/vectors-8-SSE2.h
+++ b/src/vectors-8-SSE2.h
@@ -8,14 +8,7 @@
#include <assert.h>
#include <math.h>
-#include <emmintrin.h>
-#ifdef __SSE4_1__
-// Intel's SSE 4.1
-# include <smmintrin.h>
-#endif
-#ifdef __SSE4A__
-// AMD's SSE 4a
-# include <ammintrin.h>
+#include <x86intrin.h>
// Intel compilers don't support SSE 4a. Here is how we can implement
// these instructions in assembler instead:
@@ -26,17 +19,24 @@
// asm ("movntsd %[x],%[p]" : "=m" (*p) : [p] "m" (*p), [x] "x" (x));
// }
-#endif
-
#ifdef __SSE4_1__
-# define vec8_architecture "SSE4.1 (64-bit precision)"
-#elif defined(__SSE4A__)
-# define vec8_architecture "SSE4A (64-bit precision)"
+# define vec8_architecture_SSE4_1 "+SSE4.1"
+#else
+# define vec8_architecture_SSE4_1 ""
+#endif
+#ifdef __SSE4A__
+# define vec8_architecture_SSE4a "+SSE4A"
+#else
+# define vec8_architecture_SSE4a ""
+#endif
+#ifdef __FMA4__
+# define vec8_architecture_FMA4 "+FMA4"
#else
-# define vec8_architecture "SSE2 (64-bit precision)"
+# define vec8_architecture_FMA4 ""
#endif
+#define vec8_architecture "SSE2" vec8_architecture_SSE4_1 vec8_architecture_SSE4a vec8_architecture_FMA4 " (64-bit precision)"
// Vector type corresponding to CCTK_REAL
#define CCTK_REAL8_VEC __m128d
@@ -204,10 +204,17 @@ static const union {
#define k8div(x,y) (_mm_div_pd(x,y))
// Fused multiply-add, defined as [+-]x*y[+-]z
-#define k8madd(x,y,z) (k8add(k8mul(x,y),z))
-#define k8msub(x,y,z) (k8sub(k8mul(x,y),z))
-#define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y)))
-#define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y)))
+#ifdef __FMA4__
+# define k8madd(x,y,z) (_mm_macc_pd(x,y,z))
+# define k8msub(x,y,z) (_mm_msub_pd(x,y,z))
+# define k8nmadd(x,y,z) (_mm_nmsub_pd(x,y,z))
+# define k8nmsub(x,y,z) (_mm_nmacc_pd(x,y,z))
+#else
+# define k8madd(x,y,z) (k8add(k8mul(x,y),z))
+# define k8msub(x,y,z) (k8sub(k8mul(x,y),z))
+# define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y)))
+# define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y)))
+#endif
// Cheap functions
#define k8fabs(x) (_mm_andnot_pd(k8sign_mask,x))