aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoreschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2011-12-15 15:30:14 +0000
committereschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2011-12-15 15:30:14 +0000
commit9305624ffce90bca91be92db1a718e428a1cdf4c (patch)
treea7b2ae42dc1ee25981d74c257e16d8aa4002d1fd
parent52064a4e928175bfa1be3f7e8d262f584ca025e9 (diff)
Don't use <x86intrin.h>; this does not exist everywhere
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@40 105869f7-3296-0410-a4ea-f4349344b45a
-rw-r--r--src/vectors-4-SSE.h28
-rw-r--r--src/vectors-8-AVX.h6
-rw-r--r--src/vectors-8-SSE2.h14
3 files changed, 41 insertions, 7 deletions
diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h
index 927d54e..8fea7ba 100644
--- a/src/vectors-4-SSE.h
+++ b/src/vectors-4-SSE.h
@@ -8,7 +8,18 @@
#include <assert.h>
#include <math.h>
-#include <x86intrin.h>
+#include <xmmintrin.h>
+#ifdef __SSE4_1__
+// Intel's SSE 4.1
+# include <smmintrin.h>
+#endif
+#ifdef __SSE4A__
+// AMD's SSE 4a
+# include <ammintrin.h>
+#endif
+#ifdef __FMA4__
+# include <fma4intrin.h>
+#endif
@@ -265,10 +276,17 @@ static const union {
#define k4div(x,y) (_mm_div_ps(x,y))
// Fused multiply-add, defined as [+-]x*y[+-]z
-#define k4madd(x,y,z) (k4add(k4mul(x,y),z))
-#define k4msub(x,y,z) (k4sub(k4mul(x,y),z))
-#define k4nmadd(x,y,z) (k4sub(k4neg(z),k4mul(x,y)))
-#define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y)))
+#ifdef __FMA4__
+# define k4madd(x,y,z) (_mm_macc_ps(x,y,z))
+# define k4msub(x,y,z) (_mm_msub_ps(x,y,z))
+# define k4nmadd(x,y,z) (_mm_nmsub_ps(x,y,z))
+# define k4nmsub(x,y,z) (_mm_nmacc_ps(x,y,z))
+#else
+# define k4madd(x,y,z) (k4add(k4mul(x,y),z))
+# define k4msub(x,y,z) (k4sub(k4mul(x,y),z))
+# define k4nmadd(x,y,z) (k4sub(k4neg(z),k4mul(x,y)))
+# define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y)))
+#endif
// Cheap functions
#define k4fabs(x) (_mm_andnot_ps(k4sign_mask,x))
diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h
index fc3e4d5..f96bd01 100644
--- a/src/vectors-8-AVX.h
+++ b/src/vectors-8-AVX.h
@@ -5,9 +5,13 @@
-#include <x86intrin.h>
#if VECTORISE_EMULATE_AVX
# include "avxintrin_emu.h"
+#else
+# include <immintrin.h>
+#endif
+#ifdef __FMA4__
+# include <fma4intrin.h>
#endif
diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h
index bce093c..46a855e 100644
--- a/src/vectors-8-SSE2.h
+++ b/src/vectors-8-SSE2.h
@@ -8,7 +8,14 @@
#include <assert.h>
#include <math.h>
-#include <x86intrin.h>
+#include <emmintrin.h>
+#ifdef __SSE4_1__
+// Intel's SSE 4.1
+# include <smmintrin.h>
+#endif
+#ifdef __SSE4A__
+// AMD's SSE 4a
+# include <ammintrin.h>
// Intel compilers don't support SSE 4a. Here is how we can implement
// these instructions in assembler instead:
@@ -19,6 +26,11 @@
// asm ("movntsd %[x],%[p]" : "=m" (*p) : [p] "m" (*p), [x] "x" (x));
// }
+#endif
+#ifdef __FMA4__
+# include <fma4intrin.h>
+#endif
+
#ifdef __SSE4_1__