aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/vectors-4-SSE.h6
-rw-r--r--src/vectors-8-SSE2.h16
2 files changed, 16 insertions, 6 deletions
diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h
index 2f55183..dbf0cce 100644
--- a/src/vectors-4-SSE.h
+++ b/src/vectors-4-SSE.h
@@ -21,11 +21,11 @@
#ifdef __SSE4_1__
-#define vec4_architecture "SSE4.1 (32-bit precision)"
+# define vec4_architecture "SSE4.1 (32-bit precision)"
#elif defined(__SSE4A__)
-#define vec4_architecture "SSE4A (32-bit precision)"
+# define vec4_architecture "SSE4A (32-bit precision)"
#else
-#define vec4_architecture "SSE (32-bit precision)"
+# define vec4_architecture "SSE (32-bit precision)"
#endif
// Vector type corresponding to CCTK_REAL
diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h
index 56c614d..3b11990 100644
--- a/src/vectors-8-SSE2.h
+++ b/src/vectors-8-SSE2.h
@@ -16,16 +16,26 @@
#ifdef __SSE4A__
// AMD's SSE 4a
# include <ammintrin.h>
+
+// Intel compilers don't support SSE 4a. Here is how we can implement
+// these instructions in assembler instead:
+
+// inline void __attribute__((__always_inline__))
+// _mm_stream_sd (double *p, __m128d x)
+// {
+// asm ("movntsd %[x],%[p]" : "=m" (*p) : [p] "m" (*p), [x] "x" (x));
+// }
+
#endif
#ifdef __SSE4_1__
-#define vec8_architecture "SSE4.1 (64-bit precision)"
+# define vec8_architecture "SSE4.1 (64-bit precision)"
#elif defined(__SSE4A__)
-#define vec8_architecture "SSE4A (64-bit precision)"
+# define vec8_architecture "SSE4A (64-bit precision)"
#else
-#define vec8_architecture "SSE2 (64-bit precision)"
+# define vec8_architecture "SSE2 (64-bit precision)"
#endif
// Vector type corresponding to CCTK_REAL