aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoreschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2010-12-24 00:43:09 +0000
committereschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2010-12-24 00:43:09 +0000
commit49084a03a0685df85894e22821a7ef63b2d8cf1c (patch)
tree335360232b73c656fa35c5e9f8c0f0eabf0faf49
parentd728013d0b8c0eec323cee76522f77ff70ec8bab (diff)
Make vectorisation work with PGI compilers
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@6 105869f7-3296-0410-a4ea-f4349344b45a
-rw-r--r--src/vectors-intel-4.h15
-rw-r--r--src/vectors-intel-8.h35
2 files changed, 45 insertions, 5 deletions
diff --git a/src/vectors-intel-4.h b/src/vectors-intel-4.h
index 73c90be..bc50e68 100644
--- a/src/vectors-intel-4.h
+++ b/src/vectors-intel-4.h
@@ -48,7 +48,19 @@
CCTK_REAL4_VEC const xelt3=(x); \
vec4_elt0(_mm_shuffle_ps(xelt3,xelt3,_MM_SHUFFLE(3,2,1,0))); \
})
-#define vec4_elt(x,d) \
+#if defined(__PGI) && defined (__amd64__)
+# define vec4_elt(x,d) \
+({ \
+ CCTK_REAL4_VEC const xelt=(x); \
+ CCTK_REAL4 aelt; \
+ if (d==0) aelt=vec4_elt0(xelt); \
+ else if (d==1) aelt=vec4_elt1(xelt); \
+ else if (d==2) aelt=vec4_elt2(xelt); \
+ else if (d==3) aelt=vec4_elt3(xelt); \
+ aelt; \
+})
+#else
+# define vec4_elt(x,d) \
({ \
CCTK_REAL4_VEC const xelt=(x); \
CCTK_REAL4 aelt; \
@@ -60,6 +72,7 @@
} \
aelt; \
})
+#endif
diff --git a/src/vectors-intel-8.h b/src/vectors-intel-8.h
index 35dffa6..34aa24f 100644
--- a/src/vectors-intel-8.h
+++ b/src/vectors-intel-8.h
@@ -22,13 +22,33 @@
#define vec8_set1(a) (_mm_set1_pd(a))
#define vec8_set(a,b) (_mm_set_pd(b,a)) // note reversed arguments
-#define vec8_elt0(x) (_mm_cvtsd_f64(x)) // this is a no-op
+#if defined(__PGI) && defined (__amd64__)
+// _mm_cvtsd_f64 does not exist on PGI 9 compilers
+# define vec8_elt0(x) \
+({ \
+ CCTK_REAL8 aelt0; \
+ asm ("" : "=x" (aelt0) : "0" (x)); \
+ aelt0; \
+})
+#else
+# define vec8_elt0(x) (_mm_cvtsd_f64(x)) // this is a no-op
+#endif
#define vec8_elt1(x) \
({ \
CCTK_REAL8_VEC const xelt1=(x); \
vec8_elt0(_mm_unpackhi_pd(xelt1,xelt1)); \
})
-#define vec8_elt(x,d) \
+#if defined(__PGI) && defined (__amd64__)
+# define vec8_elt(x,d) \
+({ \
+ CCTK_REAL8_VEC const xelt=(x); \
+ CCTK_REAL8 aelt; \
+ if (d==0) aelt=vec8_elt0(xelt); \
+ else if (d==1) aelt=vec8_elt1(xelt); \
+ aelt; \
+})
+#else
+# define vec8_elt(x,d) \
({ \
CCTK_REAL8_VEC const xelt=(x); \
CCTK_REAL8 aelt; \
@@ -38,6 +58,7 @@
} \
aelt; \
})
+#endif
@@ -62,8 +83,14 @@
// Store a lower or higher partial vector (aligned and non-temporal);
// the non-temporal hint is probably ignored
-#define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x))
-#define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x))
+#if 1
+# define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x))
+# define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x))
+#else
+// This is slower; we would need a non-temporal read
+# define vec8_store_nta_partial_lo(p,x,n) (vec8_store_nta(p,_mm_loadh_pd(x,&(p)+1)))
+# define vec8_store_nta_partial_hi(p,x,n) (vec8_store_nta(p,_mm_loadl_pd(x,&(p))))
+#endif