aboutsummaryrefslogtreecommitdiff
path: root/src/vectors-8-SSE2.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/vectors-8-SSE2.h')
-rw-r--r--src/vectors-8-SSE2.h216
1 files changed, 153 insertions, 63 deletions
diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h
index 34aa24f..4a3f4e2 100644
--- a/src/vectors-8-SSE2.h
+++ b/src/vectors-8-SSE2.h
@@ -6,6 +6,14 @@
#include <emmintrin.h>
+#ifdef __SSE4_1__
+// Intel's SSE 4.1
+# include <smmintrin.h>
+#endif
+#ifdef __SSE4A__
+// AMD's SSE 4a
+# include <ammintrin.h>
+#endif
@@ -22,43 +30,17 @@
#define vec8_set1(a) (_mm_set1_pd(a))
#define vec8_set(a,b) (_mm_set_pd(b,a)) // note reversed arguments
-#if defined(__PGI) && defined (__amd64__)
-// _mm_cvtsd_f64 does not exist on PGI 9 compilers
-# define vec8_elt0(x) \
-({ \
- CCTK_REAL8 aelt0; \
- asm ("" : "=x" (aelt0) : "0" (x)); \
- aelt0; \
-})
-#else
-# define vec8_elt0(x) (_mm_cvtsd_f64(x)) // this is a no-op
-#endif
-#define vec8_elt1(x) \
-({ \
- CCTK_REAL8_VEC const xelt1=(x); \
- vec8_elt0(_mm_unpackhi_pd(xelt1,xelt1)); \
-})
-#if defined(__PGI) && defined (__amd64__)
-# define vec8_elt(x,d) \
-({ \
- CCTK_REAL8_VEC const xelt=(x); \
- CCTK_REAL8 aelt; \
- if (d==0) aelt=vec8_elt0(xelt); \
- else if (d==1) aelt=vec8_elt1(xelt); \
- aelt; \
-})
-#else
-# define vec8_elt(x,d) \
-({ \
- CCTK_REAL8_VEC const xelt=(x); \
- CCTK_REAL8 aelt; \
- switch (d) { \
- case 0: aelt=vec8_elt0(xelt); break; \
- case 1: aelt=vec8_elt1(xelt); break; \
- } \
- aelt; \
-})
-#endif
+// original order is 01
+#define vec8_swap10(x_) \
+ ({ \
+ CCTK_REAL8_VEC const xx=(x_); \
+ CCTK_REAL8_VEC const x=xx; \
+ _mm_shuffle_pd(x,x, _MM_SHUFFLE2(0,1)); \
+ })
+
+#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0])
+#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1])
+#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d])
@@ -68,29 +50,96 @@
// a reference to a scalar
#define vec8_load(p) (_mm_load_pd(&(p)))
#define vec8_loadu(p) (_mm_loadu_pd(&(p)))
+#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS
+# define vec8_load_off1(p) vec_loadu(p)
+#else
+# define vec8_load_off1(p_) \
+ ({ \
+ CCTK_REAL8 const& pp=(p_); \
+ CCTK_REAL8 const& p=pp; \
+ _mm_shuffle_pd(vec8_load((&p)[-1]), \
+ vec8_load((&p)[+1]), _MM_SHUFFLE2(0,1)); \
+ })
+#endif
// Load a vector from memory that may or may not be aligned, as
// decided by the offset off and the vector size
+#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
// Implementation: Always use unaligned load
-#define vec8_loadu_maybe(off,p) (vec8_loadu(p))
-#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p))
+# define vec8_loadu_maybe(off,p) vec8_loadu(p)
+# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p)
+#else
+# define vec8_loadu_maybe(off,p_) \
+ ({ \
+ CCTK_REAL8 const& pp=(p_); \
+ CCTK_REAL8 const& p=pp; \
+ (off) % CCTK_REAL8_VEC_SIZE == 0 ? \
+ vec8_load(p) : \
+ vec8_load_off1(p); \
+ })
+# if VECTORISE_ALIGNED_ARRAYS
+// Assume all array x sizes are multiples of the vector size
+# define vec8_loadu_maybe3(off1,off2,off3,p) \
+ vec8_loadu_maybe(off1,p)
+# else
+# define vec8_loadu_maybe3(off1,off2,off3,p_) \
+ ({ \
+ CCTK_REAL8 const& pp=(p_); \
+ CCTK_REAL8 const& p=pp; \
+ ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \
+ (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \
+ vec8_loadu(p) : \
+ vec8_loadu_maybe(off1,p); \
+ })
+# endif
+#endif
// Store a vector to memory (aligned and non-temporal); this stores to
// a reference to a scalar
-#define vec8_store(p,x) (_mm_store_pd(&(p),x))
-#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x))
-#define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x))
+#define vec8_store(p,x) (_mm_store_pd(&(p),x))
+#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x))
+#if ! VECTORISE_STREAMING_STORES
+# define vec8_store_nta(p,x) vec8_store(p,x)
+#else
+# define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x))
+#endif
-// Store a lower or higher partial vector (aligned and non-temporal);
-// the non-temporal hint is probably ignored
-#if 1
+// Store a lower or higher partial vector (aligned and non-temporal)
+#if ! VECTORISE_STREAMING_STORES
# define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x))
# define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x))
#else
+# if defined(__SSE4A__)
+# define vec8_store_nta_partial_lo(p,x,n) (_mm_stream_sd(&(p),x))
+# define vec8_store_nta_partial_hi(p,x,n) \
+ (_mm_stream_sd(&(p)+1, vec8_swap10(x)))
+# else
+// TODO: use clflush once a whole cache line has been written (cache
+// lines are usually larger than the CPU vector size)
+# define vec8_store_nta_partial_lo(p_,x,n) \
+ ({ \
+ CCTK_REAL8& pp=(p_); \
+ CCTK_REAL8& p=pp; \
+ _mm_storel_pd(&p,x); \
+ /* _mm_clflush(&p); */ \
+ })
+# define vec8_store_nta_partial_hi(p_,x,n) \
+ ({ \
+ CCTK_REAL8& pp=(p_); \
+ CCTK_REAL8& p=pp; \
+ _mm_storeh_pd(&p+1,x); \
+ /* _mm_clflush(&p+1); */ \
+ })
+# endif
+#endif
+#if 0
// This is slower; we would need a non-temporal read
-# define vec8_store_nta_partial_lo(p,x,n) (vec8_store_nta(p,_mm_loadh_pd(x,&(p)+1)))
-# define vec8_store_nta_partial_hi(p,x,n) (vec8_store_nta(p,_mm_loadl_pd(x,&(p))))
+#define vec8_store_nta_partial_lo(p,x,n) \
+ vec8_store_nta(p, _mm_loadh_pd(x,&(p)+1))
+#define vec8_store_nta_partial_hi(p,x,n) \
+ vec8_store_nta(p, _mm_loadl_pd(x,&(p)))
#endif
+#define vec8_store_nta_partial_mid(p,x,nlo,nhi) assert(0)
@@ -107,6 +156,43 @@ static const union {
} k8abs_mask_union = {{ 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }};
#define k8abs_mask (k8sign_mask_union.v)
+// Choice [sign(x)>0 ? y : z]
+#ifdef __SSE4_1__
+# define k8ifthen(x,y,z) (_mm_blendv_pd(y,z,x))
+#elif 0
+# define k8ifthen(x,y,z) \
+ ({ \
+ CCTK_REAL8_VEC const xx=(x_); \
+ CCTK_REAL8_VEC const x=xx; \
+ CCTK_REAL8_VEC const yy=(y_); \
+ CCTK_REAL8_VEC const y=yy; \
+ CCTK_REAL8_VEC const zz=(z_); \
+ CCTK_REAL8_VEC const z=zz; \
+ int const m = _mm_movemask_pd(x); \
+ CCTK_REAL8_VEC r; \
+ switch (m) { \
+ case 0: r = y; break; \
+ case 1: r = _mm_move_sd(y,z); break; \
+ case 2: r = _mm_move_sd(z,y); break; \
+ case 3: r = z; break; \
+ } \
+ r; \
+ })
+#else
+# define k8ifthen(x,y,z) \
+ ({ \
+ CCTK_REAL8_VEC const xx=(x_); \
+ CCTK_REAL8_VEC const x=xx; \
+ CCTK_REAL8_VEC const yy=(y_); \
+ CCTK_REAL8_VEC const y=yy; \
+ CCTK_REAL8_VEC const zz=(z_); \
+ CCTK_REAL8_VEC const z=zz; \
+ CCTK_REAL8_VEC const c = _mm_and_pd(x,k8sign_mask); \
+ vec8_set(not vec8_elt0(c) ? vec8_elt0(y) : vec8_elt0(z), \
+ not vec8_elt1(c) ? vec8_elt1(y) : vec8_elt1(z)); \
+ })
+#endif
+
// Operators
#define k8pos(x) (x)
#define k8neg(x) (_mm_xor_pd(x,k8sign_mask))
@@ -130,19 +216,23 @@ static const union {
#define k8sqrt(x) (_mm_sqrt_pd(x))
// Expensive functions
-#define k8exp(x) \
-({ \
- CCTK_REAL8_VEC const xexp=(x); \
- vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \
-})
-#define k8log(x) \
-({ \
- CCTK_REAL8_VEC const xlog=(x); \
- vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \
-})
-#define k8pow(x,a) \
-({ \
- CCTK_REAL8_VEC const xpow=(x); \
- CCTK_REAL8 const apow=(a); \
- vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \
-})
+#define K8REPL(f,x_) \
+ ({ \
+ CCTK_REAL8_VEC const xx=(x_); \
+ CCTK_REAL8_VEC const x=xx; \
+ vec8_set(f(vec8_elt0(x)), \
+ f(vec8_elt1(x))); \
+ })
+#define K8REPL2(f,x_,a_) \
+ ({ \
+ CCTK_REAL8_VEC const xx=(x_); \
+ CCTK_REAL8_VEC const x=xx; \
+ CCTK_REAL8 const aa=(a_); \
+ CCTK_REAL8 const a=aa; \
+ vec8_set(f(vec8_elt0(x),a), \
+ f(vec8_elt1(x),a)); \
+ })
+
+#define k8exp(x) K8REPL(exp,x)
+#define k8log(x) K8REPL(log,x)
+#define k8pow(x,a) K8REPL2(pow,x,a)