1 files changed, 153 insertions, 63 deletions
diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h
index 34aa24f..4a3f4e2 100644
--- a/src/vectors-8-SSE2.h
+++ b/src/vectors-8-SSE2.h
@@ -6,6 +6,14 @@
 
 
 #include <emmintrin.h>
+#ifdef __SSE4_1__
+// Intel's SSE 4.1
+#  include <smmintrin.h>
+#endif
+#ifdef __SSE4A__
+// AMD's SSE 4a
+#  include <ammintrin.h>
+#endif
 
 
 
@@ -22,43 +30,17 @@
 #define vec8_set1(a)  (_mm_set1_pd(a))
 #define vec8_set(a,b) (_mm_set_pd(b,a)) // note reversed arguments
 
-#if defined(__PGI) && defined (__amd64__)
-// _mm_cvtsd_f64 does not exist on PGI 9 compilers
-#  define vec8_elt0(x)                          \
-({                                              \
-  CCTK_REAL8 aelt0;                             \
-  asm ("" : "=x" (aelt0) : "0" (x));            \
-  aelt0;                                        \
-})
-#else
-#  define vec8_elt0(x) (_mm_cvtsd_f64(x)) // this is a no-op
-#endif
-#define vec8_elt1(x)                            \
-({                                              \
-  CCTK_REAL8_VEC const xelt1=(x);               \
-  vec8_elt0(_mm_unpackhi_pd(xelt1,xelt1));      \
-})
-#if defined(__PGI) && defined (__amd64__)
-#  define vec8_elt(x,d)                         \
-({                                              \
-  CCTK_REAL8_VEC const xelt=(x);                \
-  CCTK_REAL8 aelt;                              \
-  if      (d==0) aelt=vec8_elt0(xelt);          \
-  else if (d==1) aelt=vec8_elt1(xelt);          \
-  aelt;                                         \
-})
-#else
-#  define vec8_elt(x,d)                         \
-({                                              \
-  CCTK_REAL8_VEC const xelt=(x);                \
-  CCTK_REAL8 aelt;                              \
-  switch (d) {                                  \
-  case 0: aelt=vec8_elt0(xelt); break;          \
-  case 1: aelt=vec8_elt1(xelt); break;          \
-  }                                             \
-  aelt;                                         \
-})
-#endif
+// original order is 01
+#define vec8_swap10(x_)                         \
+  ({                                            \
+    CCTK_REAL8_VEC const xx=(x_);               \
+    CCTK_REAL8_VEC const x=xx;                  \
+    _mm_shuffle_pd(x,x, _MM_SHUFFLE2(0,1));     \
+  })
+
+#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0])
+#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1])
+#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d])
 
 
 
@@ -68,29 +50,96 @@
 // a reference to a scalar
 #define vec8_load(p)  (_mm_load_pd(&(p)))
 #define vec8_loadu(p) (_mm_loadu_pd(&(p)))
+#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS
+#  define vec8_load_off1(p) vec_loadu(p)
+#else
+#  define vec8_load_off1(p_)                                    \
+  ({                                                            \
+    CCTK_REAL8 const& pp=(p_);                                  \
+    CCTK_REAL8 const& p=pp;                                     \
+    _mm_shuffle_pd(vec8_load((&p)[-1]),                         \
+                   vec8_load((&p)[+1]), _MM_SHUFFLE2(0,1));     \
+  })
+#endif
 
 // Load a vector from memory that may or may not be aligned, as
 // decided by the offset off and the vector size
+#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
 // Implementation: Always use unaligned load
-#define vec8_loadu_maybe(off,p)             (vec8_loadu(p))
-#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p))
+#  define vec8_loadu_maybe(off,p)             vec8_loadu(p)
+#  define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p)
+#else
+#  define vec8_loadu_maybe(off,p_)              \
+  ({                                            \
+    CCTK_REAL8 const& pp=(p_);                  \
+    CCTK_REAL8 const& p=pp;                     \
+    (off) % CCTK_REAL8_VEC_SIZE == 0 ?          \
+      vec8_load(p) :                            \
+      vec8_load_off1(p);                        \
+  })
+#  if VECTORISE_ALIGNED_ARRAYS
+// Assume all array x sizes are multiples of the vector size
+#    define vec8_loadu_maybe3(off1,off2,off3,p) \
+  vec8_loadu_maybe(off1,p)
+#  else
+#    define vec8_loadu_maybe3(off1,off2,off3,p_)        \
+  ({                                                    \
+    CCTK_REAL8 const& pp=(p_);                          \
+    CCTK_REAL8 const& p=pp;                             \
+    ((off2) % CCTK_REAL8_VEC_SIZE != 0 or               \
+     (off3) % CCTK_REAL8_VEC_SIZE != 0) ?               \
+      vec8_loadu(p) :                                   \
+      vec8_loadu_maybe(off1,p);                         \
+  })
+#  endif
+#endif
 
 // Store a vector to memory (aligned and non-temporal); this stores to
 // a reference to a scalar
-#define vec8_store(p,x)     (_mm_store_pd(&(p),x))
-#define vec8_storeu(p,x)    (_mm_storeu_pd(&(p),x))
-#define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x))
+#define vec8_store(p,x)  (_mm_store_pd(&(p),x))
+#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x))
+#if ! VECTORISE_STREAMING_STORES
+#  define vec8_store_nta(p,x) vec8_store(p,x)
+#else
+#  define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x))
+#endif
 
-// Store a lower or higher partial vector (aligned and non-temporal);
-// the non-temporal hint is probably ignored
-#if 1
+// Store a lower or higher partial vector (aligned and non-temporal)
+#if ! VECTORISE_STREAMING_STORES
 #  define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x))
 #  define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x))
 #else
+#  if defined(__SSE4A__)
+#    define vec8_store_nta_partial_lo(p,x,n) (_mm_stream_sd(&(p),x))
+#    define vec8_store_nta_partial_hi(p,x,n)    \
+  (_mm_stream_sd(&(p)+1, vec8_swap10(x)))
+#  else
+// TODO: use clflush once a whole cache line has been written (cache
+// lines are usually larger than the CPU vector size)
+#    define vec8_store_nta_partial_lo(p_,x,n)   \
+  ({                                            \
+    CCTK_REAL8& pp=(p_);                        \
+    CCTK_REAL8& p=pp;                           \
+    _mm_storel_pd(&p,x);                        \
+    /* _mm_clflush(&p); */                      \
+  })
+#    define vec8_store_nta_partial_hi(p_,x,n)   \
+  ({                                            \
+    CCTK_REAL8& pp=(p_);                        \
+    CCTK_REAL8& p=pp;                           \
+    _mm_storeh_pd(&p+1,x);                      \
+    /* _mm_clflush(&p+1); */                    \
+  })
+#  endif
+#endif
+#if 0
 // This is slower; we would need a non-temporal read
-#  define vec8_store_nta_partial_lo(p,x,n) (vec8_store_nta(p,_mm_loadh_pd(x,&(p)+1)))
-#  define vec8_store_nta_partial_hi(p,x,n) (vec8_store_nta(p,_mm_loadl_pd(x,&(p))))
+#define vec8_store_nta_partial_lo(p,x,n)        \
+  vec8_store_nta(p, _mm_loadh_pd(x,&(p)+1))
+#define vec8_store_nta_partial_hi(p,x,n)        \
+  vec8_store_nta(p, _mm_loadl_pd(x,&(p)))
 #endif
+#define vec8_store_nta_partial_mid(p,x,nlo,nhi) assert(0)
 
 
 
@@ -107,6 +156,43 @@ static const union {
 } k8abs_mask_union = {{ 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }};
 #define k8abs_mask (k8sign_mask_union.v)
 
+// Choice   [sign(x)>0 ? y : z]
+#ifdef __SSE4_1__
+#  define k8ifthen(x,y,z) (_mm_blendv_pd(y,z,x))
+#elif 0
+#  define k8ifthen(x,y,z)                       \
+  ({                                            \
+    CCTK_REAL8_VEC const xx=(x_);               \
+    CCTK_REAL8_VEC const x=xx;                  \
+    CCTK_REAL8_VEC const yy=(y_);               \
+    CCTK_REAL8_VEC const y=yy;                  \
+    CCTK_REAL8_VEC const zz=(z_);               \
+    CCTK_REAL8_VEC const z=zz;                  \
+    int const m = _mm_movemask_pd(x);           \
+    CCTK_REAL8_VEC r;                           \
+    switch (m) {                                \
+    case 0: r = y; break;                       \
+    case 1: r = _mm_move_sd(y,z); break;        \
+    case 2: r = _mm_move_sd(z,y); break;        \
+    case 3: r = z; break;                       \
+    }                                           \
+    r;                                          \
+  })
+#else
+#  define k8ifthen(x,y,z)                                       \
+  ({                                                            \
+    CCTK_REAL8_VEC const xx=(x_);                               \
+    CCTK_REAL8_VEC const x=xx;                                  \
+    CCTK_REAL8_VEC const yy=(y_);                               \
+    CCTK_REAL8_VEC const y=yy;                                  \
+    CCTK_REAL8_VEC const zz=(z_);                               \
+    CCTK_REAL8_VEC const z=zz;                                  \
+    CCTK_REAL8_VEC const c = _mm_and_pd(x,k8sign_mask);         \
+    vec8_set(not vec8_elt0(c) ? vec8_elt0(y) : vec8_elt0(z),    \
+             not vec8_elt1(c) ? vec8_elt1(y) : vec8_elt1(z));   \
+  })
+#endif
+
 // Operators
 #define k8pos(x) (x)
 #define k8neg(x) (_mm_xor_pd(x,k8sign_mask))
@@ -130,19 +216,23 @@ static const union {
 #define k8sqrt(x)   (_mm_sqrt_pd(x))
 
 // Expensive functions
-#define k8exp(x)                                        \
-({                                                      \
-  CCTK_REAL8_VEC const xexp=(x);                        \
-  vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \
-})
-#define k8log(x)                                        \
-({                                                      \
-  CCTK_REAL8_VEC const xlog=(x);                        \
-  vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \
-})
-#define k8pow(x,a)                                                      \
-({                                                                      \
-  CCTK_REAL8_VEC const xpow=(x);                                        \
-  CCTK_REAL8 const apow=(a);                                            \
-  vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow));       \
-})
+#define K8REPL(f,x_)                            \
+  ({                                            \
+    CCTK_REAL8_VEC const xx=(x_);               \
+    CCTK_REAL8_VEC const x=xx;                  \
+    vec8_set(f(vec8_elt0(x)),                   \
+             f(vec8_elt1(x)));                  \
+  })
+#define K8REPL2(f,x_,a_)                        \
+  ({                                            \
+    CCTK_REAL8_VEC const xx=(x_);               \
+    CCTK_REAL8_VEC const x=xx;                  \
+    CCTK_REAL8     const aa=(a_);               \
+    CCTK_REAL8     const a=aa;                  \
+    vec8_set(f(vec8_elt0(x),a),                 \
+             f(vec8_elt1(x),a));                \
+  })
+
+#define k8exp(x)   K8REPL(exp,x)
+#define k8log(x)   K8REPL(log,x)
+#define k8pow(x,a) K8REPL2(pow,x,a)