Introduce Cactus options for vectorisation

Introduce configuration-time options for vectorisation, including options to allow architecture-specific choices that may influence performance. Introduce "middle" masked stores for large vector sizes and small loops. Clean up and simplify some of the implementation code. git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@10 105869f7-3296-0410-a4ea-f4349344b45a
author: eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> 2011-06-06 10:11:44 +0000
committer: eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> 2011-06-06 10:11:44 +0000
commit: 2ab4d61cd4b632c0e991c781f3c15f3b054d1bbd (patch)
tree: 6664b1e9ee360ee0abf9df6b9a5562eb5bdc88c5 /src/vectors-8-DoubleHummer.h
parent: 5d4858e0736a0c0881c65b9e9ac0983d3b5bb24b (diff)
1 files changed, 139 insertions, 41 deletions
diff --git a/src/vectors-8-DoubleHummer.h b/src/vectors-8-DoubleHummer.h
index 9311f62..16ed8a0 100644
--- a/src/vectors-8-DoubleHummer.h
+++ b/src/vectors-8-DoubleHummer.h
@@ -24,16 +24,17 @@
 
 #define vec8_elt0(x) (__creal(x))
 #define vec8_elt1(x) (__cimag(x))
-#define vec8_elt(x,d)                           \
-({                                              \
-  CCTK_REAL8_VEC const xelt=(x);                \
-  CCTK_REAL8 aelt;                              \
-  switch (d) {                                  \
-  case 0: aelt=vec8_elt0(xelt); break;          \
-  case 1: aelt=vec8_elt1(xelt); break;          \
-  }                                             \
-  aelt;                                         \
-})
+#define vec8_elt(x_,d)                          \
+  ({                                            \
+    CCTK_REAL8_VEC const xx=(x_);               \
+    CCTK_REAL8_VEC const x=xx;                  \
+    CCTK_REAL8 a;                               \
+    switch (d) {                                \
+    case 0: a=vec8_elt0(x); break;              \
+    case 1: a=vec8_elt1(x); break;              \
+    }                                           \
+    a;                                          \
+  })
 
 
 
@@ -41,13 +42,72 @@
 
 // Load a vector from memory (aligned and unaligned); this loads from
 // a reference to a scalar
-#define vec8_load(p)  (__lfpd((double *)&(p)))
-#define vec8_loadu(p) (__lfpd((double *)&(p))) // this may not work
+#define vec8_load(p)  (__lfpd((CCTK_REAL8 *)&(p)))
+#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS
+#  define vec8_load_off1(p_)                    \
+  ({                                            \
+    CCTK_REAL8 const& pp=(p_);                  \
+    CCTK_REAL8 const& p=pp;                     \
+    vec8_set((&p)[0],(&p)[1]);                  \
+  })
+#else
+#if 0
+#  define vec8_load_off1(p_)                                    \
+  ({                                                            \
+    CCTK_REAL8 const& pp=(p_);                                  \
+    CCTK_REAL8 const& p=pp;                                     \
+    CCTK_REAL8_VEC const lo = __lfxd((CCTK_REAL8 *)(&p-1));     \
+    CCTK_REAL8_VEC const hi = __lfxd((CCTK_REAL8 *)(&p+1));     \
+    __fpsel(vec8_set(-1.0,+1.0),lo,hi);                         \
+  })
+#endif
+#  define vec8_load_off1(p_)                            \
+  ({                                                    \
+    CCTK_REAL8 const& pp=(p_);                          \
+    CCTK_REAL8 const& p=pp;                             \
+    CCTK_REAL8_VEC const lo = vec8_load((&p)[-1]);      \
+    CCTK_REAL8_VEC const hi = vec8_load((&p)[+1]);      \
+    __fxmr(__fpsel(vec8_set(+1.0,-1.0),lo,hi));         \
+  })
+#endif
+#define vec8_loadu(p_)                          \
+  ({                                            \
+    CCTK_REAL8 const& pp=(p_);                  \
+    CCTK_REAL8 const& p=pp;                     \
+    int const off = (ptrdiff_t)&p & 0xf;        \
+    off==0 ? vec8_load(p) : vec8_load_off1(p);  \
+  })
 
 // Load a vector from memory that may or may not be aligned, as
 // decided by the offset and the vector size
-#define vec8_loadu_maybe(off,p)             (vec8_loadu(p))
-#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p))
+#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
+// Implementation: Always use unaligned load
+#  define vec8_loadu_maybe(off,p)             vec8_loadu(p)
+#  define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p)
+#else
+#  define vec8_loadu_maybe(off,p_)              \
+  ({                                            \
+    CCTK_REAL8 const& pp=(p_);                  \
+    CCTK_REAL8 const& p=pp;                     \
+    (off) % CCTK_REAL8_VEC_SIZE == 0 ?          \
+      vec8_load(p) :                            \
+      vec8_load_off1(p);                        \
+  })
+#  if VECTORISE_ALIGNED_ARRAYS
+// Assume all array x sizes are multiples of the vector size
+#    define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu_maybe(off1,p)
+#  else
+#    define vec8_loadu_maybe3(off1,off2,off3,p_)        \
+  ({                                                    \
+    CCTK_REAL8 const& pp=(p_);                          \
+    CCTK_REAL8 const& p=pp;                             \
+    ((off2) % CCTK_REAL8_VEC_SIZE != 0 ||               \
+     (off3) % CCTK_REAL8_VEC_SIZE != 0) ?               \
+      vec8_loadu(p) :                                   \
+      vec8_loadu_maybe(off1,p);                         \
+  })
+#  endif
+#endif
 
 // Store a vector to memory (aligned and non-temporal); this stores to
 // a reference to a scalar
@@ -59,6 +119,7 @@
 // the non-temporal hint is probably ignored
 #define vec8_store_nta_partial_lo(p,x,n) ((&(p))[0]=vec8_elt0(x))
 #define vec8_store_nta_partial_hi(p,x,n) ((&(p))[1]=vec8_elt1(x))
+#define vec8_store_nta_partial_mid(p,x,nlo,nhi) assert(0)
 
 
 
@@ -71,7 +132,29 @@
 #define k8add(x,y) (__fpadd(x,y))
 #define k8sub(x,y) (__fpsub(x,y))
 #define k8mul(x,y) (__fpmul(x,y))
-#define k8div(x,y) (__fpmul(x,__fpre(y)))
+// Estimate for reciprocal
+#define k8inv_init(x) (__fpre(x))
+// One Newton iteration for reciprocal
+#define k8inv_iter(x_,r_)                               \
+  ({                                                    \
+    CCTK_REAL8_VEC const xx=(x_);                       \
+    CCTK_REAL8_VEC const x=xx;                          \
+    CCTK_REAL8_VEC const rr=(r_);                       \
+    CCTK_REAL8_VEC const r=rr;                          \
+    /* r + r * (vec8_set1(1.0) - x*r); */               \
+    k8madd(r, k8nmsub(x, r, vec8_set1(1.0)), r);        \
+  })
+// Reciprocal: First estimate, then apply two Newton iterations
+#define k8inv(x_)                               \
+  ({                                            \
+    CCTK_REAL8_VEC const xx=(x_);               \
+    CCTK_REAL8_VEC const x=xx;                  \
+    CCTK_REAL8_VEC const r0 = k8inv_init(x);    \
+    CCTK_REAL8_VEC const r1 = k8inv_iter(x,r0); \
+    CCTK_REAL8_VEC const r2 = k8inv_iter(x,r1); \
+    r2;                                         \
+  })
+#define k8div(x,y) (__fpmul(x,k8inv(y)))
 
 // Fused multiply-add, defined as [+-]x*y[+-]z
 #define k8madd(x,y,z)  (__fpmadd(z,x,y))
@@ -80,29 +163,44 @@
 #define k8nmsub(x,y,z) (__fpnmsub(z,x,y))
 
 // Cheap functions
-#define k8fabs(x)   (__fpabs(x))
-#define k8fmax(x,y) (__fpsel(__fpsub(y,x),x,y))
-#define k8fmin(x,y) (__fpsel(__fpsub(x,y),x,y))
-#define k8fnabs(x)  (__fpnabs(x))
-
-#define k8exp(x)                                        \
-({                                                      \
-  CCTK_REAL8_VEC const xexp=(x);                        \
-  vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \
-})
-#define k8log(x)                                        \
-({                                                      \
-  CCTK_REAL8_VEC const xlog=(x);                        \
-  vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \
-})
-#define k8pow(x,a)                                                      \
-({                                                                      \
-  CCTK_REAL8_VEC const xpow=(x);                                        \
-  CCTK_REAL8 const apow=(a);                                            \
-  vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow));       \
-})
-#define k8sqrt(x)                                               \
-({                                                              \
-  CCTK_REAL8_VEC const xsqrt=(x);                               \
-  vec8_set(sqrt(vec8_elt0(xsqrt)), sqrt(vec8_elt1(xsqrt)));     \
-})
+#define k8fabs(x) (__fpabs(x))
+#define k8fmax(x_,y_)                           \
+  ({                                            \
+    CCTK_REAL8_VEC const xx=(x_);               \
+    CCTK_REAL8_VEC const x=xx;                  \
+    CCTK_REAL8_VEC const yy=(y_);               \
+    CCTK_REAL8_VEC const y=yy;                  \
+    __fpsel(k8sub(y,x),x,y);                    \
+  })
+#define k8fmin(x_,y_)                           \
+  ({                                            \
+    CCTK_REAL8_VEC const xx=(x_);               \
+    CCTK_REAL8_VEC const x=xx;                  \
+    CCTK_REAL8_VEC const yy=(y_);               \
+    CCTK_REAL8_VEC const y=yy;                  \
+    __fpsel(k8sub(x,y),x,y);                    \
+  })
+#define k8fnabs(x) (__fpnabs(x))
+
+// Expensive functions
+#define K8REPL(f,x_)                            \
+  ({                                            \
+    CCTK_REAL8_VEC const xx=(x_);               \
+    CCTK_REAL8_VEC const x=xx;                  \
+    vec8_set(f(vec8_elt0(x)),                   \
+             f(vec8_elt1(x)));                  \
+  })
+#define K8REPL2(f,x_,a_)                        \
+  ({                                            \
+    CCTK_REAL8_VEC const xx=(x_);               \
+    CCTK_REAL8_VEC const x=xx;                  \
+    CCTK_REAL8     const aa=(a_);               \
+    CCTK_REAL8     const a=aa;                  \
+    vec8_set(f(vec8_elt0(x),a),                 \
+             f(vec8_elt1(x),a));                \
+  })
+
+#define k8exp(x)   K8REPL(exp,x)
+#define k8log(x)   K8REPL(log,x)
+#define k8pow(x,a) K8REPL2(pow,x,a)
+#define k8sqrt(x)  K8REPL(sqrt,x)
author	eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>	2011-06-06 10:11:44 +0000
committer	eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>	2011-06-06 10:11:44 +0000
commit	2ab4d61cd4b632c0e991c781f3c15f3b054d1bbd (patch)
tree	6664b1e9ee360ee0abf9df6b9a5562eb5bdc88c5 /src/vectors-8-DoubleHummer.h
parent	5d4858e0736a0c0881c65b9e9ac0983d3b5bb24b (diff)