aboutsummaryrefslogtreecommitdiff
path: root/src/vectors-8-DoubleHummer.h
diff options
context:
space:
mode:
authoreschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2011-06-06 10:11:44 +0000
committereschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2011-06-06 10:11:44 +0000
commit2ab4d61cd4b632c0e991c781f3c15f3b054d1bbd (patch)
tree6664b1e9ee360ee0abf9df6b9a5562eb5bdc88c5 /src/vectors-8-DoubleHummer.h
parent5d4858e0736a0c0881c65b9e9ac0983d3b5bb24b (diff)
Introduce Cactus options for vectorisation
Introduce configuration-time options for vectorisation, including options to allow architecture-specific choices that may influence performance. Introduce "middle" masked stores for large vector sizes and small loops. Clean up and simplify some of the implementation code. git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@10 105869f7-3296-0410-a4ea-f4349344b45a
Diffstat (limited to 'src/vectors-8-DoubleHummer.h')
-rw-r--r--src/vectors-8-DoubleHummer.h180
1 files changed, 139 insertions, 41 deletions
diff --git a/src/vectors-8-DoubleHummer.h b/src/vectors-8-DoubleHummer.h
index 9311f62..16ed8a0 100644
--- a/src/vectors-8-DoubleHummer.h
+++ b/src/vectors-8-DoubleHummer.h
@@ -24,16 +24,17 @@
#define vec8_elt0(x) (__creal(x))
#define vec8_elt1(x) (__cimag(x))
-#define vec8_elt(x,d) \
-({ \
- CCTK_REAL8_VEC const xelt=(x); \
- CCTK_REAL8 aelt; \
- switch (d) { \
- case 0: aelt=vec8_elt0(xelt); break; \
- case 1: aelt=vec8_elt1(xelt); break; \
- } \
- aelt; \
-})
+#define vec8_elt(x_,d) \
+ ({ \
+ CCTK_REAL8_VEC const xx=(x_); \
+ CCTK_REAL8_VEC const x=xx; \
+ CCTK_REAL8 a; \
+ switch (d) { \
+ case 0: a=vec8_elt0(x); break; \
+ case 1: a=vec8_elt1(x); break; \
+ } \
+ a; \
+ })
@@ -41,13 +42,72 @@
// Load a vector from memory (aligned and unaligned); this loads from
// a reference to a scalar
-#define vec8_load(p) (__lfpd((double *)&(p)))
-#define vec8_loadu(p) (__lfpd((double *)&(p))) // this may not work
+#define vec8_load(p) (__lfpd((CCTK_REAL8 *)&(p)))
+#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS
+# define vec8_load_off1(p_) \
+ ({ \
+ CCTK_REAL8 const& pp=(p_); \
+ CCTK_REAL8 const& p=pp; \
+ vec8_set((&p)[0],(&p)[1]); \
+ })
+#else
+#if 0
+# define vec8_load_off1(p_) \
+ ({ \
+ CCTK_REAL8 const& pp=(p_); \
+ CCTK_REAL8 const& p=pp; \
+ CCTK_REAL8_VEC const lo = __lfxd((CCTK_REAL8 *)(&p-1)); \
+ CCTK_REAL8_VEC const hi = __lfxd((CCTK_REAL8 *)(&p+1)); \
+ __fpsel(vec8_set(-1.0,+1.0),lo,hi); \
+ })
+#endif
+# define vec8_load_off1(p_) \
+ ({ \
+ CCTK_REAL8 const& pp=(p_); \
+ CCTK_REAL8 const& p=pp; \
+ CCTK_REAL8_VEC const lo = vec8_load((&p)[-1]); \
+ CCTK_REAL8_VEC const hi = vec8_load((&p)[+1]); \
+ __fxmr(__fpsel(vec8_set(+1.0,-1.0),lo,hi)); \
+ })
+#endif
+#define vec8_loadu(p_) \
+ ({ \
+ CCTK_REAL8 const& pp=(p_); \
+ CCTK_REAL8 const& p=pp; \
+ int const off = (ptrdiff_t)&p & 0xf; \
+ off==0 ? vec8_load(p) : vec8_load_off1(p); \
+ })
// Load a vector from memory that may or may not be aligned, as
// decided by the offset and the vector size
-#define vec8_loadu_maybe(off,p) (vec8_loadu(p))
-#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p))
+#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
+// Implementation: Always use unaligned load
+# define vec8_loadu_maybe(off,p) vec8_loadu(p)
+# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p)
+#else
+# define vec8_loadu_maybe(off,p_) \
+ ({ \
+ CCTK_REAL8 const& pp=(p_); \
+ CCTK_REAL8 const& p=pp; \
+ (off) % CCTK_REAL8_VEC_SIZE == 0 ? \
+ vec8_load(p) : \
+ vec8_load_off1(p); \
+ })
+# if VECTORISE_ALIGNED_ARRAYS
+// Assume all array x sizes are multiples of the vector size
+# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu_maybe(off1,p)
+# else
+# define vec8_loadu_maybe3(off1,off2,off3,p_) \
+ ({ \
+ CCTK_REAL8 const& pp=(p_); \
+ CCTK_REAL8 const& p=pp; \
+ ((off2) % CCTK_REAL8_VEC_SIZE != 0 || \
+ (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \
+ vec8_loadu(p) : \
+ vec8_loadu_maybe(off1,p); \
+ })
+# endif
+#endif
// Store a vector to memory (aligned and non-temporal); this stores to
// a reference to a scalar
@@ -59,6 +119,7 @@
// the non-temporal hint is probably ignored
#define vec8_store_nta_partial_lo(p,x,n) ((&(p))[0]=vec8_elt0(x))
#define vec8_store_nta_partial_hi(p,x,n) ((&(p))[1]=vec8_elt1(x))
+#define vec8_store_nta_partial_mid(p,x,nlo,nhi) assert(0)
@@ -71,7 +132,29 @@
#define k8add(x,y) (__fpadd(x,y))
#define k8sub(x,y) (__fpsub(x,y))
#define k8mul(x,y) (__fpmul(x,y))
-#define k8div(x,y) (__fpmul(x,__fpre(y)))
+// Estimate for reciprocal
+#define k8inv_init(x) (__fpre(x))
+// One Newton iteration for reciprocal
+#define k8inv_iter(x_,r_) \
+ ({ \
+ CCTK_REAL8_VEC const xx=(x_); \
+ CCTK_REAL8_VEC const x=xx; \
+ CCTK_REAL8_VEC const rr=(r_); \
+ CCTK_REAL8_VEC const r=rr; \
+ /* r + r * (vec8_set1(1.0) - x*r); */ \
+ k8madd(r, k8nmsub(x, r, vec8_set1(1.0)), r); \
+ })
+// Reciprocal: First estimate, then apply two Newton iterations
+#define k8inv(x_) \
+ ({ \
+ CCTK_REAL8_VEC const xx=(x_); \
+ CCTK_REAL8_VEC const x=xx; \
+ CCTK_REAL8_VEC const r0 = k8inv_init(x); \
+ CCTK_REAL8_VEC const r1 = k8inv_iter(x,r0); \
+ CCTK_REAL8_VEC const r2 = k8inv_iter(x,r1); \
+ r2; \
+ })
+#define k8div(x,y) (__fpmul(x,k8inv(y)))
// Fused multiply-add, defined as [+-]x*y[+-]z
#define k8madd(x,y,z) (__fpmadd(z,x,y))
@@ -80,29 +163,44 @@
#define k8nmsub(x,y,z) (__fpnmsub(z,x,y))
// Cheap functions
-#define k8fabs(x) (__fpabs(x))
-#define k8fmax(x,y) (__fpsel(__fpsub(y,x),x,y))
-#define k8fmin(x,y) (__fpsel(__fpsub(x,y),x,y))
-#define k8fnabs(x) (__fpnabs(x))
-
-#define k8exp(x) \
-({ \
- CCTK_REAL8_VEC const xexp=(x); \
- vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \
-})
-#define k8log(x) \
-({ \
- CCTK_REAL8_VEC const xlog=(x); \
- vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \
-})
-#define k8pow(x,a) \
-({ \
- CCTK_REAL8_VEC const xpow=(x); \
- CCTK_REAL8 const apow=(a); \
- vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \
-})
-#define k8sqrt(x) \
-({ \
- CCTK_REAL8_VEC const xsqrt=(x); \
- vec8_set(sqrt(vec8_elt0(xsqrt)), sqrt(vec8_elt1(xsqrt))); \
-})
+#define k8fabs(x) (__fpabs(x))
+#define k8fmax(x_,y_) \
+ ({ \
+ CCTK_REAL8_VEC const xx=(x_); \
+ CCTK_REAL8_VEC const x=xx; \
+ CCTK_REAL8_VEC const yy=(y_); \
+ CCTK_REAL8_VEC const y=yy; \
+ __fpsel(k8sub(y,x),x,y); \
+ })
+#define k8fmin(x_,y_) \
+ ({ \
+ CCTK_REAL8_VEC const xx=(x_); \
+ CCTK_REAL8_VEC const x=xx; \
+ CCTK_REAL8_VEC const yy=(y_); \
+ CCTK_REAL8_VEC const y=yy; \
+ __fpsel(k8sub(x,y),x,y); \
+ })
+#define k8fnabs(x) (__fpnabs(x))
+
+// Expensive functions
+#define K8REPL(f,x_) \
+ ({ \
+ CCTK_REAL8_VEC const xx=(x_); \
+ CCTK_REAL8_VEC const x=xx; \
+ vec8_set(f(vec8_elt0(x)), \
+ f(vec8_elt1(x))); \
+ })
+#define K8REPL2(f,x_,a_) \
+ ({ \
+ CCTK_REAL8_VEC const xx=(x_); \
+ CCTK_REAL8_VEC const x=xx; \
+ CCTK_REAL8 const aa=(a_); \
+ CCTK_REAL8 const a=aa; \
+ vec8_set(f(vec8_elt0(x),a), \
+ f(vec8_elt1(x),a)); \
+ })
+
+#define k8exp(x) K8REPL(exp,x)
+#define k8log(x) K8REPL(log,x)
+#define k8pow(x,a) K8REPL2(pow,x,a)
+#define k8sqrt(x) K8REPL(sqrt,x)