diff options
author | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2011-06-06 10:11:44 +0000 |
---|---|---|
committer | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2011-06-06 10:11:44 +0000 |
commit | 2ab4d61cd4b632c0e991c781f3c15f3b054d1bbd (patch) | |
tree | 6664b1e9ee360ee0abf9df6b9a5562eb5bdc88c5 /src/vectors-8-DoubleHummer.h | |
parent | 5d4858e0736a0c0881c65b9e9ac0983d3b5bb24b (diff) |
Introduce Cactus options for vectorisation
Introduce configuration-time options for vectorisation, including
options to allow architecture-specific choices that may influence
performance.
Introduce "middle" masked stores for large vector sizes and small
loops.
Clean up and simplify some of the implementation code.
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@10 105869f7-3296-0410-a4ea-f4349344b45a
Diffstat (limited to 'src/vectors-8-DoubleHummer.h')
-rw-r--r-- | src/vectors-8-DoubleHummer.h | 180 |
1 files changed, 139 insertions, 41 deletions
diff --git a/src/vectors-8-DoubleHummer.h b/src/vectors-8-DoubleHummer.h index 9311f62..16ed8a0 100644 --- a/src/vectors-8-DoubleHummer.h +++ b/src/vectors-8-DoubleHummer.h @@ -24,16 +24,17 @@ #define vec8_elt0(x) (__creal(x)) #define vec8_elt1(x) (__cimag(x)) -#define vec8_elt(x,d) \ -({ \ - CCTK_REAL8_VEC const xelt=(x); \ - CCTK_REAL8 aelt; \ - switch (d) { \ - case 0: aelt=vec8_elt0(xelt); break; \ - case 1: aelt=vec8_elt1(xelt); break; \ - } \ - aelt; \ -}) +#define vec8_elt(x_,d) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8 a; \ + switch (d) { \ + case 0: a=vec8_elt0(x); break; \ + case 1: a=vec8_elt1(x); break; \ + } \ + a; \ + }) @@ -41,13 +42,72 @@ // Load a vector from memory (aligned and unaligned); this loads from // a reference to a scalar -#define vec8_load(p) (__lfpd((double *)&(p))) -#define vec8_loadu(p) (__lfpd((double *)&(p))) // this may not work +#define vec8_load(p) (__lfpd((CCTK_REAL8 *)&(p))) +#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS +# define vec8_load_off1(p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + vec8_set((&p)[0],(&p)[1]); \ + }) +#else +#if 0 +# define vec8_load_off1(p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + CCTK_REAL8_VEC const lo = __lfxd((CCTK_REAL8 *)(&p-1)); \ + CCTK_REAL8_VEC const hi = __lfxd((CCTK_REAL8 *)(&p+1)); \ + __fpsel(vec8_set(-1.0,+1.0),lo,hi); \ + }) +#endif +# define vec8_load_off1(p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + CCTK_REAL8_VEC const lo = vec8_load((&p)[-1]); \ + CCTK_REAL8_VEC const hi = vec8_load((&p)[+1]); \ + __fxmr(__fpsel(vec8_set(+1.0,-1.0),lo,hi)); \ + }) +#endif +#define vec8_loadu(p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + int const off = (ptrdiff_t)&p & 0xf; \ + off==0 ? vec8_load(p) : vec8_load_off1(p); \ + }) // Load a vector from memory that may or may not be aligned, as // decided by the offset and the vector size -#define vec8_loadu_maybe(off,p) (vec8_loadu(p)) -#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) +#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS +// Implementation: Always use unaligned load +# define vec8_loadu_maybe(off,p) vec8_loadu(p) +# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p) +#else +# define vec8_loadu_maybe(off,p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + (off) % CCTK_REAL8_VEC_SIZE == 0 ? \ + vec8_load(p) : \ + vec8_load_off1(p); \ + }) +# if VECTORISE_ALIGNED_ARRAYS +// Assume all array x sizes are multiples of the vector size +# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu_maybe(off1,p) +# else +# define vec8_loadu_maybe3(off1,off2,off3,p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + ((off2) % CCTK_REAL8_VEC_SIZE != 0 || \ + (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \ + vec8_loadu(p) : \ + vec8_loadu_maybe(off1,p); \ + }) +# endif +#endif // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar @@ -59,6 +119,7 @@ // the non-temporal hint is probably ignored #define vec8_store_nta_partial_lo(p,x,n) ((&(p))[0]=vec8_elt0(x)) #define vec8_store_nta_partial_hi(p,x,n) ((&(p))[1]=vec8_elt1(x)) +#define vec8_store_nta_partial_mid(p,x,nlo,nhi) assert(0) @@ -71,7 +132,29 @@ #define k8add(x,y) (__fpadd(x,y)) #define k8sub(x,y) (__fpsub(x,y)) #define k8mul(x,y) (__fpmul(x,y)) -#define k8div(x,y) (__fpmul(x,__fpre(y))) +// Estimate for reciprocal +#define k8inv_init(x) (__fpre(x)) +// One Newton iteration for reciprocal +#define k8inv_iter(x_,r_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8_VEC const rr=(r_); \ + CCTK_REAL8_VEC const r=rr; \ + /* r + r * (vec8_set1(1.0) - x*r); */ \ + k8madd(r, k8nmsub(x, r, vec8_set1(1.0)), r); \ + }) +// Reciprocal: First estimate, then apply two Newton iterations +#define k8inv(x_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8_VEC const r0 = k8inv_init(x); \ + CCTK_REAL8_VEC const r1 = k8inv_iter(x,r0); \ + CCTK_REAL8_VEC const r2 = k8inv_iter(x,r1); \ + r2; \ + }) +#define k8div(x,y) (__fpmul(x,k8inv(y))) // Fused multiply-add, defined as [+-]x*y[+-]z #define k8madd(x,y,z) (__fpmadd(z,x,y)) @@ -80,29 +163,44 @@ #define k8nmsub(x,y,z) (__fpnmsub(z,x,y)) // Cheap functions -#define k8fabs(x) (__fpabs(x)) -#define k8fmax(x,y) (__fpsel(__fpsub(y,x),x,y)) -#define k8fmin(x,y) (__fpsel(__fpsub(x,y),x,y)) -#define k8fnabs(x) (__fpnabs(x)) - -#define k8exp(x) \ -({ \ - CCTK_REAL8_VEC const xexp=(x); \ - vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \ -}) -#define k8log(x) \ -({ \ - CCTK_REAL8_VEC const xlog=(x); \ - vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \ -}) -#define k8pow(x,a) \ -({ \ - CCTK_REAL8_VEC const xpow=(x); \ - CCTK_REAL8 const apow=(a); \ - vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \ -}) -#define k8sqrt(x) \ -({ \ - CCTK_REAL8_VEC const xsqrt=(x); \ - vec8_set(sqrt(vec8_elt0(xsqrt)), sqrt(vec8_elt1(xsqrt))); \ -}) +#define k8fabs(x) (__fpabs(x)) +#define k8fmax(x_,y_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8_VEC const yy=(y_); \ + CCTK_REAL8_VEC const y=yy; \ + __fpsel(k8sub(y,x),x,y); \ + }) +#define k8fmin(x_,y_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8_VEC const yy=(y_); \ + CCTK_REAL8_VEC const y=yy; \ + __fpsel(k8sub(x,y),x,y); \ + }) +#define k8fnabs(x) (__fpnabs(x)) + +// Expensive functions +#define K8REPL(f,x_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + vec8_set(f(vec8_elt0(x)), \ + f(vec8_elt1(x))); \ + }) +#define K8REPL2(f,x_,a_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8 const aa=(a_); \ + CCTK_REAL8 const a=aa; \ + vec8_set(f(vec8_elt0(x),a), \ + f(vec8_elt1(x),a)); \ + }) + +#define k8exp(x) K8REPL(exp,x) +#define k8log(x) K8REPL(log,x) +#define k8pow(x,a) K8REPL2(pow,x,a) +#define k8sqrt(x) K8REPL(sqrt,x) |