aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoreschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2012-12-31 14:31:30 +0000
committereschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2012-12-31 14:31:30 +0000
commit33e3cc3261b81b28f83a6e4d41d1ae97944dfa49 (patch)
tree48fead22cea9d108d57f5ca8f375cd9de8a6dbd1
parent19379d2ae9da591f4f7e7641a679461062b953c9 (diff)
Many corrections
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@76 105869f7-3296-0410-a4ea-f4349344b45a
-rw-r--r--src/vectors-8-QPX.h189
1 files changed, 109 insertions, 80 deletions
diff --git a/src/vectors-8-QPX.h b/src/vectors-8-QPX.h
index 40f6c18..631c974 100644
--- a/src/vectors-8-QPX.h
+++ b/src/vectors-8-QPX.h
@@ -35,9 +35,9 @@ struct CCTK_REAL8_VEC {
#define CCTK_REAL8_VEC_SIZE 4
// Integer and boolean types corresponding to this real type
-#define CCTK_INTEGER8 CCTK_REAL8
+//#define CCTK_INTEGER8 CCTK_REAL8
#define CCTK_BOOLEAN8 CCTK_REAL8
-#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC
+//#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC
#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC
@@ -47,6 +47,10 @@ struct CCTK_REAL8_VEC {
#define vec8_set1(a) (vec_splats(a))
#define vec8_set(a,b,c,d) ((vector4double){a,b,c,d})
+#define vec8_b2r(b) ((b)?+1.0:-1.0)
+#define vec8b_set(a,b,c,d) \
+ ((vector4double){vec8_b2r(a),vec8_b2r(b),vec8_b2r(c),vec8_b2r(d)})
+
#define vec8_elt0(x) (vec_extract(x,0))
#define vec8_elt1(x) (vec_extract(x,1))
#define vec8_elt2(x) (vec_extract(x,2))
@@ -65,6 +69,7 @@ struct CCTK_REAL8_VEC {
CCTK_REAL8 const& p__=(p_); \
CCTK_REAL8& p = *(CCTK_REAL8*)&p__; \
vector4double v1, v2, vp; \
+ /* code taken from IBM's compiler documentation */ \
v1 = vec_ld(0,&p); /* load the left part of the vector */ \
v2 = vec_ld(32,&p); /* load the right part of the vector */ \
vp = vec_lvsl(0,&p); /* generate control value */ \
@@ -105,28 +110,29 @@ struct CCTK_REAL8_VEC {
// Store a vector to memory (aligned and non-temporal); this stores to
// a reference to a scalar
#define vec8_store(p,x) (vec_sta(x,0,&(p)))
-#define vec8_storeu(p_,x_) \
- ({ \
- CCTK_REAL8& p__=(p_); \
- CCTK_REAL8_VEC x__=(x_); \
- CCTK_REAL8& p=p__; \
- CCTK_REAL8_VEC x=x__; \
- CCTK_REAL8_VEC v1, v2, v3, vp, m1, m2, m3; \
- /* generate insert masks */ \
- vp = vec_lvsr(0,&p); \
- m1 = k8lfalse; \
- m2 = k8ltrue; \
- m3 = vec_perm(m1,m2,vp); \
- /* get existing data */ \
- v1 = vec_ld(0,&p); \
- v2 = vec_ld(32,&p); \
- /* permute and insert */ \
- v3 = vec_perm(x,x,vp); \
- v1 = vec_sel(v1,v3,m3); \
- v2 = vec_sel(v3,v2,m3); \
- /* store data back */ \
- vec_st(0,&p,v1); \
- vec_st(32,&p,v2); \
+#define vec8_storeu(p_,x_) \
+ ({ \
+ CCTK_REAL8& p__=(p_); \
+ CCTK_REAL8_VEC x__=(x_); \
+ CCTK_REAL8& p=p__; \
+ CCTK_REAL8_VEC x=x__; \
+ CCTK_REAL8_VEC v1, v2, v3, vp, m1, m2, m3; \
+ /* code taken from IBM's compiler documentation */ \
+ /* generate insert masks */ \
+ vp = vec_lvsr(0,&p); \
+ m1 = k8lfalse; \
+ m2 = k8ltrue; \
+ m3 = vec_perm(m1,m2,vp); \
+ /* get existing data */ \
+ v1 = vec_ld(0,&p); \
+ v2 = vec_ld(32,&p); \
+ /* permute and insert */ \
+ v3 = vec_perm(x,x,vp); \
+ v1 = vec_sel(v1,v3,m3); \
+ v2 = vec_sel(v3,v2,m3); \
+ /* store data back */ \
+ vec_st(0,&p,v1); \
+ vec_st(32,&p,v2); \
})
#define vec8_store_nta(p,x) (vec_sta(x,0,&(p))) // this doesn't avoid the cache
@@ -140,19 +146,34 @@ struct CCTK_REAL8_VEC {
ptrdiff_t const imin=imin__; \
ptrdiff_t const imax=imax__; \
\
- v8stp_all = i-imin>=0 and i-imax<=-CCTK_REAL8_VEC_SIZE; \
+ v8stp_all = i>=imin and i+CCTK_REAL8_VEC_SIZE-1<imax; \
\
if (not CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
CCTK_REAL8_VEC vp_lo, vp_hi, mask_lo, mask_hi; \
- vp_lo = vec_lvsl(i-imin, (CCTK_REAL*)CCTK_REAL8_VEC_SIZE); \
- mask_lo = (i-imin>=0 ? \
+ /* this is correct but slow */ \
+ /* \
+ mask_lo = vec8b_set(i+0>=imin, i+1>=imin, i+2>=imin, i+3>=imin); \
+ mask_hi = vec8b_set(i+0<imax, i+1<imax, i+2<imax, i+3<imax); \
+ */ \
+ /* Note: vec_lvsl(i,p) = &p[i] / 8 % 4 \
+ Note: vec_lvsr(i,p) = -&p[i] / 8 % 4 \
+ /8: 8 bytes per double \
+ %4: 4 doubles per vector \
+ */ \
+ /* We assume p[i] is aligned */ \
+ /* Ensure at least one vector element is inside the active region */ \
+ assert(i-imin>=-(CCTK_REAL8_VEC_SIZE-1)); \
+ vp_lo = vec_lvsl(8 * (i-imin), (CCTK_REAL*)0); \
+ mask_lo = (i-imin >= 0 ? \
k8ltrue : \
vec_perm(k8lfalse, k8ltrue, vp_lo)); \
- vp_hi = vec_lvsl(i-imax, (CCTK_REAL*)CCTK_REAL8_VEC_SIZE); \
- mask_hi = (i-imax<=-CCTK_REAL8_VEC_SIZE ? \
+ /* Ensure at least one vector element is inside the active region */ \
+ assert(i<imax); \
+ vp_hi = vec_lvsl(8 * (i-imax), (CCTK_REAL*)0); \
+ mask_hi = (i-imax < -(CCTK_REAL8_VEC_SIZE-1) ? \
k8ltrue : \
vec_perm(k8ltrue, k8lfalse, vp_hi)); \
- v8stp_mask = vec_and(mask_lo, mask_hi); \
+ v8stp_mask = k8land(mask_lo, mask_hi); \
} \
})
#define vec8_store_nta_partial(p_,x_) \
@@ -164,49 +185,57 @@ struct CCTK_REAL8_VEC {
if (CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
vec8_store(p, x); \
} else { \
- vec8_store(p, vec_sel(vec8_load(p), x, v8stp_mask)); \
+ vec8_store(p, k8ifthen(v8stp_mask, x, vec8_load(p))); \
} \
})
// Store a lower or higher partial vector (aligned and non-temporal);
// the non-temporal hint is probably ignored
-#define vec8_store_nta_partial_lo(p_,x_,n) \
- ({ \
- CCTK_REAL8& p__=(p_); \
- CCTK_REAL8_VEC x__=(x_); \
- CCTK_REAL8& p=p__; \
- CCTK_REAL8_VEC x=x__; \
- CCTK_REAL8_VEC vp_hi, mask_hi; \
- vp_hi = vec_lvsl(CCTK_REAL8_VEC_SIZE-n, (CCTK_REAL*)0); \
- mask_hi = vec_perm(k8ltrue, k8lfalse, vp_hi); \
- vec8_store(p, vec_sel(vec8_load(p), x, mask_hi)); \
+#define vec8_store_nta_partial_lo(p_,x_,n) \
+ ({ \
+ CCTK_REAL8& p__=(p_); \
+ CCTK_REAL8_VEC x__=(x_); \
+ CCTK_REAL8& p=p__; \
+ CCTK_REAL8_VEC x=x__; \
+ CCTK_REAL8_VEC vp, mask; \
+ /* Ensure at least one and but all vector elements are active */ \
+ assert(n>0 and n<CCTK_REAL8_VEC_SIZE-1); \
+ vp = vec_lvsl(-8 * n, (CCTK_REAL*)0); \
+ mask = vec_perm(k8ltrue, k8lfalse, vp); \
+ vec8_store(p, k8ifthen(mask, x, vec8_load(p))); \
})
-#define vec8_store_nta_partial_hi(p_,x_,n) \
- ({ \
- CCTK_REAL8& p__=(p_); \
- CCTK_REAL8_VEC x__=(x_); \
- CCTK_REAL8& p=p__; \
- CCTK_REAL8_VEC x=x__; \
- CCTK_REAL8_VEC vp_lo, mask_lo; \
- vp_lo = vec_lvsl(n, (CCTK_REAL*)0); \
- mask_lo = vec_perm(k8lfalse, k8ltrue, vp_lo); \
- vec8_store(p, vec_sel(vec8_load(p), x, mask_lo)); \
+#define vec8_store_nta_partial_hi(p_,x_,n) \
+ ({ \
+ CCTK_REAL8& p__=(p_); \
+ CCTK_REAL8_VEC x__=(x_); \
+ CCTK_REAL8& p=p__; \
+ CCTK_REAL8_VEC x=x__; \
+ CCTK_REAL8_VEC vp, mask; \
+ /* Ensure at least one but not all vector elements are active */ \
+ assert(n>0 and n<CCTK_REAL8_VEC_SIZE-1); \
+ vp = vec_lvsl(8 * n, (CCTK_REAL*)0); \
+ mask = vec_perm(k8lfalse, k8ltrue, vp); \
+ vec8_store(p, k8ifthen(mask, x, vec8_load(p))); \
})
-#define vec8_store_nta_partial_mid(p_,x_,nlo,nhi) \
- ({ \
- CCTK_REAL8& p__=(p_); \
- CCTK_REAL8_VEC x__=(x_); \
- CCTK_REAL8 p=p__; \
- CCTK_REAL8_VEC x=x__; \
- CCTK_REAL8_VEC vp_lo, mask_lo; \
- vp_lo = vec_lvsl(nhi, (CCTK_REAL*)0); \
- mask_lo = vec_perm(k8lfalse, k8ltrue, vp_lo); \
- CCTK_REAL8_VEC vp_hi, mask_hi; \
- vp_hi = vec_lvsl(CCTK_REAL8_VEC_SIZE-nlo, (CCTK_REAL*)0); \
- mask_hi = vec_perm(k8ltrue, k8lfalse, vp_hi); \
- CCTK_REAL8_VEC mask; \
- mask = vec_and(mask_lo, mask_hi); \
- vec8_store(p, vec_sel(vec8_load(p), x, mask)); \
+#define vec8_store_nta_partial_mid(p_,x_,nlo,nhi) \
+ ({ \
+ CCTK_REAL8& p__=(p_); \
+ CCTK_REAL8_VEC x__=(x_); \
+ CCTK_REAL8 p=p__; \
+ CCTK_REAL8_VEC x=x__; \
+ CCTK_REAL8_VEC vp_lo, mask_lo; \
+ /* Ensure at least one but not all vector elements are active */ \
+ assert(nlo>0 and nlo<CCTK_REAL8_VEC_SIZE-1); \
+ vp_lo = vec_lvsl(-8 * nlo, (CCTK_REAL*)0); \
+ mask_lo = vec_perm(k8lfalse, k8ltrue, vp_lo); \
+ CCTK_REAL8_VEC vp_hi, mask_hi; \
+ /* Ensure at least one but not all vector elements are active */ \
+ assert(nhi>0 and nhi<CCTK_REAL8_VEC_SIZE-1); \
+ vp_hi = vec_lvsl(8 * nhi, (CCTK_REAL*)0); \
+ mask_hi = vec_perm(k8lfalse, k8ltrue, vp_hi); \
+ CCTK_REAL8_VEC mask; \
+ mask = vec_and(mask_lo, mask_hi); \
+ vec8_store(p, k8ifthen(mask, x, vec8_load(p))); \
})
@@ -222,10 +251,10 @@ struct CCTK_REAL8_VEC {
#define k8div(x,y) (vec_swdiv_nochk(x,y))
// Fused multiply-add, defined as [+-]x*y[+-]z
-#define k8madd(x,y,z) (vec_madd(z,x,y))
-#define k8msub(x,y,z) (vec_msub(z,x,y))
-#define k8nmadd(x,y,z) (vec_nmadd(z,x,y))
-#define k8nmsub(x,y,z) (vec_nmsub(z,x,y))
+#define k8madd(x,y,z) (vec_madd(x,y,z))
+#define k8msub(x,y,z) (vec_msub(x,y,z))
+#define k8nmadd(x,y,z) (vec_nmadd(x,y,z))
+#define k8nmsub(x,y,z) (vec_nmsub(x,y,z))
// Cheap functions
#define k8copysign(x,y) (vec_cpsgn(y,x))
@@ -236,7 +265,7 @@ struct CCTK_REAL8_VEC {
CCTK_REAL8_VEC y__=(y_); \
CCTK_REAL8_VEC x=x__; \
CCTK_REAL8_VEC y=y__; \
- vec_sel(vec_cmpgt(y,x),y,x); \
+ k8ifthen(k8cmplt(x,y),y,x); \
})
#define k8fmin(x_,y_) \
({ \
@@ -244,7 +273,7 @@ struct CCTK_REAL8_VEC {
CCTK_REAL8_VEC y__=(y_); \
CCTK_REAL8_VEC x=x__; \
CCTK_REAL8_VEC y=y__; \
- vec_sel(vec_cmplt(y,x),y,x); \
+ k8ifthen(k8cmpgt(x,y),y,x); \
})
#define k8fnabs(x) (vec_nabs(x))
#define k8sgn(x_) \
@@ -253,9 +282,9 @@ struct CCTK_REAL8_VEC {
CCTK_REAL8_VEC x=x__; \
CCTK_REAL8_VEC one, zero, iszero; \
one = k8ltrue; \
- zero = vec_sub(one, one); \
- iszero = vec_cmpeq(x, zero); \
- k8ifthen(iszero, zero, vec_cpsgn(one, x)); \
+ zero = k8sub(one, one); \
+ iszero = k8cmpeq(x, zero); \
+ k8ifthen(iszero, zero, k8copysign(one, x)); \
})
#define k8sqrt(x) (vec_swsqrt_nochk(x))
@@ -311,19 +340,19 @@ struct CCTK_REAL8_VEC {
// canonical true is +1.0, canonical false is -1.0
// >=0 is true, -0 is true, nan is false
-#define k8lfalse \
+#define k8lfalse \
({ CCTK_REAL8_VEC dummy; vec_logical(dummy,dummy,0x0); })
-#define k8ltrue \
+#define k8ltrue \
({ CCTK_REAL8_VEC dummy; vec_logical(dummy,dummy,0xf); })
#define k8lnot(x) (vec_not(x))
#define k8land(x,y) (vec_and(x,y))
#define k8lor(x,y) (vec_or(x,y))
#define k8lxor(x,y) (vec_xor(x,y))
-#define k8ifthen(x,y,z) (vec_sel(z,x,y))
+#define k8ifthen(x,y,z) (vec_sel(z,y,x))
#define k8cmpeq(x,y) (vec_cmpeq(x,y))
-#define k8cmpne(x,y) (vec_not(vec_cmpeq(x,y)))
+#define k8cmpne(x,y) (k8lnot(vec_cmpeq(x,y)))
#define k8cmpgt(x,y) (vec_cmpgt(x,y))
-#define k8cmpge(x,y) (vec_not(vec_cmplt(x,y)))
+#define k8cmpge(x,y) (k8lnot(vec_cmplt(x,y)))
#define k8cmplt(x,y) (vec_cmplt(x,y))
#define k8cmple(x,y) (vec_not(vec_cmpgt(x,y)))