diff options
author | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2012-12-31 14:31:30 +0000 |
---|---|---|
committer | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2012-12-31 14:31:30 +0000 |
commit | 33e3cc3261b81b28f83a6e4d41d1ae97944dfa49 (patch) | |
tree | 48fead22cea9d108d57f5ca8f375cd9de8a6dbd1 | |
parent | 19379d2ae9da591f4f7e7641a679461062b953c9 (diff) |
Many corrections
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@76 105869f7-3296-0410-a4ea-f4349344b45a
-rw-r--r-- | src/vectors-8-QPX.h | 189 |
1 files changed, 109 insertions, 80 deletions
diff --git a/src/vectors-8-QPX.h b/src/vectors-8-QPX.h index 40f6c18..631c974 100644 --- a/src/vectors-8-QPX.h +++ b/src/vectors-8-QPX.h @@ -35,9 +35,9 @@ struct CCTK_REAL8_VEC { #define CCTK_REAL8_VEC_SIZE 4 // Integer and boolean types corresponding to this real type -#define CCTK_INTEGER8 CCTK_REAL8 +//#define CCTK_INTEGER8 CCTK_REAL8 #define CCTK_BOOLEAN8 CCTK_REAL8 -#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC +//#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC #define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC @@ -47,6 +47,10 @@ struct CCTK_REAL8_VEC { #define vec8_set1(a) (vec_splats(a)) #define vec8_set(a,b,c,d) ((vector4double){a,b,c,d}) +#define vec8_b2r(b) ((b)?+1.0:-1.0) +#define vec8b_set(a,b,c,d) \ + ((vector4double){vec8_b2r(a),vec8_b2r(b),vec8_b2r(c),vec8_b2r(d)}) + #define vec8_elt0(x) (vec_extract(x,0)) #define vec8_elt1(x) (vec_extract(x,1)) #define vec8_elt2(x) (vec_extract(x,2)) @@ -65,6 +69,7 @@ struct CCTK_REAL8_VEC { CCTK_REAL8 const& p__=(p_); \ CCTK_REAL8& p = *(CCTK_REAL8*)&p__; \ vector4double v1, v2, vp; \ + /* code taken from IBM's compiler documentation */ \ v1 = vec_ld(0,&p); /* load the left part of the vector */ \ v2 = vec_ld(32,&p); /* load the right part of the vector */ \ vp = vec_lvsl(0,&p); /* generate control value */ \ @@ -105,28 +110,29 @@ struct CCTK_REAL8_VEC { // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar #define vec8_store(p,x) (vec_sta(x,0,&(p))) -#define vec8_storeu(p_,x_) \ - ({ \ - CCTK_REAL8& p__=(p_); \ - CCTK_REAL8_VEC x__=(x_); \ - CCTK_REAL8& p=p__; \ - CCTK_REAL8_VEC x=x__; \ - CCTK_REAL8_VEC v1, v2, v3, vp, m1, m2, m3; \ - /* generate insert masks */ \ - vp = vec_lvsr(0,&p); \ - m1 = k8lfalse; \ - m2 = k8ltrue; \ - m3 = vec_perm(m1,m2,vp); \ - /* get existing data */ \ - v1 = vec_ld(0,&p); \ - v2 = vec_ld(32,&p); \ - /* permute and insert */ \ - v3 = vec_perm(x,x,vp); \ - v1 = vec_sel(v1,v3,m3); \ - v2 = vec_sel(v3,v2,m3); \ - /* store data back */ \ - vec_st(0,&p,v1); \ - vec_st(32,&p,v2); \ +#define vec8_storeu(p_,x_) \ + ({ \ + CCTK_REAL8& p__=(p_); \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8& p=p__; \ + CCTK_REAL8_VEC x=x__; \ + CCTK_REAL8_VEC v1, v2, v3, vp, m1, m2, m3; \ + /* code taken from IBM's compiler documentation */ \ + /* generate insert masks */ \ + vp = vec_lvsr(0,&p); \ + m1 = k8lfalse; \ + m2 = k8ltrue; \ + m3 = vec_perm(m1,m2,vp); \ + /* get existing data */ \ + v1 = vec_ld(0,&p); \ + v2 = vec_ld(32,&p); \ + /* permute and insert */ \ + v3 = vec_perm(x,x,vp); \ + v1 = vec_sel(v1,v3,m3); \ + v2 = vec_sel(v3,v2,m3); \ + /* store data back */ \ + vec_st(0,&p,v1); \ + vec_st(32,&p,v2); \ }) #define vec8_store_nta(p,x) (vec_sta(x,0,&(p))) // this doesn't avoid the cache @@ -140,19 +146,34 @@ struct CCTK_REAL8_VEC { ptrdiff_t const imin=imin__; \ ptrdiff_t const imax=imax__; \ \ - v8stp_all = i-imin>=0 and i-imax<=-CCTK_REAL8_VEC_SIZE; \ + v8stp_all = i>=imin and i+CCTK_REAL8_VEC_SIZE-1<imax; \ \ if (not CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \ CCTK_REAL8_VEC vp_lo, vp_hi, mask_lo, mask_hi; \ - vp_lo = vec_lvsl(i-imin, (CCTK_REAL*)CCTK_REAL8_VEC_SIZE); \ - mask_lo = (i-imin>=0 ? \ + /* this is correct but slow */ \ + /* \ + mask_lo = vec8b_set(i+0>=imin, i+1>=imin, i+2>=imin, i+3>=imin); \ + mask_hi = vec8b_set(i+0<imax, i+1<imax, i+2<imax, i+3<imax); \ + */ \ + /* Note: vec_lvsl(i,p) = &p[i] / 8 % 4 \ + Note: vec_lvsr(i,p) = -&p[i] / 8 % 4 \ + /8: 8 bytes per double \ + %4: 4 doubles per vector \ + */ \ + /* We assume p[i] is aligned */ \ + /* Ensure at least one vector element is inside the active region */ \ + assert(i-imin>=-(CCTK_REAL8_VEC_SIZE-1)); \ + vp_lo = vec_lvsl(8 * (i-imin), (CCTK_REAL*)0); \ + mask_lo = (i-imin >= 0 ? \ k8ltrue : \ vec_perm(k8lfalse, k8ltrue, vp_lo)); \ - vp_hi = vec_lvsl(i-imax, (CCTK_REAL*)CCTK_REAL8_VEC_SIZE); \ - mask_hi = (i-imax<=-CCTK_REAL8_VEC_SIZE ? \ + /* Ensure at least one vector element is inside the active region */ \ + assert(i<imax); \ + vp_hi = vec_lvsl(8 * (i-imax), (CCTK_REAL*)0); \ + mask_hi = (i-imax < -(CCTK_REAL8_VEC_SIZE-1) ? \ k8ltrue : \ vec_perm(k8ltrue, k8lfalse, vp_hi)); \ - v8stp_mask = vec_and(mask_lo, mask_hi); \ + v8stp_mask = k8land(mask_lo, mask_hi); \ } \ }) #define vec8_store_nta_partial(p_,x_) \ @@ -164,49 +185,57 @@ struct CCTK_REAL8_VEC { if (CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \ vec8_store(p, x); \ } else { \ - vec8_store(p, vec_sel(vec8_load(p), x, v8stp_mask)); \ + vec8_store(p, k8ifthen(v8stp_mask, x, vec8_load(p))); \ } \ }) // Store a lower or higher partial vector (aligned and non-temporal); // the non-temporal hint is probably ignored -#define vec8_store_nta_partial_lo(p_,x_,n) \ - ({ \ - CCTK_REAL8& p__=(p_); \ - CCTK_REAL8_VEC x__=(x_); \ - CCTK_REAL8& p=p__; \ - CCTK_REAL8_VEC x=x__; \ - CCTK_REAL8_VEC vp_hi, mask_hi; \ - vp_hi = vec_lvsl(CCTK_REAL8_VEC_SIZE-n, (CCTK_REAL*)0); \ - mask_hi = vec_perm(k8ltrue, k8lfalse, vp_hi); \ - vec8_store(p, vec_sel(vec8_load(p), x, mask_hi)); \ +#define vec8_store_nta_partial_lo(p_,x_,n) \ + ({ \ + CCTK_REAL8& p__=(p_); \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8& p=p__; \ + CCTK_REAL8_VEC x=x__; \ + CCTK_REAL8_VEC vp, mask; \ + /* Ensure at least one and but all vector elements are active */ \ + assert(n>0 and n<CCTK_REAL8_VEC_SIZE-1); \ + vp = vec_lvsl(-8 * n, (CCTK_REAL*)0); \ + mask = vec_perm(k8ltrue, k8lfalse, vp); \ + vec8_store(p, k8ifthen(mask, x, vec8_load(p))); \ }) -#define vec8_store_nta_partial_hi(p_,x_,n) \ - ({ \ - CCTK_REAL8& p__=(p_); \ - CCTK_REAL8_VEC x__=(x_); \ - CCTK_REAL8& p=p__; \ - CCTK_REAL8_VEC x=x__; \ - CCTK_REAL8_VEC vp_lo, mask_lo; \ - vp_lo = vec_lvsl(n, (CCTK_REAL*)0); \ - mask_lo = vec_perm(k8lfalse, k8ltrue, vp_lo); \ - vec8_store(p, vec_sel(vec8_load(p), x, mask_lo)); \ +#define vec8_store_nta_partial_hi(p_,x_,n) \ + ({ \ + CCTK_REAL8& p__=(p_); \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8& p=p__; \ + CCTK_REAL8_VEC x=x__; \ + CCTK_REAL8_VEC vp, mask; \ + /* Ensure at least one but not all vector elements are active */ \ + assert(n>0 and n<CCTK_REAL8_VEC_SIZE-1); \ + vp = vec_lvsl(8 * n, (CCTK_REAL*)0); \ + mask = vec_perm(k8lfalse, k8ltrue, vp); \ + vec8_store(p, k8ifthen(mask, x, vec8_load(p))); \ }) -#define vec8_store_nta_partial_mid(p_,x_,nlo,nhi) \ - ({ \ - CCTK_REAL8& p__=(p_); \ - CCTK_REAL8_VEC x__=(x_); \ - CCTK_REAL8 p=p__; \ - CCTK_REAL8_VEC x=x__; \ - CCTK_REAL8_VEC vp_lo, mask_lo; \ - vp_lo = vec_lvsl(nhi, (CCTK_REAL*)0); \ - mask_lo = vec_perm(k8lfalse, k8ltrue, vp_lo); \ - CCTK_REAL8_VEC vp_hi, mask_hi; \ - vp_hi = vec_lvsl(CCTK_REAL8_VEC_SIZE-nlo, (CCTK_REAL*)0); \ - mask_hi = vec_perm(k8ltrue, k8lfalse, vp_hi); \ - CCTK_REAL8_VEC mask; \ - mask = vec_and(mask_lo, mask_hi); \ - vec8_store(p, vec_sel(vec8_load(p), x, mask)); \ +#define vec8_store_nta_partial_mid(p_,x_,nlo,nhi) \ + ({ \ + CCTK_REAL8& p__=(p_); \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8 p=p__; \ + CCTK_REAL8_VEC x=x__; \ + CCTK_REAL8_VEC vp_lo, mask_lo; \ + /* Ensure at least one but not all vector elements are active */ \ + assert(nlo>0 and nlo<CCTK_REAL8_VEC_SIZE-1); \ + vp_lo = vec_lvsl(-8 * nlo, (CCTK_REAL*)0); \ + mask_lo = vec_perm(k8lfalse, k8ltrue, vp_lo); \ + CCTK_REAL8_VEC vp_hi, mask_hi; \ + /* Ensure at least one but not all vector elements are active */ \ + assert(nhi>0 and nhi<CCTK_REAL8_VEC_SIZE-1); \ + vp_hi = vec_lvsl(8 * nhi, (CCTK_REAL*)0); \ + mask_hi = vec_perm(k8lfalse, k8ltrue, vp_hi); \ + CCTK_REAL8_VEC mask; \ + mask = vec_and(mask_lo, mask_hi); \ + vec8_store(p, k8ifthen(mask, x, vec8_load(p))); \ }) @@ -222,10 +251,10 @@ struct CCTK_REAL8_VEC { #define k8div(x,y) (vec_swdiv_nochk(x,y)) // Fused multiply-add, defined as [+-]x*y[+-]z -#define k8madd(x,y,z) (vec_madd(z,x,y)) -#define k8msub(x,y,z) (vec_msub(z,x,y)) -#define k8nmadd(x,y,z) (vec_nmadd(z,x,y)) -#define k8nmsub(x,y,z) (vec_nmsub(z,x,y)) +#define k8madd(x,y,z) (vec_madd(x,y,z)) +#define k8msub(x,y,z) (vec_msub(x,y,z)) +#define k8nmadd(x,y,z) (vec_nmadd(x,y,z)) +#define k8nmsub(x,y,z) (vec_nmsub(x,y,z)) // Cheap functions #define k8copysign(x,y) (vec_cpsgn(y,x)) @@ -236,7 +265,7 @@ struct CCTK_REAL8_VEC { CCTK_REAL8_VEC y__=(y_); \ CCTK_REAL8_VEC x=x__; \ CCTK_REAL8_VEC y=y__; \ - vec_sel(vec_cmpgt(y,x),y,x); \ + k8ifthen(k8cmplt(x,y),y,x); \ }) #define k8fmin(x_,y_) \ ({ \ @@ -244,7 +273,7 @@ struct CCTK_REAL8_VEC { CCTK_REAL8_VEC y__=(y_); \ CCTK_REAL8_VEC x=x__; \ CCTK_REAL8_VEC y=y__; \ - vec_sel(vec_cmplt(y,x),y,x); \ + k8ifthen(k8cmpgt(x,y),y,x); \ }) #define k8fnabs(x) (vec_nabs(x)) #define k8sgn(x_) \ @@ -253,9 +282,9 @@ struct CCTK_REAL8_VEC { CCTK_REAL8_VEC x=x__; \ CCTK_REAL8_VEC one, zero, iszero; \ one = k8ltrue; \ - zero = vec_sub(one, one); \ - iszero = vec_cmpeq(x, zero); \ - k8ifthen(iszero, zero, vec_cpsgn(one, x)); \ + zero = k8sub(one, one); \ + iszero = k8cmpeq(x, zero); \ + k8ifthen(iszero, zero, k8copysign(one, x)); \ }) #define k8sqrt(x) (vec_swsqrt_nochk(x)) @@ -311,19 +340,19 @@ struct CCTK_REAL8_VEC { // canonical true is +1.0, canonical false is -1.0 // >=0 is true, -0 is true, nan is false -#define k8lfalse \ +#define k8lfalse \ ({ CCTK_REAL8_VEC dummy; vec_logical(dummy,dummy,0x0); }) -#define k8ltrue \ +#define k8ltrue \ ({ CCTK_REAL8_VEC dummy; vec_logical(dummy,dummy,0xf); }) #define k8lnot(x) (vec_not(x)) #define k8land(x,y) (vec_and(x,y)) #define k8lor(x,y) (vec_or(x,y)) #define k8lxor(x,y) (vec_xor(x,y)) -#define k8ifthen(x,y,z) (vec_sel(z,x,y)) +#define k8ifthen(x,y,z) (vec_sel(z,y,x)) #define k8cmpeq(x,y) (vec_cmpeq(x,y)) -#define k8cmpne(x,y) (vec_not(vec_cmpeq(x,y))) +#define k8cmpne(x,y) (k8lnot(vec_cmpeq(x,y))) #define k8cmpgt(x,y) (vec_cmpgt(x,y)) -#define k8cmpge(x,y) (vec_not(vec_cmplt(x,y))) +#define k8cmpge(x,y) (k8lnot(vec_cmplt(x,y))) #define k8cmplt(x,y) (vec_cmplt(x,y)) #define k8cmple(x,y) (vec_not(vec_cmpgt(x,y))) |