From 33e3cc3261b81b28f83a6e4d41d1ae97944dfa49 Mon Sep 17 00:00:00 2001 From: eschnett Date: Mon, 31 Dec 2012 14:31:30 +0000 Subject: Many corrections git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@76 105869f7-3296-0410-a4ea-f4349344b45a --- src/vectors-8-QPX.h | 189 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 109 insertions(+), 80 deletions(-) diff --git a/src/vectors-8-QPX.h b/src/vectors-8-QPX.h index 40f6c18..631c974 100644 --- a/src/vectors-8-QPX.h +++ b/src/vectors-8-QPX.h @@ -35,9 +35,9 @@ struct CCTK_REAL8_VEC { #define CCTK_REAL8_VEC_SIZE 4 // Integer and boolean types corresponding to this real type -#define CCTK_INTEGER8 CCTK_REAL8 +//#define CCTK_INTEGER8 CCTK_REAL8 #define CCTK_BOOLEAN8 CCTK_REAL8 -#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC +//#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC #define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC @@ -47,6 +47,10 @@ struct CCTK_REAL8_VEC { #define vec8_set1(a) (vec_splats(a)) #define vec8_set(a,b,c,d) ((vector4double){a,b,c,d}) +#define vec8_b2r(b) ((b)?+1.0:-1.0) +#define vec8b_set(a,b,c,d) \ + ((vector4double){vec8_b2r(a),vec8_b2r(b),vec8_b2r(c),vec8_b2r(d)}) + #define vec8_elt0(x) (vec_extract(x,0)) #define vec8_elt1(x) (vec_extract(x,1)) #define vec8_elt2(x) (vec_extract(x,2)) @@ -65,6 +69,7 @@ struct CCTK_REAL8_VEC { CCTK_REAL8 const& p__=(p_); \ CCTK_REAL8& p = *(CCTK_REAL8*)&p__; \ vector4double v1, v2, vp; \ + /* code taken from IBM's compiler documentation */ \ v1 = vec_ld(0,&p); /* load the left part of the vector */ \ v2 = vec_ld(32,&p); /* load the right part of the vector */ \ vp = vec_lvsl(0,&p); /* generate control value */ \ @@ -105,28 +110,29 @@ struct CCTK_REAL8_VEC { // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar #define vec8_store(p,x) (vec_sta(x,0,&(p))) -#define vec8_storeu(p_,x_) \ - ({ \ - CCTK_REAL8& p__=(p_); \ - CCTK_REAL8_VEC x__=(x_); \ - CCTK_REAL8& p=p__; \ - CCTK_REAL8_VEC x=x__; \ - CCTK_REAL8_VEC v1, v2, v3, vp, m1, m2, m3; \ - /* generate insert masks */ \ - vp = vec_lvsr(0,&p); \ - m1 = k8lfalse; \ - m2 = k8ltrue; \ - m3 = vec_perm(m1,m2,vp); \ - /* get existing data */ \ - v1 = vec_ld(0,&p); \ - v2 = vec_ld(32,&p); \ - /* permute and insert */ \ - v3 = vec_perm(x,x,vp); \ - v1 = vec_sel(v1,v3,m3); \ - v2 = vec_sel(v3,v2,m3); \ - /* store data back */ \ - vec_st(0,&p,v1); \ - vec_st(32,&p,v2); \ +#define vec8_storeu(p_,x_) \ + ({ \ + CCTK_REAL8& p__=(p_); \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8& p=p__; \ + CCTK_REAL8_VEC x=x__; \ + CCTK_REAL8_VEC v1, v2, v3, vp, m1, m2, m3; \ + /* code taken from IBM's compiler documentation */ \ + /* generate insert masks */ \ + vp = vec_lvsr(0,&p); \ + m1 = k8lfalse; \ + m2 = k8ltrue; \ + m3 = vec_perm(m1,m2,vp); \ + /* get existing data */ \ + v1 = vec_ld(0,&p); \ + v2 = vec_ld(32,&p); \ + /* permute and insert */ \ + v3 = vec_perm(x,x,vp); \ + v1 = vec_sel(v1,v3,m3); \ + v2 = vec_sel(v3,v2,m3); \ + /* store data back */ \ + vec_st(0,&p,v1); \ + vec_st(32,&p,v2); \ }) #define vec8_store_nta(p,x) (vec_sta(x,0,&(p))) // this doesn't avoid the cache @@ -140,19 +146,34 @@ struct CCTK_REAL8_VEC { ptrdiff_t const imin=imin__; \ ptrdiff_t const imax=imax__; \ \ - v8stp_all = i-imin>=0 and i-imax<=-CCTK_REAL8_VEC_SIZE; \ + v8stp_all = i>=imin and i+CCTK_REAL8_VEC_SIZE-1=0 ? \ + /* this is correct but slow */ \ + /* \ + mask_lo = vec8b_set(i+0>=imin, i+1>=imin, i+2>=imin, i+3>=imin); \ + mask_hi = vec8b_set(i+0=-(CCTK_REAL8_VEC_SIZE-1)); \ + vp_lo = vec_lvsl(8 * (i-imin), (CCTK_REAL*)0); \ + mask_lo = (i-imin >= 0 ? \ k8ltrue : \ vec_perm(k8lfalse, k8ltrue, vp_lo)); \ - vp_hi = vec_lvsl(i-imax, (CCTK_REAL*)CCTK_REAL8_VEC_SIZE); \ - mask_hi = (i-imax<=-CCTK_REAL8_VEC_SIZE ? \ + /* Ensure at least one vector element is inside the active region */ \ + assert(i0 and n0 and n0 and nlo0 and nhi=0 is true, -0 is true, nan is false -#define k8lfalse \ +#define k8lfalse \ ({ CCTK_REAL8_VEC dummy; vec_logical(dummy,dummy,0x0); }) -#define k8ltrue \ +#define k8ltrue \ ({ CCTK_REAL8_VEC dummy; vec_logical(dummy,dummy,0xf); }) #define k8lnot(x) (vec_not(x)) #define k8land(x,y) (vec_and(x,y)) #define k8lor(x,y) (vec_or(x,y)) #define k8lxor(x,y) (vec_xor(x,y)) -#define k8ifthen(x,y,z) (vec_sel(z,x,y)) +#define k8ifthen(x,y,z) (vec_sel(z,y,x)) #define k8cmpeq(x,y) (vec_cmpeq(x,y)) -#define k8cmpne(x,y) (vec_not(vec_cmpeq(x,y))) +#define k8cmpne(x,y) (k8lnot(vec_cmpeq(x,y))) #define k8cmpgt(x,y) (vec_cmpgt(x,y)) -#define k8cmpge(x,y) (vec_not(vec_cmplt(x,y))) +#define k8cmpge(x,y) (k8lnot(vec_cmplt(x,y))) #define k8cmplt(x,y) (vec_cmplt(x,y)) #define k8cmple(x,y) (vec_not(vec_cmpgt(x,y))) -- cgit v1.2.3