// -*-C++-*- // Vectorise using IBM's Blue Gene/Q QPX (Power) // Use the type vector4double directly, without introducing a wrapper class // Use macros instead of inline functions // Note: bgxlC_r does not like const declarations, so we need to cast // them away and/or omit them everywhere // See #include // #define vec8_assert(x) ((void)0) #define vec8_assert(x) assert(x) #ifdef __cplusplus # include #endif #include #define vec8_architecture "QPX" // Vector type corresponding to CCTK_REAL // We use a struct to avoid the "const" issue // #define CCTK_REAL8_VEC vector4double struct CCTK_REAL8_VEC { vector4double v; CCTK_REAL8_VEC() {} CCTK_REAL8_VEC(CCTK_REAL8_VEC const& x): v(x.v) {} CCTK_REAL8_VEC(vector4double v_): v(v_) {} operator vector4double() const { return v; } }; // Number of vector elements in a CCTK_REAL_VEC #define CCTK_REAL8_VEC_SIZE 4 // Integer and boolean types corresponding to this real type #define CCTK_INTEGER8 CCTK_INT8 #define CCTK_BOOLEAN8 CCTK_REAL8 #define CCTK_INTEGER8_VEC CCTK_REAL8_VEC #define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC // Create vectors, extract vector elements #define vec8_set1(a) (vec_splats(a)) #if 0 #define vec8_set(a,b,c,d) \ (vec_insert \ (d,vec_insert \ (c,vec_insert \ (b,vec_insert \ (a,CCTK_REAL8_VEC(),0),1),2),3)) #endif #define vec8_set(a_,b_,c_,d_) \ ({ \ CCTK_REAL8 const a__ = (a_); \ CCTK_REAL8 const b__ = (b_); \ CCTK_REAL8 const c__ = (c_); \ CCTK_REAL8 const d__ = (d_); \ CCTK_REAL8 const a = a__; \ CCTK_REAL8 const b = b__; \ CCTK_REAL8 const c = c__; \ CCTK_REAL8 const d = d__; \ CCTK_REAL8_VEC x; \ ((CCTK_REAL*)&x)[0] = a; \ ((CCTK_REAL*)&x)[1] = b; \ ((CCTK_REAL*)&x)[2] = c; \ ((CCTK_REAL*)&x)[3] = d; \ x; \ }) #define vec8_b2r(b) ((b)?+1.0:-1.0) #define vec8_setb(a,b,c,d) \ (vec8_set(vec8_b2r(a), vec8_b2r(b), vec8_b2r(c), vec8_b2r(d))) #define vec8_elt0(x) (vec_extract(x,0)) #define vec8_elt1(x) (vec_extract(x,1)) #define vec8_elt2(x) (vec_extract(x,2)) #define vec8_elt3(x) (vec_extract(x,3)) #define vec8_elt(x,d) (vec_extract(x,d)) #define vec8_elts(x,a,b,c,d) \ ({ \ CCTK_REAL8_VEC x__ = (x_); \ CCTK_REAL8_VEC x = x__; \ a = ((CCTK_REAL*)&x)[0]; \ b = ((CCTK_REAL*)&x)[1]; \ c = ((CCTK_REAL*)&x)[2]; \ d = ((CCTK_REAL*)&x)[3]; \ }) #define vec8_r2b(x) ((x)>=0.0) #define vec8_eltb(x,d) (vec8_r2b(vec8_elt(x,d))) // Load and store vectors // Load a vector from memory (aligned and unaligned); this loads from // a reference to a scalar #define vec8_load(p) (vec_lda(0,(CCTK_REAL8*)&(p))) #define vec8_loadu(p_) \ ({ \ CCTK_REAL8 const& p__=(p_); \ CCTK_REAL8& p = *(CCTK_REAL8*)&p__; \ vector4double v1, v2, vp; \ /* code taken from IBM's compiler documentation */ \ v1 = vec_ld(0,&p); /* load the left part of the vector */ \ v2 = vec_ld(31,&p); /* load the right part of the vector */ \ vp = vec_lvsl(0,&p); /* generate control value */ \ vec_perm(v1,v2,vp); /* generate the aligned vector */ \ }) #define vec8_loadu_off(off_,p_) \ ({ \ int const off__ = (off_); \ CCTK_REAL8 const& p__ = (p_); \ int off = off__; \ CCTK_REAL8& p = *(CCTK_REAL8*)&p__; \ vector4double v1, v2; \ off &= CCTK_REAL8_VEC_SIZE-1; \ v1 = vec_lda(0,&p-off); \ v2 = vec_lda(0,&p-off+CCTK_REAL8_VEC_SIZE); \ off==1 ? vec_sldw(v1,v2,1) : \ off==2 ? vec_sldw(v1,v2,2) : \ off==3 ? vec_sldw(v1,v2,3) : \ (vec8_assert(0), v1); \ }) // Load a vector from memory that may or may not be aligned, as // decided by the offset and the vector size #if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS // Implementation: Always use unaligned load # define vec8_loadu_maybe(off,p) vec8_loadu(p) # define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p) #else # define vec8_loadu_maybe(off_,p_) \ ({ \ CCTK_REAL8 const& p__=(p_); \ int const off__=(off_); \ CCTK_REAL8 const& p=p__; \ int const off=off__; \ off % CCTK_REAL8_VEC_SIZE == 0 ? \ vec8_load(p) : \ vec8_loadu_off(off,p); \ }) # if VECTORISE_ALIGNED_ARRAYS // Assume all array x sizes are multiples of the vector size # define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu_maybe(off1,p) # else # define vec8_loadu_maybe3(off1,off2,off3,p_) \ ({ \ CCTK_REAL8 const& p__=(p_); \ CCTK_REAL8 const& p=p__; \ ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \ (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \ vec8_loadu(p) : \ vec8_loadu_maybe(off1,p); \ }) # endif #endif // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar #define vec8_store(p,x) (vec_sta(x,0,&(p))) #define vec8_storeu(p_,x_) \ ({ \ CCTK_REAL8& p__=(p_); \ CCTK_REAL8_VEC x__=(x_); \ CCTK_REAL8& p=p__; \ CCTK_REAL8_VEC x=x__; \ CCTK_REAL8_VEC v1, v2, v3, vp, m1, m2, m3; \ /* code taken from IBM's compiler documentation */ \ /* generate insert masks */ \ vp = vec_lvsr(0,&p); \ m1 = k8lfalse; \ m2 = k8ltrue; \ m3 = vec_perm(m1,m2,vp); \ v3 = vec_perm(x,x,vp); \ _Pragma("tm_atomic") { \ /* get existing data */ \ v1 = vec_ld(0,&p); \ v2 = vec_ld(31,&p); \ /* permute and insert */ \ v1 = vec_sel(v1,v3,m3); \ v2 = vec_sel(v3,v2,m3); \ /* store data back */ \ vec_st(0,&p,v1); \ vec_st(31,&p,v2); \ } \ }) #define vec8_store_nta(p,x) (vec_sta(x,0,&(p))) // this doesn't avoid the cache #if VECTORISE_ALIGNED_ARRAYS // Arrays are aligned; wrap-around is not an issue # define vec8_store_omp #else // Need to protect partial stores, as they may wrap around to the // beginning of the next line in the array # define vec8_store_omp _Pragma("tm_atomic") #endif // Store a partial vector (aligned and non-temporal) #define vec8_store_partial_prepare(i,imin_,imax_) \ bool v8stp_all; \ CCTK_BOOLEAN8_VEC v8stp_mask; \ bool v8stp_mask0, v8stp_mask1, v8stp_mask2, v8stp_mask3; \ ({ \ ptrdiff_t const imin__=(imin_); \ ptrdiff_t const imax__=(imax_); \ ptrdiff_t const imin=imin__; \ ptrdiff_t const imax=imax__; \ \ v8stp_all = i>=imin and i+CCTK_REAL8_VEC_SIZE-1=imin, i+1>=imin, i+2>=imin, i+3>=imin); \ mask_hi = vec8_setb(i+0=-(CCTK_REAL8_VEC_SIZE-1)); \ vp_lo = vec_lvsl(8 * (i-imin), (CCTK_REAL*)0); \ mask_lo = (i-imin >= 0 ? \ k8ltrue : \ vec_perm(k8lfalse, k8ltrue, vp_lo)); \ /* Ensure at least one vector element is inside the active region */ \ vec8_assert(i0 and n0 and n0 and nlo0 and nhi=0 is true, -0 is true, nan is false #define k8lfalse \ ({ CCTK_REAL8_VEC dummy; vec_logical(dummy,dummy,0x0); }) #define k8ltrue \ ({ CCTK_REAL8_VEC dummy; vec_logical(dummy,dummy,0xf); }) #define k8lnot(x) (vec_not(x)) #define k8land(x,y) (vec_and(x,y)) #define k8lor(x,y) (vec_or(x,y)) #define k8lxor(x,y) (vec_xor(x,y)) #define k8ifthen(x,y,z) (vec_sel(z,y,x)) #define k8cmpeq(x,y) (vec_cmpeq(x,y)) #define k8cmpne(x,y) (k8lnot(k8cmpeq(x,y))) #define k8cmpgt(x,y) (vec_cmpgt(x,y)) #define k8cmpge(x,y) (k8lnot(k8cmplt(x,y))) #define k8cmplt(x,y) (vec_cmplt(x,y)) #define k8cmple(x,y) (k8lnot(k8cmpgt(x,y)))