From 746a81a4430a69c5a09a509f8e5a85fbffb17305 Mon Sep 17 00:00:00 2001 From: eschnett Date: Sun, 5 Feb 2012 00:20:49 +0000 Subject: Various changes 1. Implement a simplified partial store interface Implement vec_store_nta_partial, which offers a simpler interface, similar to the one used in OpenCL. 2. Add kifmsg function, and implement kifpos and kifneg in terms of this. 3. Update (and make safer) Kranc-specific code git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@47 105869f7-3296-0410-a4ea-f4349344b45a --- src/test.cc | 19 ++++++++++++++ src/vectors-4-Altivec.h | 4 +-- src/vectors-4-SSE.h | 18 ++++++------- src/vectors-4-default.h | 3 ++- src/vectors-8-AVX.h | 62 +++++++++++++++++++++++++++++++++++++++----- src/vectors-8-DoubleHummer.h | 2 +- src/vectors-8-SSE2.h | 49 +++++++++++++++++++++++++++------- src/vectors-8-VSX.h | 4 +-- src/vectors-8-default.h | 4 ++- src/vectors.h | 45 ++++++++++++++++++++++---------- 10 files changed, 165 insertions(+), 45 deletions(-) diff --git a/src/test.cc b/src/test.cc index 5118497..a64456f 100644 --- a/src/test.cc +++ b/src/test.cc @@ -131,6 +131,18 @@ void Vectors_Test(CCTK_ARGUMENTS) VECTEST("vec_store", sv, b[i]); sv = av; vec_store_nta(*s, bv); VECTEST("vec_store_nta", sv, b[i]); + for (int dlo=-1; dlo<=CCTK_REAL_VEC_SIZE; ++dlo) { + for (int dhi=dlo; dhi<=CCTK_REAL_VEC_SIZE; ++dhi) { + if (dlo>0 and dhi>dlo and dhi=dlo && i0 ? y : z] #ifdef __SSE4_1__ -# define k4ifpos(x,y,z) (_mm_blendv_ps(y,z,x)) +# define k4ifmsb(x,y,z) (_mm_blendv_ps(z,y,x)) #elif 0 # ifdef __cplusplus # define k4sgn(x) ({ using namespace std; signbit(x); }) # else # define k4sgn(x) (signbit(x)) # endif -# define k4ifpos(x,y,z) \ +# define k4ifmsb(x,y,z) \ ({ \ CCTK_REAL4_VEC const x__=(x_); \ CCTK_REAL4_VEC const y__=(y_); \ @@ -341,13 +341,13 @@ static const union { CCTK_REAL4_VEC const x=x__; \ CCTK_REAL4_VEC const y=y__; \ CCTK_REAL4_VEC const z=z__; \ - vec4_set(k4sgn(vec4_elt0(x)) ? vec4_elt0(z) : vec4_elt0(y), \ - k4sgn(vec4_elt1(x)) ? vec4_elt1(z) : vec4_elt1(y), \ - k4sgn(vec4_elt2(x)) ? vec4_elt2(z) : vec4_elt2(y), \ - k4sgn(vec4_elt3(x)) ? vec4_elt3(z) : vec4_elt3(y)); \ + vec4_set(k4sgn(vec4_elt0(x)) ? vec4_elt0(y) : vec4_elt0(z), \ + k4sgn(vec4_elt1(x)) ? vec4_elt1(y) : vec4_elt1(z), \ + k4sgn(vec4_elt2(x)) ? vec4_elt2(y) : vec4_elt2(z), \ + k4sgn(vec4_elt3(x)) ? vec4_elt3(y) : vec4_elt3(z)); \ }) #else -# define k4ifpos(x_,y_,z_) \ +# define k4ifmsb(x_,y_,z_) \ ({ \ CCTK_REAL4_VEC const x__=(x_); \ CCTK_REAL4_VEC const y__=(y_); \ @@ -356,7 +356,7 @@ static const union { CCTK_REAL4_VEC const y=y__; \ CCTK_REAL4_VEC const z=z__; \ CCTK_REAL4_VEC const mask = _mm_srai_epi32(x, 31); \ - /* (y & ~mask) | (z & mask) */ \ - _mm_or_ps(_mm_andnot_ps(mask, y), _mm_and_ps(mask, z)); \ + /* (z & ~mask) | (y & mask) */ \ + _mm_or_ps(_mm_andnot_ps(mask, z), _mm_and_ps(mask, y)); \ }) #endif diff --git a/src/vectors-4-default.h b/src/vectors-4-default.h index a672b89..874f471 100644 --- a/src/vectors-4-default.h +++ b/src/vectors-4-default.h @@ -49,6 +49,7 @@ // Unaligned store #define vec4_store_nta(p,x) ((p)=(x)) +#define vec4_store_nta_partial(p,x,i,imin,imax) (vec4_store_nta(p,x)) // Store the n lower elements of a vector to memory #define vec4_store_nta_partial_lo(p,x,n) (assert(0)) // Store the n higher elements of a vector into memory. This stores @@ -93,4 +94,4 @@ # define k4sgn(x) (signbit(x)) #endif -#define k4ifpos(x,y,z) (k4sgn(x)?(z):(y)) +#define k4ifmsb(x,y,z) (k4sgn(x)?(y):(z)) diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h index 144d3b5..274b376 100644 --- a/src/vectors-8-AVX.h +++ b/src/vectors-8-AVX.h @@ -113,8 +113,53 @@ union k8const_t { # define vec8_store_nta(p,x) (_mm256_stream_pd(&(p),x)) #endif +// Store a partial vector (aligned and non-temporal) +#define vec8_store_partial_prepare(i,imin_,imax_) \ + bool v8stp_all; \ + __m256i v8stp_mask; \ + ({ \ + ptrdiff_t const imin1=(imin_); \ + ptrdiff_t const imin=imin1; \ + ptrdiff_t const imax1=(imax_); \ + ptrdiff_t const imax=imax1; \ + \ + v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE0 ? y : z] +#define k8ifmsb(x,y,z) (_mm256_blendv_pd(z,y,x)) diff --git a/src/vectors-8-DoubleHummer.h b/src/vectors-8-DoubleHummer.h index 951ca5d..fdc4be7 100644 --- a/src/vectors-8-DoubleHummer.h +++ b/src/vectors-8-DoubleHummer.h @@ -241,4 +241,4 @@ #define k8sin(x) K8REPL(sin,x) #define k8tan(x) K8REPL(tan,x) -#define k8ifpos(x,y,z) (__fpsel(x,z,y)) +#define k8ifmsb(x,y,z) (__fpsel(x,y,z)) diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h index 7dc7d31..fe231b7 100644 --- a/src/vectors-8-SSE2.h +++ b/src/vectors-8-SSE2.h @@ -139,6 +139,34 @@ # define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x)) #endif +// Store a partial vector (aligned and non-temporal) +#define vec8_store_partial_prepare(i,imin,imax) \ + bool const v8stp_lo = (i)>=(imin); \ + bool const v8stp_hi = (i)+CCTK_REAL_VEC_SIZE<(imax) +#if VECTORISE_STREAMING_STORES && defined(__SSE4A__) +# define vec8_store_nta_partial(p,x) \ + ({ \ + if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \ + vec8_store_nta(p,x); \ + } else if (v8stp_lo) { \ + _mm_stream_sd(&p,x); \ + } else if (v8stp_hi) { \ + _mm_stream_sd(&p+1, vec8_swap10(x)); \ + } \ + }) +#else +# define vec8_store_nta_partial(p,x) \ + ({ \ + if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \ + vec8_store_nta(p,x); \ + } else if (v8stp_lo) { \ + _mm_storel_pd(&p,x); \ + } else if (v8stp_hi) { \ + _mm_storeh_pd(&p+1,x); \ + } \ + }) +#endif + // Store a lower or higher partial vector (aligned and non-temporal) #if ! VECTORISE_STREAMING_STORES # define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x)) @@ -264,9 +292,10 @@ static const union { // Choice [sign(x)>0 ? y : z] #ifdef __SSE4_1__ -# define k8ifpos(x,y,z) (_mm_blendv_pd(y,z,x)) +# define k8ifmsb(x,y,z) (_mm_blendv_pd(z,y,x)) #elif 0 -# define k8ifpos(x_,y_,z_) \ +// This is slow +# define k8ifmsb(x_,y_,z_) \ ({ \ CCTK_REAL8_VEC const x__=(x_); \ CCTK_REAL8_VEC const y__=(y_); \ @@ -278,8 +307,8 @@ static const union { CCTK_REAL8_VEC r; \ switch (m) { \ case 0: r = y; break; \ - case 1: r = _mm_move_sd(y,z); break; \ - case 2: r = _mm_move_sd(z,y); break; \ + case 1: r = _mm_move_sd(y,z); break; \ + case 2: r = _mm_move_sd(z,y); break; \ case 3: r = z; break; \ } \ r; \ @@ -290,7 +319,7 @@ static const union { # else # define k4sgn(x) (signbit(x)) # endif -# define k8ifpos(x_,y_,z_) \ +# define k8ifmsb(x_,y_,z_) \ ({ \ CCTK_REAL8_VEC const x__=(x_); \ CCTK_REAL8_VEC const y__=(y_); \ @@ -298,8 +327,8 @@ static const union { CCTK_REAL8_VEC const x=x__; \ CCTK_REAL8_VEC const y=y__; \ CCTK_REAL8_VEC const z=z__; \ - vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(z) : vec8_elt0(y), \ - k8sgn(vec8_elt1(x)) ? vec8_elt1(z) : vec8_elt1(y)); \ + vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), \ + k8sgn(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); \ }) #else static const union { @@ -307,7 +336,7 @@ static const union { double d; } k8one_union = { 0x1ULL }; # define k8one (k8one_union.d) -# define k8ifpos(x_,y_,z_) \ +# define k8ifmsb(x_,y_,z_) \ ({ \ CCTK_REAL8_VEC const x__=(x_); \ CCTK_REAL8_VEC const y__=(y_); \ @@ -319,7 +348,7 @@ static const union { CCTK_REAL8_VEC const imask = \ (__m128d)_mm_sub_epi64(_mm_srli_epi64((__m128i)x, 63), \ (__m128i)_mm_set1_pd(k8one)); \ - /* (y & ~mask) | (z & mask); imask = ~mask */ \ - _mm_or_pd(_mm_and_pd(imask, y), _mm_andnot_pd(imask, z)); \ + /* (z & ~mask) | (y & mask); imask = ~mask */ \ + _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); \ }) #endif diff --git a/src/vectors-8-VSX.h b/src/vectors-8-VSX.h index 5e64ef4..93249a5 100644 --- a/src/vectors-8-VSX.h +++ b/src/vectors-8-VSX.h @@ -108,5 +108,5 @@ #define k8pow(x,a) K8REPL2(pow,x,a) #define k8sqrt(x) K8REPL(sqrt,x) -#define k8ifpos(x,y,z) \ - (vec_sel((y), (z), vec_sra(vec_convert((x), &(vector long long*)0), 63))) +#define k8ifmsb(x,y,z) \ + (vec_sel((z), (y), vec_sra(vec_convert((x), &(vector long long*)0), 63))) diff --git a/src/vectors-8-default.h b/src/vectors-8-default.h index f980e74..ee85593 100644 --- a/src/vectors-8-default.h +++ b/src/vectors-8-default.h @@ -49,6 +49,8 @@ // Unaligned store #define vec8_store_nta(p,x) ((p)=(x)) +#define vec8_store_partial_prepare(i,imin,imax) ((void)0) +#define vec8_store_nta_partial(p,x) (vec8_store_nta(p,x)) // Store the n lower elements of a vector to memory #define vec8_store_nta_partial_lo(p,x,n) (assert(0)) // Store the n higher elements of a vector into memory. This stores @@ -92,4 +94,4 @@ # define k8sgn(x) (signbit(x)) #endif -#define k8ifpos(x,y,z) (k8sgn(x)?(z):(y)) +#define k8ifmsb(x,y,z) (k8sgn(x)?(y):(z)) diff --git a/src/vectors.h b/src/vectors.h index 2761202..6cb1238 100644 --- a/src/vectors.h +++ b/src/vectors.h @@ -61,6 +61,8 @@ # define vec_loadu_maybe3 vec4_loadu_maybe3 # define vec_store vec4_store # define vec_store_nta vec4_store_nta +# define vec_store_partial_prepare vec4_store_partial_prepare +# define vec_store_nta_partial vec4_store_nta_partial # define vec_store_nta_partial_lo vec4_store_nta_partial_lo # define vec_store_nta_partial_hi vec4_store_nta_partial_hi # define vec_store_nta_partial_mid vec4_store_nta_partial_mid @@ -90,8 +92,7 @@ # define ksqrt k4sqrt # define ktan k4tan -# define kifpos k4ifpos -# define kifneg k4ifneg +# define kifmsb k4ifmsb #elif defined(CCTK_REAL_PRECISION_8) @@ -111,7 +112,9 @@ # define vec_loadu_maybe vec8_loadu_maybe # define vec_loadu_maybe3 vec8_loadu_maybe3 # define vec_store vec8_store +# define vec_store_partial_prepare vec8_store_partial_prepare # define vec_store_nta vec8_store_nta +# define vec_store_nta_partial vec8_store_nta_partial # define vec_store_nta_partial_lo vec8_store_nta_partial_lo # define vec_store_nta_partial_hi vec8_store_nta_partial_hi # define vec_store_nta_partial_mid vec8_store_nta_partial_mid @@ -140,7 +143,7 @@ # define ksqrt k8sqrt # define ktan k8tan -# define kifpos k8ifpos +# define kifmsb k8ifmsb #else @@ -150,6 +153,11 @@ +#define kifneg(a,b,c) kifmsb(a,b,c) +#define kifpos(a,b,c) kifmsb(a,c,b) + + + #if CCTK_REAL_VEC_SIZE == 1 # define vec_index vec_set(0) #elif CCTK_REAL_VEC_SIZE == 2 @@ -310,20 +318,31 @@ struct vecprops { // For Kranc -#undef KRANC_DIFF_FUNCTIONS -#if ! VECTORISE_INLINE -# define KRANC_DIFF_FUNCTIONS -#endif +#ifdef KRANC_C + +# undef KRANC_DIFF_FUNCTIONS +# if ! VECTORISE_INLINE +# define KRANC_DIFF_FUNCTIONS +# endif + +# undef E +# define E (ToReal(M_E)) -#undef Pi -#define Pi (ToReal(M_PI)) +# undef Pi +# define Pi (ToReal(M_PI)) -#undef ToReal -#define ToReal(x) (vec_set1((CCTK_REAL)(x))) +# undef Sign +# define Sign(x) -999999999 // poison -#undef Sign -#define Sign(x) -999999999 // poison +# undef ToReal +# define ToReal(x) (vec_set1((CCTK_REAL)(x))) +# undef KRANC_GFOFFSET3D +# define KRANC_GFOFFSET3D(var,i,j,k) \ + vec_loadu_maybe3((i),(j),(k), \ + *(CCTK_REAL const*)& \ + ((char const*)(var))[cdi*(i)+cdj*(j)+cdk*(k)]) +#endif // KRANC_C #endif // #ifndef VECTORS_H -- cgit v1.2.3