diff options
author | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2012-02-05 00:20:49 +0000 |
---|---|---|
committer | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2012-02-05 00:20:49 +0000 |
commit | 746a81a4430a69c5a09a509f8e5a85fbffb17305 (patch) | |
tree | a62ddcc9e096b5619de8f2348282778bec23568b | |
parent | 7e78ff525ffb9974d60161d446efb980bfe32cbb (diff) |
Various changes
1. Implement a simplified partial store interface
Implement vec_store_nta_partial, which offers a simpler interface,
similar to the one used in OpenCL.
2. Add kifmsg function, and implement kifpos and kifneg in terms of
this.
3. Update (and make safer) Kranc-specific code
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@47 105869f7-3296-0410-a4ea-f4349344b45a
-rw-r--r-- | src/test.cc | 19 | ||||
-rw-r--r-- | src/vectors-4-Altivec.h | 4 | ||||
-rw-r--r-- | src/vectors-4-SSE.h | 18 | ||||
-rw-r--r-- | src/vectors-4-default.h | 3 | ||||
-rw-r--r-- | src/vectors-8-AVX.h | 62 | ||||
-rw-r--r-- | src/vectors-8-DoubleHummer.h | 2 | ||||
-rw-r--r-- | src/vectors-8-SSE2.h | 49 | ||||
-rw-r--r-- | src/vectors-8-VSX.h | 4 | ||||
-rw-r--r-- | src/vectors-8-default.h | 4 | ||||
-rw-r--r-- | src/vectors.h | 45 |
10 files changed, 165 insertions, 45 deletions
diff --git a/src/test.cc b/src/test.cc index 5118497..a64456f 100644 --- a/src/test.cc +++ b/src/test.cc @@ -131,6 +131,18 @@ void Vectors_Test(CCTK_ARGUMENTS) VECTEST("vec_store", sv, b[i]); sv = av; vec_store_nta(*s, bv); VECTEST("vec_store_nta", sv, b[i]); + for (int dlo=-1; dlo<=CCTK_REAL_VEC_SIZE; ++dlo) { + for (int dhi=dlo; dhi<=CCTK_REAL_VEC_SIZE; ++dhi) { + if (dlo>0 and dhi>dlo and dhi<CCTK_REAL_VEC_SIZE) { + sv = av; + vec_store_partial_prepare(0, dlo, dhi); + vec_store_nta_partial(*s, bv); + snprintf (testname, sizeof testname, + "vec_store_nta_partial[%d,%d]", dlo, dhi); + VECTEST(testname, sv, i>=dlo && i<dhi ? b[i] : a[i]); + } + } + } /* The partial stores are not implemented for d==0 and d==CCTK_REAL_VEC_SIZE-1 (because these are trivial) */ for (int d=1; d<CCTK_REAL_VEC_SIZE-1; ++d) { @@ -183,6 +195,13 @@ void Vectors_Test(CCTK_ARGUMENTS) VECTEST("kifpos 0", kifpos(vec_set1(0.),bv,cv), b[i]); VECTEST("kifpos -0", kifpos(vec_set1(-0.),bv,cv), c[i]); + VECTEST("kifneg positive", + kifneg(av, bv, cv), my_signbit(a[i]) ? b[i] : c[i]); + VECTEST("kifneg negative", + kifneg(bv, bv, cv), my_signbit(b[i]) ? b[i] : c[i]); + VECTEST("kifneg 0", kifneg(vec_set1(0.),bv,cv), c[i]); + VECTEST("kifneg -0", kifneg(vec_set1(-0.),bv,cv), b[i]); + if (passed != numtests) CCTK_VWarn(CCTK_WARN_ALERT, __LINE__, __FILE__, CCTK_THORNSTRING, "Failed %d correctness tests", numtests - passed); diff --git a/src/vectors-4-Altivec.h b/src/vectors-4-Altivec.h index 679c34e..45e91d3 100644 --- a/src/vectors-4-Altivec.h +++ b/src/vectors-4-Altivec.h @@ -161,5 +161,5 @@ #define k4pow(x,a) K4REPL2(pow,x,a) #define k4sqrt(x) K4REPL(sqrt,x) -#define k4ifpos(x,y,z) \ - (vec_sel((y), (z), vec_sra(vec_convert((x), &(vector int*)0), 31))) +#define k4ifmsb(x,y,z) \ + (vec_sel((z), (y), vec_sra(vec_convert((x), &(vector int*)0), 31))) diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h index 9152c55..9f32cea 100644 --- a/src/vectors-4-SSE.h +++ b/src/vectors-4-SSE.h @@ -326,14 +326,14 @@ static const union { // Choice [sign(x)>0 ? y : z] #ifdef __SSE4_1__ -# define k4ifpos(x,y,z) (_mm_blendv_ps(y,z,x)) +# define k4ifmsb(x,y,z) (_mm_blendv_ps(z,y,x)) #elif 0 # ifdef __cplusplus # define k4sgn(x) ({ using namespace std; signbit(x); }) # else # define k4sgn(x) (signbit(x)) # endif -# define k4ifpos(x,y,z) \ +# define k4ifmsb(x,y,z) \ ({ \ CCTK_REAL4_VEC const x__=(x_); \ CCTK_REAL4_VEC const y__=(y_); \ @@ -341,13 +341,13 @@ static const union { CCTK_REAL4_VEC const x=x__; \ CCTK_REAL4_VEC const y=y__; \ CCTK_REAL4_VEC const z=z__; \ - vec4_set(k4sgn(vec4_elt0(x)) ? vec4_elt0(z) : vec4_elt0(y), \ - k4sgn(vec4_elt1(x)) ? vec4_elt1(z) : vec4_elt1(y), \ - k4sgn(vec4_elt2(x)) ? vec4_elt2(z) : vec4_elt2(y), \ - k4sgn(vec4_elt3(x)) ? vec4_elt3(z) : vec4_elt3(y)); \ + vec4_set(k4sgn(vec4_elt0(x)) ? vec4_elt0(y) : vec4_elt0(z), \ + k4sgn(vec4_elt1(x)) ? vec4_elt1(y) : vec4_elt1(z), \ + k4sgn(vec4_elt2(x)) ? vec4_elt2(y) : vec4_elt2(z), \ + k4sgn(vec4_elt3(x)) ? vec4_elt3(y) : vec4_elt3(z)); \ }) #else -# define k4ifpos(x_,y_,z_) \ +# define k4ifmsb(x_,y_,z_) \ ({ \ CCTK_REAL4_VEC const x__=(x_); \ CCTK_REAL4_VEC const y__=(y_); \ @@ -356,7 +356,7 @@ static const union { CCTK_REAL4_VEC const y=y__; \ CCTK_REAL4_VEC const z=z__; \ CCTK_REAL4_VEC const mask = _mm_srai_epi32(x, 31); \ - /* (y & ~mask) | (z & mask) */ \ - _mm_or_ps(_mm_andnot_ps(mask, y), _mm_and_ps(mask, z)); \ + /* (z & ~mask) | (y & mask) */ \ + _mm_or_ps(_mm_andnot_ps(mask, z), _mm_and_ps(mask, y)); \ }) #endif diff --git a/src/vectors-4-default.h b/src/vectors-4-default.h index a672b89..874f471 100644 --- a/src/vectors-4-default.h +++ b/src/vectors-4-default.h @@ -49,6 +49,7 @@ // Unaligned store #define vec4_store_nta(p,x) ((p)=(x)) +#define vec4_store_nta_partial(p,x,i,imin,imax) (vec4_store_nta(p,x)) // Store the n lower elements of a vector to memory #define vec4_store_nta_partial_lo(p,x,n) (assert(0)) // Store the n higher elements of a vector into memory. This stores @@ -93,4 +94,4 @@ # define k4sgn(x) (signbit(x)) #endif -#define k4ifpos(x,y,z) (k4sgn(x)?(z):(y)) +#define k4ifmsb(x,y,z) (k4sgn(x)?(y):(z)) diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h index 144d3b5..274b376 100644 --- a/src/vectors-8-AVX.h +++ b/src/vectors-8-AVX.h @@ -113,8 +113,53 @@ union k8const_t { # define vec8_store_nta(p,x) (_mm256_stream_pd(&(p),x)) #endif +// Store a partial vector (aligned and non-temporal) +#define vec8_store_partial_prepare(i,imin_,imax_) \ + bool v8stp_all; \ + __m256i v8stp_mask; \ + ({ \ + ptrdiff_t const imin1=(imin_); \ + ptrdiff_t const imin=imin1; \ + ptrdiff_t const imax1=(imax_); \ + ptrdiff_t const imax=imax1; \ + \ + v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE<imax; \ + \ + if (not CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \ + /* \ + __m256i const v8stp_mask = \ + _mm256_andnot_pd(_mm256_add_epi64(_mm256_set1_epi64x(i-imin), \ + vec_index), \ + _mm256_add_epi64(_mm256_set1_epi64x(i-imax), \ + vec_index)); \ + */ \ + __m128i const termlo0 = \ + _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(0,1)); \ + __m128i const termup0 = \ + _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(0,1)); \ + __m128i const term0 = _mm_andnot_si128(termlo0, termup0); \ + __m128i const termlo1 = \ + _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(2,3)); \ + __m128i const termup1 = \ + _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(2,3)); \ + __m128i const term1 = _mm_andnot_si128(termlo1, termup1); \ + v8stp_mask = \ + _mm256_insertf128_si256(_mm256_castsi128_si256(term0), term1, 1); \ + } \ + }) + +#define vec8_store_nta_partial(p,x) \ + ({ \ + if (CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \ + vec8_store_nta(p,x); \ + } else { \ + _mm256_maskstore_pd(&p,v8stp_mask,x); \ + } \ + }) + // Store a lower or higher partial vector (aligned and non-temporal); // the non-temporal hint is probably ignored +// Masks indicating which vector element should be stored: static const k8const_t k8store_lo_union[5] = { {{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }}, @@ -131,7 +176,7 @@ static const k8const_t k8store_hi_union[5] = {{ K8_ZERO, K8_IMIN, K8_IMIN, K8_IMIN, }}, {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }}, }; -#if defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4 +#if !defined(__INTEL_COMPILER) && defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4 // gcc 4.4 uses a wrong prototype for _mm256_maskstore_pd # define vec8_store_nta_partial_lo(p,x,n) \ (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_lo_union[n].vi),x)) @@ -147,10 +192,11 @@ static const k8const_t k8store_hi_union[5] = (_mm256_maskstore_pd(&(p),k8store_lo_union[n].vi,x)) # define vec8_store_nta_partial_hi(p,x,n) \ (_mm256_maskstore_pd(&(p),k8store_hi_union[n].vi,x)) -# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \ - (_mm256_maskstore_pd \ - (&(p), \ - k8store_lo_union[nlo].vi & k8store_hi_union[nhi].vi, \ +# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \ + (_mm256_maskstore_pd \ + (&(p), \ + _mm256_castpd_si256(_mm256_and_pd(k8store_lo_union[nlo].vd, \ + k8store_hi_union[nhi].vd)), \ x)) #endif @@ -209,8 +255,12 @@ static const k8const_t k8abs_mask_union = f(vec8_elt2(xfunc),afunc), \ f(vec8_elt3(xfunc),afunc)); \ }) +#define k8cos(x) K8REPL(cos,x) #define k8exp(x) K8REPL(exp,x) #define k8log(x) K8REPL(log,x) #define k8pow(x,a) K8REPL2(pow,x,a) +#define k8sin(x) K8REPL(sin,x) +#define k8tan(x) K8REPL(tan,x) -#define k8ifpos(x,y,z) (_mm256_blendv_pd(y,z,x)) +// Choice [sign(x)>0 ? y : z] +#define k8ifmsb(x,y,z) (_mm256_blendv_pd(z,y,x)) diff --git a/src/vectors-8-DoubleHummer.h b/src/vectors-8-DoubleHummer.h index 951ca5d..fdc4be7 100644 --- a/src/vectors-8-DoubleHummer.h +++ b/src/vectors-8-DoubleHummer.h @@ -241,4 +241,4 @@ #define k8sin(x) K8REPL(sin,x) #define k8tan(x) K8REPL(tan,x) -#define k8ifpos(x,y,z) (__fpsel(x,z,y)) +#define k8ifmsb(x,y,z) (__fpsel(x,y,z)) diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h index 7dc7d31..fe231b7 100644 --- a/src/vectors-8-SSE2.h +++ b/src/vectors-8-SSE2.h @@ -139,6 +139,34 @@ # define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x)) #endif +// Store a partial vector (aligned and non-temporal) +#define vec8_store_partial_prepare(i,imin,imax) \ + bool const v8stp_lo = (i)>=(imin); \ + bool const v8stp_hi = (i)+CCTK_REAL_VEC_SIZE<(imax) +#if VECTORISE_STREAMING_STORES && defined(__SSE4A__) +# define vec8_store_nta_partial(p,x) \ + ({ \ + if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \ + vec8_store_nta(p,x); \ + } else if (v8stp_lo) { \ + _mm_stream_sd(&p,x); \ + } else if (v8stp_hi) { \ + _mm_stream_sd(&p+1, vec8_swap10(x)); \ + } \ + }) +#else +# define vec8_store_nta_partial(p,x) \ + ({ \ + if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \ + vec8_store_nta(p,x); \ + } else if (v8stp_lo) { \ + _mm_storel_pd(&p,x); \ + } else if (v8stp_hi) { \ + _mm_storeh_pd(&p+1,x); \ + } \ + }) +#endif + // Store a lower or higher partial vector (aligned and non-temporal) #if ! VECTORISE_STREAMING_STORES # define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x)) @@ -264,9 +292,10 @@ static const union { // Choice [sign(x)>0 ? y : z] #ifdef __SSE4_1__ -# define k8ifpos(x,y,z) (_mm_blendv_pd(y,z,x)) +# define k8ifmsb(x,y,z) (_mm_blendv_pd(z,y,x)) #elif 0 -# define k8ifpos(x_,y_,z_) \ +// This is slow +# define k8ifmsb(x_,y_,z_) \ ({ \ CCTK_REAL8_VEC const x__=(x_); \ CCTK_REAL8_VEC const y__=(y_); \ @@ -278,8 +307,8 @@ static const union { CCTK_REAL8_VEC r; \ switch (m) { \ case 0: r = y; break; \ - case 1: r = _mm_move_sd(y,z); break; \ - case 2: r = _mm_move_sd(z,y); break; \ + case 1: r = _mm_move_sd(y,z); break; \ + case 2: r = _mm_move_sd(z,y); break; \ case 3: r = z; break; \ } \ r; \ @@ -290,7 +319,7 @@ static const union { # else # define k4sgn(x) (signbit(x)) # endif -# define k8ifpos(x_,y_,z_) \ +# define k8ifmsb(x_,y_,z_) \ ({ \ CCTK_REAL8_VEC const x__=(x_); \ CCTK_REAL8_VEC const y__=(y_); \ @@ -298,8 +327,8 @@ static const union { CCTK_REAL8_VEC const x=x__; \ CCTK_REAL8_VEC const y=y__; \ CCTK_REAL8_VEC const z=z__; \ - vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(z) : vec8_elt0(y), \ - k8sgn(vec8_elt1(x)) ? vec8_elt1(z) : vec8_elt1(y)); \ + vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), \ + k8sgn(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); \ }) #else static const union { @@ -307,7 +336,7 @@ static const union { double d; } k8one_union = { 0x1ULL }; # define k8one (k8one_union.d) -# define k8ifpos(x_,y_,z_) \ +# define k8ifmsb(x_,y_,z_) \ ({ \ CCTK_REAL8_VEC const x__=(x_); \ CCTK_REAL8_VEC const y__=(y_); \ @@ -319,7 +348,7 @@ static const union { CCTK_REAL8_VEC const imask = \ (__m128d)_mm_sub_epi64(_mm_srli_epi64((__m128i)x, 63), \ (__m128i)_mm_set1_pd(k8one)); \ - /* (y & ~mask) | (z & mask); imask = ~mask */ \ - _mm_or_pd(_mm_and_pd(imask, y), _mm_andnot_pd(imask, z)); \ + /* (z & ~mask) | (y & mask); imask = ~mask */ \ + _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); \ }) #endif diff --git a/src/vectors-8-VSX.h b/src/vectors-8-VSX.h index 5e64ef4..93249a5 100644 --- a/src/vectors-8-VSX.h +++ b/src/vectors-8-VSX.h @@ -108,5 +108,5 @@ #define k8pow(x,a) K8REPL2(pow,x,a) #define k8sqrt(x) K8REPL(sqrt,x) -#define k8ifpos(x,y,z) \ - (vec_sel((y), (z), vec_sra(vec_convert((x), &(vector long long*)0), 63))) +#define k8ifmsb(x,y,z) \ + (vec_sel((z), (y), vec_sra(vec_convert((x), &(vector long long*)0), 63))) diff --git a/src/vectors-8-default.h b/src/vectors-8-default.h index f980e74..ee85593 100644 --- a/src/vectors-8-default.h +++ b/src/vectors-8-default.h @@ -49,6 +49,8 @@ // Unaligned store #define vec8_store_nta(p,x) ((p)=(x)) +#define vec8_store_partial_prepare(i,imin,imax) ((void)0) +#define vec8_store_nta_partial(p,x) (vec8_store_nta(p,x)) // Store the n lower elements of a vector to memory #define vec8_store_nta_partial_lo(p,x,n) (assert(0)) // Store the n higher elements of a vector into memory. This stores @@ -92,4 +94,4 @@ # define k8sgn(x) (signbit(x)) #endif -#define k8ifpos(x,y,z) (k8sgn(x)?(z):(y)) +#define k8ifmsb(x,y,z) (k8sgn(x)?(y):(z)) diff --git a/src/vectors.h b/src/vectors.h index 2761202..6cb1238 100644 --- a/src/vectors.h +++ b/src/vectors.h @@ -61,6 +61,8 @@ # define vec_loadu_maybe3 vec4_loadu_maybe3 # define vec_store vec4_store # define vec_store_nta vec4_store_nta +# define vec_store_partial_prepare vec4_store_partial_prepare +# define vec_store_nta_partial vec4_store_nta_partial # define vec_store_nta_partial_lo vec4_store_nta_partial_lo # define vec_store_nta_partial_hi vec4_store_nta_partial_hi # define vec_store_nta_partial_mid vec4_store_nta_partial_mid @@ -90,8 +92,7 @@ # define ksqrt k4sqrt # define ktan k4tan -# define kifpos k4ifpos -# define kifneg k4ifneg +# define kifmsb k4ifmsb #elif defined(CCTK_REAL_PRECISION_8) @@ -111,7 +112,9 @@ # define vec_loadu_maybe vec8_loadu_maybe # define vec_loadu_maybe3 vec8_loadu_maybe3 # define vec_store vec8_store +# define vec_store_partial_prepare vec8_store_partial_prepare # define vec_store_nta vec8_store_nta +# define vec_store_nta_partial vec8_store_nta_partial # define vec_store_nta_partial_lo vec8_store_nta_partial_lo # define vec_store_nta_partial_hi vec8_store_nta_partial_hi # define vec_store_nta_partial_mid vec8_store_nta_partial_mid @@ -140,7 +143,7 @@ # define ksqrt k8sqrt # define ktan k8tan -# define kifpos k8ifpos +# define kifmsb k8ifmsb #else @@ -150,6 +153,11 @@ +#define kifneg(a,b,c) kifmsb(a,b,c) +#define kifpos(a,b,c) kifmsb(a,c,b) + + + #if CCTK_REAL_VEC_SIZE == 1 # define vec_index vec_set(0) #elif CCTK_REAL_VEC_SIZE == 2 @@ -310,20 +318,31 @@ struct vecprops<CCTK_REAL8> { // For Kranc -#undef KRANC_DIFF_FUNCTIONS -#if ! VECTORISE_INLINE -# define KRANC_DIFF_FUNCTIONS -#endif +#ifdef KRANC_C + +# undef KRANC_DIFF_FUNCTIONS +# if ! VECTORISE_INLINE +# define KRANC_DIFF_FUNCTIONS +# endif + +# undef E +# define E (ToReal(M_E)) -#undef Pi -#define Pi (ToReal(M_PI)) +# undef Pi +# define Pi (ToReal(M_PI)) -#undef ToReal -#define ToReal(x) (vec_set1((CCTK_REAL)(x))) +# undef Sign +# define Sign(x) -999999999 // poison -#undef Sign -#define Sign(x) -999999999 // poison +# undef ToReal +# define ToReal(x) (vec_set1((CCTK_REAL)(x))) +# undef KRANC_GFOFFSET3D +# define KRANC_GFOFFSET3D(var,i,j,k) \ + vec_loadu_maybe3((i),(j),(k), \ + *(CCTK_REAL const*)& \ + ((char const*)(var))[cdi*(i)+cdj*(j)+cdk*(k)]) +#endif // KRANC_C #endif // #ifndef VECTORS_H |