aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoreschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2012-02-05 00:20:49 +0000
committereschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2012-02-05 00:20:49 +0000
commit746a81a4430a69c5a09a509f8e5a85fbffb17305 (patch)
treea62ddcc9e096b5619de8f2348282778bec23568b
parent7e78ff525ffb9974d60161d446efb980bfe32cbb (diff)
Various changes
1. Implement a simplified partial store interface Implement vec_store_nta_partial, which offers a simpler interface, similar to the one used in OpenCL. 2. Add kifmsg function, and implement kifpos and kifneg in terms of this. 3. Update (and make safer) Kranc-specific code git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@47 105869f7-3296-0410-a4ea-f4349344b45a
-rw-r--r--src/test.cc19
-rw-r--r--src/vectors-4-Altivec.h4
-rw-r--r--src/vectors-4-SSE.h18
-rw-r--r--src/vectors-4-default.h3
-rw-r--r--src/vectors-8-AVX.h62
-rw-r--r--src/vectors-8-DoubleHummer.h2
-rw-r--r--src/vectors-8-SSE2.h49
-rw-r--r--src/vectors-8-VSX.h4
-rw-r--r--src/vectors-8-default.h4
-rw-r--r--src/vectors.h45
10 files changed, 165 insertions, 45 deletions
diff --git a/src/test.cc b/src/test.cc
index 5118497..a64456f 100644
--- a/src/test.cc
+++ b/src/test.cc
@@ -131,6 +131,18 @@ void Vectors_Test(CCTK_ARGUMENTS)
VECTEST("vec_store", sv, b[i]);
sv = av; vec_store_nta(*s, bv);
VECTEST("vec_store_nta", sv, b[i]);
+ for (int dlo=-1; dlo<=CCTK_REAL_VEC_SIZE; ++dlo) {
+ for (int dhi=dlo; dhi<=CCTK_REAL_VEC_SIZE; ++dhi) {
+ if (dlo>0 and dhi>dlo and dhi<CCTK_REAL_VEC_SIZE) {
+ sv = av;
+ vec_store_partial_prepare(0, dlo, dhi);
+ vec_store_nta_partial(*s, bv);
+ snprintf (testname, sizeof testname,
+ "vec_store_nta_partial[%d,%d]", dlo, dhi);
+ VECTEST(testname, sv, i>=dlo && i<dhi ? b[i] : a[i]);
+ }
+ }
+ }
/* The partial stores are not implemented for d==0 and
d==CCTK_REAL_VEC_SIZE-1 (because these are trivial) */
for (int d=1; d<CCTK_REAL_VEC_SIZE-1; ++d) {
@@ -183,6 +195,13 @@ void Vectors_Test(CCTK_ARGUMENTS)
VECTEST("kifpos 0", kifpos(vec_set1(0.),bv,cv), b[i]);
VECTEST("kifpos -0", kifpos(vec_set1(-0.),bv,cv), c[i]);
+ VECTEST("kifneg positive",
+ kifneg(av, bv, cv), my_signbit(a[i]) ? b[i] : c[i]);
+ VECTEST("kifneg negative",
+ kifneg(bv, bv, cv), my_signbit(b[i]) ? b[i] : c[i]);
+ VECTEST("kifneg 0", kifneg(vec_set1(0.),bv,cv), c[i]);
+ VECTEST("kifneg -0", kifneg(vec_set1(-0.),bv,cv), b[i]);
+
if (passed != numtests)
CCTK_VWarn(CCTK_WARN_ALERT, __LINE__, __FILE__, CCTK_THORNSTRING,
"Failed %d correctness tests", numtests - passed);
diff --git a/src/vectors-4-Altivec.h b/src/vectors-4-Altivec.h
index 679c34e..45e91d3 100644
--- a/src/vectors-4-Altivec.h
+++ b/src/vectors-4-Altivec.h
@@ -161,5 +161,5 @@
#define k4pow(x,a) K4REPL2(pow,x,a)
#define k4sqrt(x) K4REPL(sqrt,x)
-#define k4ifpos(x,y,z) \
- (vec_sel((y), (z), vec_sra(vec_convert((x), &(vector int*)0), 31)))
+#define k4ifmsb(x,y,z) \
+ (vec_sel((z), (y), vec_sra(vec_convert((x), &(vector int*)0), 31)))
diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h
index 9152c55..9f32cea 100644
--- a/src/vectors-4-SSE.h
+++ b/src/vectors-4-SSE.h
@@ -326,14 +326,14 @@ static const union {
// Choice [sign(x)>0 ? y : z]
#ifdef __SSE4_1__
-# define k4ifpos(x,y,z) (_mm_blendv_ps(y,z,x))
+# define k4ifmsb(x,y,z) (_mm_blendv_ps(z,y,x))
#elif 0
# ifdef __cplusplus
# define k4sgn(x) ({ using namespace std; signbit(x); })
# else
# define k4sgn(x) (signbit(x))
# endif
-# define k4ifpos(x,y,z) \
+# define k4ifmsb(x,y,z) \
({ \
CCTK_REAL4_VEC const x__=(x_); \
CCTK_REAL4_VEC const y__=(y_); \
@@ -341,13 +341,13 @@ static const union {
CCTK_REAL4_VEC const x=x__; \
CCTK_REAL4_VEC const y=y__; \
CCTK_REAL4_VEC const z=z__; \
- vec4_set(k4sgn(vec4_elt0(x)) ? vec4_elt0(z) : vec4_elt0(y), \
- k4sgn(vec4_elt1(x)) ? vec4_elt1(z) : vec4_elt1(y), \
- k4sgn(vec4_elt2(x)) ? vec4_elt2(z) : vec4_elt2(y), \
- k4sgn(vec4_elt3(x)) ? vec4_elt3(z) : vec4_elt3(y)); \
+ vec4_set(k4sgn(vec4_elt0(x)) ? vec4_elt0(y) : vec4_elt0(z), \
+ k4sgn(vec4_elt1(x)) ? vec4_elt1(y) : vec4_elt1(z), \
+ k4sgn(vec4_elt2(x)) ? vec4_elt2(y) : vec4_elt2(z), \
+ k4sgn(vec4_elt3(x)) ? vec4_elt3(y) : vec4_elt3(z)); \
})
#else
-# define k4ifpos(x_,y_,z_) \
+# define k4ifmsb(x_,y_,z_) \
({ \
CCTK_REAL4_VEC const x__=(x_); \
CCTK_REAL4_VEC const y__=(y_); \
@@ -356,7 +356,7 @@ static const union {
CCTK_REAL4_VEC const y=y__; \
CCTK_REAL4_VEC const z=z__; \
CCTK_REAL4_VEC const mask = _mm_srai_epi32(x, 31); \
- /* (y & ~mask) | (z & mask) */ \
- _mm_or_ps(_mm_andnot_ps(mask, y), _mm_and_ps(mask, z)); \
+ /* (z & ~mask) | (y & mask) */ \
+ _mm_or_ps(_mm_andnot_ps(mask, z), _mm_and_ps(mask, y)); \
})
#endif
diff --git a/src/vectors-4-default.h b/src/vectors-4-default.h
index a672b89..874f471 100644
--- a/src/vectors-4-default.h
+++ b/src/vectors-4-default.h
@@ -49,6 +49,7 @@
// Unaligned store
#define vec4_store_nta(p,x) ((p)=(x))
+#define vec4_store_nta_partial(p,x,i,imin,imax) (vec4_store_nta(p,x))
// Store the n lower elements of a vector to memory
#define vec4_store_nta_partial_lo(p,x,n) (assert(0))
// Store the n higher elements of a vector into memory. This stores
@@ -93,4 +94,4 @@
# define k4sgn(x) (signbit(x))
#endif
-#define k4ifpos(x,y,z) (k4sgn(x)?(z):(y))
+#define k4ifmsb(x,y,z) (k4sgn(x)?(y):(z))
diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h
index 144d3b5..274b376 100644
--- a/src/vectors-8-AVX.h
+++ b/src/vectors-8-AVX.h
@@ -113,8 +113,53 @@ union k8const_t {
# define vec8_store_nta(p,x) (_mm256_stream_pd(&(p),x))
#endif
+// Store a partial vector (aligned and non-temporal)
+#define vec8_store_partial_prepare(i,imin_,imax_) \
+ bool v8stp_all; \
+ __m256i v8stp_mask; \
+ ({ \
+ ptrdiff_t const imin1=(imin_); \
+ ptrdiff_t const imin=imin1; \
+ ptrdiff_t const imax1=(imax_); \
+ ptrdiff_t const imax=imax1; \
+ \
+ v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE<imax; \
+ \
+ if (not CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
+ /* \
+ __m256i const v8stp_mask = \
+ _mm256_andnot_pd(_mm256_add_epi64(_mm256_set1_epi64x(i-imin), \
+ vec_index), \
+ _mm256_add_epi64(_mm256_set1_epi64x(i-imax), \
+ vec_index)); \
+ */ \
+ __m128i const termlo0 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(0,1)); \
+ __m128i const termup0 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(0,1)); \
+ __m128i const term0 = _mm_andnot_si128(termlo0, termup0); \
+ __m128i const termlo1 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(2,3)); \
+ __m128i const termup1 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(2,3)); \
+ __m128i const term1 = _mm_andnot_si128(termlo1, termup1); \
+ v8stp_mask = \
+ _mm256_insertf128_si256(_mm256_castsi128_si256(term0), term1, 1); \
+ } \
+ })
+
+#define vec8_store_nta_partial(p,x) \
+ ({ \
+ if (CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
+ vec8_store_nta(p,x); \
+ } else { \
+ _mm256_maskstore_pd(&p,v8stp_mask,x); \
+ } \
+ })
+
// Store a lower or higher partial vector (aligned and non-temporal);
// the non-temporal hint is probably ignored
+// Masks indicating which vector element should be stored:
static const k8const_t k8store_lo_union[5] =
{
{{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }},
@@ -131,7 +176,7 @@ static const k8const_t k8store_hi_union[5] =
{{ K8_ZERO, K8_IMIN, K8_IMIN, K8_IMIN, }},
{{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }},
};
-#if defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4
+#if !defined(__INTEL_COMPILER) && defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4
// gcc 4.4 uses a wrong prototype for _mm256_maskstore_pd
# define vec8_store_nta_partial_lo(p,x,n) \
(_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_lo_union[n].vi),x))
@@ -147,10 +192,11 @@ static const k8const_t k8store_hi_union[5] =
(_mm256_maskstore_pd(&(p),k8store_lo_union[n].vi,x))
# define vec8_store_nta_partial_hi(p,x,n) \
(_mm256_maskstore_pd(&(p),k8store_hi_union[n].vi,x))
-# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
- (_mm256_maskstore_pd \
- (&(p), \
- k8store_lo_union[nlo].vi & k8store_hi_union[nhi].vi, \
+# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
+ (_mm256_maskstore_pd \
+ (&(p), \
+ _mm256_castpd_si256(_mm256_and_pd(k8store_lo_union[nlo].vd, \
+ k8store_hi_union[nhi].vd)), \
x))
#endif
@@ -209,8 +255,12 @@ static const k8const_t k8abs_mask_union =
f(vec8_elt2(xfunc),afunc), \
f(vec8_elt3(xfunc),afunc)); \
})
+#define k8cos(x) K8REPL(cos,x)
#define k8exp(x) K8REPL(exp,x)
#define k8log(x) K8REPL(log,x)
#define k8pow(x,a) K8REPL2(pow,x,a)
+#define k8sin(x) K8REPL(sin,x)
+#define k8tan(x) K8REPL(tan,x)
-#define k8ifpos(x,y,z) (_mm256_blendv_pd(y,z,x))
+// Choice [sign(x)>0 ? y : z]
+#define k8ifmsb(x,y,z) (_mm256_blendv_pd(z,y,x))
diff --git a/src/vectors-8-DoubleHummer.h b/src/vectors-8-DoubleHummer.h
index 951ca5d..fdc4be7 100644
--- a/src/vectors-8-DoubleHummer.h
+++ b/src/vectors-8-DoubleHummer.h
@@ -241,4 +241,4 @@
#define k8sin(x) K8REPL(sin,x)
#define k8tan(x) K8REPL(tan,x)
-#define k8ifpos(x,y,z) (__fpsel(x,z,y))
+#define k8ifmsb(x,y,z) (__fpsel(x,y,z))
diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h
index 7dc7d31..fe231b7 100644
--- a/src/vectors-8-SSE2.h
+++ b/src/vectors-8-SSE2.h
@@ -139,6 +139,34 @@
# define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x))
#endif
+// Store a partial vector (aligned and non-temporal)
+#define vec8_store_partial_prepare(i,imin,imax) \
+ bool const v8stp_lo = (i)>=(imin); \
+ bool const v8stp_hi = (i)+CCTK_REAL_VEC_SIZE<(imax)
+#if VECTORISE_STREAMING_STORES && defined(__SSE4A__)
+# define vec8_store_nta_partial(p,x) \
+ ({ \
+ if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \
+ vec8_store_nta(p,x); \
+ } else if (v8stp_lo) { \
+ _mm_stream_sd(&p,x); \
+ } else if (v8stp_hi) { \
+ _mm_stream_sd(&p+1, vec8_swap10(x)); \
+ } \
+ })
+#else
+# define vec8_store_nta_partial(p,x) \
+ ({ \
+ if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \
+ vec8_store_nta(p,x); \
+ } else if (v8stp_lo) { \
+ _mm_storel_pd(&p,x); \
+ } else if (v8stp_hi) { \
+ _mm_storeh_pd(&p+1,x); \
+ } \
+ })
+#endif
+
// Store a lower or higher partial vector (aligned and non-temporal)
#if ! VECTORISE_STREAMING_STORES
# define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x))
@@ -264,9 +292,10 @@ static const union {
// Choice [sign(x)>0 ? y : z]
#ifdef __SSE4_1__
-# define k8ifpos(x,y,z) (_mm_blendv_pd(y,z,x))
+# define k8ifmsb(x,y,z) (_mm_blendv_pd(z,y,x))
#elif 0
-# define k8ifpos(x_,y_,z_) \
+// This is slow
+# define k8ifmsb(x_,y_,z_) \
({ \
CCTK_REAL8_VEC const x__=(x_); \
CCTK_REAL8_VEC const y__=(y_); \
@@ -278,8 +307,8 @@ static const union {
CCTK_REAL8_VEC r; \
switch (m) { \
case 0: r = y; break; \
- case 1: r = _mm_move_sd(y,z); break; \
- case 2: r = _mm_move_sd(z,y); break; \
+ case 1: r = _mm_move_sd(y,z); break; \
+ case 2: r = _mm_move_sd(z,y); break; \
case 3: r = z; break; \
} \
r; \
@@ -290,7 +319,7 @@ static const union {
# else
# define k4sgn(x) (signbit(x))
# endif
-# define k8ifpos(x_,y_,z_) \
+# define k8ifmsb(x_,y_,z_) \
({ \
CCTK_REAL8_VEC const x__=(x_); \
CCTK_REAL8_VEC const y__=(y_); \
@@ -298,8 +327,8 @@ static const union {
CCTK_REAL8_VEC const x=x__; \
CCTK_REAL8_VEC const y=y__; \
CCTK_REAL8_VEC const z=z__; \
- vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(z) : vec8_elt0(y), \
- k8sgn(vec8_elt1(x)) ? vec8_elt1(z) : vec8_elt1(y)); \
+ vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), \
+ k8sgn(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); \
})
#else
static const union {
@@ -307,7 +336,7 @@ static const union {
double d;
} k8one_union = { 0x1ULL };
# define k8one (k8one_union.d)
-# define k8ifpos(x_,y_,z_) \
+# define k8ifmsb(x_,y_,z_) \
({ \
CCTK_REAL8_VEC const x__=(x_); \
CCTK_REAL8_VEC const y__=(y_); \
@@ -319,7 +348,7 @@ static const union {
CCTK_REAL8_VEC const imask = \
(__m128d)_mm_sub_epi64(_mm_srli_epi64((__m128i)x, 63), \
(__m128i)_mm_set1_pd(k8one)); \
- /* (y & ~mask) | (z & mask); imask = ~mask */ \
- _mm_or_pd(_mm_and_pd(imask, y), _mm_andnot_pd(imask, z)); \
+ /* (z & ~mask) | (y & mask); imask = ~mask */ \
+ _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); \
})
#endif
diff --git a/src/vectors-8-VSX.h b/src/vectors-8-VSX.h
index 5e64ef4..93249a5 100644
--- a/src/vectors-8-VSX.h
+++ b/src/vectors-8-VSX.h
@@ -108,5 +108,5 @@
#define k8pow(x,a) K8REPL2(pow,x,a)
#define k8sqrt(x) K8REPL(sqrt,x)
-#define k8ifpos(x,y,z) \
- (vec_sel((y), (z), vec_sra(vec_convert((x), &(vector long long*)0), 63)))
+#define k8ifmsb(x,y,z) \
+ (vec_sel((z), (y), vec_sra(vec_convert((x), &(vector long long*)0), 63)))
diff --git a/src/vectors-8-default.h b/src/vectors-8-default.h
index f980e74..ee85593 100644
--- a/src/vectors-8-default.h
+++ b/src/vectors-8-default.h
@@ -49,6 +49,8 @@
// Unaligned store
#define vec8_store_nta(p,x) ((p)=(x))
+#define vec8_store_partial_prepare(i,imin,imax) ((void)0)
+#define vec8_store_nta_partial(p,x) (vec8_store_nta(p,x))
// Store the n lower elements of a vector to memory
#define vec8_store_nta_partial_lo(p,x,n) (assert(0))
// Store the n higher elements of a vector into memory. This stores
@@ -92,4 +94,4 @@
# define k8sgn(x) (signbit(x))
#endif
-#define k8ifpos(x,y,z) (k8sgn(x)?(z):(y))
+#define k8ifmsb(x,y,z) (k8sgn(x)?(y):(z))
diff --git a/src/vectors.h b/src/vectors.h
index 2761202..6cb1238 100644
--- a/src/vectors.h
+++ b/src/vectors.h
@@ -61,6 +61,8 @@
# define vec_loadu_maybe3 vec4_loadu_maybe3
# define vec_store vec4_store
# define vec_store_nta vec4_store_nta
+# define vec_store_partial_prepare vec4_store_partial_prepare
+# define vec_store_nta_partial vec4_store_nta_partial
# define vec_store_nta_partial_lo vec4_store_nta_partial_lo
# define vec_store_nta_partial_hi vec4_store_nta_partial_hi
# define vec_store_nta_partial_mid vec4_store_nta_partial_mid
@@ -90,8 +92,7 @@
# define ksqrt k4sqrt
# define ktan k4tan
-# define kifpos k4ifpos
-# define kifneg k4ifneg
+# define kifmsb k4ifmsb
#elif defined(CCTK_REAL_PRECISION_8)
@@ -111,7 +112,9 @@
# define vec_loadu_maybe vec8_loadu_maybe
# define vec_loadu_maybe3 vec8_loadu_maybe3
# define vec_store vec8_store
+# define vec_store_partial_prepare vec8_store_partial_prepare
# define vec_store_nta vec8_store_nta
+# define vec_store_nta_partial vec8_store_nta_partial
# define vec_store_nta_partial_lo vec8_store_nta_partial_lo
# define vec_store_nta_partial_hi vec8_store_nta_partial_hi
# define vec_store_nta_partial_mid vec8_store_nta_partial_mid
@@ -140,7 +143,7 @@
# define ksqrt k8sqrt
# define ktan k8tan
-# define kifpos k8ifpos
+# define kifmsb k8ifmsb
#else
@@ -150,6 +153,11 @@
+#define kifneg(a,b,c) kifmsb(a,b,c)
+#define kifpos(a,b,c) kifmsb(a,c,b)
+
+
+
#if CCTK_REAL_VEC_SIZE == 1
# define vec_index vec_set(0)
#elif CCTK_REAL_VEC_SIZE == 2
@@ -310,20 +318,31 @@ struct vecprops<CCTK_REAL8> {
// For Kranc
-#undef KRANC_DIFF_FUNCTIONS
-#if ! VECTORISE_INLINE
-# define KRANC_DIFF_FUNCTIONS
-#endif
+#ifdef KRANC_C
+
+# undef KRANC_DIFF_FUNCTIONS
+# if ! VECTORISE_INLINE
+# define KRANC_DIFF_FUNCTIONS
+# endif
+
+# undef E
+# define E (ToReal(M_E))
-#undef Pi
-#define Pi (ToReal(M_PI))
+# undef Pi
+# define Pi (ToReal(M_PI))
-#undef ToReal
-#define ToReal(x) (vec_set1((CCTK_REAL)(x)))
+# undef Sign
+# define Sign(x) -999999999 // poison
-#undef Sign
-#define Sign(x) -999999999 // poison
+# undef ToReal
+# define ToReal(x) (vec_set1((CCTK_REAL)(x)))
+# undef KRANC_GFOFFSET3D
+# define KRANC_GFOFFSET3D(var,i,j,k) \
+ vec_loadu_maybe3((i),(j),(k), \
+ *(CCTK_REAL const*)& \
+ ((char const*)(var))[cdi*(i)+cdj*(j)+cdk*(k)])
+#endif // KRANC_C
#endif // #ifndef VECTORS_H