aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoreschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2012-08-11 20:55:54 +0000
committereschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2012-08-11 20:55:54 +0000
commit4b4418d0efb5b06784e89f617701c7b7cd5f6059 (patch)
treea2be52c8fbe23ed0b9864538ea81fc62aa7ca036
parent95156e0c84a5e3282b27aafce4199f77a2fecf14 (diff)
Add ksgn function (vectorised version of Kranc's Sign)
All architectures: Add copysign and sgn functions. Remove pos function (which does nothing). Add support for Blue Gene/Q (QPX instructions). Correct errors in AVX instructions. git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@62 105869f7-3296-0410-a4ea-f4349344b45a
-rw-r--r--src/test.cc62
-rw-r--r--src/vectors-4-Altivec.h1
-rw-r--r--src/vectors-4-SSE.h82
-rw-r--r--src/vectors-4-default.h57
-rw-r--r--src/vectors-8-AVX.h80
-rw-r--r--src/vectors-8-DoubleHummer.h32
-rw-r--r--src/vectors-8-QPX.h311
-rw-r--r--src/vectors-8-SSE2.h101
-rw-r--r--src/vectors-8-VSX.h12
-rw-r--r--src/vectors-8-default.h56
-rw-r--r--src/vectors.h117
11 files changed, 689 insertions, 222 deletions
diff --git a/src/test.cc b/src/test.cc
index 924e4ec..73dd9e2 100644
--- a/src/test.cc
+++ b/src/test.cc
@@ -11,6 +11,14 @@
using namespace std;
+
+
+inline CCTK_REAL my_copysign (CCTK_REAL const x, CCTK_REAL const y)
+{
+ using namespace std;
+ return copysign(x, y);
+}
+
inline int my_isnan (CCTK_REAL const x)
{
return std::isnan(x);
@@ -21,6 +29,14 @@ inline int my_signbit (CCTK_REAL const x)
return std::signbit(x);
}
+inline CCTK_REAL my_sgn (CCTK_REAL const x)
+{
+ using namespace std;
+ return x == (CCTK_REAL)0.0 ? (CCTK_REAL)0.0 : my_copysign((CCTK_REAL)1.0, x);
+}
+
+
+
#define SCALARTEST(testname, vecexpr, scalarexpr) \
do { \
if (verbose) { \
@@ -67,6 +83,8 @@ inline int my_signbit (CCTK_REAL const x)
} \
} while(0)
+
+
extern "C"
void Vectors_Test(CCTK_ARGUMENTS)
{
@@ -215,27 +233,29 @@ void Vectors_Test(CCTK_ARGUMENTS)
VECTEST("knmadd", knmadd(av, bv, cv), -a[i] * b[i] - c[i] );
VECTEST("knmsub", knmsub(av, bv, cv), -a[i] * b[i] + c[i] );
- VECTEST("kacos", kacos(xv), acos(x[i]) );
- VECTEST("kacosh", kacosh(zv), acosh(z[i]) );
- VECTEST("kasin", kasin(xv), asin(x[i]) );
- VECTEST("kasinh", kasinh(xv), asinh(x[i]) );
- VECTEST("katan", katan(xv), atan(x[i]) );
- VECTEST("katan2", katan2(xv, yv), atan2(x[i], y[i]) );
- VECTEST("katanh", katanh(xv), atanh(x[i]) );
- VECTEST("kcos", kcos(xv), cos(x[i]) );
- VECTEST("kcosh", kcosh(xv), cosh(x[i]) );
- VECTEST("kexp", kexp(xv), exp(x[i]) );
- VECTEST("kfabs", kfabs(xv), fabs(x[i]) );
- VECTEST("kfmax", kfmax(xv, yv), fmax(x[i], y[i]) );
- VECTEST("kfmin", kfmin(xv, yv), fmin(x[i], y[i]) );
- VECTEST("kfnabs", kfnabs(xv), -fabs(x[i]) );
- VECTEST("klog", klog(yv), log(y[i]) );
- VECTEST("kpow", kpow(yv, x[0]), pow(y[i], x[0]) );
- VECTEST("ksin", ksin(xv), sin(x[i]) );
- VECTEST("ksinh", ksinh(xv), sinh(x[i]) );
- VECTEST("ksqrt", ksqrt(yv), sqrt(y[i]) );
- VECTEST("ktan", ktan(xv), tan(x[i]) );
- VECTEST("ktanh", ktanh(xv), tanh(x[i]) );
+ VECTEST("kacos", kacos(xv), acos(x[i]) );
+ VECTEST("kacosh", kacosh(zv), acosh(z[i]) );
+ VECTEST("kasin", kasin(xv), asin(x[i]) );
+ VECTEST("kasinh", kasinh(xv), asinh(x[i]) );
+ VECTEST("katan", katan(xv), atan(x[i]) );
+ VECTEST("katan2", katan2(xv, yv), atan2(x[i], y[i]) );
+ VECTEST("katanh", katanh(xv), atanh(x[i]) );
+ VECTEST("kcopysign", kcopysign(xv, yv), copysign(x[i], y[i]));
+ VECTEST("kcos", kcos(xv), cos(x[i]) );
+ VECTEST("kcosh", kcosh(xv), cosh(x[i]) );
+ VECTEST("kexp", kexp(xv), exp(x[i]) );
+ VECTEST("kfabs", kfabs(xv), fabs(x[i]) );
+ VECTEST("kfmax", kfmax(xv, yv), fmax(x[i], y[i]) );
+ VECTEST("kfmin", kfmin(xv, yv), fmin(x[i], y[i]) );
+ VECTEST("kfnabs", kfnabs(xv), -fabs(x[i]) );
+ VECTEST("klog", klog(yv), log(y[i]) );
+ VECTEST("kpow", kpow(yv, x[0]), pow(y[i], x[0]) );
+ VECTEST("ksin", ksin(xv), sin(x[i]) );
+ VECTEST("ksinh", ksinh(xv), sinh(x[i]) );
+ VECTEST("ksgn", ksgn(xv), my_sgn(x[i]) );
+ VECTEST("ksqrt", ksqrt(yv), sqrt(y[i]) );
+ VECTEST("ktan", ktan(xv), tan(x[i]) );
+ VECTEST("ktanh", ktanh(xv), tanh(x[i]) );
VECTEST("kifpos positive",
kifpos(av, bv, cv), my_signbit(a[i]) ? c[i] : b[i]);
diff --git a/src/vectors-4-Altivec.h b/src/vectors-4-Altivec.h
index 78c07ca..d6281c8 100644
--- a/src/vectors-4-Altivec.h
+++ b/src/vectors-4-Altivec.h
@@ -114,7 +114,6 @@
// Functions and operators
// Operators
-#define k4pos(x) (+(x))
#define k4neg(x) (-(x))
#define k4add(x,y) ((x)+(y))
diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h
index 68388b6..9af7189 100644
--- a/src/vectors-4-SSE.h
+++ b/src/vectors-4-SSE.h
@@ -50,6 +50,19 @@
+union k4const_t {
+ unsigned i[4];
+ float f[4];
+ __m128i vi;
+ __m128 vf;
+};
+
+#define K4_ZERO 0x00000000UL
+#define K4_IMIN 0x80000000UL
+#define K4_IMAX 0x7fffffffUL
+
+
+
// Create vectors, extract vector elements
#define vec4_set1(a) (_mm_set1_ps(a))
@@ -272,15 +285,10 @@
// Functions and operators
-static const union {
- unsigned i[4];
- __m128 v;
-} k4sign_mask_union = {{ 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U }};
-#define k4sign_mask (k4sign_mask_union.v)
+static const k4const_t k4sign_mask = {{ K4_IMIN, K4_IMIN, K4_IMIN, K4_IMIN, }};
// Operators
-#define k4pos(x) (x)
-#define k4neg(x) (_mm_xor_ps(k4sign_mask,x))
+#define k4neg(x) (_mm_xor_ps(k4sign_mask.vf,x))
// #define k4inv(x)
// TODO: provide k4inv via rcp and Newton-Raphson
// This is described in AMD's publication 47414.
@@ -306,10 +314,24 @@ static const union {
#endif
// Cheap functions
-#define k4fabs(x) (_mm_andnot_ps(k4sign_mask,x))
+#define k4copysign(x,y) \
+ (_mm_or_ps(_mm_andnot_ps(k4sign_mask.vf,x), \
+ _mm_and_ps(k4sign_mask.vf,y)))
+#define k4fabs(x) (_mm_andnot_ps(k4sign_mask.vf,x))
#define k4fmax(x,y) (_mm_max_ps(x,y))
#define k4fmin(x,y) (_mm_min_ps(x,y))
-#define k4fnabs(x) (_mm_or_ps(k4sign_mask,x))
+#define k4fnabs(x) (_mm_or_ps(k4sign_mask.vf,x))
+static const k4const_t k4zero = { f: { 0.0f, 0.0f, 0.0f, 0.0f, }};
+static const k4const_t k4one = { f: { 1.0f, 1.0f, 1.0f, 1.0f, }};
+#define k4sgn(x_) \
+ ({ \
+ CCTK_REAL_VEC const x__=(x_); \
+ CCTK_REAL_VEC const x=x__; \
+ CCTK_REAL_VEC const iszero = _mm_cmpeq_ps(k4zero.vf, x); \
+ CCTK_REAL_VEC const sign = _mm_and_ps(k4sign_mask.vf, x); \
+ CCTK_REAL_VEC const signedone = _mm_or_ps(k4one.vf, sign); \
+ k4ifthen(iszero, k4zero.vf, signedone); \
+ })
// TODO: maybe use rsqrt and Newton-Raphson
#define k4sqrt(x) (_mm_sqrt_ps(x))
@@ -363,16 +385,22 @@ static const union {
#define k4tan(x) K4REPL(tanf,x)
#define k4tanh(x) K4REPL(tanhf,x)
-// Choice [sign(x)>0 ? y : z]
+static const k4const_t k4lfalse = {{ +0U, +0U, +0U, +0U, }};
+static const k4const_t k4ltrue = {{ -1U, -1U, -1U, -1U, }};
+#define k4lnot(x) (_mm_xor_ps(k4ltrue,x))
+#define k4land(x,y) (_mm_and_ps(x,y))
+#define k4lor(x,y) (_mm_or_ps(x,y))
+#define k4lxor(x,y) (_mm_xor_ps(x,y))
+
#ifdef __SSE4_1__
-# define k4ifmsb(x,y,z) (_mm_blendv_ps(z,y,x))
+# define k4ifthen(x,y,z) (_mm_blendv_ps(z,y,x))
#elif 0
# ifdef __cplusplus
-# define k4sgn(x) ({ using namespace std; signbit(x); })
+# define k4signbit(x) ({ using namespace std; signbit(x); })
# else
-# define k4sgn(x) (signbitf(x))
+# define k4signbit(x) (signbitf(x))
# endif
-# define k4ifmsb(x,y,z) \
+# define k4ifthen(x,y,z) \
({ \
CCTK_REAL4_VEC const x__=(x_); \
CCTK_REAL4_VEC const y__=(y_); \
@@ -380,13 +408,15 @@ static const union {
CCTK_REAL4_VEC const x=x__; \
CCTK_REAL4_VEC const y=y__; \
CCTK_REAL4_VEC const z=z__; \
- vec4_set(k4sgn(vec4_elt0(x)) ? vec4_elt0(y) : vec4_elt0(z), \
- k4sgn(vec4_elt1(x)) ? vec4_elt1(y) : vec4_elt1(z), \
- k4sgn(vec4_elt2(x)) ? vec4_elt2(y) : vec4_elt2(z), \
- k4sgn(vec4_elt3(x)) ? vec4_elt3(y) : vec4_elt3(z)); \
+ vec4_set(k4signbit(vec4_elt0(x)) ? vec4_elt0(y) : vec4_elt0(z), \
+ k4signbit(vec4_elt1(x)) ? vec4_elt1(y) : vec4_elt1(z), \
+ k4signbit(vec4_elt2(x)) ? vec4_elt2(y) : vec4_elt2(z), \
+ k4signbit(vec4_elt3(x)) ? vec4_elt3(y) : vec4_elt3(z)); \
})
-#else
-# define k4ifmsb(x_,y_,z_) \
+#elif 0
+// We don't need to shift -- the condition (mask) will be either all
+// zeros or all ones
+# define k4ifthen(x_,y_,z_) \
({ \
CCTK_REAL4_VEC const x__=(x_); \
CCTK_REAL4_VEC const y__=(y_); \
@@ -399,4 +429,16 @@ static const union {
/* (z & ~mask) | (y & mask) */ \
_mm_or_ps(_mm_andnot_ps(mask, z), _mm_and_ps(mask, y)); \
})
+#else
+# define k4ifthen(x_,y_,z_) \
+ ({ \
+ CCTK_REAL4_VEC const x__=(x_); \
+ CCTK_REAL4_VEC const y__=(y_); \
+ CCTK_REAL4_VEC const z__=(z_); \
+ CCTK_REAL4_VEC const x=x__; \
+ CCTK_REAL4_VEC const y=y__; \
+ CCTK_REAL4_VEC const z=z__; \
+ /* (z & ~mask) | (y & mask) where imask = ~mask */ \
+ _mm_or_ps(_mm_and_ps(x, y), _mm_andnot_ps(x, z)); \
+ })
#endif
diff --git a/src/vectors-4-default.h b/src/vectors-4-default.h
index ff675f6..d1b2135 100644
--- a/src/vectors-4-default.h
+++ b/src/vectors-4-default.h
@@ -64,7 +64,6 @@
// Operators
-#define k4pos(x) (+(x))
#define k4neg(x) (-(x))
#define k4add(x,y) ((x)+(y))
@@ -79,32 +78,40 @@
#define k4nmsub(x,y,z) (-(x)*(y)+(z))
// Functions
-#define k4acos(x) (acosf(x))
-#define k4acosh(x) (acoshf(x))
-#define k4asin(x) (asinf(x))
-#define k4asinh(x) (asinhf(x))
-#define k4atan(x) (atanf(x))
-#define k4atan2(x,y) (atan2f(x,y))
-#define k4atanh(x) (atanhf(x))
-#define k4cos(x) (cosf(x))
-#define k4cosh(x) (coshf(x))
-#define k4exp(x) (expf(x))
-#define k4fabs(x) (fabsf(x))
-#define k4fmax(x,y) (fmaxf(x,y))
-#define k4fmin(x,y) (fminf(x,y))
-#define k4fnabs(x) (-fabsf(x))
-#define k4log(x) (logf(x))
-#define k4pow(x,a) (powf(x,a))
-#define k4sin(x) (sinf(x))
-#define k4sinh(x) (sinhf(x))
-#define k4sqrt(x) (sqrtf(x))
-#define k4tan(x) (tanf(x))
-#define k4tanh(x) (tanhf(x))
+#define k4acos(x) (acosf(x))
+#define k4acosh(x) (acoshf(x))
+#define k4asin(x) (asinf(x))
+#define k4asinh(x) (asinhf(x))
+#define k4atan(x) (atanf(x))
+#define k4atan2(x,y) (atan2f(x,y))
+#define k4atanh(x) (atanhf(x))
+#define k4copysign(x,y) (copysign(x,y))
+#define k4cos(x) (cosf(x))
+#define k4cosh(x) (coshf(x))
+#define k4exp(x) (expf(x))
+#define k4fabs(x) (fabsf(x))
+#define k4fmax(x,y) (fmaxf(x,y))
+#define k4fmin(x,y) (fminf(x,y))
+#define k4fnabs(x) (-fabsf(x))
+#define k4log(x) (logf(x))
+#define k4pow(x,a) (powf(x,a))
+#define k4sin(x) (sinf(x))
+#define k4sinh(x) (sinhf(x))
+#define k4sqrt(x) (sqrtf(x))
+#define k4tan(x) (tanf(x))
+#define k4tanh(x) (tanhf(x))
+
+#define k4sgn(x_) \
+ ({ \
+ CCTK_REAL x__=(x_); \
+ CCTK_REAL x=x__; \
+ x==(CCTK_REAL)0.0 ? (CCTK_REAL)0.0 : copysign((CCTK_REAL)1.0, x); \
+ })
#ifdef __cplusplus
-# define k4sgn(x) ({ using namespace std; signbit(x); })
+# define k4signbit(x) ({ using namespace std; signbit(x); })
#else
-# define k4sgn(x) (signbit(x))
+# define k4signbit(x) (signbit(x))
#endif
-#define k4ifmsb(x,y,z) (k4sgn(x)?(y):(z))
+#define k4ifthen(x,y,z) (k4signbit(x)?(y):(z))
diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h
index 9e1d98b..825f2d3 100644
--- a/src/vectors-8-AVX.h
+++ b/src/vectors-8-AVX.h
@@ -35,9 +35,9 @@
union k8const_t {
unsigned long long i[4];
- double d[4];
+ double f[4];
__m256i vi;
- __m256d vd;
+ __m256d vf;
};
#define K8_ZERO 0x0000000000000000ULL
@@ -118,12 +118,12 @@ union k8const_t {
bool v8stp_all; \
__m256i v8stp_mask; \
({ \
- ptrdiff_t const imin1=(imin_); \
- ptrdiff_t const imin=imin1; \
- ptrdiff_t const imax1=(imax_); \
- ptrdiff_t const imax=imax1; \
+ ptrdiff_t const imin__=(imin_); \
+ ptrdiff_t const imin=imin__; \
+ ptrdiff_t const imax__=(imax_); \
+ ptrdiff_t const imax=imax__; \
\
- v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE<imax; \
+ v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE-1<imax; \
\
if (not CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
/* \
@@ -134,14 +134,14 @@ union k8const_t {
vec_index)); \
*/ \
__m128i const termlo0 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(0,1)); \
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(1, 0)); \
__m128i const termup0 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(0,1)); \
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(1, 0)); \
__m128i const term0 = _mm_andnot_si128(termlo0, termup0); \
__m128i const termlo1 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(2,3)); \
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(3, 2)); \
__m128i const termup1 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(2,3)); \
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(3, 2)); \
__m128i const term1 = _mm_andnot_si128(termlo1, termup1); \
v8stp_mask = \
_mm256_insertf128_si256(_mm256_castsi128_si256(term0), term1, 1); \
@@ -160,7 +160,7 @@ union k8const_t {
// Store a lower or higher partial vector (aligned and non-temporal);
// the non-temporal hint is probably ignored
// Masks indicating which vector element should be stored:
-static const k8const_t k8store_lo_union[5] =
+static const k8const_t k8store_lo[5] =
{
{{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }},
{{ K8_IMIN, K8_ZERO, K8_ZERO, K8_ZERO, }},
@@ -168,7 +168,7 @@ static const k8const_t k8store_lo_union[5] =
{{ K8_IMIN, K8_IMIN, K8_IMIN, K8_ZERO, }},
{{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }},
};
-static const k8const_t k8store_hi_union[5] =
+static const k8const_t k8store_hi[5] =
{
{{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }},
{{ K8_ZERO, K8_ZERO, K8_ZERO, K8_IMIN, }},
@@ -179,24 +179,24 @@ static const k8const_t k8store_hi_union[5] =
#if !defined(__INTEL_COMPILER) && defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4
// gcc 4.4 uses a wrong prototype for _mm256_maskstore_pd
# define vec8_store_nta_partial_lo(p,x,n) \
- (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_lo_union[n].vi),x))
+ (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_lo[n].vi),x))
# define vec8_store_nta_partial_hi(p,x,n) \
- (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_hi_union[n].vi),x))
+ (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_hi[n].vi),x))
# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
(_mm256_maskstore_pd \
(&(p), \
- _mm256_castsi256_pd(k8store_lo_union[nlo].vi & k8store_hi_union[nhi].vi), \
+ _mm256_castsi256_pd(k8store_lo[nlo].vi & k8store_hi[nhi].vi), \
x))
#else
# define vec8_store_nta_partial_lo(p,x,n) \
- (_mm256_maskstore_pd(&(p),k8store_lo_union[n].vi,x))
+ (_mm256_maskstore_pd(&(p),k8store_lo[n].vi,x))
# define vec8_store_nta_partial_hi(p,x,n) \
- (_mm256_maskstore_pd(&(p),k8store_hi_union[n].vi,x))
-# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
- (_mm256_maskstore_pd \
- (&(p), \
- _mm256_castpd_si256(_mm256_and_pd(k8store_lo_union[nlo].vd, \
- k8store_hi_union[nhi].vd)), \
+ (_mm256_maskstore_pd(&(p),k8store_hi[n].vi,x))
+# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
+ (_mm256_maskstore_pd \
+ (&(p), \
+ _mm256_castpd_si256(_mm256_and_pd(k8store_lo[nlo].vf, \
+ k8store_hi[nhi].vf)), \
x))
#endif
@@ -204,13 +204,10 @@ static const k8const_t k8store_hi_union[5] =
// Functions and operators
-static const k8const_t k8sign_mask_union =
- {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }};
-static const k8const_t k8abs_mask_union =
- {{ K8_IMAX, K8_IMAX, K8_IMAX, K8_IMAX, }};
+static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }};
// Operators
-#define k8neg(x) (_mm256_xor_pd(x,k8sign_mask_union.vd))
+#define k8neg(x) (_mm256_xor_pd(x,k8sign_mask.vf))
#define k8add(x,y) (_mm256_add_pd(x,y))
#define k8sub(x,y) (_mm256_sub_pd(x,y))
@@ -231,10 +228,24 @@ static const k8const_t k8abs_mask_union =
#endif
// Cheap functions
-#define k8fabs(x) (_mm256_and_pd(x,k8abs_mask_union.vd))
+#define k8copysign(x,y) \
+ (_mm256_or_pd(_mm256_andnot_pd(k8sign_mask.vf,x), \
+ _mm256_and_pd(k8sign_mask.vf,y)))
+#define k8fabs(x) (_mm256_andnot_pd(k8sign_mask.vf,x))
#define k8fmax(x,y) (_mm256_max_pd(x,y))
#define k8fmin(x,y) (_mm256_min_pd(x,y))
-#define k8fnabs(x) (_mm256_or_pd(x,k8sign_mask_union.vd))
+#define k8fnabs(x) (_mm256_or_pd(x,k8sign_mask.vf))
+static const k8const_t k8zero = { f: { 0.0, 0.0, 0.0, 0.0, }};
+static const k8const_t k8one = { f: { 1.0, 1.0, 1.0, 1.0, }};
+#define k8sgn(x_) \
+ ({ \
+ CCTK_REAL_VEC x__=(x_); \
+ CCTK_REAL_VEC x=x__; \
+ CCTK_REAL_VEC iszero = _mm256_cmp_pd(x, k8zero.vf, _CMP_EQ_OQ); \
+ CCTK_REAL_VEC sign = _mm256_and_pd(k8sign_mask.vf, x); \
+ CCTK_REAL_VEC signedone = _mm256_or_pd(sign, k8one.vf); \
+ k8ifthen(iszero, k8zero.vf, signedone); \
+ })
#define k8sqrt(x) (_mm256_sqrt_pd(x))
// Expensive functions
@@ -287,5 +298,10 @@ static const k8const_t k8abs_mask_union =
#define k8tan(x) K8REPL(tan,x)
#define k8tanh(x) K8REPL(tanh,x)
-// Choice [sign(x)>0 ? y : z]
-#define k8ifmsb(x,y,z) (_mm256_blendv_pd(z,y,x))
+static const k8const_t k8lfalse = {{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }};
+static const k8const_t k8ltrue = {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }};
+#define k8lnot(x) (_mm256_xor_pd(k8sign_mask,x))
+#define k8land(x,y) (_mm256_and_pd(x,y))
+#define k8lor(x,y) (_mm256_or_pd(x,y))
+#define k8lxor(x,y) (_mm256_xor_pd(x,y))
+#define k8ifthen(x,y,z) (_mm256_blendv_pd(z,y,x))
diff --git a/src/vectors-8-DoubleHummer.h b/src/vectors-8-DoubleHummer.h
index b0a1adf..ffb04b4 100644
--- a/src/vectors-8-DoubleHummer.h
+++ b/src/vectors-8-DoubleHummer.h
@@ -23,6 +23,14 @@
+union k8const_t {
+ double f[2];
+ unsigned long long i[2];
+ CCTK_REAL8_VEC vf;
+};
+
+
+
// Create vectors, extract vector elements
#define vec8_set1(a) (__cmplx(a,a))
@@ -107,7 +115,7 @@
({ \
CCTK_REAL8 const& p__=(p_); \
CCTK_REAL8 const& p=p__; \
- ((off2) % CCTK_REAL8_VEC_SIZE != 0 || \
+ ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \
(off3) % CCTK_REAL8_VEC_SIZE != 0) ? \
vec8_loadu(p) : \
vec8_loadu_maybe(off1,p); \
@@ -205,6 +213,20 @@
__fpsel(k8sub(x,y),x,y); \
})
#define k8fnabs(x) (__fpnabs(x))
+static const k8const_t k8zero = {{ 0.0, 0.0, }};
+static const k8const_t k8one = {{ +1.0, +1.0, }};
+static const k8const_t k8mone = {{ -1.0, -1.0, }};
+#define k8sgn(x_) \
+ ({ \
+ CCTK_REAL_VEC x__=(x_); \
+ CCTK_REAL_VEC x=x__; \
+ /* TODO: this assumes that __fpsel says -0>=+0; \
+ if this is not so, we need k8abs(x) instead of x for iszero. */ \
+ CCTK_REAL_VEC iszero = k8land(__fpsel(x, k8lfalse.vf, k8ltrue.vf), \
+ __fpsel(k8neg(x), k8lfalse.vf, k8ltrue.vf)); \
+ CCTK_REAL_VEC signedone = __fpsel(x, k8mone.vf, k8one.vf); \
+ k8ifthen(iszero, k8zero.vf, signedone); \
+ })
// Estimate for reciprocal square root
#define k8rsqrt_init(x) (__fprsqrte(x))
// One Newton iteration for reciprocal square root
@@ -279,4 +301,10 @@
#define k8tan(x) K8REPL(tan,x)
#define k8tanh(x) K8REPL(tanh,x)
-#define k8ifmsb(x,y,z) (__fpsel(x,y,z))
+static const k8const_t k8lfalse = {{ -1.0, -1.0, }};
+static const k8const_t k8ltrue = {{ +1.0, +1.0, }};
+#define k8lnot(x) (__fpneg(x))
+#define k8land(x,y) (k8ifthen(x,y,k8lfalse.vf))
+#define k8lor(x,y) (k8ifthen(x,k8ltrue.vf,y))
+#define k8lxor(x,y) (k8ifthen(x,k8lnot(y),y))
+#define k8ifthen(x,y,z) (__fpsel(x,z,y))
diff --git a/src/vectors-8-QPX.h b/src/vectors-8-QPX.h
new file mode 100644
index 0000000..80762fe
--- /dev/null
+++ b/src/vectors-8-QPX.h
@@ -0,0 +1,311 @@
+// Vectorise using IBM's Blue Gene/Q QPX (Power)
+
+// Use the type vector4double directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+// Note: bgxlC_r does not like const declarations, so we need to cast
+// them away and/or omit them everywhere
+
+
+
+#include <assert.h>
+
+#ifdef __cplusplus
+# include <builtins.h>
+#endif
+
+
+
+#define vec8_architecture "QPX"
+
+// Vector type corresponding to CCTK_REAL
+#define CCTK_REAL8_VEC vector4double
+
+// Number of vector elements in a CCTK_REAL_VEC
+#define CCTK_REAL8_VEC_SIZE 4
+
+
+
+union k8const_t {
+ CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE];
+ CCTK_REAL8_VEC vf;
+};
+
+
+
+// Create vectors, extract vector elements
+
+#define vec8_set1(a) (vec_splats(a))
+#define vec8_set(a,b,c,d) ((vector4double){a,b,c,d})
+
+#define vec8_elt0(x) (vec_extract(x,0))
+#define vec8_elt1(x) (vec_extract(x,1))
+#define vec8_elt2(x) (vec_extract(x,2))
+#define vec8_elt3(x) (vec_extract(x,3))
+#define vec8_elt(x,d) (vec_extract(x,d))
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec8_load(p) (vec_lda(0,(CCTK_REAL8*)&(p)))
+#define vec8_loadu(p_) \
+ ({ \
+ CCTK_REAL8 const& p__=(p_); \
+ CCTK_REAL8& p = *(CCTK_REAL8*)&p__; \
+ CCTK_REAL8_VEC v1, v2, vp; \
+ v1 = vec_ld(0,&p); /* load the left part of the vector */ \
+ v2 = vec_ld(32,&p); /* load the right part of the vector */ \
+ vp = vec_lvsl(0,&p); /* generate control value */ \
+ vec_perm(v1,v2,vp); /* generate the aligned vector */ \
+ })
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset and the vector size
+#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
+// Implementation: Always use unaligned load
+# define vec8_loadu_maybe(off,p) vec8_loadu(p)
+# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p)
+#else
+# define vec8_loadu_maybe(off,p_) \
+ ({ \
+ CCTK_REAL8 const& p__=(p_); \
+ CCTK_REAL8 const& p=p__; \
+ (off) % CCTK_REAL8_VEC_SIZE == 0 ? \
+ vec8_load(p) : \
+ vec8_loadu(p); \
+ })
+# if VECTORISE_ALIGNED_ARRAYS
+// Assume all array x sizes are multiples of the vector size
+# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu_maybe(off1,p)
+# else
+# define vec8_loadu_maybe3(off1,off2,off3,p_) \
+ ({ \
+ CCTK_REAL8 const& p__=(p_); \
+ CCTK_REAL8 const& p=p__; \
+ ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \
+ (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \
+ vec8_loadu(p) : \
+ vec8_loadu_maybe(off1,p); \
+ })
+# endif
+#endif
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec8_store(p,x) (vec_sta(x,0,&(p)))
+#define vec8_storeu(p_,x_) \
+ ({ \
+ CCTK_REAL8 & p__=(p_); \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8 & p=p__; \
+ CCTK_REAL8_VEC const x=x__; \
+ CCTK_REAL8_VEC v1, v2, v3, vp, m1, m2, m3; \
+ /* generate insert masks */ \
+ vp = vec_lvsr(0,&p); \
+ m1 = k8lfalse; \
+ m2 = k8ltrue; \
+ m3 = vec_perm(m1,m2,vp); \
+ /* get existing data */ \
+ v1 = vec_ld(0,&p); \
+ v2 = vec_ld(32,&p); \
+ /* permute and insert */ \
+ v3 = vec_perm(x,x,vp); \
+ v1 = vec_sel(v1,v3,m3); \
+ v2 = vec_sel(v3,v2,m3); \
+ /* store data back */ \
+ vec_st(0,&p,v1); \
+ vec_st(32,&p,v2); \
+ })
+#define vec8_store_nta(p,x) (vec_sta(x,0,&(p))) // this doesn't avoid the cache
+
+// Store a partial vector (aligned and non-temporal)
+#define vec8_store_partial_prepare(i,imin_,imax_) \
+ bool v8stp_all; \
+ CCTK_REAL8_VEC v8stp_mask; \
+ ({ \
+ ptrdiff_t const imin__=(imin_); \
+ ptrdiff_t const imax__=(imax_); \
+ ptrdiff_t const imin=imin__; \
+ ptrdiff_t const imax=imax__; \
+ \
+ v8stp_all = i-imin>=0 and i-imax<=-CCTK_REAL8_VEC_SIZE; \
+ \
+ if (not CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
+ CCTK_REAL8_VEC vp_lo, vp_hi, mask_lo, mask_hi; \
+ vp_lo = vec_lvsl(i-imin, (CCTK_REAL*)CCTK_REAL8_VEC_SIZE); \
+ mask_lo = (i-imin>=0 ? \
+ k8ltrue : \
+ vec_perm(k8lfalse, k8ltrue, vp_lo)); \
+ vp_hi = vec_lvsl(i-imax, (CCTK_REAL*)CCTK_REAL8_VEC_SIZE); \
+ mask_hi = (i-imax<=-CCTK_REAL8_VEC_SIZE ? \
+ k8ltrue : \
+ vec_perm(k8ltrue, k8lfalse, vp_hi)); \
+ v8stp_mask = vec_and(mask_lo, mask_hi); \
+ } \
+ })
+#define vec8_store_nta_partial(p_,x_) \
+ ({ \
+ CCTK_REAL8& p__=(p_); \
+ CCTK_REAL8_VEC x__=(x_); \
+ CCTK_REAL8& p=p__; \
+ CCTK_REAL8_VEC x=x__; \
+ if (CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
+ vec8_store(p, x); \
+ } else { \
+ vec8_store(p, vec_sel(vec8_load(p), x, v8stp_mask)); \
+ } \
+ })
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+#define vec8_store_nta_partial_lo(p_,x_,n) \
+ ({ \
+ CCTK_REAL8& p__=(p_); \
+ CCTK_REAL8_VEC x__=(x_); \
+ CCTK_REAL8& p=p__; \
+ CCTK_REAL8_VEC x=x__; \
+ CCTK_REAL8_VEC vp_hi, mask_hi; \
+ vp_hi = vec_lvsl(CCTK_REAL8_VEC_SIZE-n, (CCTK_REAL*)0); \
+ mask_hi = vec_perm(k8ltrue, k8lfalse, vp_hi); \
+ vec8_store(p, vec_sel(vec8_load(p), x, mask_hi)); \
+ })
+#define vec8_store_nta_partial_hi(p_,x_,n) \
+ ({ \
+ CCTK_REAL8& p__=(p_); \
+ CCTK_REAL8_VEC x__=(x_); \
+ CCTK_REAL8& p=p__; \
+ CCTK_REAL8_VEC x=x__; \
+ CCTK_REAL8_VEC vp_lo, mask_lo; \
+ vp_lo = vec_lvsl(n, (CCTK_REAL*)0); \
+ mask_lo = vec_perm(k8lfalse, k8ltrue, vp_lo); \
+ vec8_store(p, vec_sel(vec8_load(p), x, mask_lo)); \
+ })
+#define vec8_store_nta_partial_mid(p_,x_,nlo,nhi) \
+ ({ \
+ CCTK_REAL8& p__=(p_); \
+ CCTK_REAL8_VEC x__=(x_); \
+ CCTK_REAL8 p=p__; \
+ CCTK_REAL8_VEC x=x__; \
+ CCTK_REAL8_VEC vp_lo, mask_lo; \
+ vp_lo = vec_lvsl(nhi, (CCTK_REAL*)0); \
+ mask_lo = vec_perm(k8lfalse, k8ltrue, vp_lo); \
+ CCTK_REAL8_VEC vp_hi, mask_hi; \
+ vp_hi = vec_lvsl(CCTK_REAL8_VEC_SIZE-nlo, (CCTK_REAL*)0); \
+ mask_hi = vec_perm(k8ltrue, k8lfalse, vp_hi); \
+ CCTK_REAL8_VEC mask; \
+ mask = vec_and(mask_lo, mask_hi); \
+ vec8_store(p, vec_sel(vec8_load(p), x, mask)); \
+ })
+
+
+
+// Functions and operators
+
+// Operators
+#define k8neg(x) (vec_neg(x))
+
+#define k8add(x,y) (vec_add(x,y))
+#define k8sub(x,y) (vec_sub(x,y))
+#define k8mul(x,y) (vec_mul(x,y))
+#define k8div(x,y) (vec_swdiv_nochk(x,y))
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#define k8madd(x,y,z) (vec_madd(z,x,y))
+#define k8msub(x,y,z) (vec_msub(z,x,y))
+#define k8nmadd(x,y,z) (vec_nmadd(z,x,y))
+#define k8nmsub(x,y,z) (vec_nmsub(z,x,y))
+
+// Cheap functions
+#define k8copysign(x,y) (vec_cpsgn(y,x))
+#define k8fabs(x) (vec_abs(x))
+#define k8fmax(x_,y_) \
+ ({ \
+ CCTK_REAL8_VEC x__=(x_); \
+ CCTK_REAL8_VEC y__=(y_); \
+ CCTK_REAL8_VEC x=x__; \
+ CCTK_REAL8_VEC y=y__; \
+ vec_sel(vec_cmpgt(y,x),y,x); \
+ })
+#define k8fmin(x_,y_) \
+ ({ \
+ CCTK_REAL8_VEC x__=(x_); \
+ CCTK_REAL8_VEC y__=(y_); \
+ CCTK_REAL8_VEC x=x__; \
+ CCTK_REAL8_VEC y=y__; \
+ vec_sel(vec_cmplt(y,x),y,x); \
+ })
+#define k8fnabs(x) (vec_nabs(x))
+#define k8sgn(x_) \
+ ({ \
+ CCTK_REAL8_VEC x__=(x_); \
+ CCTK_REAL8_VEC x=x__; \
+ CCTK_REAL8_VEC one, zero, iszero; \
+ one = k8ltrue; \
+ zero = vec_sub(one, one); \
+ iszero = vec_cmpeq(x, zero); \
+ k8ifthen(iszero, zero, vec_cpsgn(one, x)); \
+ })
+#define k8sqrt(x) (vec_swsqrt_nochk(x))
+
+// Expensive functions
+#define K8REPL(f,x_) \
+ ({ \
+ CCTK_REAL8_VEC x__=(x_); \
+ CCTK_REAL8_VEC x=x__; \
+ vec8_set(f(vec8_elt0(x)), \
+ f(vec8_elt1(x)), \
+ f(vec8_elt2(x)), \
+ f(vec8_elt3(x))); \
+ })
+#define K8REPL2S(f,x_,a_) \
+ ({ \
+ CCTK_REAL8_VEC x__=(x_); \
+ CCTK_REAL8 a__=(a_); \
+ CCTK_REAL8_VEC x=x__; \
+ CCTK_REAL8 a=a__; \
+ vec8_set(f(vec8_elt0(x),a), \
+ f(vec8_elt1(x),a), \
+ f(vec8_elt2(x),a), \
+ f(vec8_elt3(x),a)); \
+ })
+#define K8REPL2(f,x_,y_) \
+ ({ \
+ CCTK_REAL8_VEC x__=(x_); \
+ CCTK_REAL8_VEC y__=(y_); \
+ CCTK_REAL8_VEC x=x__; \
+ CCTK_REAL8_VEC y=y__; \
+ vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \
+ f(vec8_elt1(x),vec8_elt1(y)), \
+ f(vec8_elt2(x),vec8_elt2(y)), \
+ f(vec8_elt3(x),vec8_elt3(y))); \
+ })
+
+#define k8acos(x) K8REPL(acos,x)
+#define k8acosh(x) K8REPL(acosh,x)
+#define k8asin(x) K8REPL(asin,x)
+#define k8asinh(x) K8REPL(asinh,x)
+#define k8atan(x) K8REPL(atan,x)
+#define k8atan2(x,y) K8REPL2(atan2,x,y)
+#define k8atanh(x) K8REPL(atanh,x)
+#define k8cos(x) K8REPL(cos,x)
+#define k8cosh(x) K8REPL(cosh,x)
+#define k8exp(x) K8REPL(exp,x)
+#define k8log(x) K8REPL(log,x)
+#define k8pow(x,a) K8REPL2S(pow,x,a)
+#define k8sin(x) K8REPL(sin,x)
+#define k8sinh(x) K8REPL(sinh,x)
+#define k8tan(x) K8REPL(tan,x)
+#define k8tanh(x) K8REPL(tanh,x)
+
+#define k8lfalse \
+ ({ CCTK_REAL8_VEC dummy; vec_logical(dummy,dummy,0x0); })
+#define k8ltrue \
+ ({ CCTK_REAL8_VEC dummy; vec_logical(dummy,dummy,0xf); })
+#define k8lnot(x) (vec_not(x))
+#define k8land(x,y) (vec_and(x,y))
+#define k8lor(x,y) (vec_or(x,y))
+#define k8lxor(x,y) (vec_xor(x,y))
+#define k8ifthen(x,y,z) (vec_sel(z,x,y))
diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h
index 4138a18..0b301e8 100644
--- a/src/vectors-8-SSE2.h
+++ b/src/vectors-8-SSE2.h
@@ -60,6 +60,17 @@
+union k8const_t {
+ long long i[2];
+ double f[2];
+ __m128i vi;
+ __m128d vf;
+};
+
+#define K8_IMIN ((long long)0x8000000000000000ULL)
+
+
+
// Create vectors, extract vector elements
#define vec8_set1(a) (_mm_set1_pd(a))
@@ -216,16 +227,7 @@
// Functions and operators
-// static const union {
-// unsigned long long i[2];
-// __m128d v;
-// } k8all_mask_union = {{ 0xfffffffffffffffULL, 0xfffffffffffffffULL }};
-// #define k8all_mask (k8all_mask_union.v)
-static const union {
- unsigned long long i[2];
- __m128d v;
-} k8sign_mask_union = {{ 0x8000000000000000ULL, 0x8000000000000000ULL }};
-#define k8sign_mask (k8sign_mask_union.v)
+static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, }};
// Operators
@@ -246,7 +248,7 @@ static const union {
// #define k8or(x,y) (_mm_or_pd(x,y))
// #define k8xor(x,y) (_mm_xor_pd(x,y))
-#define k8neg(x) (_mm_xor_pd(k8sign_mask,x))
+#define k8neg(x) (_mm_xor_pd(k8sign_mask.vf,x))
#define k8add(x,y) (_mm_add_pd(x,y))
#define k8sub(x,y) (_mm_sub_pd(x,y))
@@ -267,10 +269,24 @@ static const union {
#endif
// Cheap functions
-#define k8fabs(x) (_mm_andnot_pd(k8sign_mask,x))
+#define k8copysign(x,y) \
+ (_mm_or_pd(_mm_andnot_pd(k8sign_mask.vf,x), \
+ _mm_and_pd(k8sign_mask.vf,y)))
+#define k8fabs(x) (_mm_andnot_pd(k8sign_mask.vf,x))
#define k8fmax(x,y) (_mm_max_pd(x,y))
#define k8fmin(x,y) (_mm_min_pd(x,y))
-#define k8fnabs(x) (_mm_or_pd(k8sign_mask,x))
+#define k8fnabs(x) (_mm_or_pd(k8sign_mask.vf,x))
+static const k8const_t k8zero = { f: { 0.0, 0.0, }};
+static const k8const_t k8one = { f: { 1.0, 1.0, }};
+#define k8sgn(x_) \
+ ({ \
+ CCTK_REAL_VEC const x__=(x_); \
+ CCTK_REAL_VEC const x=x__; \
+ CCTK_REAL_VEC const iszero = _mm_cmpeq_pd(k8zero.vf, x); \
+ CCTK_REAL_VEC const sign = _mm_and_pd(k8sign_mask.vf, x); \
+ CCTK_REAL_VEC const signedone = _mm_or_pd(k8one.vf, sign); \
+ k8ifthen(iszero, k8zero.vf, signedone); \
+ })
#define k8sqrt(x) (_mm_sqrt_pd(x))
// Expensive functions
@@ -317,12 +333,18 @@ static const union {
#define k8tan(x) K8REPL(tan,x)
#define k8tanh(x) K8REPL(tanh,x)
-// Choice [sign(x)>0 ? y : z]
+static const k8const_t k8lfalse = {{ +0LL, +0LL, }};
+static const k8const_t k8ltrue = {{ -1LL, -1LL, }};
+#define k8lnot(x) (_mm_xor_pd(k8ltrue,x))
+#define k8land(x,y) (_mm_and_pd(x,y))
+#define k8lor(x,y) (_mm_or_pd(x,y))
+#define k8lxor(x,y) (_mm_xor_pd(x,y))
+
#ifdef __SSE4_1__
-# define k8ifmsb(x,y,z) (_mm_blendv_pd(z,y,x))
+# define k8ifthen(x,y,z) (_mm_blendv_pd(z,y,x))
#elif 0
-// This is slow
-# define k8ifmsb(x_,y_,z_) \
+// This is slow (but this is what Intel/PGI produce by themselves)
+# define k8ifthen(x_,y_,z_) \
({ \
CCTK_REAL8_VEC const x__=(x_); \
CCTK_REAL8_VEC const y__=(y_); \
@@ -342,11 +364,26 @@ static const union {
})
#elif 0
# ifdef __cplusplus
-# define k8sgn(x) ({ using namespace std; signbit(x); })
+# define k8signbit(x) ({ using namespace std; signbit(x); })
# else
-# define k4sgn(x) (signbit(x))
+# define k8signbit(x) (signbit(x))
# endif
-# define k8ifmsb(x_,y_,z_) \
+# define k8ifthen(x_,y_,z_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8_VEC const y__=(y_); \
+ CCTK_REAL8_VEC const z__=(z_); \
+ CCTK_REAL8_VEC const x=x__; \
+ CCTK_REAL8_VEC const y=y__; \
+ CCTK_REAL8_VEC const z=z__; \
+ vec8_set(k8signbit(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), \
+ k8signbit(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); \
+ })
+#elif 0
+// We don't need to shift -- the condition (mask) will be either all
+// zeros or all ones
+static const k8const_t k8ione = {{ 0x1ULL, 0x1ULL, }};
+# define k8ifthen(x_,y_,z_) \
({ \
CCTK_REAL8_VEC const x__=(x_); \
CCTK_REAL8_VEC const y__=(y_); \
@@ -354,16 +391,16 @@ static const union {
CCTK_REAL8_VEC const x=x__; \
CCTK_REAL8_VEC const y=y__; \
CCTK_REAL8_VEC const z=z__; \
- vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), \
- k8sgn(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); \
+ /* there is no _mm_srai_epi64(x, 63); we therefore calculate srli(x)-1 */ \
+ __m128i const x_int = *(__m128i const*)&x; \
+ __m128i const imask_int = \
+ _mm_sub_epi64(_mm_srli_epi64(x_int, 63), k8ione.vi); \
+ CCTK_REAL8_VEC const imask = *(CCTK_REAL8_VEC const*)&imask_int; \
+ /* (z & ~mask) | (y & mask) where imask = ~mask */ \
+ _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); \
})
#else
-static const union {
- unsigned long long i;
- double d;
-} k8one_union = { 0x1ULL };
-# define k8one (k8one_union.d)
-# define k8ifmsb(x_,y_,z_) \
+# define k8ifthen(x_,y_,z_) \
({ \
CCTK_REAL8_VEC const x__=(x_); \
CCTK_REAL8_VEC const y__=(y_); \
@@ -371,11 +408,7 @@ static const union {
CCTK_REAL8_VEC const x=x__; \
CCTK_REAL8_VEC const y=y__; \
CCTK_REAL8_VEC const z=z__; \
- /* there is no _mm_srai_epi64(x, 63) */ \
- CCTK_REAL8_VEC const imask = \
- (__m128d)_mm_sub_epi64(_mm_srli_epi64((__m128i)x, 63), \
- (__m128i)_mm_set1_pd(k8one)); \
- /* (z & ~mask) | (y & mask); imask = ~mask */ \
- _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); \
+ /* (z & ~mask) | (y & mask) where imask = ~mask */ \
+ _mm_or_pd(_mm_and_pd(x, y), _mm_andnot_pd(x, z)); \
})
#endif
diff --git a/src/vectors-8-VSX.h b/src/vectors-8-VSX.h
index 8007bb2..07f19d6 100644
--- a/src/vectors-8-VSX.h
+++ b/src/vectors-8-VSX.h
@@ -129,5 +129,13 @@
#define k8tan(x) K8REPL(tan,x)
#define k8tanh(x) K8REPL(tanh,x)
-#define k8ifmsb(x,y,z) \
- (vec_sel((z), (y), vec_sra(vec_convert((x), &(vector long long*)0), 63)))
+/* #define k8ifmsb(x,y,z) \ */
+/* (vec_sel((z), (y), vec_sra(vec_convert((x), &(vector long long*)0), 63))) */
+
+#define k8lfalse (vec_splats(+0LL))
+#define k8ltrue (vec_splats(-1LL))
+#define k8lnot(x) (vec_xor(x,k8ltrue))
+#define k8land(x,y,z) (vec_and(x,y))
+#define k8lor(x,y,z) (vec_or(x,y))
+#define k8lxor(x,y,z) (vec_xor(x,y))
+#define k8ifthen(x,y,z) (vec_sel(z,y,x))
diff --git a/src/vectors-8-default.h b/src/vectors-8-default.h
index 94d9ed4..e9e2734 100644
--- a/src/vectors-8-default.h
+++ b/src/vectors-8-default.h
@@ -76,32 +76,40 @@
#define k8nmsub(x,y,z) (-(x)*(y)+(z))
// Functions
-#define k8acos(x) (acos(x))
-#define k8acosh(x) (acosh(x))
-#define k8asin(x) (asin(x))
-#define k8asinh(x) (asinh(x))
-#define k8atan(x) (atan(x))
-#define k8atan2(x,y) (atan2(x,y))
-#define k8atanh(x) (atanh(x))
-#define k8cos(x) (cos(x))
-#define k8cosh(x) (cosh(x))
-#define k8exp(x) (exp(x))
-#define k8fabs(x) (fabs(x))
-#define k8fmax(x,y) (fmax(x,y))
-#define k8fmin(x,y) (fmin(x,y))
-#define k8fnabs(x) (-fabs(x))
-#define k8log(x) (log(x))
-#define k8pow(x,a) (pow(x,a))
-#define k8sin(x) (sin(x))
-#define k8sinh(x) (sinh(x))
-#define k8sqrt(x) (sqrt(x))
-#define k8tan(x) (tan(x))
-#define k8tanh(x) (tanh(x))
+#define k8acos(x) (acos(x))
+#define k8acosh(x) (acosh(x))
+#define k8asin(x) (asin(x))
+#define k8asinh(x) (asinh(x))
+#define k8atan(x) (atan(x))
+#define k8atan2(x,y) (atan2(x,y))
+#define k8atanh(x) (atanh(x))
+#define k8copysign(x,y) (copysign(x,y))
+#define k8cos(x) (cos(x))
+#define k8cosh(x) (cosh(x))
+#define k8exp(x) (exp(x))
+#define k8fabs(x) (fabs(x))
+#define k8fmax(x,y) (fmax(x,y))
+#define k8fmin(x,y) (fmin(x,y))
+#define k8fnabs(x) (-fabs(x))
+#define k8log(x) (log(x))
+#define k8pow(x,a) (pow(x,a))
+#define k8sin(x) (sin(x))
+#define k8sinh(x) (sinh(x))
+#define k8sqrt(x) (sqrt(x))
+#define k8tan(x) (tan(x))
+#define k8tanh(x) (tanh(x))
+
+#define k8sgn(x_) \
+ ({ \
+ CCTK_REAL x__=(x_); \
+ CCTK_REAL x=x__; \
+ x==(CCTK_REAL)0.0 ? (CCTK_REAL)0.0 : copysign((CCTK_REAL)1.0, x); \
+ })
#ifdef __cplusplus
-# define k8sgn(x) ({ using namespace std; signbit(x); })
+# define k8signbit(x) ({ using namespace std; signbit(x); })
#else
-# define k8sgn(x) (signbit(x))
+# define k8signbit(x) (signbit(x))
#endif
-#define k8ifmsb(x,y,z) (k8sgn(x)?(y):(z))
+#define k8ifthen(x,y,z) (k8signbit(x)?(y):(z))
diff --git a/src/vectors.h b/src/vectors.h
index 4443212..b1bce6f 100644
--- a/src/vectors.h
+++ b/src/vectors.h
@@ -22,10 +22,12 @@
# else
# include "vectors-8-SSE2.h"
# endif
-# elif defined(_ARCH_450D) // Blue Gene/P Double Hummer
-# include "vectors-8-DoubleHummer.h"
+# elif defined(_ARCH_QP) // Blue Gene/Q QPX
+# include "vectors-8-QPX.h"
# elif defined(__ALTIVEC__) && defined(_ARCH_PWR7) // Power VSX
# include "vectors-8-VSX.h"
+# elif defined(_ARCH_450D) // Blue Gene/P Double Hummer
+# include "vectors-8-DoubleHummer.h"
# endif
#endif
@@ -67,7 +69,6 @@
# define vec_store_nta_partial_hi vec4_store_nta_partial_hi
# define vec_store_nta_partial_mid vec4_store_nta_partial_mid
-# define kpos k4pos
# define kneg k4neg
# define kadd k4add
@@ -80,29 +81,31 @@
# define knmadd k4nmadd
# define knmsub k4nmsub
-# define kacos k4acos
-# define kacosh k4acosh
-# define kasin k4asin
-# define kasinh k4asinh
-# define katan k4atan
-# define katan2 k4atan2
-# define katanh k4atanh
-# define kcos k4cos
-# define kcosh k4cosh
-# define kexp k4exp
-# define kfabs k4fabs
-# define kfmax k4fmax
-# define kfmin k4fmin
-# define kfnabs k4fnabs
-# define klog k4log
-# define kpow k4pow
-# define ksin k4sin
-# define ksinh k4sinh
-# define ksqrt k4sqrt
-# define ktan k4tan
-# define ktanh k4tanh
-
-# define kifmsb k4ifmsb
+# define kacos k4acos
+# define kacosh k4acosh
+# define kasin k4asin
+# define kasinh k4asinh
+# define katan k4atan
+# define katan2 k4atan2
+# define katanh k4atanh
+# define kcopysign k4copysign
+# define kcos k4cos
+# define kcosh k4cosh
+# define kexp k4exp
+# define kfabs k4fabs
+# define kfmax k4fmax
+# define kfmin k4fmin
+# define kfnabs k4fnabs
+# define klog k4log
+# define kpow k4pow
+# define ksin k4sin
+# define ksinh k4sinh
+# define ksgn k4sgn
+# define ksqrt k4sqrt
+# define ktan k4tan
+# define ktanh k4tanh
+
+# define kifthen k4ifthen
#elif defined(CCTK_REAL_PRECISION_8)
@@ -141,29 +144,31 @@
# define knmadd k8nmadd
# define knmsub k8nmsub
-# define kacos k8acos
-# define kacosh k8acosh
-# define kasin k8asin
-# define kasinh k8asinh
-# define katan k8atan
-# define katan2 k8atan2
-# define katanh k8atanh
-# define kcos k8cos
-# define kcosh k8cosh
-# define kexp k8exp
-# define kfabs k8fabs
-# define kfmax k8fmax
-# define kfmin k8fmin
-# define kfnabs k8fnabs
-# define klog k8log
-# define kpow k8pow
-# define ksin k8sin
-# define ksinh k8sinh
-# define ksqrt k8sqrt
-# define ktan k8tan
-# define ktanh k8tanh
-
-# define kifmsb k8ifmsb
+# define kacos k8acos
+# define kacosh k8acosh
+# define kasin k8asin
+# define kasinh k8asinh
+# define katan k8atan
+# define katan2 k8atan2
+# define katanh k8atanh
+# define kcopysign k8copysign
+# define kcos k8cos
+# define kcosh k8cosh
+# define kexp k8exp
+# define kfabs k8fabs
+# define kfmax k8fmax
+# define kfmin k8fmin
+# define kfnabs k8fnabs
+# define klog k8log
+# define kpow k8pow
+# define ksin k8sin
+# define ksinh k8sinh
+# define ksgn k8sgn
+# define ksqrt k8sqrt
+# define ktan k8tan
+# define ktanh k8tanh
+
+# define kifthen k8ifthen
#else
@@ -173,6 +178,7 @@
+#define kifmsb(a,b,c) kifthen(a,b,c)
#define kifneg(a,b,c) kifmsb(a,b,c)
#define kifpos(a,b,c) kifmsb(a,c,b)
@@ -218,10 +224,6 @@ struct vecprops {
{
return x;
}
- static inline vector_t pos (vector_t const& x)
- {
- return +x;
- }
static inline vector_t neg (vector_t const& x)
{
return -x;
@@ -264,10 +266,6 @@ struct vecprops<CCTK_REAL4> {
{
return vec4_elt(x,d);
}
- static inline vector_t pos (vector_t const& x)
- {
- return k4pos(x);
- }
static inline vector_t neg (vector_t const& x)
{
return k4neg(x);
@@ -345,11 +343,8 @@ struct vecprops<CCTK_REAL8> {
# define KRANC_DIFF_FUNCTIONS
# endif
-# undef Sign
-# define Sign(x) -999999999 // poison
-
# undef ToReal
-# define ToReal(x) (vec_set1((CCTK_REAL)(x)))
+# define ToReal(x) (vec_set1(CCTK_REAL(x)))
# undef KRANC_GFOFFSET3D
# define KRANC_GFOFFSET3D(var,i,j,k) \