From dc69911dd15fa1fa24c51ca222fc7883d3fc5cff Mon Sep 17 00:00:00 2001 From: eschnett Date: Fri, 19 Jul 2013 17:48:51 +0000 Subject: Do not use type punning any more Do not cast between different pointer types. This is illegal in C/C++, and modern compilers (such as gcc 4.8) then generate wrong code. Instead, use memcpy to re-interpret the bit patterns of values with a different type. git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@85 105869f7-3296-0410-a4ea-f4349344b45a --- src/test.cc | 25 ++-- src/vectors-4-AVX.h | 263 ++++++++++++++++++++++++++++--------------- src/vectors-4-Altivec.h | 1 + src/vectors-4-SSE.h | 263 +++++++++++++++++++++++++++---------------- src/vectors-4-default.h | 43 +++---- src/vectors-8-AVX.h | 218 +++++++++++++++++++++++++---------- src/vectors-8-DoubleHummer.h | 1 + src/vectors-8-MIC.h | 133 ++++++++++------------ src/vectors-8-QPX.h | 206 ++++++++++++++++++++------------- src/vectors-8-SSE2.h | 196 +++++++++++++++++++++++++------- src/vectors-8-default.h | 49 ++++---- src/vectors.h | 18 +-- 12 files changed, 908 insertions(+), 508 deletions(-) diff --git a/src/test.cc b/src/test.cc index 4a3f6d1..a674705 100644 --- a/src/test.cc +++ b/src/test.cc @@ -30,8 +30,7 @@ inline CCTK_REAL my_sgn(CCTK_REAL const x) CCTK_REAL const res = (scalarexpr); \ CCTK_REAL const vecres = (vecexpr); \ CCTK_REAL const eps = numeric_limits::epsilon(); \ - assert(abs((CCTK_REAL)0.1) > 0); \ - if ((abs(vecres - res) <= 10*eps) or \ + if ((fabs(vecres - res) <= 10*eps) or \ (isnan(vecres) and isnan(res))) \ { \ passed++; \ @@ -54,8 +53,7 @@ inline CCTK_REAL my_sgn(CCTK_REAL const x) CCTK_REAL res = (scalarexpr); \ CCTK_REAL vecres = vec_elt(rv,i); \ CCTK_REAL eps = numeric_limits::epsilon(); \ - assert(abs((CCTK_REAL)0.1) > 0); \ - if ((abs(vecres - res) <= 10*eps) or \ + if ((fabs(vecres - res) <= 10*eps) or \ (isnan(vecres) and isnan(res))) \ { \ passed++; \ @@ -78,14 +76,17 @@ inline CCTK_REAL my_sgn(CCTK_REAL const x) CCTK_BOOLEAN_VEC rv = (vecexpr); \ for (int i=0; i +#include @@ -25,9 +27,13 @@ // Vector type corresponding to CCTK_REAL +// Note: some boolean masks (e.g. ~0) correspond to nan when +// interpreted as floating point number. gcc 4.8 is clever enough to +// optimize away such constants with fast-math. We therefore need to +// handle this constant as integer number. typedef __m256 CCTK_REAL4_VEC; typedef __m256i CCTK_INTEGER4_VEC; -typedef __m256 CCTK_BOOLEAN4_VEC; +typedef __m256i CCTK_BOOLEAN4_VEC; // Number of vector elements in a CCTK_REAL_VEC #define CCTK_REAL4_VEC_SIZE 8 @@ -36,16 +42,21 @@ vec_static_assert(sizeof(CCTK_REAL4_VEC) == sizeof(CCTK_REAL4) * CCTK_REAL4_VEC_SIZE); // Integer and boolean types corresponding to this real type -typedef CCTK_INT4 CCTK_INTEGER4; -typedef CCTK_REAL4 CCTK_BOOLEAN4; +typedef CCTK_INT4 CCTK_INTEGER4; +typedef CCTK_INT4 CCTK_BOOLEAN4; + + + +// These macros are undefined at the end of this file -- use them only +// within functions, not within macros that are exported +#define I2R(x) _mm256_castsi256_ps(x) +#define R2I(x) _mm256_castps_si256(x) union k4const_t { CCTK_INTEGER4 i[CCTK_REAL4_VEC_SIZE]; - CCTK_REAL4 f[CCTK_REAL4_VEC_SIZE]; CCTK_INTEGER4_VEC vi; - CCTK_REAL4_VEC vf; }; #define k4sign (vec4_set1i( (CCTK_INTEGER4)(1UL << 31UL))) @@ -61,9 +72,9 @@ CCTK_REAL4_VEC vec4_set1(CCTK_REAL4 const a) return _mm256_set1_ps(a); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4_VEC vec4_set1i(CCTK_INT4 const a) +CCTK_INTEGER4_VEC vec4_set1i(CCTK_INT4 const a) { - return _mm256_castsi256_ps(_mm256_set1_epi32(a)); + return _mm256_set1_epi32(a); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC vec4_set(CCTK_REAL4 const a, @@ -79,49 +90,27 @@ CCTK_REAL4_VEC vec4_set(CCTK_REAL4 const a, } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 vec4_elt0(CCTK_REAL4_VEC const x) -{ - return ((CCTK_REAL4 const*)&x)[0]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 vec4_elt1(CCTK_REAL4_VEC const x) -{ - return ((CCTK_REAL4 const*)&x)[1]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 vec4_elt2(CCTK_REAL4_VEC const x) -{ - return ((CCTK_REAL4 const*)&x)[2]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 vec4_elt3(CCTK_REAL4_VEC const x) -{ - return ((CCTK_REAL4 const*)&x)[3]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 vec4_elt4(CCTK_REAL4_VEC const x) -{ - return ((CCTK_REAL4 const*)&x)[4]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 vec4_elt5(CCTK_REAL4_VEC const x) -{ - return ((CCTK_REAL4 const*)&x)[5]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 vec4_elt6(CCTK_REAL4_VEC const x) +CCTK_REAL4 vec4_elt(CCTK_REAL4_VEC const x, std::ptrdiff_t const d) { - return ((CCTK_REAL4 const*)&x)[6]; + CCTK_REAL4 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 vec4_elt7(CCTK_REAL4_VEC const x) +CCTK_INTEGER4 vec4_elti(CCTK_INTEGER4_VEC const x, std::ptrdiff_t const d) { - return ((CCTK_REAL4 const*)&x)[7]; + CCTK_INTEGER4 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } + static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 vec4_elt(CCTK_REAL4_VEC const x, std::ptrdiff_t const d) +CCTK_BOOLEAN4 vec4_eltb(CCTK_BOOLEAN4_VEC const x, std::ptrdiff_t const d) { - return ((CCTK_REAL4 const*)&x)[d]; + CCTK_BOOLEAN4 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } @@ -300,14 +289,14 @@ void vec4_store_nta_partial_lo(CCTK_REAL4& p, CCTK_REAL4_VEC const x, ptrdiff_t const n) { - _mm256_maskstore_ps(&p, _mm256_castsi256_ps(k4store_lo[n].vi), x); + _mm256_maskstore_ps(&p, I2R(k4store_lo[n].vi), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec4_store_nta_partial_hi(CCTK_REAL4& p, CCTK_REAL4_VEC const x, ptrdiff_t const n) { - _mm256_maskstore_ps(&p, _mm256_castsi256_ps(k4store_hi[n].vi), x); + _mm256_maskstore_ps(&p, I2R(k4store_hi[n].vi), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec4_store_nta_partial_mid(CCTK_REAL4& p, @@ -315,10 +304,7 @@ void vec4_store_nta_partial_mid(CCTK_REAL4& p, ptrdiff_t const nlo, ptrdiff_t const nhi) { - _mm256_maskstore_ps - (&p, - _mm256_castsi256_ps(k4store_lo[nlo].vi & k4store_hi[nhi].vi), - x); + _mm256_maskstore_ps(&p, I2R(k4store_lo[nlo].vi & k4store_hi[nhi].vi), x); } #else static inline CCTK_ATTRIBUTE_ALWAYS_INLINE @@ -343,7 +329,7 @@ void vec4_store_nta_partial_mid(CCTK_REAL4& p, { _mm256_maskstore_ps (&p, - _mm256_castps_si256(_mm256_and_ps(k4store_lo[nlo].vf, k4store_hi[nhi].vf)), + R2I(_mm256_and_ps(I2R(k4store_lo[nlo].vi), I2R(k4store_hi[nhi].vi))), x); } #endif @@ -356,7 +342,7 @@ void vec4_store_nta_partial_mid(CCTK_REAL4& p, static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4neg(CCTK_REAL4_VEC const x) { - return _mm256_xor_ps(x, k4sign); + return _mm256_xor_ps(x, I2R(k4sign)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE @@ -445,13 +431,13 @@ CCTK_REAL4_VEC k4nmsub(CCTK_REAL4_VEC const x, static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4copysign(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { - return _mm256_or_ps(_mm256_and_ps(k4notsign, x), - _mm256_and_ps(k4sign , y)); + return _mm256_or_ps(_mm256_and_ps(I2R(k4notsign), x), + _mm256_and_ps(I2R(k4sign ), y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4fabs(CCTK_REAL4_VEC const x) { - return _mm256_and_ps(k4notsign, x); + return _mm256_and_ps(I2R(k4notsign), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4fmax(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) @@ -466,7 +452,7 @@ CCTK_REAL4_VEC k4fmin(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4fnabs(CCTK_REAL4_VEC const x) { - return _mm256_or_ps(x, k4sign); + return _mm256_or_ps(x, I2R(k4sign)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4sqrt(CCTK_REAL4_VEC const x) @@ -474,34 +460,122 @@ CCTK_REAL4_VEC k4sqrt(CCTK_REAL4_VEC const x) return _mm256_sqrt_ps(x); } + + // Expensive functions +#if defined __ICC +// The Intel compiler provides intrinsics for these + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4acos(CCTK_REAL4_VEC const x) +{ + return _mm256_acos_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4acosh(CCTK_REAL4_VEC const x) +{ + return _mm256_acosh_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4asin(CCTK_REAL4_VEC const x) +{ + return _mm256_asin_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4asinh(CCTK_REAL4_VEC const x) +{ + return _mm256_asinh_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4atan(CCTK_REAL4_VEC const x) +{ + return _mm256_atan_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4atan2(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return _mm256_atan2_ps(x,y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4atanh(CCTK_REAL4_VEC const x) +{ + return _mm256_atanh_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4cos(CCTK_REAL4_VEC const x) +{ + return _mm256_cos_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4cosh(CCTK_REAL4_VEC const x) +{ + return _mm256_cosh_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4exp(CCTK_REAL4_VEC const x) +{ + return _mm256_exp_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4log(CCTK_REAL4_VEC const x) +{ + return _mm256_log_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4pow(CCTK_REAL4_VEC const x, CCTK_REAL4 const a) +{ + return _mm256_pow_ps(x, _mm256_set1_ps(a)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4sin(CCTK_REAL4_VEC const x) +{ + return _mm256_sin_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4sinh(CCTK_REAL4_VEC const x) +{ + return _mm256_sinh_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4tan(CCTK_REAL4_VEC const x) +{ + return _mm256_tan_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4tanh(CCTK_REAL4_VEC const x) +{ + return _mm256_tanh_ps(x); +} + +#else + #define K4REPL(f,x) \ - vec4_set(f(vec4_elt0(x)), \ - f(vec4_elt1(x)), \ - f(vec4_elt2(x)), \ - f(vec4_elt3(x)), \ - f(vec4_elt4(x)), \ - f(vec4_elt5(x)), \ - f(vec4_elt6(x)), \ - f(vec4_elt7(x))); + vec4_set(f(vec4_elt(x,0)), \ + f(vec4_elt(x,1)), \ + f(vec4_elt(x,2)), \ + f(vec4_elt(x,3)), \ + f(vec4_elt(x,4)), \ + f(vec4_elt(x,5)), \ + f(vec4_elt(x,6)), \ + f(vec4_elt(x,7))); #define K4REPL2S(f,x,a) \ - vec4_set(f(vec4_elt0(x),a), \ - f(vec4_elt1(x),a), \ - f(vec4_elt2(x),a), \ - f(vec4_elt3(x),a), \ - f(vec4_elt4(x),a), \ - f(vec4_elt5(x),a), \ - f(vec4_elt6(x),a), \ - f(vec4_elt7(x),a)); + vec4_set(f(vec4_elt(x,0),a), \ + f(vec4_elt(x,1),a), \ + f(vec4_elt(x,2),a), \ + f(vec4_elt(x,3),a), \ + f(vec4_elt(x,4),a), \ + f(vec4_elt(x,5),a), \ + f(vec4_elt(x,6),a), \ + f(vec4_elt(x,7),a)); #define K4REPL2(f,x,y) \ - vec4_set(f(vec4_elt0(x),vec4_elt0(y)), \ - f(vec4_elt1(x),vec4_elt1(y)), \ - f(vec4_elt2(x),vec4_elt2(y)), \ - f(vec4_elt3(x),vec4_elt3(y)), \ - f(vec4_elt4(x),vec4_elt4(y)), \ - f(vec4_elt5(x),vec4_elt5(y)), \ - f(vec4_elt6(x),vec4_elt6(y)), \ - f(vec4_elt7(x),vec4_elt7(y))); + vec4_set(f(vec4_elt(x,0),vec4_elt(y,0)), \ + f(vec4_elt(x,1),vec4_elt(y,1)), \ + f(vec4_elt(x,2),vec4_elt(y,2)), \ + f(vec4_elt(x,3),vec4_elt(y,3)), \ + f(vec4_elt(x,4),vec4_elt(y,4)), \ + f(vec4_elt(x,5),vec4_elt(y,5)), \ + f(vec4_elt(x,6),vec4_elt(y,6)), \ + f(vec4_elt(x,7),vec4_elt(y,7))); static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4acos(CCTK_REAL4_VEC const x) @@ -584,6 +658,8 @@ CCTK_REAL4_VEC k4tanh(CCTK_REAL4_VEC const x) return K4REPL(tanh,x); } +#endif + #define k4lfalse (vec4_set1i( 0)) @@ -591,60 +667,60 @@ CCTK_REAL4_VEC k4tanh(CCTK_REAL4_VEC const x) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4lnot(CCTK_BOOLEAN4_VEC const x) { - return _mm256_xor_ps(k4ltrue, x); + return R2I(_mm256_xor_ps(I2R(k4ltrue), I2R(x))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4land(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y) { - return _mm256_and_ps(x, y); + return R2I(_mm256_and_ps(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4lor(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y) { - return _mm256_or_ps(x, y); + return R2I(_mm256_or_ps(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4lxor(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y) { - return _mm256_xor_ps(x, y); + return R2I(_mm256_xor_ps(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4ifthen(CCTK_BOOLEAN4_VEC const x, CCTK_REAL4_VEC const y, CCTK_REAL4_VEC const z) { - return _mm256_blendv_ps(z, y, x); + return _mm256_blendv_ps(z, y, I2R(x)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmpeq(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { - return _mm256_cmp_ps(x, y, _CMP_EQ_OQ); + return R2I(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmpne(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { - return _mm256_cmp_ps(x, y, _CMP_NEQ_OQ); + return R2I(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmpgt(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { - return _mm256_cmp_ps(x, y, _CMP_GT_OQ); + return R2I(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmpge(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { - return _mm256_cmp_ps(x, y, _CMP_GE_OQ); + return R2I(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmplt(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { - return _mm256_cmp_ps(x, y, _CMP_LT_OQ); + return R2I(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmple(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { - return _mm256_cmp_ps(x, y, _CMP_LE_OQ); + return R2I(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); } @@ -653,7 +729,12 @@ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4sgn(CCTK_REAL4_VEC const x) { CCTK_BOOLEAN4_VEC const iszero = k4cmpeq(x, vec4_set1(0.0)); - CCTK_REAL4_VEC const sign = _mm256_and_ps(k4sign, x); + CCTK_REAL4_VEC const sign = _mm256_and_ps(I2R(k4sign), x); CCTK_REAL4_VEC const signedone = _mm256_or_ps(sign, vec4_set1(1.0)); return k4ifthen(iszero, vec4_set1(0.0), signedone); } + + + +#undef I2R +#undef R2I diff --git a/src/vectors-4-Altivec.h b/src/vectors-4-Altivec.h index 3975c77..ca235ee 100644 --- a/src/vectors-4-Altivec.h +++ b/src/vectors-4-Altivec.h @@ -1,3 +1,4 @@ +// -*-C++-*- // Vectorise using IBM's Altivec (Power) // Use the type vector double directly, without introducing a wrapper class diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h index 7d0d9c3..bdbc10d 100644 --- a/src/vectors-4-SSE.h +++ b/src/vectors-4-SSE.h @@ -1,3 +1,4 @@ +// -*-C++-*- // Vectorise using Intel's or AMD's SSE // Use the type __m128 directly, without introducing a wrapper class @@ -12,6 +13,7 @@ #include #include #include +#include @@ -50,9 +52,13 @@ // Vector type corresponding to CCTK_REAL +// Note: some boolean masks (e.g. ~0) correspond to nan when +// interpreted as floating point number. gcc 4.8 is clever enough to +// optimize away such constants with fast-math. We therefore need to +// handle this constant as integer number. typedef __m128 CCTK_REAL4_VEC; typedef __m128i CCTK_INTEGER4_VEC; -typedef __m128 CCTK_BOOLEAN4_VEC; +typedef __m128i CCTK_BOOLEAN4_VEC; // Number of vector elements in a CCTK_REAL_VEC #define CCTK_REAL4_VEC_SIZE 4 @@ -66,12 +72,12 @@ typedef CCTK_REAL4 CCTK_BOOLEAN4; -union k4const_t { - CCTK_INTEGER4 i[CCTK_REAL4_VEC_SIZE]; - CCTK_REAL4 f[CCTK_REAL4_VEC_SIZE]; - CCTK_INTEGER4_VEC vi; - CCTK_REAL4_VEC vf; -}; +// These macros are undefined at the end of this file -- use them only +// within functions, not within macros that are exported +#define I2R(x) _mm_castsi128_ps(x) +#define R2I(x) _mm_castps_si128(x) + + #define k4sign (vec4_set1i( (CCTK_INTEGER4)(1UL << 31UL))) #define k4notsign (vec4_set1i(~ (CCTK_INTEGER4)(1UL << 31UL))) @@ -86,9 +92,9 @@ CCTK_REAL4_VEC vec4_set1(CCTK_REAL4 const a) return _mm_set1_ps(a); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4_VEC vec4_set1i(CCTK_INT4 const a) +CCTK_INTEGER4_VEC vec4_set1i(CCTK_INT4 const a) { - return _mm_castsi128_ps(_mm_set1_epi32(a)); + return _mm_set1_epi32(a); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC vec4_set(CCTK_REAL4 const a, @@ -116,54 +122,26 @@ CCTK_REAL4_VEC vec4_swap3210(CCTK_REAL4_VEC const x) return _mm_shuffle_ps(x, x, _MM_SHUFFLE(0,1,2,3)); } -#if defined __PGI -// _mm_cvtss_f32 does not exist on PGI compilers -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 _mm_cvtss_f32(CCTK_REAL4_VEC const x) -{ - CCTK_REAL4 a; - asm ("" : "=x" (a) : "0" (x)); - return a; -} -#endif - -// TODO: Why not ((CCTK_REAL4 const*)&x)[d] ? -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 vec4_elt0(CCTK_REAL4_VEC const x) -{ - return _mm_cvtss_f32(x); // this is a no-op -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 vec4_elt1(CCTK_REAL4_VEC const x) -{ - return vec4_elt0(vec4_swap1032(x)); -} static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 vec4_elt2(CCTK_REAL4_VEC const x) +CCTK_REAL4 vec4_elt(CCTK_REAL4_VEC const x, std::ptrdiff_t const d) { - return vec4_elt0(vec4_swap2301(x)); + CCTK_REAL4 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 vec4_elt3(CCTK_REAL4_VEC const x) +CCTK_INTEGER4 vec4_elti(CCTK_INTEGER4_VEC const x, std::ptrdiff_t const d) { - return vec4_elt0(vec4_swap3210(x)); + CCTK_INTEGER4 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL4 vec4_elt(CCTK_REAL4_VEC const x, std::ptrdiff_t const d) +CCTK_BOOLEAN4 vec4_eltb(CCTK_BOOLEAN4_VEC const x, std::ptrdiff_t const d) { -#if defined __PGI - if (d==0) return vec4_elt0(x); - if (d==1) return vec4_elt1(x); - if (d==2) return vec4_elt2(x); - return vec4_elt3(x); -#else - switch (d) { - case 0: return vec4_elt0(x); - case 1: return vec4_elt1(x); - case 2: return vec4_elt2(x); - } - return vec4_elt3(x); -#endif + CCTK_BOOLEAN4 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } @@ -331,16 +309,16 @@ void vec4_store_nta_partial_(std::ptrdiff_t const lo_skip, // these cases fall through switch (lo_skip) { case 0: - (&p)[0] = vec4_elt0(x); + (&p)[0] = vec4_elt(x, 0); case 1: if (hi_skip>=3) break; - (&p)[1] = vec4_elt1(x); + (&p)[1] = vec4_elt(x, 1); case 2: if (hi_skip>=2) break; - (&p)[2] = vec4_elt2(x); + (&p)[2] = vec4_elt(x, 2); case 3: if (hi_skip>=1) break; - (&p)[3] = vec4_elt3(x); + (&p)[3] = vec4_elt(x, 3); } } } @@ -352,9 +330,9 @@ void vec4_store_nta_partial_lo(CCTK_REAL4& p, { // these cases fall through switch (n) { - case 3: (&p)[2] = vec4_elt2(x); - case 2: (&p)[1] = vec4_elt1(x); - case 1: (&p)[0] = vec4_elt0(x); + case 3: (&p)[2] = vec4_elt(x, 2); + case 2: (&p)[1] = vec4_elt(x, 1); + case 1: (&p)[0] = vec4_elt(x, 0); } } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE @@ -364,9 +342,9 @@ void vec4_store_nta_partial_hi(CCTK_REAL4& p, { // these cases fall through switch (n) { - case 3: (&p)[1]=vec4_elt1(x); - case 2: (&p)[2]=vec4_elt2(x); - case 1: (&p)[3]=vec4_elt3(x); + case 3: (&p)[1]=vec4_elt(x, 1); + case 2: (&p)[2]=vec4_elt(x, 2); + case 1: (&p)[3]=vec4_elt(x, 3); } } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE @@ -379,10 +357,10 @@ void vec4_store_nta_partial_hi(CCTK_REAL4& p, switch (nhi) { case 3: if (nlo<2) break; - (&p)[1] = vec4_elt1(x); + (&p)[1] = vec4_elt(x, 1); case 2: if (nlo<3) break; - (&p)[2] = vec4_elt2(x); + (&p)[2] = vec4_elt(x, 2); } } @@ -394,7 +372,7 @@ void vec4_store_nta_partial_hi(CCTK_REAL4& p, static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4neg(CCTK_REAL4_VEC const x) { - return _mm_xor_ps(k4sign, x); + return _mm_xor_ps(I2R(k4sign), x); } // #define k4inv(x) // TODO: provide k4inv via rcp and Newton-Raphson @@ -488,13 +466,13 @@ CCTK_REAL4_VEC k4nmsub(CCTK_REAL4_VEC const x, static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4copysign(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { - return _mm_or_ps(_mm_and_ps(k4notsign, x), - _mm_and_ps(k4sign , y)); + return _mm_or_ps(_mm_and_ps(I2R(k4notsign), x), + _mm_and_ps(I2R(k4sign ), y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4fabs(CCTK_REAL4_VEC const x) { - return _mm_and_ps(k4notsign, x); + return _mm_and_ps(I2R(k4notsign), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4fmax(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) @@ -509,7 +487,7 @@ CCTK_REAL4_VEC k4fmin(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4fnabs(CCTK_REAL4_VEC const x) { - return _mm_or_ps(k4sign, x); + return _mm_or_ps(I2R(k4sign), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4sqrt(CCTK_REAL4_VEC const x) @@ -518,22 +496,110 @@ CCTK_REAL4_VEC k4sqrt(CCTK_REAL4_VEC const x) return _mm_sqrt_ps(x); } + + // Expensive functions +#if defined __ICC +// The Intel compiler provides intrinsics for these + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4acos(CCTK_REAL4_VEC const x) +{ + return _mm_acos_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4acosh(CCTK_REAL4_VEC const x) +{ + return _mm_acosh_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4asin(CCTK_REAL4_VEC const x) +{ + return _mm_asin_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4asinh(CCTK_REAL4_VEC const x) +{ + return _mm_asinh_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4atan(CCTK_REAL4_VEC const x) +{ + return _mm_atan_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4atan2(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) +{ + return _mm_atan2_ps(x,y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4atanh(CCTK_REAL4_VEC const x) +{ + return _mm_atanh_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4cos(CCTK_REAL4_VEC const x) +{ + return _mm_cos_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4cosh(CCTK_REAL4_VEC const x) +{ + return _mm_cosh_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4exp(CCTK_REAL4_VEC const x) +{ + return _mm_exp_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4log(CCTK_REAL4_VEC const x) +{ + return _mm_log_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4pow(CCTK_REAL4_VEC const x, CCTK_REAL4 const a) +{ + return _mm_pow_ps(x, _mm_set1_ps(a)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4sin(CCTK_REAL4_VEC const x) +{ + return _mm_sin_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4sinh(CCTK_REAL4_VEC const x) +{ + return _mm_sinh_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4tan(CCTK_REAL4_VEC const x) +{ + return _mm_tan_ps(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL4_VEC k4tanh(CCTK_REAL4_VEC const x) +{ + return _mm_tanh_ps(x); +} + +#else + #define K4REPL(f,x) \ - vec4_set(f(vec4_elt0(x)), \ - f(vec4_elt1(x)), \ - f(vec4_elt2(x)), \ - f(vec4_elt3(x))); + vec4_set(f(vec4_elt(x,0)), \ + f(vec4_elt(x,1)), \ + f(vec4_elt(x,2)), \ + f(vec4_elt(x,3))); #define K4REPL2S(f,x,a) \ - vec4_set(f(vec4_elt0(x),a), \ - f(vec4_elt1(x),a), \ - f(vec4_elt2(x),a), \ - f(vec4_elt3(x),a)); + vec4_set(f(vec4_elt(x,0),a), \ + f(vec4_elt(x,1),a), \ + f(vec4_elt(x,2),a), \ + f(vec4_elt(x,3),a)); #define K4REPL2(f,x,y) \ - vec4_set(f(vec4_elt0(x),vec4_elt0(y)), \ - f(vec4_elt1(x),vec4_elt1(y)), \ - f(vec4_elt2(x),vec4_elt2(y)), \ - f(vec4_elt3(x),vec4_elt3(y))); + vec4_set(f(vec4_elt(x,0),vec4_elt(y,0)), \ + f(vec4_elt(x,1),vec4_elt(y,1)), \ + f(vec4_elt(x,2),vec4_elt(y,2)), \ + f(vec4_elt(x,3),vec4_elt(y,3))); static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4acos(CCTK_REAL4_VEC const x) @@ -616,6 +682,8 @@ CCTK_REAL4_VEC k4tanh(CCTK_REAL4_VEC const x) return K4REPL(tanh,x); } +#endif + #define k4lfalse (vec4_set1i( 0)) @@ -623,22 +691,22 @@ CCTK_REAL4_VEC k4tanh(CCTK_REAL4_VEC const x) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4lnot(CCTK_BOOLEAN4_VEC const x) { - return _mm_xor_ps(k4ltrue, x); + return R2I(_mm_xor_ps(I2R(k4ltrue), I2R(x))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4land(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y) { - return _mm_and_ps(x, y); + return R2I(_mm_and_ps(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4lor(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y) { - return _mm_or_ps(x, y); + return R2I(_mm_or_ps(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4lxor(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y) { - return _mm_xor_ps(x, y); + return R2I(_mm_xor_ps(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4ifthen(CCTK_BOOLEAN4_VEC const x, @@ -646,12 +714,12 @@ CCTK_REAL4_VEC k4ifthen(CCTK_BOOLEAN4_VEC const x, CCTK_REAL4_VEC const z) { #ifdef __SSE4_1__ - return _mm_blendv_ps(z,y,x); + return _mm_blendv_ps(z,y,I2R(x)); #elif 0 - return vec4_set(std::signbit(vec4_elt0(x)) ? vec4_elt0(y) : vec4_elt0(z), - std::signbit(vec4_elt1(x)) ? vec4_elt1(y) : vec4_elt1(z), - std::signbit(vec4_elt2(x)) ? vec4_elt2(y) : vec4_elt2(z), - std::signbit(vec4_elt3(x)) ? vec4_elt3(y) : vec4_elt3(z)); + return vec4_set(std::signbit(vec4_elt(x,0)) ? vec4_elt(y,0) : vec4_elt(z,0), + std::signbit(vec4_elt(x,1)) ? vec4_elt(y,1) : vec4_elt(z,1), + std::signbit(vec4_elt(x,2)) ? vec4_elt(y,2) : vec4_elt(z,2), + std::signbit(vec4_elt(x,3)) ? vec4_elt(y,3) : vec4_elt(z,3)); #elif 0 // We don't need to shift -- the condition (mask) will be either all // zeros or all ones @@ -662,39 +730,39 @@ CCTK_REAL4_VEC k4ifthen(CCTK_BOOLEAN4_VEC const x, // This assumes that all logical operations always return either // lfalse or ltrue, and nothing "in between" // (z & ~mask) | (y & mask) where imask = ~mask - return _mm_or_ps(_mm_and_ps(x, y), _mm_andnot_ps(x, z)); + return _mm_or_ps(_mm_and_ps(I2RI(x), y), _mm_andnot_ps(I2R(x), z)); #endif } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmpeq(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { - return _mm_cmpeq_ps(x, y); + return R2I(_mm_cmpeq_ps(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmpne(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { - return _mm_cmpneq_ps(x, y); + return R2I(_mm_cmpneq_ps(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmpgt(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { - return _mm_cmpgt_ps(x, y); + return R2I(_mm_cmpgt_ps(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmpge(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { - return _mm_cmpge_ps(x, y); + return R2I(_mm_cmpge_ps(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmplt(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { - return _mm_cmplt_ps(x, y); + return R2I(_mm_cmplt_ps(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN4_VEC k4cmple(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y) { - return _mm_cmple_ps(x, y); + return R2I(_mm_cmple_ps(x, y)); } @@ -703,9 +771,14 @@ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL4_VEC k4sgn(CCTK_REAL4_VEC const x) { CCTK_BOOLEAN4_VEC const iszero = k4cmpeq(x, vec4_set1(0.0)); - CCTK_REAL4_VEC const sign = _mm_and_ps(k4sign, x); + CCTK_REAL4_VEC const sign = _mm_and_ps(I2R(k4sign), x); CCTK_REAL4_VEC const signedone = _mm_or_ps(sign, vec4_set1(1.0)); return k4ifthen(iszero, vec4_set1(0.0), signedone); } #endif + + + +#undef I2R +#undef R2I diff --git a/src/vectors-4-default.h b/src/vectors-4-default.h index 28fae04..ec98ebf 100644 --- a/src/vectors-4-default.h +++ b/src/vectors-4-default.h @@ -1,3 +1,4 @@ +// -*-C++-*- // Fallback vectorisation implementation: Do not vectorise // We use macros here, so that we are not surprised by compilers which @@ -23,10 +24,10 @@ vec_static_assert(sizeof(CCTK_REAL4_VEC) == sizeof(CCTK_REAL4) * CCTK_REAL4_VEC_SIZE); // Integer and boolean types corresponding to this real type -#define CCTK_INTEGER4 CCTK_REAL4 -#define CCTK_BOOLEAN4 CCTK_REAL4 -#define CCTK_INTEGER4_VEC CCTK_REAL4_VEC -#define CCTK_BOOLEAN4_VEC CCTK_REAL4_VEC +#define CCTK_INTEGER4 CCTK_INT4 +#define CCTK_BOOLEAN4 CCTK_INT4 +#define CCTK_INTEGER4_VEC CCTK_INT4 +#define CCTK_BOOLEAN4_VEC CCTK_INT4 @@ -38,6 +39,8 @@ vec_static_assert(sizeof(CCTK_REAL4_VEC) == // Access vectors elements #define vec4_elt0(x) (x) #define vec4_elt(x,d) (x) +#define vec4_elti(x,d) (x) +#define vec4_eltb(x,d) (x) @@ -118,20 +121,18 @@ vec_static_assert(sizeof(CCTK_REAL4_VEC) == }) #define k4signbit(x) (std::signbit(x)) -#define k4l2r(x_) ({ CCTK_INT4 x__=(x_); CCTK_INT4 x=x__; *(CCTK_REAL4*)&x; }) -#define k4r2l(x_) ({ CCTK_REAL4 x__=(x_); CCTK_REAL4 x=x__; *(CCTK_INT4*)&x; }) -#define k4lfalse k4l2r(0) -#define k4ltrue k4l2r(1) -#define k4lnot(x) k4l2r(!k4r2l(x)) -#define k4land(x,y) k4l2r(k4r2l(x) && k4r2l(y)) -#define k4lor(x,y) k4l2r(k4r2l(x) || k4r2l(y)) -#define k4lxor(x,y) k4l2r(!k4r2l(x) != !k4r2l(y)) - -#define k4ifthen(x,y,z) (k4r2l(x)?(y):(z)) - -#define k4cmpeq(x,y) k4l2r((x)==(y)) -#define k4cmpne(x,y) k4l2r((x)!=(y)) -#define k4cmpgt(x,y) k4l2r((x)>(y)) -#define k4cmpge(x,y) k4l2r((x)>=(y)) -#define k4cmplt(x,y) k4l2r((x)<(y)) -#define k4cmple(x,y) k4l2r((x)<=(y)) +#define k4lfalse 0 +#define k4ltrue 1 +#define k4lnot(x) (!(x)) +#define k4land(x,y) ((x) && (y)) +#define k4lor(x,y) ((x) || (y)) +#define k4lxor(x,y) (!(x) != !(y)) + +#define k4ifthen(x,y,z) ((x)?(y):(z)) + +#define k4cmpeq(x,y) ((x)==(y)) +#define k4cmpne(x,y) ((x)!=(y)) +#define k4cmpgt(x,y) ((x)>(y)) +#define k4cmpge(x,y) ((x)>=(y)) +#define k4cmplt(x,y) ((x)<(y)) +#define k4cmple(x,y) ((x)<=(y)) diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h index ce43542..7ff04c0 100644 --- a/src/vectors-8-AVX.h +++ b/src/vectors-8-AVX.h @@ -1,3 +1,4 @@ +// -*-C++-*- // Vectorise using Intel's or AMD's AVX // Use the type __m256d directly, without introducing a wrapper class @@ -5,6 +6,7 @@ #include +#include @@ -25,9 +27,13 @@ // Vector type corresponding to CCTK_REAL +// Note: some boolean masks (e.g. ~0) correspond to nan when +// interpreted as floating point number. gcc 4.8 is clever enough to +// optimize away such constants with fast-math. We therefore need to +// handle this constant as integer number. typedef __m256d CCTK_REAL8_VEC; typedef __m256i CCTK_INTEGER8_VEC; -typedef __m256d CCTK_BOOLEAN8_VEC; +typedef __m256i CCTK_BOOLEAN8_VEC; // Number of vector elements in a CCTK_REAL_VEC #define CCTK_REAL8_VEC_SIZE 4 @@ -36,16 +42,21 @@ vec_static_assert(sizeof(CCTK_REAL8_VEC) == sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE); // Integer and boolean types corresponding to this real type -typedef CCTK_INT8 CCTK_INTEGER8; -typedef CCTK_REAL8 CCTK_BOOLEAN8; +typedef CCTK_INT8 CCTK_INTEGER8; +typedef CCTK_INT8 CCTK_BOOLEAN8; + + + +// These macros are undefined at the end of this file -- use them only +// within functions, not within macros that are exported +#define I2R(x) _mm256_castsi256_pd(x) +#define R2I(x) _mm256_castpd_si256(x) union k8const_t { CCTK_INTEGER8 i[CCTK_REAL8_VEC_SIZE]; - CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE]; CCTK_INTEGER8_VEC vi; - CCTK_REAL8_VEC vf; }; #define k8sign (vec8_set1i( (CCTK_INTEGER8)(1ULL << 63ULL))) @@ -61,9 +72,9 @@ CCTK_REAL8_VEC vec8_set1(CCTK_REAL8 const a) return _mm256_set1_pd(a); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8_VEC vec8_set1i(CCTK_INT8 const a) +CCTK_INTEGER8_VEC vec8_set1i(CCTK_INT8 const a) { - return _mm256_castsi256_pd(_mm256_set1_epi64x(a)); + return _mm256_set1_epi64x(a); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC vec8_set(CCTK_REAL8 const a, @@ -75,29 +86,25 @@ CCTK_REAL8_VEC vec8_set(CCTK_REAL8 const a, } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt0(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[0]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt1(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[1]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt2(CCTK_REAL8_VEC const x) +CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d) { - return ((CCTK_REAL8 const*)&x)[2]; + CCTK_REAL8 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt3(CCTK_REAL8_VEC const x) +CCTK_INTEGER8 vec8_elti(CCTK_INTEGER8_VEC const x, std::ptrdiff_t const d) { - return ((CCTK_REAL8 const*)&x)[3]; + CCTK_INTEGER8 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d) +CCTK_BOOLEAN8 vec8_eltb(CCTK_BOOLEAN8_VEC const x, std::ptrdiff_t const d) { - return ((CCTK_REAL8 const*)&x)[d]; + CCTK_BOOLEAN8 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } @@ -268,14 +275,14 @@ void vec8_store_nta_partial_lo(CCTK_REAL8& p, CCTK_REAL8_VEC const x, ptrdiff_t const n) { - _mm256_maskstore_pd(&p, _mm256_castsi256_pd(k8store_lo[n].vi), x); + _mm256_maskstore_pd(&p, I2R(k8store_lo[n].vi), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec8_store_nta_partial_hi(CCTK_REAL8& p, CCTK_REAL8_VEC const x, ptrdiff_t const n) { - _mm256_maskstore_pd(&p, _mm256_castsi256_pd(k8store_hi[n].vi), x); + _mm256_maskstore_pd(&p, I2R(k8store_hi[n].vi), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec8_store_nta_partial_mid(CCTK_REAL8& p, @@ -283,10 +290,7 @@ void vec8_store_nta_partial_mid(CCTK_REAL8& p, ptrdiff_t const nlo, ptrdiff_t const nhi) { - _mm256_maskstore_pd - (&p, - _mm256_castsi256_pd(k8store_lo[nlo].vi & k8store_hi[nhi].vi), - x); + _mm256_maskstore_pd(&p, I2R(k8store_lo[nlo].vi & k8store_hi[nhi].vi), x); } #else static inline CCTK_ATTRIBUTE_ALWAYS_INLINE @@ -311,7 +315,7 @@ void vec8_store_nta_partial_mid(CCTK_REAL8& p, { _mm256_maskstore_pd (&p, - _mm256_castpd_si256(_mm256_and_pd(k8store_lo[nlo].vf, k8store_hi[nhi].vf)), + R2I(_mm256_and_pd(I2R(k8store_lo[nlo].vi), I2R(k8store_hi[nhi].vi))), x); } #endif @@ -324,7 +328,7 @@ void vec8_store_nta_partial_mid(CCTK_REAL8& p, static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8neg(CCTK_REAL8_VEC const x) { - return _mm256_xor_pd(k8sign, x); + return _mm256_xor_pd(I2R(k8sign), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE @@ -413,13 +417,13 @@ CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x, static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8copysign(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm256_or_pd(_mm256_and_pd(k8notsign, x), - _mm256_and_pd(k8sign , y)); + return _mm256_or_pd(_mm256_and_pd(I2R(k8notsign), x), + _mm256_and_pd(I2R(k8sign ), y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8fabs(CCTK_REAL8_VEC const x) { - return _mm256_and_pd(k8notsign, x); + return _mm256_and_pd(I2R(k8notsign), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8fmax(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) @@ -434,7 +438,7 @@ CCTK_REAL8_VEC k8fmin(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8fnabs(CCTK_REAL8_VEC const x) { - return _mm256_or_pd(k8sign, x); + return _mm256_or_pd(I2R(k8sign), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x) @@ -442,22 +446,111 @@ CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x) return _mm256_sqrt_pd(x); } + + +// Expensive functions +#if defined __ICC +// The Intel compiler provides intrinsics for these + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x) +{ + return _mm256_acos_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8acosh(CCTK_REAL8_VEC const x) +{ + return _mm256_acosh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8asin(CCTK_REAL8_VEC const x) +{ + return _mm256_asin_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8asinh(CCTK_REAL8_VEC const x) +{ + return _mm256_asinh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atan(CCTK_REAL8_VEC const x) +{ + return _mm256_atan_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atan2(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm256_atan2_pd(x,y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atanh(CCTK_REAL8_VEC const x) +{ + return _mm256_atanh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8cos(CCTK_REAL8_VEC const x) +{ + return _mm256_cos_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8cosh(CCTK_REAL8_VEC const x) +{ + return _mm256_cosh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8exp(CCTK_REAL8_VEC const x) +{ + return _mm256_exp_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8log(CCTK_REAL8_VEC const x) +{ + return _mm256_log_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8pow(CCTK_REAL8_VEC const x, CCTK_REAL8 const a) +{ + return _mm256_pow_pd(x, _mm256_set1_pd(a)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sin(CCTK_REAL8_VEC const x) +{ + return _mm256_sin_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sinh(CCTK_REAL8_VEC const x) +{ + return _mm256_sinh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8tan(CCTK_REAL8_VEC const x) +{ + return _mm256_tan_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) +{ + return _mm256_tanh_pd(x); +} + +#else + // Expensive functions #define K8REPL(f,x) \ - vec8_set(f(vec8_elt0(x)), \ - f(vec8_elt1(x)), \ - f(vec8_elt2(x)), \ - f(vec8_elt3(x))); + vec8_set(f(vec8_elt(x,0)), \ + f(vec8_elt(x,1)), \ + f(vec8_elt(x,2)), \ + f(vec8_elt(x,3))); #define K8REPL2S(f,x,a) \ - vec8_set(f(vec8_elt0(x),a), \ - f(vec8_elt1(x),a), \ - f(vec8_elt2(x),a), \ - f(vec8_elt3(x),a)); + vec8_set(f(vec8_elt(x,0),a), \ + f(vec8_elt(x,1),a), \ + f(vec8_elt(x,2),a), \ + f(vec8_elt(x,3),a)); #define K8REPL2(f,x,y) \ - vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \ - f(vec8_elt1(x),vec8_elt1(y)), \ - f(vec8_elt2(x),vec8_elt2(y)), \ - f(vec8_elt3(x),vec8_elt3(y))); + vec8_set(f(vec8_elt(x,0),vec8_elt(y,0)), \ + f(vec8_elt(x,1),vec8_elt(y,1)), \ + f(vec8_elt(x,2),vec8_elt(y,2)), \ + f(vec8_elt(x,3),vec8_elt(y,3))); static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x) @@ -540,6 +633,8 @@ CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) return K8REPL(tanh,x); } +#endif + #define k8lfalse (vec8_set1i( 0)) @@ -547,60 +642,60 @@ CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8lnot(CCTK_BOOLEAN8_VEC const x) { - return _mm256_xor_pd(k8ltrue, x); + return R2I(_mm256_xor_pd(I2R(k8ltrue), I2R(x))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8land(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) { - return _mm256_and_pd(x, y); + return R2I(_mm256_and_pd(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8lor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) { - return _mm256_or_pd(x, y); + return R2I(_mm256_or_pd(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8lxor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) { - return _mm256_xor_pd(x, y); + return R2I(_mm256_xor_pd(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x, CCTK_REAL8_VEC const y, CCTK_REAL8_VEC const z) { - return _mm256_blendv_pd(z, y, x); + return _mm256_blendv_pd(z, y, I2R(x)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmpeq(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm256_cmp_pd(x, y, _CMP_EQ_OQ); + return R2I(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmpne(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm256_cmp_pd(x, y, _CMP_NEQ_OQ); + return R2I(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmpgt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm256_cmp_pd(x, y, _CMP_GT_OQ); + return R2I(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmpge(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm256_cmp_pd(x, y, _CMP_GE_OQ); + return R2I(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmplt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm256_cmp_pd(x, y, _CMP_LT_OQ); + return R2I(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmple(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm256_cmp_pd(x, y, _CMP_LE_OQ); + return R2I(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); } @@ -609,7 +704,12 @@ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8sgn(CCTK_REAL8_VEC const x) { CCTK_BOOLEAN8_VEC const iszero = k8cmpeq(x, vec8_set1(0.0)); - CCTK_REAL8_VEC const sign = _mm256_and_pd(k8sign, x); + CCTK_REAL8_VEC const sign = _mm256_and_pd(I2R(k8sign), x); CCTK_REAL8_VEC const signedone = _mm256_or_pd(sign, vec8_set1(1.0)); return k8ifthen(iszero, vec8_set1(0.0), signedone); } + + + +#undef I2R +#undef R2I diff --git a/src/vectors-8-DoubleHummer.h b/src/vectors-8-DoubleHummer.h index 7b9c50d..bc1c6e3 100644 --- a/src/vectors-8-DoubleHummer.h +++ b/src/vectors-8-DoubleHummer.h @@ -1,3 +1,4 @@ +// -*-C++-*- // Vectorise using IBM's Blue Gene/P Double Hummer (Power) // Use the type double _Complex directly, without introducing a wrapper class diff --git a/src/vectors-8-MIC.h b/src/vectors-8-MIC.h index 3f85119..d909e7c 100644 --- a/src/vectors-8-MIC.h +++ b/src/vectors-8-MIC.h @@ -1,10 +1,17 @@ +// -*-C++-*- // Vectorise using Intel's MIC // Use the type __m512d directly, without introducing a wrapper class +// See +// +// and +// . + #include +#include #include @@ -31,13 +38,6 @@ typedef bool CCTK_BOOLEAN8; -union k8const_t { - CCTK_INTEGER8 i[CCTK_REAL8_VEC_SIZE]; - CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE]; - CCTK_INTEGER8_VEC vi; - CCTK_REAL8_VEC vf; -}; - #define k8sign (vec8i_set1i( (CCTK_INTEGER8)(1ULL << 63ULL))) #define k8notsign (vec8i_set1i(~ (CCTK_INTEGER8)(1ULL << 63ULL))) @@ -68,53 +68,15 @@ CCTK_REAL8_VEC vec8_set(CCTK_REAL8 const a0, return _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0); // note reversed arguments } -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt0(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[0]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt1(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[1]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt2(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[2]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt3(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[3]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt4(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[4]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt5(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[5]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt6(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[6]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt7(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[7]; -} static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d) { - return ((CCTK_REAL8 const*)&x)[d]; + CCTK_REAL8 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_BOOLEAN8 vec8_elt(CCTK_BOOLEAN8_VEC const x, std::ptrdiff_t const d) +CCTK_BOOLEAN8 vec8_eltb(CCTK_BOOLEAN8_VEC const x, std::ptrdiff_t const d) { return _mm512_mask2int(x) & (1 << d); } @@ -201,14 +163,23 @@ void vec8_store(CCTK_REAL8& p, CCTK_REAL8_VEC const x) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec8_storeu(CCTK_REAL8& p, CCTK_REAL8_VEC const x) { - _mm512_packstorelo_pd(&p , x); + // TODO: Intel erratum suggests that hi should come before lo _mm512_packstorehi_pd(&p+8, x); + _mm512_packstorelo_pd(&p , x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) { #if VECTORISE_STREAMING_STORES - _mm512_extstore_pd(&p, x, _MM_DOWNCONV_PD_NONE, _MM_HINT_NT); + // non-temporal hint: + // _mm512_extstore_pd(&p, x, _MM_DOWNCONV_PD_NONE, _MM_HINT_NT); + // no-read hint: + _mm512_storenr_pd(&p, x); + _mm_clevict(&p, _MM_HINT_T1); + // no-read hint, not globally ordered (requires fence?): + // _mm512_storenrngo_pd(&p, x); + // _mm_clevict(&p, _MM_HINT_T1); + #else _mm512_store_pd(&p, x); #endif @@ -243,6 +214,7 @@ void vec8_store_nta_partial_(__mmask8 const mask, CCTK_REAL8& p, CCTK_REAL8_VEC const x) { + // TODO: use vec8_store_nta(p, x) if all=true? _mm512_mask_store_pd(&p, mask, x); } @@ -376,9 +348,12 @@ CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x) return _mm512_sqrt_pd(x); } + + // Expensive functions +#if defined __ICC +// The Intel compiler provides intrinsics for these -#if 0 // These implementations lead to an ICE with icpc 13.0.1 static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x) @@ -465,32 +440,32 @@ CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) // These implementations are very expensive #define K8REPL(f,x) \ - vec8_set(f(vec8_elt0(x)), \ - f(vec8_elt1(x)), \ - f(vec8_elt2(x)), \ - f(vec8_elt3(x)), \ - f(vec8_elt4(x)), \ - f(vec8_elt5(x)), \ - f(vec8_elt6(x)), \ - f(vec8_elt7(x))); + vec8_set(f(vec8_elt(x,0)), \ + f(vec8_elt(x,1)), \ + f(vec8_elt(x,2)), \ + f(vec8_elt(x,3)), \ + f(vec8_elt(x,4)), \ + f(vec8_elt(x,5)), \ + f(vec8_elt(x,6)), \ + f(vec8_elt(x,7))); #define K8REPL2S(f,x,a) \ - vec8_set(f(vec8_elt0(x),a), \ - f(vec8_elt1(x),a), \ - f(vec8_elt2(x),a), \ - f(vec8_elt3(x),a), \ - f(vec8_elt4(x),a), \ - f(vec8_elt5(x),a), \ - f(vec8_elt6(x),a), \ - f(vec8_elt7(x),a)); + vec8_set(f(vec8_elt(x,0),a), \ + f(vec8_elt(x,1),a), \ + f(vec8_elt(x,2),a), \ + f(vec8_elt(x,3),a), \ + f(vec8_elt(x,4),a), \ + f(vec8_elt(x,5),a), \ + f(vec8_elt(x,6),a), \ + f(vec8_elt(x,7),a)); #define K8REPL2(f,x,y) \ - vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \ - f(vec8_elt1(x),vec8_elt1(y)), \ - f(vec8_elt2(x),vec8_elt2(y)), \ - f(vec8_elt3(x),vec8_elt3(y)), \ - f(vec8_elt4(x),vec8_elt4(y)), \ - f(vec8_elt5(x),vec8_elt5(y)), \ - f(vec8_elt6(x),vec8_elt6(y)), \ - f(vec8_elt7(x),vec8_elt7(y))); + vec8_set(f(vec8_elt(x,0),vec8_elt(y,0)), \ + f(vec8_elt(x,1),vec8_elt(y,1)), \ + f(vec8_elt(x,2),vec8_elt(y,2)), \ + f(vec8_elt(x,3),vec8_elt(y,3)), \ + f(vec8_elt(x,4),vec8_elt(y,4)), \ + f(vec8_elt(x,5),vec8_elt(y,5)), \ + f(vec8_elt(x,6),vec8_elt(y,6)), \ + f(vec8_elt(x,7),vec8_elt(y,7))); static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x) @@ -577,6 +552,7 @@ CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) +// TODO: try k8lxor(x,x) and k8lxnor(x,x) #define k8lfalse (_mm512_int2mask( 0)) #define k8ltrue (_mm512_int2mask(~0)) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE @@ -606,7 +582,12 @@ CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x, { // This leads to an ICE // return _mm512_mask_blend_pd(x, z, y); +#if 0 + // This works: return _mm512_mask_mov_pd(z, x, y); +#endif + // Intel suggests this: + return x==0 ? z : _mm512_mask_blend_pd(x, z, y); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE diff --git a/src/vectors-8-QPX.h b/src/vectors-8-QPX.h index 7639476..75c7fdb 100644 --- a/src/vectors-8-QPX.h +++ b/src/vectors-8-QPX.h @@ -1,3 +1,4 @@ +// -*-C++-*- // Vectorise using IBM's Blue Gene/Q QPX (Power) // Use the type vector4double directly, without introducing a wrapper class @@ -12,9 +13,13 @@ #include +// #define vec8_assert(x) ((void)0) +#define vec8_assert(x) assert(x) + #ifdef __cplusplus # include #endif +#include @@ -35,9 +40,9 @@ struct CCTK_REAL8_VEC { #define CCTK_REAL8_VEC_SIZE 4 // Integer and boolean types corresponding to this real type -//#define CCTK_INTEGER8 CCTK_REAL8 +#define CCTK_INTEGER8 CCTK_INT8 #define CCTK_BOOLEAN8 CCTK_REAL8 -//#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC +#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC #define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC @@ -45,22 +50,53 @@ struct CCTK_REAL8_VEC { // Create vectors, extract vector elements #define vec8_set1(a) (vec_splats(a)) +#if 0 #define vec8_set(a,b,c,d) \ (vec_insert \ (d,vec_insert \ (c,vec_insert \ (b,vec_insert \ (a,CCTK_REAL8_VEC(),0),1),2),3)) +#endif +#define vec8_set(a_,b_,c_,d_) \ + ({ \ + CCTK_REAL8 const a__ = (a_); \ + CCTK_REAL8 const b__ = (b_); \ + CCTK_REAL8 const c__ = (c_); \ + CCTK_REAL8 const d__ = (d_); \ + CCTK_REAL8 const a = a__; \ + CCTK_REAL8 const b = b__; \ + CCTK_REAL8 const c = c__; \ + CCTK_REAL8 const d = d__; \ + CCTK_REAL8_VEC x; \ + ((CCTK_REAL*)&x)[0] = a; \ + ((CCTK_REAL*)&x)[1] = b; \ + ((CCTK_REAL*)&x)[2] = c; \ + ((CCTK_REAL*)&x)[3] = d; \ + x; \ + }) #define vec8_b2r(b) ((b)?+1.0:-1.0) -#define vec8b_set(a,b,c,d) \ - (vec8_set(vec8_b2r(a),vec8_b2r(b),vec8_b2r(c),vec8_b2r(d))) +#define vec8b_set(a,b,c,d) \ + (vec8_set(vec8_b2r(a), vec8_b2r(b), vec8_b2r(c), vec8_b2r(d))) #define vec8_elt0(x) (vec_extract(x,0)) #define vec8_elt1(x) (vec_extract(x,1)) #define vec8_elt2(x) (vec_extract(x,2)) #define vec8_elt3(x) (vec_extract(x,3)) #define vec8_elt(x,d) (vec_extract(x,d)) +#define vec8_elts(x,a,b,c,d) \ + ({ \ + CCTK_REAL8_VEC x__ = (x_); \ + CCTK_REAL8_VEC x = x__; \ + a = ((CCTK_REAL*)&x)[0]; \ + b = ((CCTK_REAL*)&x)[1]; \ + c = ((CCTK_REAL*)&x)[2]; \ + d = ((CCTK_REAL*)&x)[3]; \ + }) + +#define vec8_r2b(x) ((x)>=0.0) +#define vec8b_elt(x,d) (vec8_r2b(vec8_elt(x,d))) @@ -76,10 +112,25 @@ struct CCTK_REAL8_VEC { vector4double v1, v2, vp; \ /* code taken from IBM's compiler documentation */ \ v1 = vec_ld(0,&p); /* load the left part of the vector */ \ - v2 = vec_ld(32,&p); /* load the right part of the vector */ \ + v2 = vec_ld(31,&p); /* load the right part of the vector */ \ vp = vec_lvsl(0,&p); /* generate control value */ \ vec_perm(v1,v2,vp); /* generate the aligned vector */ \ }) +#define vec8_loadu_off(off_,p_) \ + ({ \ + int const off__ = (off_); \ + CCTK_REAL8 const& p__ = (p_); \ + int off = off__; \ + CCTK_REAL8& p = *(CCTK_REAL8*)&p__; \ + vector4double v1, v2; \ + off &= CCTK_REAL8_VEC_SIZE-1; \ + v1 = vec_lda(0,&p-off); \ + v2 = vec_lda(0,&p-off+CCTK_REAL8_VEC_SIZE); \ + off==1 ? vec_sldw(v1,v2,1) : \ + off==2 ? vec_sldw(v1,v2,2) : \ + off==3 ? vec_sldw(v1,v2,3) : \ + (vec8_assert(0), v1); \ + }) // Load a vector from memory that may or may not be aligned, as // decided by the offset and the vector size @@ -88,13 +139,15 @@ struct CCTK_REAL8_VEC { # define vec8_loadu_maybe(off,p) vec8_loadu(p) # define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p) #else -# define vec8_loadu_maybe(off,p_) \ +# define vec8_loadu_maybe(off_,p_) \ ({ \ CCTK_REAL8 const& p__=(p_); \ + int const off__=(off_); \ CCTK_REAL8 const& p=p__; \ - (off) % CCTK_REAL8_VEC_SIZE == 0 ? \ + int const off=off__; \ + off % CCTK_REAL8_VEC_SIZE == 0 ? \ vec8_load(p) : \ - vec8_loadu(p); \ + vec8_loadu_off(off,p); \ }) # if VECTORISE_ALIGNED_ARRAYS // Assume all array x sizes are multiples of the vector size @@ -128,23 +181,35 @@ struct CCTK_REAL8_VEC { m1 = k8lfalse; \ m2 = k8ltrue; \ m3 = vec_perm(m1,m2,vp); \ - /* get existing data */ \ - v1 = vec_ld(0,&p); \ - v2 = vec_ld(32,&p); \ - /* permute and insert */ \ v3 = vec_perm(x,x,vp); \ - v1 = vec_sel(v1,v3,m3); \ - v2 = vec_sel(v3,v2,m3); \ - /* store data back */ \ - vec_st(0,&p,v1); \ - vec_st(32,&p,v2); \ + _Pragma("tm_atomic") { \ + /* get existing data */ \ + v1 = vec_ld(0,&p); \ + v2 = vec_ld(31,&p); \ + /* permute and insert */ \ + v1 = vec_sel(v1,v3,m3); \ + v2 = vec_sel(v3,v2,m3); \ + /* store data back */ \ + vec_st(0,&p,v1); \ + vec_st(31,&p,v2); \ + } \ }) #define vec8_store_nta(p,x) (vec_sta(x,0,&(p))) // this doesn't avoid the cache +#if VECTORISE_ALIGNED_ARRAYS +// Arrays are aligned; wrap-around is not an issue +# define vec8_store_omp +#else +// Need to protect partial stores, as they may wrap around to the +// beginning of the next line in the array +# define vec8_store_omp _Pragma("tm_atomic") +#endif + // Store a partial vector (aligned and non-temporal) #define vec8_store_partial_prepare(i,imin_,imax_) \ bool v8stp_all; \ - CCTK_REAL8_VEC v8stp_mask; \ + CCTK_BOOLEAN8_VEC v8stp_mask; \ + bool v8stp_mask0, v8stp_mask1, v8stp_mask2, v8stp_mask3; \ ({ \ ptrdiff_t const imin__=(imin_); \ ptrdiff_t const imax__=(imax_); \ @@ -154,7 +219,8 @@ struct CCTK_REAL8_VEC { v8stp_all = i>=imin and i+CCTK_REAL8_VEC_SIZE-1=imin, i+1>=imin, i+2>=imin, i+3>=imin); \ @@ -167,18 +233,22 @@ struct CCTK_REAL8_VEC { */ \ /* We assume p[i] is aligned */ \ /* Ensure at least one vector element is inside the active region */ \ - assert(i-imin>=-(CCTK_REAL8_VEC_SIZE-1)); \ + vec8_assert(i-imin>=-(CCTK_REAL8_VEC_SIZE-1)); \ vp_lo = vec_lvsl(8 * (i-imin), (CCTK_REAL*)0); \ mask_lo = (i-imin >= 0 ? \ k8ltrue : \ vec_perm(k8lfalse, k8ltrue, vp_lo)); \ /* Ensure at least one vector element is inside the active region */ \ - assert(i0 and n0 and n0 and n0 and n0 and nlo0 and nlo0 and nhi0 and nhi=0 is true, -0 is true, nan is false diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h index bded69e..b6dcfa6 100644 --- a/src/vectors-8-SSE2.h +++ b/src/vectors-8-SSE2.h @@ -1,3 +1,4 @@ +// -*-C++-*- // Vectorise using Intel's or AMD's SSE2 // Use the type __m128d directly, without introducing a wrapper class @@ -32,6 +33,9 @@ // asm ("movntsd %[x],%[p]" : "=m" (*p) : [p] "m" (*p), [x] "x" (x)); // } +#endif +#ifdef __AVX__ +# include #endif #ifdef __FMA4__ # include @@ -59,9 +63,13 @@ // Vector type corresponding to CCTK_REAL +// Note: some boolean masks (e.g. ~0) correspond to nan when +// interpreted as floating point number. gcc 4.8 is clever enough to +// optimize away such constants with fast-math. We therefore need to +// handle this constant as integer number. typedef __m128d CCTK_REAL8_VEC; typedef __m128i CCTK_INTEGER8_VEC; -typedef __m128d CCTK_BOOLEAN8_VEC; +typedef __m128i CCTK_BOOLEAN8_VEC; // Number of vector elements in a CCTK_REAL_VEC #define CCTK_REAL8_VEC_SIZE 2 @@ -70,16 +78,21 @@ vec_static_assert(sizeof(CCTK_REAL8_VEC) == sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE); // Integer and boolean types corresponding to this real type -typedef CCTK_INT8 CCTK_INTEGER8; -typedef CCTK_REAL8 CCTK_BOOLEAN8; +typedef CCTK_INT8 CCTK_INTEGER8; +typedef CCTK_INT8 CCTK_BOOLEAN8; + + + +// These macros are undefined at the end of this file -- use them only +// within functions, not within macros that are exported +#define I2R(x) _mm_castsi128_pd(x) +#define R2I(x) _mm_castpd_si128(x) union k8const_t { CCTK_INTEGER8 i[CCTK_REAL8_VEC_SIZE]; - CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE]; CCTK_INTEGER8_VEC vi; - CCTK_REAL8_VEC vf; }; #define k8sign (vec8_set1i( (CCTK_INTEGER8)(1ULL << 63ULL))) @@ -95,13 +108,13 @@ CCTK_REAL8_VEC vec8_set1(CCTK_REAL8 const a) return _mm_set1_pd(a); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8_VEC vec8_set1i(CCTK_INT8 const a) +CCTK_INTEGER8_VEC vec8_set1i(CCTK_INT8 const a) { #if defined(__INTEL_COMPILER) // Intel 11.1 does not support _mm_set1_epi64x - return _mm_set1_pd(*(CCTK_REAL8 const*)&a); + return R2I(_mm_set1_pd(*(CCTK_REAL8 const*)&a)); #else - return _mm_castsi128_pd(_mm_set1_epi64x(a)); + return _mm_set1_epi64x(a); #endif } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE @@ -118,19 +131,25 @@ CCTK_REAL8_VEC vec8_swap10(CCTK_REAL8_VEC const x) } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt0(CCTK_REAL8_VEC const x) +CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d) { - return ((CCTK_REAL8 const*)&x)[0]; + CCTK_REAL8 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt1(CCTK_REAL8_VEC const x) +CCTK_INTEGER8 vec8_elti(CCTK_INTEGER8_VEC const x, std::ptrdiff_t const d) { - return ((CCTK_REAL8 const*)&x)[1]; + CCTK_INTEGER8 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d) +CCTK_BOOLEAN8 vec8_eltb(CCTK_BOOLEAN8_VEC const x, std::ptrdiff_t const d) { - return ((CCTK_REAL8 const*)&x)[d]; + CCTK_BOOLEAN8 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } @@ -240,6 +259,8 @@ void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) { + // TODO: requires _mm_sfence() afterwards? requires _mm_lfence() in + // readers afterwards? maybe better just an _mm_mfence() afterwards? _mm_stream_pd(&p, x); } #endif @@ -394,7 +415,7 @@ void vec8_store_nta_partial_mid(CCTK_REAL8& p, static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8neg(CCTK_REAL8_VEC const x) { - return _mm_xor_pd(k8sign, x); + return _mm_xor_pd(I2R(k8sign), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE @@ -483,13 +504,13 @@ CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x, static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8copysign(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm_or_pd(_mm_and_pd(k8notsign, x), - _mm_and_pd(k8sign , y)); + return _mm_or_pd(_mm_and_pd(I2R(k8notsign), x), + _mm_and_pd(I2R(k8sign ), y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8fabs(CCTK_REAL8_VEC const x) { - return _mm_and_pd(k8notsign, x); + return _mm_and_pd(I2R(k8notsign), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8fmax(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) @@ -504,7 +525,7 @@ CCTK_REAL8_VEC k8fmin(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8fnabs(CCTK_REAL8_VEC const x) { - return _mm_or_pd(k8sign, x); + return _mm_or_pd(I2R(k8sign), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x) @@ -512,16 +533,104 @@ CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x) return _mm_sqrt_pd(x); } + + // Expensive functions +#if defined __ICC +// The Intel compiler provides intrinsics for these + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x) +{ + return _mm_acos_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8acosh(CCTK_REAL8_VEC const x) +{ + return _mm_acosh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8asin(CCTK_REAL8_VEC const x) +{ + return _mm_asin_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8asinh(CCTK_REAL8_VEC const x) +{ + return _mm_asinh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atan(CCTK_REAL8_VEC const x) +{ + return _mm_atan_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atan2(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_atan2_pd(x,y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atanh(CCTK_REAL8_VEC const x) +{ + return _mm_atanh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8cos(CCTK_REAL8_VEC const x) +{ + return _mm_cos_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8cosh(CCTK_REAL8_VEC const x) +{ + return _mm_cosh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8exp(CCTK_REAL8_VEC const x) +{ + return _mm_exp_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8log(CCTK_REAL8_VEC const x) +{ + return _mm_log_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8pow(CCTK_REAL8_VEC const x, CCTK_REAL8 const a) +{ + return _mm_pow_pd(x, _mm_set1_pd(a)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sin(CCTK_REAL8_VEC const x) +{ + return _mm_sin_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sinh(CCTK_REAL8_VEC const x) +{ + return _mm_sinh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8tan(CCTK_REAL8_VEC const x) +{ + return _mm_tan_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) +{ + return _mm_tanh_pd(x); +} + +#else + #define K8REPL(f,x) \ - vec8_set(f(vec8_elt0(x)), \ - f(vec8_elt1(x))); + vec8_set(f(vec8_elt(x,0)), \ + f(vec8_elt(x,1))); #define K8REPL2S(f,x,a) \ - vec8_set(f(vec8_elt0(x),a), \ - f(vec8_elt1(x),a)); + vec8_set(f(vec8_elt(x,0),a), \ + f(vec8_elt(x,1),a)); #define K8REPL2(f,x,y) \ - vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \ - f(vec8_elt1(x),vec8_elt1(y))); + vec8_set(f(vec8_elt(x,0),vec8_elt(y,0)), \ + f(vec8_elt(x,1),vec8_elt(y,1))); static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x) @@ -604,6 +713,8 @@ CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) return K8REPL(tanh,x); } +#endif + #define k8lfalse (vec8_set1i( 0)) @@ -611,22 +722,22 @@ CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8lnot(CCTK_BOOLEAN8_VEC const x) { - return _mm_xor_pd(k8ltrue, x); + return R2I(_mm_xor_pd(I2R(k8ltrue), I2R(x))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8land(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) { - return _mm_and_pd(x, y); + return R2I(_mm_and_pd(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8lor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) { - return _mm_or_pd(x, y); + return R2I(_mm_or_pd(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8lxor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) { - return _mm_xor_pd(x, y); + return R2I(_mm_xor_pd(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x, @@ -634,7 +745,7 @@ CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x, CCTK_REAL8_VEC const z) { #ifdef __SSE4_1__ - return _mm_blendv_pd(z,y,x); + return _mm_blendv_pd(z,y,I2R(x)); #elif 0 // This is slow (but this is what Intel/PGI produce by themselves) int const m = _mm_movemask_pd(x); @@ -645,8 +756,8 @@ CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x, } return z; #elif 0 - return vec8_set(std::signbit(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), - std::signbit(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); + return vec8_set(std::signbit(vec8_elt(x,0)) ? vec8_elt(y,0) : vec8_elt(z,0), + std::signbit(vec8_elt(x,1)) ? vec8_elt(y,1) : vec8_elt(z,1)); #elif 0 // We don't need to shift -- the condition (mask) will be either all // zeros or all ones @@ -661,39 +772,39 @@ CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x, // This assumes that all logical operations always return either // lfalse or ltrue, and nothing "in between" // (z & ~mask) | (y & mask) where imask = ~mask - return _mm_or_pd(_mm_and_pd(x, y), _mm_andnot_pd(x, z)); + return _mm_or_pd(_mm_and_pd(I2R(x), y), _mm_andnot_pd(I2R(x), z)); #endif } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmpeq(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm_cmpeq_pd(x, y); + return R2I(_mm_cmpeq_pd(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmpne(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm_cmpneq_pd(x, y); + return R2I(_mm_cmpneq_pd(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmpgt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm_cmpgt_pd(x, y); + return R2I(_mm_cmpgt_pd(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmpge(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm_cmpge_pd(x, y); + return R2I(_mm_cmpge_pd(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmplt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm_cmplt_pd(x, y); + return R2I(_mm_cmplt_pd(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmple(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm_cmple_pd(x, y); + return R2I(_mm_cmple_pd(x, y)); } @@ -702,9 +813,14 @@ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8sgn(CCTK_REAL8_VEC const x) { CCTK_BOOLEAN8_VEC const iszero = k8cmpeq(x, vec8_set1(0.0)); - CCTK_REAL8_VEC const sign = _mm_and_pd(k8sign, x); + CCTK_REAL8_VEC const sign = _mm_and_pd(I2R(k8sign), x); CCTK_REAL8_VEC const signedone = _mm_or_pd(sign, vec8_set1(1.0)); return k8ifthen(iszero, vec8_set1(0.0), signedone); } + + +#undef I2R +#undef R2I + #endif diff --git a/src/vectors-8-default.h b/src/vectors-8-default.h index 5c07bfb..fac21ba 100644 --- a/src/vectors-8-default.h +++ b/src/vectors-8-default.h @@ -1,3 +1,4 @@ +// -*-C++-*- // Fallback vectorisation implementation: Do not vectorise @@ -19,10 +20,10 @@ vec_static_assert(sizeof(CCTK_REAL8_VEC) == sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE); // Integer and boolean types corresponding to this real type -//#define CCTK_INTEGER8 CCTK_REAL8 -#define CCTK_BOOLEAN8 CCTK_REAL8 -//#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC -#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC +#define CCTK_INTEGER8 CCTK_INT8 +#define CCTK_BOOLEAN8 CCTK_INT8 +#define CCTK_INTEGER8_VEC CCTK_INT8 +#define CCTK_BOOLEAN8_VEC CCTK_INT8 @@ -34,6 +35,8 @@ vec_static_assert(sizeof(CCTK_REAL8_VEC) == // Access vectors elements #define vec8_elt0(x) (x) #define vec8_elt(x,d) (x) +#define vec8_elti(x,d) (x) +#define vec8_eltb(x,d) (x) @@ -105,29 +108,21 @@ vec_static_assert(sizeof(CCTK_REAL8_VEC) == #define k8tanh(x) (tanh(x)) #define k8signbit(x) (std::signbit(x)) -static inline CCTK_REAL8_VEC k8l2r(CCTK_INT8 const x) -{ - return *(CCTK_REAL8 const*)&x; -} -static inline CCTK_INT8 k8r2l(CCTK_REAL8_VEC const x) -{ - return *(CCTK_INT8 const*)&x; -} -#define k8lfalse k8l2r(0) -#define k8ltrue k8l2r(1) -#define k8lnot(x) k8l2r(!k8r2l(x)) -#define k8land(x,y) k8l2r(k8r2l(x) && k8r2l(y)) -#define k8lor(x,y) k8l2r(k8r2l(x) || k8r2l(y)) -#define k8lxor(x,y) k8l2r(!k8r2l(x) != !k8r2l(y)) - -#define k8ifthen(x,y,z) (k8r2l(x)?(y):(z)) - -#define k8cmpeq(x,y) k8l2r((x)==(y)) -#define k8cmpne(x,y) k8l2r((x)!=(y)) -#define k8cmpgt(x,y) k8l2r((x)>(y)) -#define k8cmpge(x,y) k8l2r((x)>=(y)) -#define k8cmplt(x,y) k8l2r((x)<(y)) -#define k8cmple(x,y) k8l2r((x)<=(y)) +#define k8lfalse 0 +#define k8ltrue 1 +#define k8lnot(x) (!(x)) +#define k8land(x,y) ((x) && (y)) +#define k8lor(x,y) ((x) || (y)) +#define k8lxor(x,y) (!(x) != !(y)) + +#define k8ifthen(x,y,z) ((x)?(y):(z)) + +#define k8cmpeq(x,y) ((x)==(y)) +#define k8cmpne(x,y) ((x)!=(y)) +#define k8cmpgt(x,y) ((x)>(y)) +#define k8cmpge(x,y) ((x)>=(y)) +#define k8cmplt(x,y) ((x)<(y)) +#define k8cmple(x,y) ((x)<=(y)) static inline CCTK_REAL8_VEC k8sgn(CCTK_REAL8_VEC const x) { diff --git a/src/vectors.h b/src/vectors.h index 9222823..9a47422 100644 --- a/src/vectors.h +++ b/src/vectors.h @@ -13,7 +13,7 @@ #if VECTORISE -# if defined __AVX__ // Intel AVX +# if defined __AVX__ && !defined DISABLE_AVX // Intel AVX # include "vectors-4-AVX.h" # elif defined __SSE__ // Intel SSE # include "vectors-4-SSE.h" @@ -23,13 +23,13 @@ # if defined __MIC__ // Intel MIC # include "vectors-8-MIC.h" -# elif defined __AVX__ && !defined DISABLE_AVX // Intel AVX +# elif defined __AVX__ && !defined DISABLE_AVX // Intel AVX # include "vectors-8-AVX.h" # elif defined __SSE2__ // Intel SSE2 # include "vectors-8-SSE2.h" -# elif defined __bgq__ && defined __VECTOR4DOUBLE__ // Blue Gene/Q QPX +# elif defined __bgq__ && defined __VECTOR4DOUBLE__ // Blue Gene/Q QPX # include "vectors-8-QPX.h" -# elif defined __ALTIVEC__ && defined _ARCH_PWR7 // Power VSX +# elif defined __ALTIVEC__ && defined _ARCH_PWR7 // Power VSX # include "vectors-8-VSX.h" # elif defined _ARCH_450D // Blue Gene/P Double Hummer # include "vectors-8-DoubleHummer.h" @@ -63,8 +63,9 @@ # define vec_set1 vec4_set1 # define vec_set vec4_set -# define vec_elt0 vec4_elt0 # define vec_elt vec4_elt +# define vec_elti vec4_elti +# define vec_eltb vec4_eltb # define vec_load vec4_load # define vec_loadu vec4_loadu @@ -135,16 +136,17 @@ # define CCTK_REAL_VEC CCTK_REAL8_VEC # define CCTK_REAL_VEC_SIZE CCTK_REAL8_VEC_SIZE -//# define CCTK_INTEGER CCTK_INTEGER8 +# define CCTK_INTEGER CCTK_INTEGER8 # define CCTK_BOOLEAN CCTK_BOOLEAN8 -//# define CCTK_INTEGER_VEC CCTK_INTEGER8_VEC +# define CCTK_INTEGER_VEC CCTK_INTEGER8_VEC # define CCTK_BOOLEAN_VEC CCTK_BOOLEAN8_VEC # define vec_set1 vec8_set1 # define vec_set vec8_set -# define vec_elt0 vec8_elt0 # define vec_elt vec8_elt +# define vec_elti vec8_elti +# define vec_eltb vec8_eltb # define vec_load vec8_load # define vec_loadu vec8_loadu -- cgit v1.2.3