diff options
author | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2013-07-19 17:48:51 +0000 |
---|---|---|
committer | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2013-07-19 17:48:51 +0000 |
commit | dc69911dd15fa1fa24c51ca222fc7883d3fc5cff (patch) | |
tree | 0ee4c550f788de3787c2e922f1268189334b6983 /src/vectors-8-MIC.h | |
parent | 825b89e0e6bf3e4e248188b36f5b29029737d44a (diff) |
Do not use type punning any more
Do not cast between different pointer types. This is illegal in C/C++,
and modern compilers (such as gcc 4.8) then generate wrong code.
Instead, use memcpy to re-interpret the bit patterns of values with a
different type.
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@85 105869f7-3296-0410-a4ea-f4349344b45a
Diffstat (limited to 'src/vectors-8-MIC.h')
-rw-r--r-- | src/vectors-8-MIC.h | 133 |
1 files changed, 57 insertions, 76 deletions
diff --git a/src/vectors-8-MIC.h b/src/vectors-8-MIC.h index 3f85119..d909e7c 100644 --- a/src/vectors-8-MIC.h +++ b/src/vectors-8-MIC.h @@ -1,10 +1,17 @@ +// -*-C++-*- // Vectorise using Intel's MIC // Use the type __m512d directly, without introducing a wrapper class +// See +// <http://software.intel.com/sites/products/documentation/doclib/stdxe/2013/composerxe/compiler/cpp-lin/index.htm#GUID-B8DF6000-6872-47B4-AA64-D47A38AF21BD.htm> +// and +// <http://software.intel.com/sites/default/files/forum/278102/327364001en.pdf>. + #include <cstdlib> +#include <cstring> #include <immintrin.h> @@ -31,13 +38,6 @@ typedef bool CCTK_BOOLEAN8; -union k8const_t { - CCTK_INTEGER8 i[CCTK_REAL8_VEC_SIZE]; - CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE]; - CCTK_INTEGER8_VEC vi; - CCTK_REAL8_VEC vf; -}; - #define k8sign (vec8i_set1i( (CCTK_INTEGER8)(1ULL << 63ULL))) #define k8notsign (vec8i_set1i(~ (CCTK_INTEGER8)(1ULL << 63ULL))) @@ -69,52 +69,14 @@ CCTK_REAL8_VEC vec8_set(CCTK_REAL8 const a0, } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt0(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[0]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt1(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[1]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt2(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[2]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt3(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[3]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt4(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[4]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt5(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[5]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt6(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[6]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt7(CCTK_REAL8_VEC const x) -{ - return ((CCTK_REAL8 const*)&x)[7]; -} -static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d) { - return ((CCTK_REAL8 const*)&x)[d]; + CCTK_REAL8 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_BOOLEAN8 vec8_elt(CCTK_BOOLEAN8_VEC const x, std::ptrdiff_t const d) +CCTK_BOOLEAN8 vec8_eltb(CCTK_BOOLEAN8_VEC const x, std::ptrdiff_t const d) { return _mm512_mask2int(x) & (1 << d); } @@ -201,14 +163,23 @@ void vec8_store(CCTK_REAL8& p, CCTK_REAL8_VEC const x) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec8_storeu(CCTK_REAL8& p, CCTK_REAL8_VEC const x) { - _mm512_packstorelo_pd(&p , x); + // TODO: Intel erratum suggests that hi should come before lo _mm512_packstorehi_pd(&p+8, x); + _mm512_packstorelo_pd(&p , x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) { #if VECTORISE_STREAMING_STORES - _mm512_extstore_pd(&p, x, _MM_DOWNCONV_PD_NONE, _MM_HINT_NT); + // non-temporal hint: + // _mm512_extstore_pd(&p, x, _MM_DOWNCONV_PD_NONE, _MM_HINT_NT); + // no-read hint: + _mm512_storenr_pd(&p, x); + _mm_clevict(&p, _MM_HINT_T1); + // no-read hint, not globally ordered (requires fence?): + // _mm512_storenrngo_pd(&p, x); + // _mm_clevict(&p, _MM_HINT_T1); + #else _mm512_store_pd(&p, x); #endif @@ -243,6 +214,7 @@ void vec8_store_nta_partial_(__mmask8 const mask, CCTK_REAL8& p, CCTK_REAL8_VEC const x) { + // TODO: use vec8_store_nta(p, x) if all=true? _mm512_mask_store_pd(&p, mask, x); } @@ -376,9 +348,12 @@ CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x) return _mm512_sqrt_pd(x); } + + // Expensive functions +#if defined __ICC +// The Intel compiler provides intrinsics for these -#if 0 // These implementations lead to an ICE with icpc 13.0.1 static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x) @@ -465,32 +440,32 @@ CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) // These implementations are very expensive #define K8REPL(f,x) \ - vec8_set(f(vec8_elt0(x)), \ - f(vec8_elt1(x)), \ - f(vec8_elt2(x)), \ - f(vec8_elt3(x)), \ - f(vec8_elt4(x)), \ - f(vec8_elt5(x)), \ - f(vec8_elt6(x)), \ - f(vec8_elt7(x))); + vec8_set(f(vec8_elt(x,0)), \ + f(vec8_elt(x,1)), \ + f(vec8_elt(x,2)), \ + f(vec8_elt(x,3)), \ + f(vec8_elt(x,4)), \ + f(vec8_elt(x,5)), \ + f(vec8_elt(x,6)), \ + f(vec8_elt(x,7))); #define K8REPL2S(f,x,a) \ - vec8_set(f(vec8_elt0(x),a), \ - f(vec8_elt1(x),a), \ - f(vec8_elt2(x),a), \ - f(vec8_elt3(x),a), \ - f(vec8_elt4(x),a), \ - f(vec8_elt5(x),a), \ - f(vec8_elt6(x),a), \ - f(vec8_elt7(x),a)); + vec8_set(f(vec8_elt(x,0),a), \ + f(vec8_elt(x,1),a), \ + f(vec8_elt(x,2),a), \ + f(vec8_elt(x,3),a), \ + f(vec8_elt(x,4),a), \ + f(vec8_elt(x,5),a), \ + f(vec8_elt(x,6),a), \ + f(vec8_elt(x,7),a)); #define K8REPL2(f,x,y) \ - vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \ - f(vec8_elt1(x),vec8_elt1(y)), \ - f(vec8_elt2(x),vec8_elt2(y)), \ - f(vec8_elt3(x),vec8_elt3(y)), \ - f(vec8_elt4(x),vec8_elt4(y)), \ - f(vec8_elt5(x),vec8_elt5(y)), \ - f(vec8_elt6(x),vec8_elt6(y)), \ - f(vec8_elt7(x),vec8_elt7(y))); + vec8_set(f(vec8_elt(x,0),vec8_elt(y,0)), \ + f(vec8_elt(x,1),vec8_elt(y,1)), \ + f(vec8_elt(x,2),vec8_elt(y,2)), \ + f(vec8_elt(x,3),vec8_elt(y,3)), \ + f(vec8_elt(x,4),vec8_elt(y,4)), \ + f(vec8_elt(x,5),vec8_elt(y,5)), \ + f(vec8_elt(x,6),vec8_elt(y,6)), \ + f(vec8_elt(x,7),vec8_elt(y,7))); static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x) @@ -577,6 +552,7 @@ CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) +// TODO: try k8lxor(x,x) and k8lxnor(x,x) #define k8lfalse (_mm512_int2mask( 0)) #define k8ltrue (_mm512_int2mask(~0)) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE @@ -606,7 +582,12 @@ CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x, { // This leads to an ICE // return _mm512_mask_blend_pd(x, z, y); +#if 0 + // This works: return _mm512_mask_mov_pd(z, x, y); +#endif + // Intel suggests this: + return x==0 ? z : _mm512_mask_blend_pd(x, z, y); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE |