// -*-C++-*- // Vectorise using Intel's or AMD's AVX // Use the type __m256d directly, without introducing a wrapper class #include #include #include #ifdef __FMA4__ # include #endif #ifdef __FMA4__ # define vec8_architecture_FMA4 "+FMA4" #else # define vec8_architecture_FMA4 "" #endif #define vec8_architecture "AVX" vec8_architecture_FMA4 " (64-bit precision)" // Vector type corresponding to CCTK_REAL // Note: some boolean masks (e.g. ~0) correspond to nan when // interpreted as floating point number. gcc 4.8 is clever enough to // optimize away such constants with fast-math. We therefore need to // handle this constant as integer number. typedef __m256d CCTK_REAL8_VEC; typedef __m256i CCTK_INTEGER8_VEC; typedef __m256i CCTK_BOOLEAN8_VEC; // Number of vector elements in a CCTK_REAL_VEC #define CCTK_REAL8_VEC_SIZE 4 vec_static_assert(sizeof(CCTK_REAL8_VEC) == sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE); // Integer and boolean types corresponding to this real type typedef CCTK_INT8 CCTK_INTEGER8; typedef CCTK_INT8 CCTK_BOOLEAN8; // These macros are undefined at the end of this file -- use them only // within functions, not within macros that are exported #define I2R(x) _mm256_castsi256_pd(x) #define R2I(x) _mm256_castpd_si256(x) union k8const_t { CCTK_INTEGER8 i[CCTK_REAL8_VEC_SIZE]; CCTK_INTEGER8_VEC vi; }; #define k8sign (vec8_set1i( (CCTK_INTEGER8)(1ULL << 63ULL))) #define k8notsign (vec8_set1i(~ (CCTK_INTEGER8)(1ULL << 63ULL))) // Create vectors, extract vector elements static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC vec8_set1(CCTK_REAL8 const a) { return _mm256_set1_pd(a); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_INTEGER8_VEC vec8_set1i(CCTK_INT8 const a) { return _mm256_set1_epi64x(a); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC vec8_set(CCTK_REAL8 const a, CCTK_REAL8 const b, CCTK_REAL8 const c, CCTK_REAL8 const d) { return _mm256_set_pd(d,c,b,a); // note reversed arguments } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d) { CCTK_REAL8 e; std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); return e; } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_INTEGER8 vec8_elti(CCTK_INTEGER8_VEC const x, std::ptrdiff_t const d) { CCTK_INTEGER8 e; std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); return e; } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8 vec8_eltb(CCTK_BOOLEAN8_VEC const x, std::ptrdiff_t const d) { CCTK_BOOLEAN8 e; std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); return e; } // Load and store vectors // Load a vector from memory (aligned and unaligned); this loads from // a reference to a scalar static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC vec8_load(CCTK_REAL8 const& p) { return _mm256_loadu_pd(&p); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC vec8_loadu(CCTK_REAL8 const& p) { return _mm256_loadu_pd(&p); } #if VECTORISE_ALWAYS_USE_ALIGNED_LOADS # error "VECTORISE_ALWAYS_USE_ALIGNED_LOADS not yet supported" #endif // Load a vector from memory that may or may not be aligned, as // decided by the offset off and the vector size #if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS // Implementation: Always use unaligned load static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p) { return vec8_loadu(p); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, std::ptrdiff_t const off2, std::ptrdiff_t const off3, CCTK_REAL8 const& p) { return vec8_loadu(p); } #else static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p) { return off % CCTK_REAL8_VEC_SIZE == 0 ? vec8_load(p) : vec8_loadu(p); } # if VECTORISE_ALIGNED_ARRAYS // Assume all array x sizes are multiples of the vector size static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, std::ptrdiff_t const off2, std::ptrdiff_t const off3, CCTK_REAL8 const& p) { return vec8_loadu_maybe(off1, p); } # else static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, std::ptrdiff_t const off2, std::ptrdiff_t const off3, CCTK_REAL8 const& p) { return off2 % CCTK_REAL8_VEC_SIZE != 0 or off3 % CCTK_REAL8_VEC_SIZE != 0 ? vec8_loadu(p) : vec8_loadu_maybe(off1, p); } # endif #endif // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec8_store(CCTK_REAL8& p, CCTK_REAL8_VEC const x) { _mm256_store_pd(&p, x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec8_storeu(CCTK_REAL8& p, CCTK_REAL8_VEC const x) { _mm256_storeu_pd(&p, x); } #if ! VECTORISE_STREAMING_STORES static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) { vec8_store(p, x); } #else static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) { _mm256_stream_pd(&p, x); } #endif // Store a partial vector (aligned and non-temporal) #define vec8_store_partial_prepare(i, imin,imax) \ bool v8stp_all; \ __m256i v8stp_mask; \ vec8_store_partial_prepare_(v8stp_all, v8stp_mask, i, imin, imax) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec8_store_partial_prepare_(bool& all, __m256i& mask, std::ptrdiff_t const i, std::ptrdiff_t const imin, std::ptrdiff_t const imax) { all = i>=imin and i+CCTK_REAL8_VEC_SIZE-1