diff options
Diffstat (limited to 'src/vectors-8-SSE2.h')
-rw-r--r-- | src/vectors-8-SSE2.h | 831 |
1 files changed, 555 insertions, 276 deletions
diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h index 6dfe89f..2326e49 100644 --- a/src/vectors-8-SSE2.h +++ b/src/vectors-8-SSE2.h @@ -1,12 +1,18 @@ - +// Vectorise using Intel's or AMD's SSE2 // Use the type __m128d directly, without introducing a wrapper class -// Use macros instead of inline functions + +#ifdef __PGI +// PGI doesn't want to inline functions +# include "macros/vectors-8-SSE2.h" +#else + +#include <cassert> +#include <cmath> + -#include <assert.h> -#include <math.h> #include <emmintrin.h> #ifdef __SSE4_1__ @@ -28,7 +34,7 @@ #endif #ifdef __FMA4__ -# include <fma4intrin.h> +# include <x86intrin.h> #endif @@ -53,46 +59,79 @@ // Vector type corresponding to CCTK_REAL -#define CCTK_REAL8_VEC __m128d +typedef __m128d CCTK_REAL8_VEC; +typedef __m128i CCTK_INTEGER8_VEC; +typedef __m128d CCTK_BOOLEAN8_VEC; // Number of vector elements in a CCTK_REAL_VEC #define CCTK_REAL8_VEC_SIZE 2 +vec_static_assert(sizeof(CCTK_REAL8_VEC) == + sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE); + // Integer and boolean types corresponding to this real type -#define CCTK_INTEGER8 CCTK_REAL8 -#define CCTK_BOOLEAN8 CCTK_REAL8 -#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC -#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC +typedef CCTK_INT8 CCTK_INTEGER8; +typedef CCTK_REAL8 CCTK_BOOLEAN8; union k8const_t { - long long i[2]; - double f[2]; - __m128i vi; - __m128d vf; + CCTK_INTEGER8 i[CCTK_REAL8_VEC_SIZE]; + CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE]; + CCTK_INTEGER8_VEC vi; + CCTK_REAL8_VEC vf; }; -#define K8_IMIN ((long long)0x8000000000000000ULL) +#define k8sign (vec8_set1i( (CCTK_INTEGER8)(1ULL << 63ULL))) +#define k8notsign (vec8_set1i(~ (CCTK_INTEGER8)(1ULL << 63ULL))) // Create vectors, extract vector elements -#define vec8_set1(a) (_mm_set1_pd(a)) -#define vec8_set(a,b) (_mm_set_pd(b,a)) // note reversed arguments +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_set1(CCTK_REAL8 const a) +{ + return _mm_set1_pd(a); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_set1i(CCTK_INT8 const a) +{ +#if defined(__INTEL_COMPILER) + // Intel 11.1 does not support _mm_set1_epi64x + return _mm_set1_pd(*(CCTK_REAL8 const*)&a); +#else + return _mm_castsi128_pd(_mm_set1_epi64x(a)); +#endif +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_set(CCTK_REAL8 const a, CCTK_REAL8 const b) +{ + return _mm_set_pd(b,a); // note reversed arguments +} // original order is 01 -#define vec8_swap10(x_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const x=x__; \ - _mm_shuffle_pd(x,x, _MM_SHUFFLE2(0,1)); \ - }) - -#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0]) -#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1]) -#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d]) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_swap10(CCTK_REAL8_VEC const x) +{ + return _mm_shuffle_pd(x,x, _MM_SHUFFLE2(0,1)); +} + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt0(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[0]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt1(CCTK_REAL8_VEC const x) +{ + return ((CCTK_REAL8 const*)&x)[1]; +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d) +{ + return ((CCTK_REAL8 const*)&x)[d]; +} @@ -100,141 +139,237 @@ union k8const_t { // Load a vector from memory (aligned and unaligned); this loads from // a reference to a scalar -#define vec8_load(p) (_mm_load_pd(&(p))) -#define vec8_loadu(p) (_mm_loadu_pd(&(p))) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_load(CCTK_REAL8 const& p) +{ + return _mm_load_pd(&p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu(CCTK_REAL8 const& p) +{ + return _mm_loadu_pd(&p); +} #if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS -# define vec8_load_off1(p) vec_loadu(p) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_load_off1(CCTK_REAL8 const& p) +{ + return vec8_loadu(p); +} #else -# define vec8_load_off1(p_) \ - ({ \ - CCTK_REAL8 const& p__=(p_); \ - CCTK_REAL8 const& p=p__; \ - _mm_shuffle_pd(vec8_load((&p)[-1]), \ - vec8_load((&p)[+1]), _MM_SHUFFLE2(0,1)); \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_load_off1(CCTK_REAL8 const& p) +{ + return _mm_shuffle_pd(vec8_load((&p)[-1]), + vec8_load((&p)[+1]), _MM_SHUFFLE2(0,1)); +} #endif // Load a vector from memory that may or may not be aligned, as // decided by the offset off and the vector size #if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS // Implementation: Always use unaligned load -# define vec8_loadu_maybe(off,p) vec8_loadu(p) -# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p) +{ + return vec8_loadu(p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL8 const& p) +{ + return vec8_loadu(p); +} #else -# define vec8_loadu_maybe(off,p_) \ - ({ \ - CCTK_REAL8 const& p__=(p_); \ - CCTK_REAL8 const& p=p__; \ - (off) % CCTK_REAL8_VEC_SIZE == 0 ? \ - vec8_load(p) : \ - vec8_load_off1(p); \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p) +{ + // The :? operator breaks with the Intel compiler + // return off % CCTK_REAL8_VEC_SIZE == 0 ? vec8_load(p) : vec8_load_off1(p); + if (off % CCTK_REAL8_VEC_SIZE == 0) return vec8_load(p); + return vec8_load_off1(p); +} # if VECTORISE_ALIGNED_ARRAYS // Assume all array x sizes are multiples of the vector size -# define vec8_loadu_maybe3(off1,off2,off3,p) \ - vec8_loadu_maybe(off1,p) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL8 const& p) +{ + return vec8_loadu_maybe(off1, p); +} # else -# define vec8_loadu_maybe3(off1,off2,off3,p_) \ - ({ \ - CCTK_REAL8 const& p__=(p_); \ - CCTK_REAL8 const& p=p__; \ - ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \ - (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \ - vec8_loadu(p) : \ - vec8_loadu_maybe(off1,p); \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1, + std::ptrdiff_t const off2, + std::ptrdiff_t const off3, + CCTK_REAL8 const& p) +{ + return + off2 % CCTK_REAL8_VEC_SIZE != 0 or + off3 % CCTK_REAL8_VEC_SIZE != 0 ? + vec8_loadu(p) : + vec8_loadu_maybe(off1, p); +} # endif #endif // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar -#define vec8_store(p,x) (_mm_store_pd(&(p),x)) -#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ + _mm_store_pd(&p, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_storeu(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ + _mm_storeu_pd(&p, x); +} #if ! VECTORISE_STREAMING_STORES -# define vec8_store_nta(p,x) vec8_store(p,x) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ + vec8_store(p, x); +} #else -# define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) +{ + _mm_stream_pd(&p, x); +} #endif // Store a partial vector (aligned and non-temporal) -#define vec8_store_partial_prepare(i,imin,imax) \ - bool const v8stp_lo = (i)>=(imin); \ - bool const v8stp_hi = (i)+CCTK_REAL_VEC_SIZE-1<(imax) +#define vec8_store_partial_prepare(i, imin,imax) \ + bool v8stp_lo, v8stp_hi; \ + vec8_store_partial_prepare_(v8stp_lo, v8stp_hi, i, imin, imax); +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_partial_prepare_(bool& lo, bool& hi, + std::ptrdiff_t const i, + std::ptrdiff_t const imin, + std::ptrdiff_t const imax) +{ + lo = i >= imin; + hi = i+CCTK_REAL8_VEC_SIZE-1 < imax; +} +#define vec8_store_nta_partial(p, x) \ + vec8_store_nta_partial_(v8stp_lo, v8stp_hi, p, x) #if VECTORISE_STREAMING_STORES && defined(__SSE4A__) -# define vec8_store_nta_partial(p_,x_) \ - ({ \ - CCTK_REAL8& p__=(p_); \ - CCTK_REAL8& p=p__; \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const x=x__; \ - if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \ - vec8_store_nta(p,x); \ - } else if (v8stp_lo) { \ - _mm_stream_sd(&p,x); \ - } else if (v8stp_hi) { \ - _mm_stream_sd(&p+1, vec8_swap10(x)); \ - } \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_(bool const lo, bool const hi, + CCTK_REAL8& p, + CCTK_REAL8_VEC const x) +{ + if (CCTK_BUILTIN_EXPECT(lo and hi, true)) { + vec8_store_nta(p, x); + } else if (lo) { + _mm_stream_sd(&p, x); + } else if (hi) { + _mm_stream_sd(&p+1, vec8_swap10(x)); + } +} #else -# define vec8_store_nta_partial(p_,x_) \ - ({ \ - CCTK_REAL8& p__=(p_); \ - CCTK_REAL8& p=p__; \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const x=x__; \ - if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \ - vec8_store_nta(p,x); \ - } else if (v8stp_lo) { \ - _mm_storel_pd(&p,x); \ - } else if (v8stp_hi) { \ - _mm_storeh_pd(&p+1,x); \ - } \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_(bool const lo, bool const hi, + CCTK_REAL8& p, + CCTK_REAL8_VEC const x) +{ + if (CCTK_BUILTIN_EXPECT(lo and hi, true)) { + vec8_store_nta(p, x); + } else if (lo) { + _mm_storel_pd(&p, x); + } else if (hi) { + _mm_storeh_pd(&p+1, x); + } +} #endif // Store a lower or higher partial vector (aligned and non-temporal) #if ! VECTORISE_STREAMING_STORES -# define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x)) -# define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_lo(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + _mm_storel_pd(&p, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_hi(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + _mm_storeh_pd(&p+1, x); +} #else # if defined(__SSE4A__) -# define vec8_store_nta_partial_lo(p,x,n) (_mm_stream_sd(&(p),x)) -# define vec8_store_nta_partial_hi(p,x,n) \ - (_mm_stream_sd(&(p)+1, vec8_swap10(x))) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_lo(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + _mm_stream_sd(&p, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_hi(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + _mm_stream_sd(&p+1, vec8_swap10(x)); +} # else // TODO: use clflush once a whole cache line has been written (cache // lines are usually larger than the CPU vector size) -# define vec8_store_nta_partial_lo(p_,x,n) \ - ({ \ - CCTK_REAL8& p__=(p_); \ - CCTK_REAL8& p=p__; \ - _mm_storel_pd(&p,x); \ - /* _mm_clflush(&p); */ \ - }) -# define vec8_store_nta_partial_hi(p_,x,n) \ - ({ \ - CCTK_REAL8& p__=(p_); \ - CCTK_REAL8& p=p__; \ - _mm_storeh_pd(&p+1,x); \ - /* _mm_clflush(&p+1); */ \ - }) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_lo(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + _mm_storel_pd(&p, x); + // _mm_clflush(&p); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_hi(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + _mm_storeh_pd(&p+1, x); + // _mm_clflush(&p+1); +} # endif #endif #if 0 // This is slower; we would need a non-temporal read -#define vec8_store_nta_partial_lo(p,x,n) \ - vec8_store_nta(p, _mm_loadh_pd(x,&(p)+1)) -#define vec8_store_nta_partial_hi(p,x,n) \ - vec8_store_nta(p, _mm_loadl_pd(x,&(p))) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_lo(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + vec8_store_nta(p, _mm_loadh_pd(x, &p+1)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_hi(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const n) +{ + vec8_store_nta(p, _mm_loadl_pd(x, &p)); +} #endif -#define vec8_store_nta_partial_mid(p,x,nlo,nhi) assert(0) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +void vec8_store_nta_partial_mid(CCTK_REAL8& p, + CCTK_REAL8_VEC const x, + ptrdiff_t const nlo, + ptrdiff_t const nhi) +{ + assert(0); +} // Functions and operators -static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, }}; - // Operators // #define k8inot(x) (_mm_xor_si128(k8all_mask,x)) @@ -254,176 +389,320 @@ static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, }}; // #define k8or(x,y) (_mm_or_pd(x,y)) // #define k8xor(x,y) (_mm_xor_pd(x,y)) -#define k8neg(x) (_mm_xor_pd(k8sign_mask.vf,x)) - -#define k8add(x,y) (_mm_add_pd(x,y)) -#define k8sub(x,y) (_mm_sub_pd(x,y)) -#define k8mul(x,y) (_mm_mul_pd(x,y)) -#define k8div(x,y) (_mm_div_pd(x,y)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8neg(CCTK_REAL8_VEC const x) +{ + return _mm_xor_pd(k8sign, x); +} + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8add(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_add_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sub(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_sub_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8mul(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_mul_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8div(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_div_pd(x, y); +} // Fused multiply-add, defined as [+-]x*y[+-]z #ifdef __FMA4__ -# define k8madd(x,y,z) (_mm_macc_pd(x,y,z)) -# define k8msub(x,y,z) (_mm_msub_pd(x,y,z)) -# define k8nmadd(x,y,z) (_mm_nmsub_pd(x,y,z)) -# define k8nmsub(x,y,z) (_mm_nmacc_pd(x,y,z)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8madd(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return _mm_macc_pd(x, y, z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8msub(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return _mm_msub_pd(x, y, z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8nmadd(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return _mm_nmsub_pd(x, y, z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return _mm_nmacc_pd(x, y, z); +} #else -# define k8madd(x,y,z) (k8add(k8mul(x,y),z)) -# define k8msub(x,y,z) (k8sub(k8mul(x,y),z)) -# define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y))) -# define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y))) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8madd(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return k8add(k8mul(x, y), z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8msub(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return k8sub(k8mul(x, y), z); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8nmadd(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return k8sub(k8neg(z), k8mul(x, y)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ + return k8sub(z, k8mul(x, y)); +} #endif // Cheap functions -#define k8copysign(x,y) \ - (_mm_or_pd(_mm_andnot_pd(k8sign_mask.vf,x), \ - _mm_and_pd(k8sign_mask.vf,y))) -#define k8fabs(x) (_mm_andnot_pd(k8sign_mask.vf,x)) -#define k8fmax(x,y) (_mm_max_pd(x,y)) -#define k8fmin(x,y) (_mm_min_pd(x,y)) -#define k8fnabs(x) (_mm_or_pd(k8sign_mask.vf,x)) -static const k8const_t k8zero = { f: { 0.0, 0.0, }}; -static const k8const_t k8one = { f: { 1.0, 1.0, }}; -#define k8sgn(x_) \ - ({ \ - CCTK_REAL_VEC const x__=(x_); \ - CCTK_REAL_VEC const x=x__; \ - CCTK_REAL_VEC const iszero = _mm_cmpeq_pd(k8zero.vf, x); \ - CCTK_REAL_VEC const sign = _mm_and_pd(k8sign_mask.vf, x); \ - CCTK_REAL_VEC const signedone = _mm_or_pd(k8one.vf, sign); \ - k8ifthen(iszero, k8zero.vf, signedone); \ - }) -#define k8sqrt(x) (_mm_sqrt_pd(x)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8copysign(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_or_pd(_mm_and_pd(k8notsign, x), + _mm_and_pd(k8sign , y)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8fabs(CCTK_REAL8_VEC const x) +{ + return _mm_and_pd(k8notsign, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8fmax(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_max_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8fmin(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_min_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8fnabs(CCTK_REAL8_VEC const x) +{ + return _mm_or_pd(k8sign, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x) +{ + return _mm_sqrt_pd(x); +} // Expensive functions -#define K8REPL(f,x_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const x=x__; \ - vec8_set(f(vec8_elt0(x)), \ - f(vec8_elt1(x))); \ - }) -#define K8REPL2S(f,x_,a_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8 const a__=(a_); \ - CCTK_REAL8_VEC const x=x__; \ - CCTK_REAL8 const a=a__; \ - vec8_set(f(vec8_elt0(x),a), \ - f(vec8_elt1(x),a)); \ - }) -#define K8REPL2(f,x_,y_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const y__=(y_); \ - CCTK_REAL8_VEC const x=x__; \ - CCTK_REAL8_VEC const y=y__; \ - vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \ - f(vec8_elt1(x),vec8_elt1(y))); \ - }) - -#define k8acos(x) K8REPL(acos,x) -#define k8acosh(x) K8REPL(acosh,x) -#define k8asin(x) K8REPL(asin,x) -#define k8asinh(x) K8REPL(asinh,x) -#define k8atan(x) K8REPL(atan,x) -#define k8atan2(x,y) K8REPL2(atan2,x,y) -#define k8atanh(x) K8REPL(atanh,x) -#define k8cos(x) K8REPL(cos,x) -#define k8cosh(x) K8REPL(cosh,x) -#define k8exp(x) K8REPL(exp,x) -#define k8log(x) K8REPL(log,x) -#define k8pow(x,a) K8REPL2S(pow,x,a) -#define k8sin(x) K8REPL(sin,x) -#define k8sinh(x) K8REPL(sinh,x) -#define k8tan(x) K8REPL(tan,x) -#define k8tanh(x) K8REPL(tanh,x) - -static const k8const_t k8lfalse_ = {{ +0LL, +0LL, }}; -static const k8const_t k8ltrue_ = {{ -1LL, -1LL, }}; -#define k8lfalse (k8lfalse_.vf) -#define k8ltrue (k8ltrue_.vf) -#define k8lnot(x) (_mm_xor_pd(k8ltrue,x)) -#define k8land(x,y) (_mm_and_pd(x,y)) -#define k8lor(x,y) (_mm_or_pd(x,y)) -#define k8lxor(x,y) (_mm_xor_pd(x,y)) - +#define K8REPL(f,x) \ + vec8_set(f(vec8_elt0(x)), \ + f(vec8_elt1(x))); +#define K8REPL2S(f,x,a) \ + vec8_set(f(vec8_elt0(x),a), \ + f(vec8_elt1(x),a)); +#define K8REPL2(f,x,y) \ + vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \ + f(vec8_elt1(x),vec8_elt1(y))); + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x) +{ + return K8REPL(acos,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8acosh(CCTK_REAL8_VEC const x) +{ + return K8REPL(acosh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8asin(CCTK_REAL8_VEC const x) +{ + return K8REPL(asin,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8asinh(CCTK_REAL8_VEC const x) +{ + return K8REPL(asinh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atan(CCTK_REAL8_VEC const x) +{ + return K8REPL(atan,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atan2(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return K8REPL2(atan2,x,y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atanh(CCTK_REAL8_VEC const x) +{ + return K8REPL(atanh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8cos(CCTK_REAL8_VEC const x) +{ + return K8REPL(cos,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8cosh(CCTK_REAL8_VEC const x) +{ + return K8REPL(cosh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8exp(CCTK_REAL8_VEC const x) +{ + return K8REPL(exp,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8log(CCTK_REAL8_VEC const x) +{ + return K8REPL(log,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8pow(CCTK_REAL8_VEC const x, CCTK_REAL8 const a) +{ + return K8REPL2S(pow,x,a); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sin(CCTK_REAL8_VEC const x) +{ + return K8REPL(sin,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sinh(CCTK_REAL8_VEC const x) +{ + return K8REPL(sinh,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8tan(CCTK_REAL8_VEC const x) +{ + return K8REPL(tan,x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) +{ + return K8REPL(tanh,x); +} + + + +#define k8lfalse (vec8_set1i( 0)) +#define k8ltrue (vec8_set1i(~0)) +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8lnot(CCTK_BOOLEAN8_VEC const x) +{ + return _mm_xor_pd(k8ltrue, x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8land(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) +{ + return _mm_and_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8lor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) +{ + return _mm_or_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8lxor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) +{ + return _mm_xor_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x, + CCTK_REAL8_VEC const y, + CCTK_REAL8_VEC const z) +{ #ifdef __SSE4_1__ -# define k8ifthen(x,y,z) (_mm_blendv_pd(z,y,x)) + return _mm_blendv_pd(z,y,x); #elif 0 -// This is slow (but this is what Intel/PGI produce by themselves) -# define k8ifthen(x_,y_,z_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const y__=(y_); \ - CCTK_REAL8_VEC const z__=(z_); \ - CCTK_REAL8_VEC const x=x__; \ - CCTK_REAL8_VEC const y=y__; \ - CCTK_REAL8_VEC const z=z__; \ - int const m = _mm_movemask_pd(x); \ - CCTK_REAL8_VEC r; \ - switch (m) { \ - case 0: r = y; break; \ - case 1: r = _mm_move_sd(y,z); break; \ - case 2: r = _mm_move_sd(z,y); break; \ - case 3: r = z; break; \ - } \ - r; \ - }) + // This is slow (but this is what Intel/PGI produce by themselves) + int const m = _mm_movemask_pd(x); + switch (m) { + case 0: return y; + case 1: return _mm_move_sd(y,z); + case 2: return _mm_move_sd(z,y); + } + return z; #elif 0 -# ifdef __cplusplus -# define k8signbit(x) ({ using namespace std; signbit(x); }) -# else -# define k8signbit(x) (signbit(x)) -# endif -# define k8ifthen(x_,y_,z_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const y__=(y_); \ - CCTK_REAL8_VEC const z__=(z_); \ - CCTK_REAL8_VEC const x=x__; \ - CCTK_REAL8_VEC const y=y__; \ - CCTK_REAL8_VEC const z=z__; \ - vec8_set(k8signbit(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), \ - k8signbit(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); \ - }) + return vec8_set(std::signbit(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), + std::signbit(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); #elif 0 -// We don't need to shift -- the condition (mask) will be either all -// zeros or all ones -static const k8const_t k8ione = {{ 0x1ULL, 0x1ULL, }}; -# define k8ifthen(x_,y_,z_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const y__=(y_); \ - CCTK_REAL8_VEC const z__=(z_); \ - CCTK_REAL8_VEC const x=x__; \ - CCTK_REAL8_VEC const y=y__; \ - CCTK_REAL8_VEC const z=z__; \ - /* there is no _mm_srai_epi64(x, 63); we therefore calculate srli(x)-1 */ \ - __m128i const x_int = *(__m128i const*)&x; \ - __m128i const imask_int = \ - _mm_sub_epi64(_mm_srli_epi64(x_int, 63), k8ione.vi); \ - CCTK_REAL8_VEC const imask = *(CCTK_REAL8_VEC const*)&imask_int; \ - /* (z & ~mask) | (y & mask) where imask = ~mask */ \ - _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); \ - }) + // We don't need to shift -- the condition (mask) will be either all + // zeros or all ones + k8const_t const k8ione = { i: { 1, 1, }}; + // there is no _mm_srai_epi64(x, 63); we therefore calculate srli(x)-1 + __m128i const x_int = *(__m128i const*)&x; + __m128i const imask_int = _mm_sub_epi64(_mm_srli_epi64(x_int, 63), k8ione.vi); + CCTK_REAL8_VEC const imask = *(CCTK_REAL8_VEC const*)&imask_int; + // (z & ~mask) | (y & mask) where imask = ~mask + return _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); #else -# define k8ifthen(x_,y_,z_) \ - ({ \ - CCTK_REAL8_VEC const x__=(x_); \ - CCTK_REAL8_VEC const y__=(y_); \ - CCTK_REAL8_VEC const z__=(z_); \ - CCTK_REAL8_VEC const x=x__; \ - CCTK_REAL8_VEC const y=y__; \ - CCTK_REAL8_VEC const z=z__; \ - /* (z & ~mask) | (y & mask) where imask = ~mask */ \ - _mm_or_pd(_mm_and_pd(x, y), _mm_andnot_pd(x, z)); \ - }) + // This assumes that all logical operations always return either + // lfalse or ltrue, and nothing "in between" + // (z & ~mask) | (y & mask) where imask = ~mask + return _mm_or_pd(_mm_and_pd(x, y), _mm_andnot_pd(x, z)); #endif +} + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmpeq(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_cmpeq_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmpne(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_cmpneq_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmpgt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_cmpgt_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmpge(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_cmpge_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmplt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_cmplt_pd(x, y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_BOOLEAN8_VEC k8cmple(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_cmple_pd(x, y); +} + + + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sgn(CCTK_REAL8_VEC const x) +{ + CCTK_BOOLEAN8_VEC const iszero = k8cmpeq(x, vec8_set1(0.0)); + CCTK_REAL8_VEC const sign = _mm_and_pd(k8sign, x); + CCTK_REAL8_VEC const signedone = _mm_or_pd(sign, vec8_set1(1.0)); + return k8ifthen(iszero, vec8_set1(0.0), signedone); +} -#define k8cmpeq(x,y) (_mm_cmpeq_pd(x,y)) -#define k8cmpne(x,y) (_mm_cmpneq_pd(x,y)) -#define k8cmpgt(x,y) (_mm_cmpgt_pd(x,y)) -#define k8cmpge(x,y) (_mm_cmpge_pd(x,y)) -#define k8cmplt(x,y) (_mm_cmplt_pd(x,y)) -#define k8cmple(x,y) (_mm_cmple_pd(x,y)) +#endif |