diff options
Diffstat (limited to 'src/vectors-8-SSE2.h')
-rw-r--r-- | src/vectors-8-SSE2.h | 196 |
1 files changed, 156 insertions, 40 deletions
diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h index bded69e..b6dcfa6 100644 --- a/src/vectors-8-SSE2.h +++ b/src/vectors-8-SSE2.h @@ -1,3 +1,4 @@ +// -*-C++-*- // Vectorise using Intel's or AMD's SSE2 // Use the type __m128d directly, without introducing a wrapper class @@ -33,6 +34,9 @@ // } #endif +#ifdef __AVX__ +# include <immintrin.h> +#endif #ifdef __FMA4__ # include <x86intrin.h> #endif @@ -59,9 +63,13 @@ // Vector type corresponding to CCTK_REAL +// Note: some boolean masks (e.g. ~0) correspond to nan when +// interpreted as floating point number. gcc 4.8 is clever enough to +// optimize away such constants with fast-math. We therefore need to +// handle this constant as integer number. typedef __m128d CCTK_REAL8_VEC; typedef __m128i CCTK_INTEGER8_VEC; -typedef __m128d CCTK_BOOLEAN8_VEC; +typedef __m128i CCTK_BOOLEAN8_VEC; // Number of vector elements in a CCTK_REAL_VEC #define CCTK_REAL8_VEC_SIZE 2 @@ -70,16 +78,21 @@ vec_static_assert(sizeof(CCTK_REAL8_VEC) == sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE); // Integer and boolean types corresponding to this real type -typedef CCTK_INT8 CCTK_INTEGER8; -typedef CCTK_REAL8 CCTK_BOOLEAN8; +typedef CCTK_INT8 CCTK_INTEGER8; +typedef CCTK_INT8 CCTK_BOOLEAN8; + + + +// These macros are undefined at the end of this file -- use them only +// within functions, not within macros that are exported +#define I2R(x) _mm_castsi128_pd(x) +#define R2I(x) _mm_castpd_si128(x) union k8const_t { CCTK_INTEGER8 i[CCTK_REAL8_VEC_SIZE]; - CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE]; CCTK_INTEGER8_VEC vi; - CCTK_REAL8_VEC vf; }; #define k8sign (vec8_set1i( (CCTK_INTEGER8)(1ULL << 63ULL))) @@ -95,13 +108,13 @@ CCTK_REAL8_VEC vec8_set1(CCTK_REAL8 const a) return _mm_set1_pd(a); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8_VEC vec8_set1i(CCTK_INT8 const a) +CCTK_INTEGER8_VEC vec8_set1i(CCTK_INT8 const a) { #if defined(__INTEL_COMPILER) // Intel 11.1 does not support _mm_set1_epi64x - return _mm_set1_pd(*(CCTK_REAL8 const*)&a); + return R2I(_mm_set1_pd(*(CCTK_REAL8 const*)&a)); #else - return _mm_castsi128_pd(_mm_set1_epi64x(a)); + return _mm_set1_epi64x(a); #endif } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE @@ -118,19 +131,25 @@ CCTK_REAL8_VEC vec8_swap10(CCTK_REAL8_VEC const x) } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt0(CCTK_REAL8_VEC const x) +CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d) { - return ((CCTK_REAL8 const*)&x)[0]; + CCTK_REAL8 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt1(CCTK_REAL8_VEC const x) +CCTK_INTEGER8 vec8_elti(CCTK_INTEGER8_VEC const x, std::ptrdiff_t const d) { - return ((CCTK_REAL8 const*)&x)[1]; + CCTK_INTEGER8 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE -CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d) +CCTK_BOOLEAN8 vec8_eltb(CCTK_BOOLEAN8_VEC const x, std::ptrdiff_t const d) { - return ((CCTK_REAL8 const*)&x)[d]; + CCTK_BOOLEAN8 e; + std::memcpy(&e, &((char const*)&x)[d*sizeof e], sizeof e); + return e; } @@ -240,6 +259,8 @@ void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x) { + // TODO: requires _mm_sfence() afterwards? requires _mm_lfence() in + // readers afterwards? maybe better just an _mm_mfence() afterwards? _mm_stream_pd(&p, x); } #endif @@ -394,7 +415,7 @@ void vec8_store_nta_partial_mid(CCTK_REAL8& p, static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8neg(CCTK_REAL8_VEC const x) { - return _mm_xor_pd(k8sign, x); + return _mm_xor_pd(I2R(k8sign), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE @@ -483,13 +504,13 @@ CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x, static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8copysign(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm_or_pd(_mm_and_pd(k8notsign, x), - _mm_and_pd(k8sign , y)); + return _mm_or_pd(_mm_and_pd(I2R(k8notsign), x), + _mm_and_pd(I2R(k8sign ), y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8fabs(CCTK_REAL8_VEC const x) { - return _mm_and_pd(k8notsign, x); + return _mm_and_pd(I2R(k8notsign), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8fmax(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) @@ -504,7 +525,7 @@ CCTK_REAL8_VEC k8fmin(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8fnabs(CCTK_REAL8_VEC const x) { - return _mm_or_pd(k8sign, x); + return _mm_or_pd(I2R(k8sign), x); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x) @@ -512,16 +533,104 @@ CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x) return _mm_sqrt_pd(x); } + + // Expensive functions +#if defined __ICC +// The Intel compiler provides intrinsics for these + +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x) +{ + return _mm_acos_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8acosh(CCTK_REAL8_VEC const x) +{ + return _mm_acosh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8asin(CCTK_REAL8_VEC const x) +{ + return _mm_asin_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8asinh(CCTK_REAL8_VEC const x) +{ + return _mm_asinh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atan(CCTK_REAL8_VEC const x) +{ + return _mm_atan_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atan2(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) +{ + return _mm_atan2_pd(x,y); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8atanh(CCTK_REAL8_VEC const x) +{ + return _mm_atanh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8cos(CCTK_REAL8_VEC const x) +{ + return _mm_cos_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8cosh(CCTK_REAL8_VEC const x) +{ + return _mm_cosh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8exp(CCTK_REAL8_VEC const x) +{ + return _mm_exp_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8log(CCTK_REAL8_VEC const x) +{ + return _mm_log_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8pow(CCTK_REAL8_VEC const x, CCTK_REAL8 const a) +{ + return _mm_pow_pd(x, _mm_set1_pd(a)); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sin(CCTK_REAL8_VEC const x) +{ + return _mm_sin_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8sinh(CCTK_REAL8_VEC const x) +{ + return _mm_sinh_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8tan(CCTK_REAL8_VEC const x) +{ + return _mm_tan_pd(x); +} +static inline CCTK_ATTRIBUTE_ALWAYS_INLINE +CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) +{ + return _mm_tanh_pd(x); +} + +#else + #define K8REPL(f,x) \ - vec8_set(f(vec8_elt0(x)), \ - f(vec8_elt1(x))); + vec8_set(f(vec8_elt(x,0)), \ + f(vec8_elt(x,1))); #define K8REPL2S(f,x,a) \ - vec8_set(f(vec8_elt0(x),a), \ - f(vec8_elt1(x),a)); + vec8_set(f(vec8_elt(x,0),a), \ + f(vec8_elt(x,1),a)); #define K8REPL2(f,x,y) \ - vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \ - f(vec8_elt1(x),vec8_elt1(y))); + vec8_set(f(vec8_elt(x,0),vec8_elt(y,0)), \ + f(vec8_elt(x,1),vec8_elt(y,1))); static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x) @@ -604,6 +713,8 @@ CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) return K8REPL(tanh,x); } +#endif + #define k8lfalse (vec8_set1i( 0)) @@ -611,22 +722,22 @@ CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x) static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8lnot(CCTK_BOOLEAN8_VEC const x) { - return _mm_xor_pd(k8ltrue, x); + return R2I(_mm_xor_pd(I2R(k8ltrue), I2R(x))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8land(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) { - return _mm_and_pd(x, y); + return R2I(_mm_and_pd(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8lor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) { - return _mm_or_pd(x, y); + return R2I(_mm_or_pd(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8lxor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y) { - return _mm_xor_pd(x, y); + return R2I(_mm_xor_pd(I2R(x), I2R(y))); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x, @@ -634,7 +745,7 @@ CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x, CCTK_REAL8_VEC const z) { #ifdef __SSE4_1__ - return _mm_blendv_pd(z,y,x); + return _mm_blendv_pd(z,y,I2R(x)); #elif 0 // This is slow (but this is what Intel/PGI produce by themselves) int const m = _mm_movemask_pd(x); @@ -645,8 +756,8 @@ CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x, } return z; #elif 0 - return vec8_set(std::signbit(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), - std::signbit(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); + return vec8_set(std::signbit(vec8_elt(x,0)) ? vec8_elt(y,0) : vec8_elt(z,0), + std::signbit(vec8_elt(x,1)) ? vec8_elt(y,1) : vec8_elt(z,1)); #elif 0 // We don't need to shift -- the condition (mask) will be either all // zeros or all ones @@ -661,39 +772,39 @@ CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x, // This assumes that all logical operations always return either // lfalse or ltrue, and nothing "in between" // (z & ~mask) | (y & mask) where imask = ~mask - return _mm_or_pd(_mm_and_pd(x, y), _mm_andnot_pd(x, z)); + return _mm_or_pd(_mm_and_pd(I2R(x), y), _mm_andnot_pd(I2R(x), z)); #endif } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmpeq(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm_cmpeq_pd(x, y); + return R2I(_mm_cmpeq_pd(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmpne(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm_cmpneq_pd(x, y); + return R2I(_mm_cmpneq_pd(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmpgt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm_cmpgt_pd(x, y); + return R2I(_mm_cmpgt_pd(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmpge(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm_cmpge_pd(x, y); + return R2I(_mm_cmpge_pd(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmplt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm_cmplt_pd(x, y); + return R2I(_mm_cmplt_pd(x, y)); } static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_BOOLEAN8_VEC k8cmple(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y) { - return _mm_cmple_pd(x, y); + return R2I(_mm_cmple_pd(x, y)); } @@ -702,9 +813,14 @@ static inline CCTK_ATTRIBUTE_ALWAYS_INLINE CCTK_REAL8_VEC k8sgn(CCTK_REAL8_VEC const x) { CCTK_BOOLEAN8_VEC const iszero = k8cmpeq(x, vec8_set1(0.0)); - CCTK_REAL8_VEC const sign = _mm_and_pd(k8sign, x); + CCTK_REAL8_VEC const sign = _mm_and_pd(I2R(k8sign), x); CCTK_REAL8_VEC const signedone = _mm_or_pd(sign, vec8_set1(1.0)); return k8ifthen(iszero, vec8_set1(0.0), signedone); } + + +#undef I2R +#undef R2I + #endif |