diff options
author | Ian Hinder <ian.hinder@aei.mpg.de> | 2010-11-22 11:20:48 +0100 |
---|---|---|
committer | Ian Hinder <ian.hinder@aei.mpg.de> | 2010-11-23 00:19:37 +0100 |
commit | d6c4d4c2131107ef3a4004692823e2041394acd6 (patch) | |
tree | bbdd18d5c45e9d4b4f722c8792c4c7d1f02ced19 /Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh | |
parent | c9431558d69ee1b8769c918044a45bdd50520b46 (diff) |
Implement vectorisation
This is Erik's vectorisation working tree from 13-Oct-2010
Diffstat (limited to 'Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh')
-rw-r--r-- | Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh | 194 |
1 files changed, 194 insertions, 0 deletions
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh new file mode 100644 index 0000000..4a4eea6 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh @@ -0,0 +1,194 @@ +// Vectorise using Intel's or AMD's SSE2 + + + +#include <emmintrin.h> + +// Vector type corresponding to CCTK_REAL +struct CCTK_REAL_VEC { + // The underlying scalar and vector types + typedef double S; + typedef __m128d V; + V v; + + // Convert from and to the underlying vector type + inline CCTK_REAL_VEC(V const v_): v(v_) { } + inline operator V const() const { return v; } + + inline CCTK_REAL_VEC() { } + + // Copy constructor + inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { } +}; + +// Number of vector elements in a CCTK_REAL_VEC +static +int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL); + + + +// Create vectors, extract vector elements + +DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a)) +DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a)) + +// Get a scalar from the vector +#if defined(__PGI) && defined (__amd64__) +// _mm_cvtsd_f64 does not exist on PGI compilers +// DEFINE_FUNCTION_V_R(vec_elt0,({ CCTK_REAL a; _mm_store_sd(&a,x); a; })) +// DEFINE_FUNCTION_V_R(vec_elt0,(*(CCTK_REAL const*)&x)) +// This generates the fastest code with PGI compilers +DEFINE_FUNCTION_V_R(vec_elt0,({ CCTK_REAL a; asm ("" : "=x" (a) : "0" (x)); a; })) +#else +DEFINE_FUNCTION_V_R(vec_elt0,_mm_cvtsd_f64(x)) // this is a no-op +#endif +DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_unpackhi_pd(x,x))) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p)) +DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p)) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: default to unaligned load +template<int mod> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p)) +template<int mod1,int mod2,int mod3> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p)) +// Implementation: load aligned if the modulus is zero +template<> +inline +CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p) +{ + return vec_load(p); +} +template<> +inline +CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p) +{ + return vec_load(p); +} +// Call the implementation with the modulus +#define vec_loadu_maybe(off,p) \ + (vec_loadu_maybe_impl<(off)&(CCTK_REAL_VEC_SIZE-1>(p))) +#define vec_loadu_maybe3(off1,off2,off3,p) \ + (vec_loadu_maybe_impl3<(off1)&(CCTK_REAL_VEC_SIZE-1), \ + (off2)&(CCTK_REAL_VEC_SIZE-1), \ + (off3)&(CCTK_REAL_VEC_SIZE-1)>(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x)) +DEFINE_FUNCTION_PRV(vec_storeu,_mm_storeu_pd(&p,x)) +#if defined(KRANC_CACHE) +DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x)) +#else +DEFINE_FUNCTION_PRV(vec_store_nta,_mm_store_pd(&p,x)) +#endif + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +static inline +void vec_store_nta_partial_lo (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n) +{ + switch (n) { + case 1: _mm_storel_pd(&p,x); break; + default: assert(0); + } +} +static inline +void vec_store_nta_partial_hi (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n) +{ + switch (n) { + case 1: _mm_storeh_pd((&p)+1,x); break; + default: assert(0); + } +} + + + +// Functions and operators + +// Single-argument operators +#if 0 +DEFINE_FUNCTION_V_V(operator+,x) +static CCTK_REAL_VEC const vec_neg_mask = + (CCTK_REAL_VEC::V)(__m128i) { 0x8000000000000000ULL, 0x8000000000000000ULL }; +DEFINE_FUNCTION_V_V(operator-,_mm_xor_pd(x,vec_neg_mask)) +#endif +DEFINE_FUNCTION_V_V(operator+,+x.v) +DEFINE_FUNCTION_V_V(operator-,-x.v) + +// Double-argument operators, both vectors +#if 0 +DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y)) +#endif +DEFINE_FUNCTION_VV_V(operator+,x.v+y.v) +DEFINE_FUNCTION_VV_V(operator-,x.v-y.v) +DEFINE_FUNCTION_VV_V(operator*,x.v*y.v) +DEFINE_FUNCTION_VV_V(operator/,x.v/y.v) + +// Double-argument operators, vector and scalar +DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a)) + +// Double-argument operators, scalar and vector +DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x) +DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x) +DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x) +DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x) + +// Cheap functions +#if defined(__PGI) && defined (__amd64__) +// The PGI compiler does not understand __m128d literals +static union { + CCTK_REAL_VEC::S s[CCTK_REAL_VEC_SIZE]; + CCTK_REAL_VEC::V v; +} vec_fabs_mask_impl = { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }; +# define vec_fabs_mask (vec_fabs_mask_impl.v) +#else +static CCTK_REAL_VEC const vec_fabs_mask = + (CCTK_REAL_VEC::V)(__m128i) { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }; +#endif +DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask)) +DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y)) +DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y)) +DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x)) + +// Expensive functions +DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x)))) +DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x)))) +DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a))) + + + +#undef Sign +#define Sign(x) (42) + +#undef ToReal +#define ToReal(x) vec_set1(x) + +#if defined(__PGI) && defined (__amd64__) +// Special case for PGI 9.0.4 to avoid an internal compiler error +#undef IfThen +static inline +CCTK_REAL_VEC IfThen (bool const cond, CCTK_REAL_VEC const x, CCTK_REAL_VEC const y) +{ + union { + __m128i vi; + CCTK_REAL_VEC::V v; + } mask; + mask.vi = _mm_set1_epi64x(-(long long)cond); + return _mm_or_pd(_mm_and_pd(x.v, mask.v), _mm_andnot_pd(mask.v, y.v)); +} +#endif |