diff options
author | Ian Hinder <ian.hinder@aei.mpg.de> | 2010-11-22 11:20:48 +0100 |
---|---|---|
committer | Ian Hinder <ian.hinder@aei.mpg.de> | 2010-11-23 00:19:37 +0100 |
commit | d6c4d4c2131107ef3a4004692823e2041394acd6 (patch) | |
tree | bbdd18d5c45e9d4b4f722c8792c4c7d1f02ced19 /Auxiliary/Cactus/KrancNumericalTools | |
parent | c9431558d69ee1b8769c918044a45bdd50520b46 (diff) |
Implement vectorisation
This is Erik's vectorisation working tree from 13-Oct-2010
Diffstat (limited to 'Auxiliary/Cactus/KrancNumericalTools')
11 files changed, 1535 insertions, 226 deletions
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h index 29c89d1..ee6a3b7 100644 --- a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h @@ -1,17 +1,17 @@ -#define Power(x, y) (pow((x), (y))) +#define Power(x, y) (pow(x,y)) #define Sqrt(x) (sqrt(x)) #ifdef KRANC_C -#define Abs(x) (fabs(x)) -#define Min(x, y) (fmin((x), (y))) -#define Max(x, y) (fmax((x), (y))) -#define IfThen(x,y,z) ((x) ? (y) : (z)) +# define Abs(x) (fabs(x)) +# define Min(x, y) (fmin(x,y)) +# define Max(x, y) (fmax(x,y)) +# define IfThen(x,y,z) ((x) ? (y) : (z)) #else -#define Abs(x) (abs(x)) -#define Min(x, y) (min((x), (y))) -#define Max(x, y) (max((x), (y))) +# define Abs(x) (abs(x)) +# define Min(x, y) (min(x,y)) +# define Max(x, y) (max(x,y)) /* IfThen cannot be expressed in Fortran */ #endif @@ -31,17 +31,46 @@ #define Tanh(x) (tanh(x)) #ifdef KRANC_C -#define Sign(x) (signbit(x)?-1:+1) +# define Sign(x) ((x)<0?-1:+1) +# define ToReal(x) ((CCTK_REAL)(x)) #else -#define Sign(x) (sgn(x)) +# define Sign(x) (sgn(x)) +# define ToReal(x) (real((x),kind(khalf))) #endif +/* TODO: use fma(x,y,z) to implement fmadd and friends? Note that fma + may be unsupported, or may be slow. */ + +/* #define fmadd(x,y,z) ((x)*(y)+(z)) */ +/* #define fmsub(x,y,z) ((x)*(y)-(z)) */ +/* #define fnmadd(x,y,z) (-(z)-(x)*(y)) */ +/* #define fnmsub(x,y,z) (+(z)-(x)*(y)) */ + +#define fneg(x) (-(x)) +#define fmul(x,y) ((x)*(y)) +#define fdiv(x,y) ((x)/(y)) +#define fadd(x,y) ((x)+(y)) +#define fsub(x,y) ((x)-(y)) + +#define fmadd(x,y,z) (fadd(fmul(x,y),z)) +#define fmsub(x,y,z) (fsub(fmul(x,y),z)) +#define fnmadd(x,y,z) (fsub(fneg(z),fmul(x,y))) +#define fnmsub(x,y,z) (fsub(z,fmul(x,y))) + +#define kexp(x) (exp(x)) +#define kfabs(x) (fabs(x)) +#define kfmax(x,y) (fmax(x,y)) +#define kfmin(x,y) (fmin(x,y)) +#define klog(x) (log(x)) +#define kpow(x,y) (pow(x,y)) +#define ksqrt(x) (sqrt(x)) + #ifdef KRANC_C -#define E M_E -#define Pi M_PI +# define E M_E +# define Pi M_PI #else -#define E 2.71828182845904523536029d0 -#define Pi 3.14159265358979323846264d0 +# define E 2.71828182845904523536029d0 +# define Pi 3.14159265358979323846264d0 #endif -#define UnitStep(x) ( (x) > 0 ? 1 : 0 ) +#define UnitStep(x) ((x)>0) diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh new file mode 100644 index 0000000..12cd6e8 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh @@ -0,0 +1,135 @@ +// Vectorise using Intel's or AMD's SSE2 + +// Use the type __m128d directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include <emmintrin.h> + +// Vector type corresponding to CCTK_REAL +typedef __m128d CCTK_REAL_VEC; + +// Number of vector elements in a CCTK_REAL_VEC +static +int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL); + + + +// Create vectors, extract vector elements + +#define vec_set1(a) (_mm_set1_pd(a)) +#define vec_set(a,b) (_mm_set_pd(b,a)) + +// Get a scalar from the vector +#if defined(__PGI) && defined (__amd64__) +// _mm_cvtsd_f64 does not exist on PGI compilers +// # define vec_elt0(x) (*(CCTK_REAL const*)&(x)) +# define vec_elt0(x) ({ CCTK_REAL a_elt0; asm ("" : "=x" (a_elt0) : "0" (x)); a_elt0; }) +#else +// this is a no-op +# define vec_elt0(x) (_mm_cvtsd_f64(x)) +#endif +#define vec_elt1(x_) ({ CCTK_REAL_VEC const x_elt1=(x_); vec_elt0(_mm_unpackhi_pd(x_elt1,x_elt1)); }) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec_load(p) (_mm_load_pd(&(p))) +#define vec_loadu(p) (_mm_loadu_pd(&(p))) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: Always use unaligned load +#define vec_loadu_maybe(off,p) (vec_loadu(p)) +#define vec_loadu_maybe3(off1,off2,off3,p) (vec_loadu(p)) +#if 0 +#define vec_loadu_maybe(off,p) \ + (!((off)&(CCTK_REAL_VEC_SIZE-1)) ? \ + vec_load(p) : vec_loadu(p)) +#define vec_loadu_maybe3(off1,off2,off3,p) \ + (!((off1)&(CCTK_REAL_VEC_SIZE-1)) && \ + !((off2)&(CCTK_REAL_VEC_SIZE-1)) && \ + !((off3)&(CCTK_REAL_VEC_SIZE-1)) ? \ + vec_load(p) : vec_loadu(p)) +#endif + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec_store(p,x) (_mm_store_pd(&(p),x)) +#define vec_storeu(p,x) (_mm_storeu_pd(&(p),x)) +#if defined(KRANC_CACHE) +# define vec_store_nta(p,x) (_mm_stream_pd(&(p),x)) +#else +# define vec_store_nta(p,x) (_mm_store_pd(&(p),x)) +#endif + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x)) +#define vec_store_nta_partial_hi(p,x,n) (_mm_storeh_pd((&(p))+1,x)) + + + +// Functions and operators + +// Operators +#undef fneg +#undef fmul +#undef fdiv +#undef fadd +#undef fsub +#if defined(__PGI) && defined (__amd64__) +// The PGI compiler does not understand __m128d literals +static union { + unsigned long long s[CCTK_REAL_VEC_SIZE]; + CCTK_REAL_VEC v; +} vec_neg_mask_impl = {0x8000000000000000ULL, 0x8000000000000000ULL}; +# define vec_neg_mask (vec_neg_mask_impl.v) +#else +# define vec_neg_mask ((CCTK_REAL_VEC)(__m128i){0x8000000000000000ULL, 0x8000000000000000ULL}) +#endif +#define fneg(x) (_mm_xor_pd(x,vec_neg_mask)) +#define fmul(x,y) (_mm_mul_pd(x,y)) +#define fdiv(x,y) (_mm_div_pd(x,y)) +#define fadd(x,y) (_mm_add_pd(x,y)) +#define fsub(x,y) (_mm_sub_pd(x,y)) + +// Cheap functions +#undef kfabs +#undef kfmax +#undef kfmin +#undef ksqrt +#if defined(__PGI) && defined (__amd64__) +// The PGI compiler does not understand __m128d literals +static union { + unsigned long long s[CCTK_REAL_VEC_SIZE]; + CCTK_REAL_VEC v; +} vec_fabs_mask_impl = {0x7fffffffffffffffULL, 0x7fffffffffffffffULL}; +# define vec_fabs_mask (vec_fabs_mask_impl.v) +#else +# define vec_fabs_mask ((CCTK_REAL_VEC)(__m128i){0x7fffffffffffffffULL, 0x7fffffffffffffffULL}) +#endif +#define kfabs(x) (_mm_and_pd(x,vec_fabs_mask)) +#define kfmax(x,y) (_mm_max_pd(x,y)) +#define kfmin(x,y) (_mm_min_pd(x,y)) +#define ksqrt(x) (_mm_sqrt_pd(x)) + +// Expensive functions +#undef kexp +#undef klog +#undef kpow +#define kexp(x_) ({ CCTK_REAL_VEC const x_exp=(x_); vec_set(exp(vec_elt0(x_exp)),exp(vec_elt1(x_exp))); }) +#define klog(x_) ({ CCTK_REAL_VEC const x_log=(x_); vec_set(log(vec_elt0(x_log)),log(vec_elt1(x_log))); }) +#define kpow(x_,a_) ({ CCTK_REAL_VEC const x_pow=(x_); CCTK_REAL const a_pow=(a_); vec_set(pow(vec_elt0(x_pow),a_pow),pow(vec_elt1(x_pow),a_pow)); }) + + + +#undef Sign +#define Sign(x) (42) + +#undef ToReal +#define ToReal(x) (vec_set1(x)) diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh new file mode 100644 index 0000000..4a4eea6 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh @@ -0,0 +1,194 @@ +// Vectorise using Intel's or AMD's SSE2 + + + +#include <emmintrin.h> + +// Vector type corresponding to CCTK_REAL +struct CCTK_REAL_VEC { + // The underlying scalar and vector types + typedef double S; + typedef __m128d V; + V v; + + // Convert from and to the underlying vector type + inline CCTK_REAL_VEC(V const v_): v(v_) { } + inline operator V const() const { return v; } + + inline CCTK_REAL_VEC() { } + + // Copy constructor + inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { } +}; + +// Number of vector elements in a CCTK_REAL_VEC +static +int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL); + + + +// Create vectors, extract vector elements + +DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a)) +DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a)) + +// Get a scalar from the vector +#if defined(__PGI) && defined (__amd64__) +// _mm_cvtsd_f64 does not exist on PGI compilers +// DEFINE_FUNCTION_V_R(vec_elt0,({ CCTK_REAL a; _mm_store_sd(&a,x); a; })) +// DEFINE_FUNCTION_V_R(vec_elt0,(*(CCTK_REAL const*)&x)) +// This generates the fastest code with PGI compilers +DEFINE_FUNCTION_V_R(vec_elt0,({ CCTK_REAL a; asm ("" : "=x" (a) : "0" (x)); a; })) +#else +DEFINE_FUNCTION_V_R(vec_elt0,_mm_cvtsd_f64(x)) // this is a no-op +#endif +DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_unpackhi_pd(x,x))) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p)) +DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p)) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: default to unaligned load +template<int mod> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p)) +template<int mod1,int mod2,int mod3> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p)) +// Implementation: load aligned if the modulus is zero +template<> +inline +CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p) +{ + return vec_load(p); +} +template<> +inline +CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p) +{ + return vec_load(p); +} +// Call the implementation with the modulus +#define vec_loadu_maybe(off,p) \ + (vec_loadu_maybe_impl<(off)&(CCTK_REAL_VEC_SIZE-1>(p))) +#define vec_loadu_maybe3(off1,off2,off3,p) \ + (vec_loadu_maybe_impl3<(off1)&(CCTK_REAL_VEC_SIZE-1), \ + (off2)&(CCTK_REAL_VEC_SIZE-1), \ + (off3)&(CCTK_REAL_VEC_SIZE-1)>(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x)) +DEFINE_FUNCTION_PRV(vec_storeu,_mm_storeu_pd(&p,x)) +#if defined(KRANC_CACHE) +DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x)) +#else +DEFINE_FUNCTION_PRV(vec_store_nta,_mm_store_pd(&p,x)) +#endif + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +static inline +void vec_store_nta_partial_lo (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n) +{ + switch (n) { + case 1: _mm_storel_pd(&p,x); break; + default: assert(0); + } +} +static inline +void vec_store_nta_partial_hi (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n) +{ + switch (n) { + case 1: _mm_storeh_pd((&p)+1,x); break; + default: assert(0); + } +} + + + +// Functions and operators + +// Single-argument operators +#if 0 +DEFINE_FUNCTION_V_V(operator+,x) +static CCTK_REAL_VEC const vec_neg_mask = + (CCTK_REAL_VEC::V)(__m128i) { 0x8000000000000000ULL, 0x8000000000000000ULL }; +DEFINE_FUNCTION_V_V(operator-,_mm_xor_pd(x,vec_neg_mask)) +#endif +DEFINE_FUNCTION_V_V(operator+,+x.v) +DEFINE_FUNCTION_V_V(operator-,-x.v) + +// Double-argument operators, both vectors +#if 0 +DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y)) +#endif +DEFINE_FUNCTION_VV_V(operator+,x.v+y.v) +DEFINE_FUNCTION_VV_V(operator-,x.v-y.v) +DEFINE_FUNCTION_VV_V(operator*,x.v*y.v) +DEFINE_FUNCTION_VV_V(operator/,x.v/y.v) + +// Double-argument operators, vector and scalar +DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a)) + +// Double-argument operators, scalar and vector +DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x) +DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x) +DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x) +DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x) + +// Cheap functions +#if defined(__PGI) && defined (__amd64__) +// The PGI compiler does not understand __m128d literals +static union { + CCTK_REAL_VEC::S s[CCTK_REAL_VEC_SIZE]; + CCTK_REAL_VEC::V v; +} vec_fabs_mask_impl = { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }; +# define vec_fabs_mask (vec_fabs_mask_impl.v) +#else +static CCTK_REAL_VEC const vec_fabs_mask = + (CCTK_REAL_VEC::V)(__m128i) { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }; +#endif +DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask)) +DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y)) +DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y)) +DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x)) + +// Expensive functions +DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x)))) +DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x)))) +DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a))) + + + +#undef Sign +#define Sign(x) (42) + +#undef ToReal +#define ToReal(x) vec_set1(x) + +#if defined(__PGI) && defined (__amd64__) +// Special case for PGI 9.0.4 to avoid an internal compiler error +#undef IfThen +static inline +CCTK_REAL_VEC IfThen (bool const cond, CCTK_REAL_VEC const x, CCTK_REAL_VEC const y) +{ + union { + __m128i vi; + CCTK_REAL_VEC::V v; + } mask; + mask.vi = _mm_set1_epi64x(-(long long)cond); + return _mm_or_pd(_mm_and_pd(x.v, mask.v), _mm_andnot_pd(mask.v, y.v)); +} +#endif diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh new file mode 100644 index 0000000..7e06017 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh @@ -0,0 +1,111 @@ +// Vectorise using IBM's Altivec + +// Use the type vector double directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include <altivec.h> + +// Vector type corresponding to CCTK_REAL +typedef vector double CCTK_REAL_VEC; + +// Number of vector elements in a CCTK_REAL_VEC +static +int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL); + + + +// Create vectors, extract vector elements + +#define vec_set1(a) (vec_splats(a)) +#if defined(__GNUC__) +// GNU doesn't support array indices on vectors +union vec_mask { + double elts[2]; + vector double v; +}; +# define vec_set(a,b) ({ vec_mask x_set; x_set.elts[0]=(a); x_set.elts[1]=(b); x_set.v; }) +#else +# define vec_set(a,b) ({ CCTK_REAL_VEC x_set; x_set[0]=(a); x_set[1]=(b); x_set; }) +#endif + +// Get a scalar from the vector +#if defined(__GNUC__) +// GNU doesn't support array indices on vectors +# define vec_elt0(x) ({ vec_mask x_elt0; x_elt0.v=(x); x_elt0.elts[0]; }) +# define vec_elt1(x) ({ vec_mask x_elt1; x_elt1.v=(x); x_elt1.elts[1]; }) +#else +# define vec_elt0(x) ((x)[0]) +# define vec_elt1(x) ((x)[1]) +#endif + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec_load(p) (*(CCTK_REAL_VEC const*)&(p)) +#define vec_loadu(p) (vec_load(p)) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +#define vec_loadu_maybe(off,p) (vec_load(p)) +#define vec_loadu_maybe3(off1,off2,off3,p) (vec_load(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec_store(p,x) (*(CCTK_REAL_VEC*)&(p)=(x)) +#define vec_storeu(p,x) (*(CCTK_REAL_VEC*)&(p)=(x)) +// TODO: Use stvxl instruction? +#define vec_store_nta(p,x) vec_store(p,x) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec_store_nta_partial_lo(p,x,n) ((p)=vec_elt0(x)) +#define vec_store_nta_partial_hi(p,x,n) ((&(p))[1]=vec_elt1(x)) + + + +// Functions and operators + +// Other Altivec functions are: +// nabs: -abs a +// madd msub nmadd nmsub: [+-]a*b[+-]c + +// Triple-argument operators, all vectors +#undef fmadd +#undef fmsub +#undef fnmadd +#undef fnmsub +#define fmadd(x,y,z) (vec_madd(x,y,z)) +#define fmsub(x,y,z) (vec_msub(x,y,z)) +#define fnmadd(x,y,z) (vec_nmadd(x,y,z)) +#define fnmsub(x,y,z) (vec_nmsub(x,y,z)) + +// Cheap functions +#undef kfabs +#undef kfmax +#undef kfmin +#define kfabs(x) (vec_abs(x)) +#define kfmax(x,y) (vec_max(x,y)) +#define kfmin(x,y) (vec_min(x,y)) + +// Expensive functions +#undef kexp +#undef klog +#undef kpow +#undef ksqrt +#define kexp(x_) ({ CCTK_REAL_VEC const x_exp=(x_); vec_set(exp(vec_elt0(x_exp)),exp(vec_elt1(x_exp))); }) +#define klog(x_) ({ CCTK_REAL_VEC const x_log=(x_); vec_set(log(vec_elt0(x_log)),log(vec_elt1(x_log))); }) +#define kpow(x_,a_) ({ CCTK_REAL_VEC const x_pow=(x_); CCTK_REAL const a_pow=(a_); vec_set(pow(vec_elt0(x_pow),a_pow),pow(vec_elt1(x_pow),a_pow)); }) +#define ksqrt(x_) ({ CCTK_REAL_VEC const x_sqrt=(x_); vec_set(sqrt(vec_elt0(x_sqrt)),sqrt(vec_elt1(x_sqrt))); }) + + + +#undef Sign +#define Sign(x) (42) + +#undef ToReal +#define ToReal(x) (vec_set1((CCTK_REAL)(x))) diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh new file mode 100644 index 0000000..f591647 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh @@ -0,0 +1,212 @@ +// Vectorise using IBM's Altivec + + + +#include <altivec.h> + +// Vector type corresponding to CCTK_REAL +struct CCTK_REAL_VEC { + // The underlying scalar and vector types + typedef double S; + typedef vector double V; + V v; + union vec_mask { + S elts[2]; + V v; + }; + + // Set a vector from scalars +#if 0 + // IBM + inline CCTK_REAL_VEC(S const a, S const b) { v[0]=a; v[1]=b; } +#endif +#if 0 + inline CCTK_REAL_VEC(S const a, S const b): + v(vec_mergel(vec_splats(a), vec_splats(b))) { } +#endif + inline CCTK_REAL_VEC(S const a, S const b) + { + vec_mask x; + x.elts[0] = a; + x.elts[1] = b; + v = x.v; + } + + // Set a vector from a scalar, replicating the scalar + // Note: Could also use vec_xlds instead + inline CCTK_REAL_VEC(S const a): v(vec_splats(a)) { } + + // Convert from and to the underlying vector type + inline CCTK_REAL_VEC(V const v_): v(v_) { } + inline operator V const() const { return v; } + + inline CCTK_REAL_VEC() { } + + // Copy constructor + inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { } +}; + +// Number of vector elements in a CCTK_REAL_VEC +static +int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL); + + + +// Create vectors, extract vector elements +DEFINE_FUNCTION_R_V(vec_set1,CCTK_REAL_VEC(a)) +DEFINE_FUNCTION_RR_V(vec_set,CCTK_REAL_VEC(a,b)) + +// Get a scalar from the vector +#if 0 +// IBM +DEFINE_FUNCTION_V_R(vec_elt0,x.v[0]) +DEFINE_FUNCTION_V_R(vec_elt1,x.v[1]) +#endif +static inline CCTK_REAL vec_elt0(CCTK_REAL_VEC const x) +{ + CCTK_REAL_VEC::vec_mask x1; + x1.v = x; + return x1.elts[0]; +} +static inline CCTK_REAL vec_elt1(CCTK_REAL_VEC const x) +{ + CCTK_REAL_VEC::vec_mask x1; + x1.v = x; + return x1.elts[1]; +} + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +DEFINE_FUNCTION_PR_V(vec_load,p) +#if 0 +// IBM +DEFINE_FUNCTION_PR_V(vec_loadu,vec_xld2(0,const_cast<CCTK_REAL*>(&p))) +#endif +DEFINE_FUNCTION_PR_V(vec_loadu,p) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: default to unaligned load +template<int mod> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p)) +template<int mod1,int mod2,int mod3> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p)) +// Implementation: load aligned if the modulus is zero +template<> +inline +CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p) +{ + return vec_load(p); +} +template<> +inline +CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p) +{ + return vec_load(p); +} +// Call the implementation with the modulus +#define vec_loadu_maybe(off,p) \ + (vec_loadu_maybe_impl<(off)&(CCTK_REAL_VEC_SIZE-1>(p))) +#define vec_loadu_maybe3(off1,off2,off3,p) \ + (vec_loadu_maybe_impl3<(off1)&(CCTK_REAL_VEC_SIZE-1), \ + (off2)&(CCTK_REAL_VEC_SIZE-1), \ + (off3)&(CCTK_REAL_VEC_SIZE-1)>(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC::V*)&p=x) +DEFINE_FUNCTION_PRV(vec_storeu,*(CCTK_REAL_VEC::V*)&p=x) +// TODO: Use stvxl instruction? +DEFINE_FUNCTION_PRV(vec_store_nta,*(CCTK_REAL_VEC::V*)&p=x) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +static inline +void vec_store_nta_partial_lo (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n) +{ + switch (n) { + case 1: p=vec_elt0(x); break; + default: assert(0); + } +} +static inline +void vec_store_nta_partial_hi (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n) +{ + switch (n) { + case 1: (&p)[1]=vec_elt1(x); break; + default: assert(0); + } +} + + + +// Functions and operators + +// Other Altivec functions are: +// nabs: -abs a +// madd msub nmadd nmsub: [+-]a*b[+-]c + +// Single-argument operators +#if 0 +DEFINE_FUNCTION_V_V(operator+,x) +DEFINE_FUNCTION_V_V(operator-,vec_neg(x)) +#endif +DEFINE_FUNCTION_V_V(operator+,+x.v) +DEFINE_FUNCTION_V_V(operator-,-x.v) + +// Double-argument operators, both vectors +#if 0 +DEFINE_FUNCTION_VV_V(operator+,vec_add(x,y)) +DEFINE_FUNCTION_VV_V(operator-,vec_sub(x,y)) +DEFINE_FUNCTION_VV_V(operator*,vec_mul(x,y)) +DEFINE_FUNCTION_VV_V(operator/,vec_div(x,y)) +#endif +DEFINE_FUNCTION_VV_V(operator+,x.v+y.v) +DEFINE_FUNCTION_VV_V(operator-,x.v-y.v) +DEFINE_FUNCTION_VV_V(operator*,x.v*y.v) +DEFINE_FUNCTION_VV_V(operator/,x.v/y.v) + +// Double-argument operators, vector and scalar +DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a)) + +// Double-argument operators, scalar and vector +DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x) +DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x) +DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x) +DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x) + +// Triple-argument operators, all vectors +#undef fmadd +#undef fmsub +#undef fnmadd +#undef fnmsub +DEFINE_FUNCTION_VVV_V(fmadd,vec_madd(x.v,y.v,z.v)) +DEFINE_FUNCTION_VVV_V(fmsub,vec_msub(x.v,y.v,z.v)) +DEFINE_FUNCTION_VVV_V(fnmadd,vec_nmadd(x.v,y.v,z.v)) +DEFINE_FUNCTION_VVV_V(fnmsub,vec_nmsub(x.v,y.v,z.v)) + +// Cheap functions +DEFINE_FUNCTION_V_V(fabs,vec_abs(x.v)) +DEFINE_FUNCTION_VV_V(fmax,vec_max(x.v,y.v)) +DEFINE_FUNCTION_VV_V(fmin,vec_min(x.v,y.v)) + +// Expensive functions +DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x)))) +DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x)))) +DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a))) +DEFINE_FUNCTION_V_V(sqrt,vec_set(sqrt(vec_elt0(x)),sqrt(vec_elt1(x)))) + + + +#undef Sign +#define Sign(x) (42) + +#undef ToReal +#define ToReal(x) (vec_set1(x)) diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh new file mode 100644 index 0000000..f928ed8 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh @@ -0,0 +1,31 @@ +// Fallback vectorisation implementation: Do not vectorise + + + +// Use CCTK_REAL +typedef CCTK_REAL CCTK_REAL_VEC; + +// Number of vector elements in a CCTK_REAL_VEC +static int const CCTK_REAL_VEC_SIZE = 1; + + + +// We use macros here, so that we are not surprised by compilers which +// don't like to inline functions (e.g. PGI). This should also make +// debug builds (which may not inline) more efficient. + +#define vec_load(p) (p) +#define vec_loadu(p) (p) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +#define vec_loadu_maybe(off,p) (p) +#define vec_loadu_maybe3(off1,off2,off3,p) (p) + +#define vec_store(p,x) ((p)=(x)) +#define vec_store_nta(p,x) ((p)=(x)) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec_store_nta_partial_lo(p,x,n) (assert(0)) +#define vec_store_nta_partial_hi(p,x,n) (assert(0)) diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh new file mode 100644 index 0000000..f5c0b22 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh @@ -0,0 +1,104 @@ +// Define some macros that simplify defining short function that are +// supposed to be inlined + + + +// Letters defining the prototype (argument and return value types): +// I: i,j: integer +// R: a,b: real +// V: x,y: vector (of real) +// P: p,q: pointer (i.e. const reference) to something +// L: l,m: L-value (i.e. non-const reference) to something + + + +// Load and store + +#define DEFINE_FUNCTION_PR_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const& p) \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_PRV(name,expr) \ +static inline \ +void name (CCTK_REAL& p, CCTK_REAL_VEC const x) \ +{ \ + expr; \ +} + +#define DEFINE_FUNCTION_PVR(name,expr) \ +static inline \ +void name (CCTK_REAL_VEC& p, CCTK_REAL const a) \ +{ \ + expr; \ +} + + + +// Functions and operators + +#define DEFINE_FUNCTION_V_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL_VEC const x) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_V_R(name,expr) \ +static inline \ +CCTK_REAL name (CCTK_REAL_VEC const x) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_R_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const a) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_VV_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL_VEC const y) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_VR_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL const a) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_RV_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const a, CCTK_REAL_VEC const x) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_RR_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const a, CCTK_REAL const b) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_VVV_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL_VEC const y, CCTK_REAL_VEC const z) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh new file mode 100644 index 0000000..df83b3a --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh @@ -0,0 +1,591 @@ +#ifndef VECTORS_HH +#define VECTORS_HH + + + +// Vectorisation + +#include <assert.h> +#include <math.h> +#include <stdlib.h> + +#include <cctk.h> + + + +// I: i,j: integer +// R: a,b: real +// V: x,y: vector (of real) +// P: p,q: pointer (i.e. const reference) to something +// L: l,m: L-value (i.e. non-const reference) to something + +#define DEFINE_FUNCTION_PR_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const& p) \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_PRV(name,expr) \ +static inline \ +void name (CCTK_REAL& p, CCTK_REAL_VEC const& x) \ +{ \ + expr; \ +} + +#define DEFINE_FUNCTION_PVR(name,expr) \ +static inline \ +void name (CCTK_REAL_VEC& p, CCTK_REAL const& a) \ +{ \ + expr; \ +} + +#define DEFINE_FUNCTION_V_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL_VEC const& x) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_V_R(name,expr) \ +static inline \ +CCTK_REAL name (CCTK_REAL_VEC const& x) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_R_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const& a) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_VV_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL_VEC const& y) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_VR_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL const& a) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_RV_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL_VEC const& x) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_RR_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL const& b) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + + + +// Intel, double +#if defined(KRANC_VECTORS) && defined(__SSE2__) && defined(CCTK_REAL_PRECISION_8) + +#include <emmintrin.h> + +// Vector type corresponding to CCTK_REAL +struct CCTK_REAL_VEC { + // The underlying scalar and vector types + typedef double S; + typedef __m128d V; + V v; + + // Set a vector from scalars + inline CCTK_REAL_VEC(S const& a, S const& b): v(_mm_set_pd(b,a)) { } + + // Set a vector from a scalar, replicating the scalar + inline CCTK_REAL_VEC(S const& a): v(_mm_set1_pd(a)) { } + + // Convert from and to the underlying vector type + inline CCTK_REAL_VEC(V const& v_): v(v_) { } + inline operator V const() const { return v; } + + inline CCTK_REAL_VEC() { } + + // Copy constructor + inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { } +}; + +union vec_mask { + unsigned long long bits[2]; + CCTK_REAL_VEC::V v; +}; + +DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a)) +DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a)) + +// Get a scalar from the vector +#if defined(__PGI) && defined (__amd64__) +// _mm_cvtsd_f64 does not exist on PGI compilers +static inline +CCTK_REAL vec_elt0 (CCTK_REAL_VEC const& x) +{ + CCTK_REAL a; _mm_store_sd(&a,x); return a; +} +#else +DEFINE_FUNCTION_V_R(vec_elt0,_mm_cvtsd_f64(x)) //this is a no-op +#endif + +#if 0 +DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_shuffle_pd(x,x,_MM_SHUFFLE2(1,1)))) +#endif +static inline +CCTK_REAL vec_elt1 (CCTK_REAL_VEC const& x) +{ + CCTK_REAL a; _mm_storeh_pd(&a,x); return a; +} + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p)) +DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p)) + +#if 0 +// Load a partial vector (duplicating the last loaded element to fill +// the remaining elements) +// TODO: Should this be aligned or unaligned? +static inline +CCTK_REAL_VEC vec_load_partial (CCTK_REAL const& p, int const n) +{ + switch (n) { + case 1: return _mm_load1_pd(p); + default: assert(0); + } +} +#endif + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: default to unaligned load +template<int mod> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p)) +template<int mod1,int mod2,int mod3> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p)) +// Implementation: load aligned if the modulus is zero +template<> +inline +CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p) +{ + return vec_load(p); +} +template<> +inline +CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p) +{ + return vec_load(p); +} +// Call the implementation with the modulus +template<int off> +static inline +CCTK_REAL_VEC vec_loadu_maybe (CCTK_REAL const& p) +{ + return vec_loadu_maybe_impl<off&(2-1)>(p); +} +template<int off1,int off2, int off3> +static inline +CCTK_REAL_VEC vec_loadu_maybe3 (CCTK_REAL const& p) +{ + return vec_loadu_maybe_impl3<off1&(2-1),off2&(2-1),off3&(2-1)>(p); +} + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x)) +DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x)) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +static inline +void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n) +{ + switch (n) { + case 1: _mm_storel_pd(&p,x); break; + default: assert(0); + } +} +static inline +void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n) +{ + switch (n) { + case 1: _mm_storeh_pd((&p)+1,x); break; + default: assert(0); + } +} + +// Double-argument operators, both vectors +DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y)) + +// Double-argument operators, vector and scalar +DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a)) + +// Double-argument operators, scalar and vector +DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x) +DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x) +DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x) +DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x) + +// Single-argument operators +DEFINE_FUNCTION_V_V(operator+,x) +#if 0 +DEFINE_FUNCTION_V_V(operator-,vec_set(0.0,0.0)-x) +#endif +static vec_mask const vec_neg_mask = +{ { 0x8000000000000000ULL, 0x8000000000000000ULL } }; +DEFINE_FUNCTION_V_V(operator-,_mm_xor_pd(x,vec_neg_mask.v)) + +// Cheap functions +static vec_mask const vec_fabs_mask = +{ { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } }; +DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask.v)) +DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y)) +DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y)) +DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x)) + +// Expensive functions +DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x)))) +DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x)))) +DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a))) + +// Special case for PGI to avoid internal compiler error +#if defined(__PGI) && defined (__amd64__) +#undef IfThen +CCTK_REAL_VEC IfThen (bool const cond, CCTK_REAL_VEC const& x, CCTK_REAL_VEC co\ +nst& y) +{ + return cond*x + (not cond)*y; +} +#endif + + + +#if 0 +// Try to use the __m128d type directly. + +// This does not really work, because it is not possible to define +// automatic conversion operators from double to __m128d, so that +// explicit conversions are required. This makes the code look more +// clumsy. + +// Vector type corresponding to CCTK_REAL +typedef __m128d CCTK_REAL_VEC; + +DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a)) +DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a)) + +// Get a scalar from the vector +static inline +CCTK_REAL vec_elt0 (CCTK_REAL_VEC const& x) +{ +#if 0 + // _mm_cvtsd_f64 does not exist on PGI compilers + return _mm_cvtsd_f64(x); // this is a no-op +#endif + CCTK_REAL a; _mm_store_sd(&a,x); return a; +} + +DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_shuffle_pd(x,x,_MM_SHUFFLE2(1,1)))) + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p)) +DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x)) +DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x)) + +// Cheap functions +static vec_mask const vec_fabs_mask = +{ { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } }; +DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask.v)) +DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y)) +DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y)) +DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x)) + +// Expensive functions +DEFINE_FUNCTION_V_V(exp,set(exp(vec_elt0(x)),exp(vec_elt1(x)))) +DEFINE_FUNCTION_V_V(log,set(log(vec_elt0(x)),log(vec_elt1(x)))) +DEFINE_FUNCTION_VR_V(pow,set(pow(vec_elt0(x),a),pow(vec_elt1(x),a))) + +#endif + + + +// Intel, float +#elif defined(KRANC_VECTORS) && defined(__SSE__) && defined(CCTK_REAL_PRECISION_4) + +#include <xmmintrin.h> + +// A vector type corresponding to CCTK_REAL +typedef __m128 CCTK_REAL_VEC; + + + +// Power, double +#elif defined(KRANC_VECTORS) && defined(__ALTIVEC__) && defined(_ARCH_PWR7) && defined(CCTK_REAL_PRECISION_8) + +#include <altivec.h> + +// Vector type corresponding to CCTK_REAL +struct CCTK_REAL_VEC { + // The underlying scalar and vector types + typedef double S; + typedef vector double V; + V v; + + // vec_insert, vec_extract, vec_splat + + // Set a vector from scalars + inline CCTK_REAL_VEC(S const& a, S const& b) { v[0]=a; v[1]=b; } + + // Set a vector from a scalar, replicating the scalar + inline CCTK_REAL_VEC(S const& a): v(vec_splats(a)) { } + + // Convert from and to the underlying vector type + inline CCTK_REAL_VEC(V const& v_): v(v_) { } + inline operator V const() const { return v; } + + inline CCTK_REAL_VEC() { } + + // Copy constructor + inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { } +}; + +DEFINE_FUNCTION_R_V(vec_set1,CCTK_REAL_VEC(a)) +DEFINE_FUNCTION_RR_V(vec_set,CCTK_REAL_VEC(a,b)) + +// Get a scalar from the vector +DEFINE_FUNCTION_V_R(vec_elt0,x.v[0]) +DEFINE_FUNCTION_V_R(vec_elt1,x.v[1]) + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +DEFINE_FUNCTION_PR_V(vec_load,p) +DEFINE_FUNCTION_PR_V(vec_loadu,vec_xld2(0,const_cast<CCTK_REAL*>(&p))) +// vec_xlds + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: default to unaligned load +template<int mod> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p)) +// Implementation: load aligned if the modulus is zero +#define static +template<> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl<0>,vec_load(p)) +#undef static +// Call the implementation with the modulus +template<int off> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe,vec_loadu_maybe_impl<off&(2-1)>(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC::V*)&p=x) +DEFINE_FUNCTION_PRV(vec_store_nta,*(CCTK_REAL_VEC::V*)&p=x) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +static inline +void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n) +{ + switch (n) { + case 1: p=x.v[0]; break; + default: assert(0); + } +} +static inline +void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n) +{ + switch (n) { + case 1: (&p)[1]=x.v[1]; break; + default: assert(0); + } +} + +// Double-argument operators, both vectors +DEFINE_FUNCTION_VV_V(operator+,vec_add(x,y)) +DEFINE_FUNCTION_VV_V(operator-,vec_sub(x,y)) +DEFINE_FUNCTION_VV_V(operator*,vec_mul(x,y)) +DEFINE_FUNCTION_VV_V(operator/,vec_div(x,y)) + +// Double-argument operators, vector and scalar +DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a)) + +// Double-argument operators, scalar and vector +DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x) +DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x) +DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x) +DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x) + +// Single-argument operators +DEFINE_FUNCTION_V_V(operator+,x) +DEFINE_FUNCTION_V_V(operator-,vec_neg(x)) + +// Cheap functions +DEFINE_FUNCTION_V_V(fabs,vec_abs(x)) +DEFINE_FUNCTION_VV_V(fmax,vec_max(x,y)) +DEFINE_FUNCTION_VV_V(fmin,vec_min(x,y)) + +// Expensive functions +DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x)))) +DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x)))) +DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a))) +DEFINE_FUNCTION_V_V(sqrt,vec_set(sqrt(vec_elt0(x)),sqrt(vec_elt1(x)))) + + + +// Fallback: pseudo-vectorisation +#elif 0 + +// There is no vector type corresponding to CCTK_REAL +struct CCTK_REAL_VEC { + // The underlying scalar and vector types + CCTK_REAL v, w; + + // Set a vector from scalars + inline CCTK_REAL_VEC(CCTK_REAL const& a, CCTK_REAL const& b): v(a), w(b) { } + + // Set a vector from a scalar, replicating the scalar + inline CCTK_REAL_VEC(CCTK_REAL const& a): v(a), w(a) { } + + inline CCTK_REAL_VEC() { } + + // Copy constructor + inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x.v), w(x.w) { } +}; + + + +DEFINE_FUNCTION_PR_V(vec_load,*(CCTK_REAL_VEC const* restrict)&p) +DEFINE_FUNCTION_PR_V(vec_loadu,vec_load(p)) +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +template<int off> +DEFINE_FUNCTION_PR_V(vec_loadm,vec_load(p)) + +DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC* restrict)&p=x) +DEFINE_FUNCTION_PRV(vec_store_nta,vec_store(p,x)) + +// Double-argument operators, both vectors +DEFINE_FUNCTION_VV_V(operator+,CCTK_REAL_VEC(x.v+y.v,x.w+y.w)) +DEFINE_FUNCTION_VV_V(operator-,CCTK_REAL_VEC(x.v-y.v,x.w-y.w)) +DEFINE_FUNCTION_VV_V(operator*,CCTK_REAL_VEC(x.v*y.v,x.w*y.w)) +DEFINE_FUNCTION_VV_V(operator/,CCTK_REAL_VEC(x.v/y.v,x.w/y.w)) + +// Double-argument operators, vector and scalar +DEFINE_FUNCTION_VR_V(operator+,CCTK_REAL_VEC(x.v+a,x.w+a)) +DEFINE_FUNCTION_VR_V(operator-,CCTK_REAL_VEC(x.v-a,x.w-a)) +DEFINE_FUNCTION_VR_V(operator*,CCTK_REAL_VEC(x.v*a,x.w*a)) +DEFINE_FUNCTION_VR_V(operator/,CCTK_REAL_VEC(x.v/a,x.w/a)) + +// Double-argument operators, scalar and vector +DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a+x.v,a+x.w)) +DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a-x.v,a-x.w)) +DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a*x.v,a*x.w)) +DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a/x.v,a/x.w)) + +// Single-argument operators +DEFINE_FUNCTION_V_V(operator+,x) +DEFINE_FUNCTION_V_V(operator-,CCTK_REAL_VEC(-x.v,-x.w)) + +// Cheap functions +DEFINE_FUNCTION_V_V(fabs,CCTK_REAL_VEC(fabs(x.v),fabs(x.w))) +DEFINE_FUNCTION_VV_V(fmax,CCTK_REAL_VEC(fmax(x.v,y.v),fmax(x.w,y.w))) +DEFINE_FUNCTION_VV_V(fmin,CCTK_REAL_VEC(fmin(x.v,y.v),fmin(x.w,y.w))) +DEFINE_FUNCTION_V_V(sqrt,CCTK_REAL_VEC(sqrt(x.v),sqrt(x.w))) + +// Expensive functions +DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.v),exp(x.w))) +DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.v),log(x.w))) +DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.v,a),pow(x.w,a))) + + + +// Fallback: no vectorisation +#else + +// There is no vector type corresponding to CCTK_REAL +typedef CCTK_REAL CCTK_REAL_VEC; + + + +DEFINE_FUNCTION_PR_V(vec_load,p) +DEFINE_FUNCTION_PR_V(vec_loadu,p) +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: default to unaligned load +template<int off> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe,p) +template<int off1,int off2, int off3> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe3,p) + +DEFINE_FUNCTION_PRV(vec_store,p=x) +DEFINE_FUNCTION_PRV(vec_store_nta,p=x) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +static inline +void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n) +{ + assert(0); +} +static inline +void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n) +{ + assert(0); +} + + + +#endif + + + +#undef DEFINE_FUNCTION_PR_V +#undef DEFINE_FUNCTION_PRV +#undef DEFINE_FUNCTION_V_V +#undef DEFINE_FUNCTION_R_V +#undef DEFINE_FUNCTION_VV_V +#undef DEFINE_FUNCTION_VR_V +#undef DEFINE_FUNCTION_RV_V +#undef DEFINE_FUNCTION_RR_V + + + +// Number of vector elements in a CCTK_REAL_VEC +static +int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL); + + + +#endif // #ifndef VECTORS_HH diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh new file mode 100644 index 0000000..f439c9b --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh @@ -0,0 +1,72 @@ +// Pseudo vectorisation using scalar operations + + + +// Number of vector elements in a CCTK_REAL_VEC +static int const CCTK_REAL_VEC_SIZE = 2; + +// There is no vector type corresponding to CCTK_REAL +struct CCTK_REAL_VEC { + // The underlying scalar and vector types + CCTK_REAL v[CCTK_REAL_VEC_SIZE]; + + // Set a vector from scalars + inline CCTK_REAL_VEC(CCTK_REAL const& a, CCTK_REAL const& b): v(a), w(b) { } + + // Set a vector from a scalar, replicating the scalar + inline CCTK_REAL_VEC(CCTK_REAL const& a): v(a), w(a) { } + + inline CCTK_REAL_VEC() { } + + // Copy constructor + inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x) { v[0]=x.v[0]; v[1]=x.v[1]; } +}; + + + +// Load and store vectors + +DEFINE_FUNCTION_PR_V(vec_load,*(CCTK_REAL_VEC const* restrict)&p) +DEFINE_FUNCTION_PR_V(vec_loadu,vec_load(p)) +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +#define vec_loadu_maybe(off,p) (vec_load(p)) +#define vec_loadu_maybe3(off1,off2,off3,p) (vec_load(p)) + +DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC* restrict)&p=x) +DEFINE_FUNCTION_PRV(vec_store_nta,vec_store(p,x)) + + + +// Functions and operators + +// Double-argument operators, both vectors +DEFINE_FUNCTION_VV_V(operator+,CCTK_REAL_VEC(x.v[0]+y.v[0],x.v[1]+y.v[1])) +DEFINE_FUNCTION_VV_V(operator-,CCTK_REAL_VEC(x.v[0]-y.v[0],x.v[1]-y.v[1])) +DEFINE_FUNCTION_VV_V(operator*,CCTK_REAL_VEC(x.v[0]*y.v[0],x.v[1]*y.v[1])) +DEFINE_FUNCTION_VV_V(operator/,CCTK_REAL_VEC(x.v[0]/y.v[0],x.v[1]/y.v[1])) + +// Double-argument operators, vector and scalar +DEFINE_FUNCTION_VR_V(operator+,CCTK_REAL_VEC(x.v[0]+a,x.v[1]+a)) +DEFINE_FUNCTION_VR_V(operator-,CCTK_REAL_VEC(x.v[0]-a,x.v[1]-a)) +DEFINE_FUNCTION_VR_V(operator*,CCTK_REAL_VEC(x.v[0]*a,x.v[1]*a)) +DEFINE_FUNCTION_VR_V(operator/,CCTK_REAL_VEC(x.v[0]/a,x.v[1]/a)) + +// Double-argument operators, scalar and vector +DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a+x.v[0],a+x.v[1])) +DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a-x.v[0],a-x.v[1])) +DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a*x.v[0],a*x.v[1])) +DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a/x.v[0],a/x.v[1])) + +// Single-argument operators +DEFINE_FUNCTION_V_V(operator+,x) +DEFINE_FUNCTION_V_V(operator-,CCTK_REAL_VEC(-x.v[0],-x.v[1])) + +// Functions +DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.v[0]),exp(x.v[1]))) +DEFINE_FUNCTION_V_V(fabs,CCTK_REAL_VEC(fabs(x.v[0]),fabs(x.v[1]))) +DEFINE_FUNCTION_VV_V(fmax,CCTK_REAL_VEC(fmax(x.v[0],y.v[0]),fmax(x.v[1],y.v[1]))) +DEFINE_FUNCTION_VV_V(fmin,CCTK_REAL_VEC(fmin(x.v[0],y.v[0]),fmin(x.v[1],y.v[1]))) +DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.v[0]),log(x.v[1]))) +DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.v[0],a),pow(x.v[1],a))) +DEFINE_FUNCTION_V_V(sqrt,CCTK_REAL_VEC(sqrt(x.v[0]),sqrt(x.v[1]))) diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh new file mode 100644 index 0000000..0d950c7 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh @@ -0,0 +1,14 @@ +// Undefine all macros defined in "Vectors-define.hh", so that we +// leave a clean namespace + + + +#undef DEFINE_FUNCTION_PR_V +#undef DEFINE_FUNCTION_PRV +#undef DEFINE_FUNCTION_V_V +#undef DEFINE_FUNCTION_R_V +#undef DEFINE_FUNCTION_VV_V +#undef DEFINE_FUNCTION_VR_V +#undef DEFINE_FUNCTION_RV_V +#undef DEFINE_FUNCTION_RR_V +#undef DEFINE_FUNCTION_VVV_V diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh index 3fb77e1..d32afb2 100644 --- a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh @@ -5,231 +5,47 @@ // Vectorisation +#include <assert.h> #include <math.h> #include <stdlib.h> -#include <cctk.h> - - - -// I: i,j: integer -// R: a,b: real -// V: x,y: vector (of real) -// P: p,q: pointer (i.e. const reference) to something -// L: l,m: L-value (i.e. non-const reference) to something - -#define DEFINE_FUNCTION_PR_V(name,expr) \ -inline \ -CCTK_REAL_VEC name (CCTK_REAL const& p) \ -{ \ - return expr; \ -} - -#define DEFINE_FUNCTION_PRV(name,expr) \ -inline \ -void name (CCTK_REAL& p, CCTK_REAL_VEC const& x) \ -{ \ - expr; \ -} - -#define DEFINE_FUNCTION_PVR(name,expr) \ -inline \ -void name (CCTK_REAL_VEC& p, CCTK_REAL const& a) \ -{ \ - expr; \ -} - -#define DEFINE_FUNCTION_V_V(name,expr) \ -inline \ -CCTK_REAL_VEC name (CCTK_REAL_VEC const& x) \ - CCTK_ATTRIBUTE_PURE \ -{ \ - return CCTK_REAL_VEC(expr); \ -} - -#define DEFINE_FUNCTION_V_R(name,expr) \ -inline \ -CCTK_REAL name (CCTK_REAL_VEC const& x) \ - CCTK_ATTRIBUTE_PURE \ -{ \ - return expr; \ -} - -#define DEFINE_FUNCTION_R_V(name,expr) \ -inline \ -CCTK_REAL_VEC name (CCTK_REAL const& a) \ - CCTK_ATTRIBUTE_PURE \ -{ \ - return expr; \ -} - -#define DEFINE_FUNCTION_VV_V(name,expr) \ -inline \ -CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL_VEC const& y) \ - CCTK_ATTRIBUTE_PURE \ -{ \ - return expr; \ -} - -#define DEFINE_FUNCTION_VR_V(name,expr) \ -inline \ -CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL const& a) \ - CCTK_ATTRIBUTE_PURE \ -{ \ - return expr; \ -} - -#define DEFINE_FUNCTION_RV_V(name,expr) \ -inline \ -CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL_VEC const& x) \ - CCTK_ATTRIBUTE_PURE \ -{ \ - return expr; \ -} - -#define DEFINE_FUNCTION_RR_V(name,expr) \ -inline \ -CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL const& b) \ - CCTK_ATTRIBUTE_PURE \ -{ \ - return expr; \ -} - - - -// Intel, double -#if defined(KRANC_VECTORS) && defined(__SSE2__) && defined(CCTK_REAL_PRECISION_8) - -#include <emmintrin.h> - -// Vector type corresponding to CCTK_REAL -struct CCTK_REAL_VEC { - // The underlying scalar and vector types - typedef double S; - typedef __m128d V; - static int const n = sizeof(V)/sizeof(S); - V v; - - // Set a vector from scalars - CCTK_REAL_VEC(S const& a, S const& b): v(_mm_set_pd(a,b)) { }; - - // Get a scalar from the vector - S elt0() const { return _mm_cvtsd_f64(v); /* this is a no-op */ } - S elt1() const { return _mm_cvtsd_f64(_mm_shuffle_pd(v,v,_MM_SHUFFLE2(1,1))); } - - // Set a vector from a scalar, replicating the scalar - CCTK_REAL_VEC(S const& a): v(_mm_set1_pd(a)) { }; - - // Convert from and to the underlying vector type - CCTK_REAL_VEC(V const& v_): v(v_) { }; - operator V const() const { return v; } - - CCTK_REAL_VEC() { }; - - // Copy constructor - CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { }; -}; - -// Load a vector from memory (aligned and unaligned); this loads from -// a reference to a scalar -DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p)); -DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p)); - -// Store a vector to memory (aligned and non-temporal); this stores to -// a reference to a scalar -DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x)) -DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x)) - -// Double-argument operators, both vectors -DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y)) -DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y)) -DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y)) -DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y)) - -// Double-argument operators, vector and scalar -DEFINE_FUNCTION_VR_V(operator+,x+CCTK_REAL_VEC(a)) -DEFINE_FUNCTION_VR_V(operator-,x-CCTK_REAL_VEC(a)) -DEFINE_FUNCTION_VR_V(operator*,x*CCTK_REAL_VEC(a)) -DEFINE_FUNCTION_VR_V(operator/,x/CCTK_REAL_VEC(a)) - -// Double-argument operators, scalar and vector -DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a)+x) -DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a)-x) -DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a)*x) -DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a)/x) - -// Single-argument operators -DEFINE_FUNCTION_V_V(operator+,x) -DEFINE_FUNCTION_V_V(operator-,0.0-x) - -// Cheap functions -static union { - unsigned long long const bits[2]; - CCTK_REAL_VEC::V v; -} const fabs_mask = - { { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } }; -DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,fabs_mask.v)) -DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y)) -DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y)) -DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x)) - -// Expensive functions -DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.elt0()),exp(x.elt1()))) -DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.elt0()),log(x.elt1()))) -DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.elt0(),a),pow(x.elt1(),a))) - -// Un-implemented functions -DEFINE_FUNCTION_V_R(signbit,0) - - - -#if 0 -// Intel, float -#elif defined(KRANC_VECTORS) && defined(__SSE__) && defined(CCTK_REAL_PRECISION_4) - -#include <xmmintrin.h> - -// A vector type corresponding to CCTK_REAL -typedef __m128 CCTK_REAL_VEC; -#endif - +#include <cctk.h> -// Fallback: no vectorisation -#else - -// There is no vector type corresponding to CCTK_REAL -typedef CCTK_REAL CCTK_REAL_VEC; +#include "Vectors-define.hh" -DEFINE_FUNCTION_PR_V(vec_load,p) -DEFINE_FUNCTION_PR_V(vec_loadu,p) +#if defined(KRANC_VECTORS) +// Vectorise -DEFINE_FUNCTION_PRV(vec_store,p=x) -DEFINE_FUNCTION_PRV(vec_store_nta,p=x) +# if ! defined(CCTK_REAL_PRECISION_8) +# error "Vectorisation is currently only supported for double precision" +# endif -DEFINE_FUNCTION_V_R(signbit,x<0) +# if defined(__SSE2__) // SSE2 (Intel) +# if defined(KRANC_DIRECT) +# include "Vectors-SSE2-direct.hh" +# else +# include "Vectors-SSE2.hh" +# endif +# elif defined(__ALTIVEC__) && defined(_ARCH_PWR7) // Altivec (Power) +# if defined(KRANC_DIRECT) +# include "Vectors-VSX-direct.hh" +# else +# include "Vectors-VSX.hh" +# endif +# else +# include "Vectors-pseudo.hh" +# endif +#else +// Don't vectorise +# include "Vectors-default.hh" #endif - - -#undef DEFINE_FUNCTION_PR_V -#undef DEFINE_FUNCTION_PRV -#undef DEFINE_FUNCTION_V_V -#undef DEFINE_FUNCTION_R_V -#undef DEFINE_FUNCTION_VV_V -#undef DEFINE_FUNCTION_VR_V -#undef DEFINE_FUNCTION_RV_V -#undef DEFINE_FUNCTION_RR_V - - - -// Number of vector elements in a CCTK_REAL_VEC -static -size_t const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL); +#include "Vectors-undefine.hh" |