diff options
author | Ian Hinder <ian.hinder@aei.mpg.de> | 2010-11-22 11:20:48 +0100 |
---|---|---|
committer | Ian Hinder <ian.hinder@aei.mpg.de> | 2010-11-23 00:19:37 +0100 |
commit | d6c4d4c2131107ef3a4004692823e2041394acd6 (patch) | |
tree | bbdd18d5c45e9d4b4f722c8792c4c7d1f02ced19 | |
parent | c9431558d69ee1b8769c918044a45bdd50520b46 (diff) |
Implement vectorisation
This is Erik's vectorisation working tree from 13-Oct-2010
16 files changed, 1767 insertions, 406 deletions
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h index 29c89d1..ee6a3b7 100644 --- a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h @@ -1,17 +1,17 @@ -#define Power(x, y) (pow((x), (y))) +#define Power(x, y) (pow(x,y)) #define Sqrt(x) (sqrt(x)) #ifdef KRANC_C -#define Abs(x) (fabs(x)) -#define Min(x, y) (fmin((x), (y))) -#define Max(x, y) (fmax((x), (y))) -#define IfThen(x,y,z) ((x) ? (y) : (z)) +# define Abs(x) (fabs(x)) +# define Min(x, y) (fmin(x,y)) +# define Max(x, y) (fmax(x,y)) +# define IfThen(x,y,z) ((x) ? (y) : (z)) #else -#define Abs(x) (abs(x)) -#define Min(x, y) (min((x), (y))) -#define Max(x, y) (max((x), (y))) +# define Abs(x) (abs(x)) +# define Min(x, y) (min(x,y)) +# define Max(x, y) (max(x,y)) /* IfThen cannot be expressed in Fortran */ #endif @@ -31,17 +31,46 @@ #define Tanh(x) (tanh(x)) #ifdef KRANC_C -#define Sign(x) (signbit(x)?-1:+1) +# define Sign(x) ((x)<0?-1:+1) +# define ToReal(x) ((CCTK_REAL)(x)) #else -#define Sign(x) (sgn(x)) +# define Sign(x) (sgn(x)) +# define ToReal(x) (real((x),kind(khalf))) #endif +/* TODO: use fma(x,y,z) to implement fmadd and friends? Note that fma + may be unsupported, or may be slow. */ + +/* #define fmadd(x,y,z) ((x)*(y)+(z)) */ +/* #define fmsub(x,y,z) ((x)*(y)-(z)) */ +/* #define fnmadd(x,y,z) (-(z)-(x)*(y)) */ +/* #define fnmsub(x,y,z) (+(z)-(x)*(y)) */ + +#define fneg(x) (-(x)) +#define fmul(x,y) ((x)*(y)) +#define fdiv(x,y) ((x)/(y)) +#define fadd(x,y) ((x)+(y)) +#define fsub(x,y) ((x)-(y)) + +#define fmadd(x,y,z) (fadd(fmul(x,y),z)) +#define fmsub(x,y,z) (fsub(fmul(x,y),z)) +#define fnmadd(x,y,z) (fsub(fneg(z),fmul(x,y))) +#define fnmsub(x,y,z) (fsub(z,fmul(x,y))) + +#define kexp(x) (exp(x)) +#define kfabs(x) (fabs(x)) +#define kfmax(x,y) (fmax(x,y)) +#define kfmin(x,y) (fmin(x,y)) +#define klog(x) (log(x)) +#define kpow(x,y) (pow(x,y)) +#define ksqrt(x) (sqrt(x)) + #ifdef KRANC_C -#define E M_E -#define Pi M_PI +# define E M_E +# define Pi M_PI #else -#define E 2.71828182845904523536029d0 -#define Pi 3.14159265358979323846264d0 +# define E 2.71828182845904523536029d0 +# define Pi 3.14159265358979323846264d0 #endif -#define UnitStep(x) ( (x) > 0 ? 1 : 0 ) +#define UnitStep(x) ((x)>0) diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh new file mode 100644 index 0000000..12cd6e8 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh @@ -0,0 +1,135 @@ +// Vectorise using Intel's or AMD's SSE2 + +// Use the type __m128d directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include <emmintrin.h> + +// Vector type corresponding to CCTK_REAL +typedef __m128d CCTK_REAL_VEC; + +// Number of vector elements in a CCTK_REAL_VEC +static +int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL); + + + +// Create vectors, extract vector elements + +#define vec_set1(a) (_mm_set1_pd(a)) +#define vec_set(a,b) (_mm_set_pd(b,a)) + +// Get a scalar from the vector +#if defined(__PGI) && defined (__amd64__) +// _mm_cvtsd_f64 does not exist on PGI compilers +// # define vec_elt0(x) (*(CCTK_REAL const*)&(x)) +# define vec_elt0(x) ({ CCTK_REAL a_elt0; asm ("" : "=x" (a_elt0) : "0" (x)); a_elt0; }) +#else +// this is a no-op +# define vec_elt0(x) (_mm_cvtsd_f64(x)) +#endif +#define vec_elt1(x_) ({ CCTK_REAL_VEC const x_elt1=(x_); vec_elt0(_mm_unpackhi_pd(x_elt1,x_elt1)); }) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec_load(p) (_mm_load_pd(&(p))) +#define vec_loadu(p) (_mm_loadu_pd(&(p))) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: Always use unaligned load +#define vec_loadu_maybe(off,p) (vec_loadu(p)) +#define vec_loadu_maybe3(off1,off2,off3,p) (vec_loadu(p)) +#if 0 +#define vec_loadu_maybe(off,p) \ + (!((off)&(CCTK_REAL_VEC_SIZE-1)) ? \ + vec_load(p) : vec_loadu(p)) +#define vec_loadu_maybe3(off1,off2,off3,p) \ + (!((off1)&(CCTK_REAL_VEC_SIZE-1)) && \ + !((off2)&(CCTK_REAL_VEC_SIZE-1)) && \ + !((off3)&(CCTK_REAL_VEC_SIZE-1)) ? \ + vec_load(p) : vec_loadu(p)) +#endif + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec_store(p,x) (_mm_store_pd(&(p),x)) +#define vec_storeu(p,x) (_mm_storeu_pd(&(p),x)) +#if defined(KRANC_CACHE) +# define vec_store_nta(p,x) (_mm_stream_pd(&(p),x)) +#else +# define vec_store_nta(p,x) (_mm_store_pd(&(p),x)) +#endif + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x)) +#define vec_store_nta_partial_hi(p,x,n) (_mm_storeh_pd((&(p))+1,x)) + + + +// Functions and operators + +// Operators +#undef fneg +#undef fmul +#undef fdiv +#undef fadd +#undef fsub +#if defined(__PGI) && defined (__amd64__) +// The PGI compiler does not understand __m128d literals +static union { + unsigned long long s[CCTK_REAL_VEC_SIZE]; + CCTK_REAL_VEC v; +} vec_neg_mask_impl = {0x8000000000000000ULL, 0x8000000000000000ULL}; +# define vec_neg_mask (vec_neg_mask_impl.v) +#else +# define vec_neg_mask ((CCTK_REAL_VEC)(__m128i){0x8000000000000000ULL, 0x8000000000000000ULL}) +#endif +#define fneg(x) (_mm_xor_pd(x,vec_neg_mask)) +#define fmul(x,y) (_mm_mul_pd(x,y)) +#define fdiv(x,y) (_mm_div_pd(x,y)) +#define fadd(x,y) (_mm_add_pd(x,y)) +#define fsub(x,y) (_mm_sub_pd(x,y)) + +// Cheap functions +#undef kfabs +#undef kfmax +#undef kfmin +#undef ksqrt +#if defined(__PGI) && defined (__amd64__) +// The PGI compiler does not understand __m128d literals +static union { + unsigned long long s[CCTK_REAL_VEC_SIZE]; + CCTK_REAL_VEC v; +} vec_fabs_mask_impl = {0x7fffffffffffffffULL, 0x7fffffffffffffffULL}; +# define vec_fabs_mask (vec_fabs_mask_impl.v) +#else +# define vec_fabs_mask ((CCTK_REAL_VEC)(__m128i){0x7fffffffffffffffULL, 0x7fffffffffffffffULL}) +#endif +#define kfabs(x) (_mm_and_pd(x,vec_fabs_mask)) +#define kfmax(x,y) (_mm_max_pd(x,y)) +#define kfmin(x,y) (_mm_min_pd(x,y)) +#define ksqrt(x) (_mm_sqrt_pd(x)) + +// Expensive functions +#undef kexp +#undef klog +#undef kpow +#define kexp(x_) ({ CCTK_REAL_VEC const x_exp=(x_); vec_set(exp(vec_elt0(x_exp)),exp(vec_elt1(x_exp))); }) +#define klog(x_) ({ CCTK_REAL_VEC const x_log=(x_); vec_set(log(vec_elt0(x_log)),log(vec_elt1(x_log))); }) +#define kpow(x_,a_) ({ CCTK_REAL_VEC const x_pow=(x_); CCTK_REAL const a_pow=(a_); vec_set(pow(vec_elt0(x_pow),a_pow),pow(vec_elt1(x_pow),a_pow)); }) + + + +#undef Sign +#define Sign(x) (42) + +#undef ToReal +#define ToReal(x) (vec_set1(x)) diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh new file mode 100644 index 0000000..4a4eea6 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh @@ -0,0 +1,194 @@ +// Vectorise using Intel's or AMD's SSE2 + + + +#include <emmintrin.h> + +// Vector type corresponding to CCTK_REAL +struct CCTK_REAL_VEC { + // The underlying scalar and vector types + typedef double S; + typedef __m128d V; + V v; + + // Convert from and to the underlying vector type + inline CCTK_REAL_VEC(V const v_): v(v_) { } + inline operator V const() const { return v; } + + inline CCTK_REAL_VEC() { } + + // Copy constructor + inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { } +}; + +// Number of vector elements in a CCTK_REAL_VEC +static +int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL); + + + +// Create vectors, extract vector elements + +DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a)) +DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a)) + +// Get a scalar from the vector +#if defined(__PGI) && defined (__amd64__) +// _mm_cvtsd_f64 does not exist on PGI compilers +// DEFINE_FUNCTION_V_R(vec_elt0,({ CCTK_REAL a; _mm_store_sd(&a,x); a; })) +// DEFINE_FUNCTION_V_R(vec_elt0,(*(CCTK_REAL const*)&x)) +// This generates the fastest code with PGI compilers +DEFINE_FUNCTION_V_R(vec_elt0,({ CCTK_REAL a; asm ("" : "=x" (a) : "0" (x)); a; })) +#else +DEFINE_FUNCTION_V_R(vec_elt0,_mm_cvtsd_f64(x)) // this is a no-op +#endif +DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_unpackhi_pd(x,x))) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p)) +DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p)) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: default to unaligned load +template<int mod> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p)) +template<int mod1,int mod2,int mod3> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p)) +// Implementation: load aligned if the modulus is zero +template<> +inline +CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p) +{ + return vec_load(p); +} +template<> +inline +CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p) +{ + return vec_load(p); +} +// Call the implementation with the modulus +#define vec_loadu_maybe(off,p) \ + (vec_loadu_maybe_impl<(off)&(CCTK_REAL_VEC_SIZE-1>(p))) +#define vec_loadu_maybe3(off1,off2,off3,p) \ + (vec_loadu_maybe_impl3<(off1)&(CCTK_REAL_VEC_SIZE-1), \ + (off2)&(CCTK_REAL_VEC_SIZE-1), \ + (off3)&(CCTK_REAL_VEC_SIZE-1)>(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x)) +DEFINE_FUNCTION_PRV(vec_storeu,_mm_storeu_pd(&p,x)) +#if defined(KRANC_CACHE) +DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x)) +#else +DEFINE_FUNCTION_PRV(vec_store_nta,_mm_store_pd(&p,x)) +#endif + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +static inline +void vec_store_nta_partial_lo (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n) +{ + switch (n) { + case 1: _mm_storel_pd(&p,x); break; + default: assert(0); + } +} +static inline +void vec_store_nta_partial_hi (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n) +{ + switch (n) { + case 1: _mm_storeh_pd((&p)+1,x); break; + default: assert(0); + } +} + + + +// Functions and operators + +// Single-argument operators +#if 0 +DEFINE_FUNCTION_V_V(operator+,x) +static CCTK_REAL_VEC const vec_neg_mask = + (CCTK_REAL_VEC::V)(__m128i) { 0x8000000000000000ULL, 0x8000000000000000ULL }; +DEFINE_FUNCTION_V_V(operator-,_mm_xor_pd(x,vec_neg_mask)) +#endif +DEFINE_FUNCTION_V_V(operator+,+x.v) +DEFINE_FUNCTION_V_V(operator-,-x.v) + +// Double-argument operators, both vectors +#if 0 +DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y)) +#endif +DEFINE_FUNCTION_VV_V(operator+,x.v+y.v) +DEFINE_FUNCTION_VV_V(operator-,x.v-y.v) +DEFINE_FUNCTION_VV_V(operator*,x.v*y.v) +DEFINE_FUNCTION_VV_V(operator/,x.v/y.v) + +// Double-argument operators, vector and scalar +DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a)) + +// Double-argument operators, scalar and vector +DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x) +DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x) +DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x) +DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x) + +// Cheap functions +#if defined(__PGI) && defined (__amd64__) +// The PGI compiler does not understand __m128d literals +static union { + CCTK_REAL_VEC::S s[CCTK_REAL_VEC_SIZE]; + CCTK_REAL_VEC::V v; +} vec_fabs_mask_impl = { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }; +# define vec_fabs_mask (vec_fabs_mask_impl.v) +#else +static CCTK_REAL_VEC const vec_fabs_mask = + (CCTK_REAL_VEC::V)(__m128i) { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }; +#endif +DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask)) +DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y)) +DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y)) +DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x)) + +// Expensive functions +DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x)))) +DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x)))) +DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a))) + + + +#undef Sign +#define Sign(x) (42) + +#undef ToReal +#define ToReal(x) vec_set1(x) + +#if defined(__PGI) && defined (__amd64__) +// Special case for PGI 9.0.4 to avoid an internal compiler error +#undef IfThen +static inline +CCTK_REAL_VEC IfThen (bool const cond, CCTK_REAL_VEC const x, CCTK_REAL_VEC const y) +{ + union { + __m128i vi; + CCTK_REAL_VEC::V v; + } mask; + mask.vi = _mm_set1_epi64x(-(long long)cond); + return _mm_or_pd(_mm_and_pd(x.v, mask.v), _mm_andnot_pd(mask.v, y.v)); +} +#endif diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh new file mode 100644 index 0000000..7e06017 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh @@ -0,0 +1,111 @@ +// Vectorise using IBM's Altivec + +// Use the type vector double directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include <altivec.h> + +// Vector type corresponding to CCTK_REAL +typedef vector double CCTK_REAL_VEC; + +// Number of vector elements in a CCTK_REAL_VEC +static +int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL); + + + +// Create vectors, extract vector elements + +#define vec_set1(a) (vec_splats(a)) +#if defined(__GNUC__) +// GNU doesn't support array indices on vectors +union vec_mask { + double elts[2]; + vector double v; +}; +# define vec_set(a,b) ({ vec_mask x_set; x_set.elts[0]=(a); x_set.elts[1]=(b); x_set.v; }) +#else +# define vec_set(a,b) ({ CCTK_REAL_VEC x_set; x_set[0]=(a); x_set[1]=(b); x_set; }) +#endif + +// Get a scalar from the vector +#if defined(__GNUC__) +// GNU doesn't support array indices on vectors +# define vec_elt0(x) ({ vec_mask x_elt0; x_elt0.v=(x); x_elt0.elts[0]; }) +# define vec_elt1(x) ({ vec_mask x_elt1; x_elt1.v=(x); x_elt1.elts[1]; }) +#else +# define vec_elt0(x) ((x)[0]) +# define vec_elt1(x) ((x)[1]) +#endif + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec_load(p) (*(CCTK_REAL_VEC const*)&(p)) +#define vec_loadu(p) (vec_load(p)) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +#define vec_loadu_maybe(off,p) (vec_load(p)) +#define vec_loadu_maybe3(off1,off2,off3,p) (vec_load(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec_store(p,x) (*(CCTK_REAL_VEC*)&(p)=(x)) +#define vec_storeu(p,x) (*(CCTK_REAL_VEC*)&(p)=(x)) +// TODO: Use stvxl instruction? +#define vec_store_nta(p,x) vec_store(p,x) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec_store_nta_partial_lo(p,x,n) ((p)=vec_elt0(x)) +#define vec_store_nta_partial_hi(p,x,n) ((&(p))[1]=vec_elt1(x)) + + + +// Functions and operators + +// Other Altivec functions are: +// nabs: -abs a +// madd msub nmadd nmsub: [+-]a*b[+-]c + +// Triple-argument operators, all vectors +#undef fmadd +#undef fmsub +#undef fnmadd +#undef fnmsub +#define fmadd(x,y,z) (vec_madd(x,y,z)) +#define fmsub(x,y,z) (vec_msub(x,y,z)) +#define fnmadd(x,y,z) (vec_nmadd(x,y,z)) +#define fnmsub(x,y,z) (vec_nmsub(x,y,z)) + +// Cheap functions +#undef kfabs +#undef kfmax +#undef kfmin +#define kfabs(x) (vec_abs(x)) +#define kfmax(x,y) (vec_max(x,y)) +#define kfmin(x,y) (vec_min(x,y)) + +// Expensive functions +#undef kexp +#undef klog +#undef kpow +#undef ksqrt +#define kexp(x_) ({ CCTK_REAL_VEC const x_exp=(x_); vec_set(exp(vec_elt0(x_exp)),exp(vec_elt1(x_exp))); }) +#define klog(x_) ({ CCTK_REAL_VEC const x_log=(x_); vec_set(log(vec_elt0(x_log)),log(vec_elt1(x_log))); }) +#define kpow(x_,a_) ({ CCTK_REAL_VEC const x_pow=(x_); CCTK_REAL const a_pow=(a_); vec_set(pow(vec_elt0(x_pow),a_pow),pow(vec_elt1(x_pow),a_pow)); }) +#define ksqrt(x_) ({ CCTK_REAL_VEC const x_sqrt=(x_); vec_set(sqrt(vec_elt0(x_sqrt)),sqrt(vec_elt1(x_sqrt))); }) + + + +#undef Sign +#define Sign(x) (42) + +#undef ToReal +#define ToReal(x) (vec_set1((CCTK_REAL)(x))) diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh new file mode 100644 index 0000000..f591647 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh @@ -0,0 +1,212 @@ +// Vectorise using IBM's Altivec + + + +#include <altivec.h> + +// Vector type corresponding to CCTK_REAL +struct CCTK_REAL_VEC { + // The underlying scalar and vector types + typedef double S; + typedef vector double V; + V v; + union vec_mask { + S elts[2]; + V v; + }; + + // Set a vector from scalars +#if 0 + // IBM + inline CCTK_REAL_VEC(S const a, S const b) { v[0]=a; v[1]=b; } +#endif +#if 0 + inline CCTK_REAL_VEC(S const a, S const b): + v(vec_mergel(vec_splats(a), vec_splats(b))) { } +#endif + inline CCTK_REAL_VEC(S const a, S const b) + { + vec_mask x; + x.elts[0] = a; + x.elts[1] = b; + v = x.v; + } + + // Set a vector from a scalar, replicating the scalar + // Note: Could also use vec_xlds instead + inline CCTK_REAL_VEC(S const a): v(vec_splats(a)) { } + + // Convert from and to the underlying vector type + inline CCTK_REAL_VEC(V const v_): v(v_) { } + inline operator V const() const { return v; } + + inline CCTK_REAL_VEC() { } + + // Copy constructor + inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { } +}; + +// Number of vector elements in a CCTK_REAL_VEC +static +int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL); + + + +// Create vectors, extract vector elements +DEFINE_FUNCTION_R_V(vec_set1,CCTK_REAL_VEC(a)) +DEFINE_FUNCTION_RR_V(vec_set,CCTK_REAL_VEC(a,b)) + +// Get a scalar from the vector +#if 0 +// IBM +DEFINE_FUNCTION_V_R(vec_elt0,x.v[0]) +DEFINE_FUNCTION_V_R(vec_elt1,x.v[1]) +#endif +static inline CCTK_REAL vec_elt0(CCTK_REAL_VEC const x) +{ + CCTK_REAL_VEC::vec_mask x1; + x1.v = x; + return x1.elts[0]; +} +static inline CCTK_REAL vec_elt1(CCTK_REAL_VEC const x) +{ + CCTK_REAL_VEC::vec_mask x1; + x1.v = x; + return x1.elts[1]; +} + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +DEFINE_FUNCTION_PR_V(vec_load,p) +#if 0 +// IBM +DEFINE_FUNCTION_PR_V(vec_loadu,vec_xld2(0,const_cast<CCTK_REAL*>(&p))) +#endif +DEFINE_FUNCTION_PR_V(vec_loadu,p) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: default to unaligned load +template<int mod> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p)) +template<int mod1,int mod2,int mod3> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p)) +// Implementation: load aligned if the modulus is zero +template<> +inline +CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p) +{ + return vec_load(p); +} +template<> +inline +CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p) +{ + return vec_load(p); +} +// Call the implementation with the modulus +#define vec_loadu_maybe(off,p) \ + (vec_loadu_maybe_impl<(off)&(CCTK_REAL_VEC_SIZE-1>(p))) +#define vec_loadu_maybe3(off1,off2,off3,p) \ + (vec_loadu_maybe_impl3<(off1)&(CCTK_REAL_VEC_SIZE-1), \ + (off2)&(CCTK_REAL_VEC_SIZE-1), \ + (off3)&(CCTK_REAL_VEC_SIZE-1)>(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC::V*)&p=x) +DEFINE_FUNCTION_PRV(vec_storeu,*(CCTK_REAL_VEC::V*)&p=x) +// TODO: Use stvxl instruction? +DEFINE_FUNCTION_PRV(vec_store_nta,*(CCTK_REAL_VEC::V*)&p=x) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +static inline +void vec_store_nta_partial_lo (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n) +{ + switch (n) { + case 1: p=vec_elt0(x); break; + default: assert(0); + } +} +static inline +void vec_store_nta_partial_hi (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n) +{ + switch (n) { + case 1: (&p)[1]=vec_elt1(x); break; + default: assert(0); + } +} + + + +// Functions and operators + +// Other Altivec functions are: +// nabs: -abs a +// madd msub nmadd nmsub: [+-]a*b[+-]c + +// Single-argument operators +#if 0 +DEFINE_FUNCTION_V_V(operator+,x) +DEFINE_FUNCTION_V_V(operator-,vec_neg(x)) +#endif +DEFINE_FUNCTION_V_V(operator+,+x.v) +DEFINE_FUNCTION_V_V(operator-,-x.v) + +// Double-argument operators, both vectors +#if 0 +DEFINE_FUNCTION_VV_V(operator+,vec_add(x,y)) +DEFINE_FUNCTION_VV_V(operator-,vec_sub(x,y)) +DEFINE_FUNCTION_VV_V(operator*,vec_mul(x,y)) +DEFINE_FUNCTION_VV_V(operator/,vec_div(x,y)) +#endif +DEFINE_FUNCTION_VV_V(operator+,x.v+y.v) +DEFINE_FUNCTION_VV_V(operator-,x.v-y.v) +DEFINE_FUNCTION_VV_V(operator*,x.v*y.v) +DEFINE_FUNCTION_VV_V(operator/,x.v/y.v) + +// Double-argument operators, vector and scalar +DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a)) + +// Double-argument operators, scalar and vector +DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x) +DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x) +DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x) +DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x) + +// Triple-argument operators, all vectors +#undef fmadd +#undef fmsub +#undef fnmadd +#undef fnmsub +DEFINE_FUNCTION_VVV_V(fmadd,vec_madd(x.v,y.v,z.v)) +DEFINE_FUNCTION_VVV_V(fmsub,vec_msub(x.v,y.v,z.v)) +DEFINE_FUNCTION_VVV_V(fnmadd,vec_nmadd(x.v,y.v,z.v)) +DEFINE_FUNCTION_VVV_V(fnmsub,vec_nmsub(x.v,y.v,z.v)) + +// Cheap functions +DEFINE_FUNCTION_V_V(fabs,vec_abs(x.v)) +DEFINE_FUNCTION_VV_V(fmax,vec_max(x.v,y.v)) +DEFINE_FUNCTION_VV_V(fmin,vec_min(x.v,y.v)) + +// Expensive functions +DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x)))) +DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x)))) +DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a))) +DEFINE_FUNCTION_V_V(sqrt,vec_set(sqrt(vec_elt0(x)),sqrt(vec_elt1(x)))) + + + +#undef Sign +#define Sign(x) (42) + +#undef ToReal +#define ToReal(x) (vec_set1(x)) diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh new file mode 100644 index 0000000..f928ed8 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh @@ -0,0 +1,31 @@ +// Fallback vectorisation implementation: Do not vectorise + + + +// Use CCTK_REAL +typedef CCTK_REAL CCTK_REAL_VEC; + +// Number of vector elements in a CCTK_REAL_VEC +static int const CCTK_REAL_VEC_SIZE = 1; + + + +// We use macros here, so that we are not surprised by compilers which +// don't like to inline functions (e.g. PGI). This should also make +// debug builds (which may not inline) more efficient. + +#define vec_load(p) (p) +#define vec_loadu(p) (p) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +#define vec_loadu_maybe(off,p) (p) +#define vec_loadu_maybe3(off1,off2,off3,p) (p) + +#define vec_store(p,x) ((p)=(x)) +#define vec_store_nta(p,x) ((p)=(x)) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec_store_nta_partial_lo(p,x,n) (assert(0)) +#define vec_store_nta_partial_hi(p,x,n) (assert(0)) diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh new file mode 100644 index 0000000..f5c0b22 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh @@ -0,0 +1,104 @@ +// Define some macros that simplify defining short function that are +// supposed to be inlined + + + +// Letters defining the prototype (argument and return value types): +// I: i,j: integer +// R: a,b: real +// V: x,y: vector (of real) +// P: p,q: pointer (i.e. const reference) to something +// L: l,m: L-value (i.e. non-const reference) to something + + + +// Load and store + +#define DEFINE_FUNCTION_PR_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const& p) \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_PRV(name,expr) \ +static inline \ +void name (CCTK_REAL& p, CCTK_REAL_VEC const x) \ +{ \ + expr; \ +} + +#define DEFINE_FUNCTION_PVR(name,expr) \ +static inline \ +void name (CCTK_REAL_VEC& p, CCTK_REAL const a) \ +{ \ + expr; \ +} + + + +// Functions and operators + +#define DEFINE_FUNCTION_V_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL_VEC const x) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_V_R(name,expr) \ +static inline \ +CCTK_REAL name (CCTK_REAL_VEC const x) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_R_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const a) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_VV_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL_VEC const y) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_VR_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL const a) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_RV_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const a, CCTK_REAL_VEC const x) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_RR_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const a, CCTK_REAL const b) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_VVV_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL_VEC const y, CCTK_REAL_VEC const z) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh new file mode 100644 index 0000000..df83b3a --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh @@ -0,0 +1,591 @@ +#ifndef VECTORS_HH +#define VECTORS_HH + + + +// Vectorisation + +#include <assert.h> +#include <math.h> +#include <stdlib.h> + +#include <cctk.h> + + + +// I: i,j: integer +// R: a,b: real +// V: x,y: vector (of real) +// P: p,q: pointer (i.e. const reference) to something +// L: l,m: L-value (i.e. non-const reference) to something + +#define DEFINE_FUNCTION_PR_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const& p) \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_PRV(name,expr) \ +static inline \ +void name (CCTK_REAL& p, CCTK_REAL_VEC const& x) \ +{ \ + expr; \ +} + +#define DEFINE_FUNCTION_PVR(name,expr) \ +static inline \ +void name (CCTK_REAL_VEC& p, CCTK_REAL const& a) \ +{ \ + expr; \ +} + +#define DEFINE_FUNCTION_V_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL_VEC const& x) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_V_R(name,expr) \ +static inline \ +CCTK_REAL name (CCTK_REAL_VEC const& x) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_R_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const& a) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_VV_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL_VEC const& y) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_VR_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL const& a) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_RV_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL_VEC const& x) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + +#define DEFINE_FUNCTION_RR_V(name,expr) \ +static inline \ +CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL const& b) \ + CCTK_ATTRIBUTE_PURE \ +{ \ + return expr; \ +} + + + +// Intel, double +#if defined(KRANC_VECTORS) && defined(__SSE2__) && defined(CCTK_REAL_PRECISION_8) + +#include <emmintrin.h> + +// Vector type corresponding to CCTK_REAL +struct CCTK_REAL_VEC { + // The underlying scalar and vector types + typedef double S; + typedef __m128d V; + V v; + + // Set a vector from scalars + inline CCTK_REAL_VEC(S const& a, S const& b): v(_mm_set_pd(b,a)) { } + + // Set a vector from a scalar, replicating the scalar + inline CCTK_REAL_VEC(S const& a): v(_mm_set1_pd(a)) { } + + // Convert from and to the underlying vector type + inline CCTK_REAL_VEC(V const& v_): v(v_) { } + inline operator V const() const { return v; } + + inline CCTK_REAL_VEC() { } + + // Copy constructor + inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { } +}; + +union vec_mask { + unsigned long long bits[2]; + CCTK_REAL_VEC::V v; +}; + +DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a)) +DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a)) + +// Get a scalar from the vector +#if defined(__PGI) && defined (__amd64__) +// _mm_cvtsd_f64 does not exist on PGI compilers +static inline +CCTK_REAL vec_elt0 (CCTK_REAL_VEC const& x) +{ + CCTK_REAL a; _mm_store_sd(&a,x); return a; +} +#else +DEFINE_FUNCTION_V_R(vec_elt0,_mm_cvtsd_f64(x)) //this is a no-op +#endif + +#if 0 +DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_shuffle_pd(x,x,_MM_SHUFFLE2(1,1)))) +#endif +static inline +CCTK_REAL vec_elt1 (CCTK_REAL_VEC const& x) +{ + CCTK_REAL a; _mm_storeh_pd(&a,x); return a; +} + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p)) +DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p)) + +#if 0 +// Load a partial vector (duplicating the last loaded element to fill +// the remaining elements) +// TODO: Should this be aligned or unaligned? +static inline +CCTK_REAL_VEC vec_load_partial (CCTK_REAL const& p, int const n) +{ + switch (n) { + case 1: return _mm_load1_pd(p); + default: assert(0); + } +} +#endif + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: default to unaligned load +template<int mod> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p)) +template<int mod1,int mod2,int mod3> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p)) +// Implementation: load aligned if the modulus is zero +template<> +inline +CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p) +{ + return vec_load(p); +} +template<> +inline +CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p) +{ + return vec_load(p); +} +// Call the implementation with the modulus +template<int off> +static inline +CCTK_REAL_VEC vec_loadu_maybe (CCTK_REAL const& p) +{ + return vec_loadu_maybe_impl<off&(2-1)>(p); +} +template<int off1,int off2, int off3> +static inline +CCTK_REAL_VEC vec_loadu_maybe3 (CCTK_REAL const& p) +{ + return vec_loadu_maybe_impl3<off1&(2-1),off2&(2-1),off3&(2-1)>(p); +} + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x)) +DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x)) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +static inline +void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n) +{ + switch (n) { + case 1: _mm_storel_pd(&p,x); break; + default: assert(0); + } +} +static inline +void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n) +{ + switch (n) { + case 1: _mm_storeh_pd((&p)+1,x); break; + default: assert(0); + } +} + +// Double-argument operators, both vectors +DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y)) +DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y)) + +// Double-argument operators, vector and scalar +DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a)) + +// Double-argument operators, scalar and vector +DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x) +DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x) +DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x) +DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x) + +// Single-argument operators +DEFINE_FUNCTION_V_V(operator+,x) +#if 0 +DEFINE_FUNCTION_V_V(operator-,vec_set(0.0,0.0)-x) +#endif +static vec_mask const vec_neg_mask = +{ { 0x8000000000000000ULL, 0x8000000000000000ULL } }; +DEFINE_FUNCTION_V_V(operator-,_mm_xor_pd(x,vec_neg_mask.v)) + +// Cheap functions +static vec_mask const vec_fabs_mask = +{ { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } }; +DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask.v)) +DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y)) +DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y)) +DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x)) + +// Expensive functions +DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x)))) +DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x)))) +DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a))) + +// Special case for PGI to avoid internal compiler error +#if defined(__PGI) && defined (__amd64__) +#undef IfThen +CCTK_REAL_VEC IfThen (bool const cond, CCTK_REAL_VEC const& x, CCTK_REAL_VEC co\ +nst& y) +{ + return cond*x + (not cond)*y; +} +#endif + + + +#if 0 +// Try to use the __m128d type directly. + +// This does not really work, because it is not possible to define +// automatic conversion operators from double to __m128d, so that +// explicit conversions are required. This makes the code look more +// clumsy. + +// Vector type corresponding to CCTK_REAL +typedef __m128d CCTK_REAL_VEC; + +DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a)) +DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a)) + +// Get a scalar from the vector +static inline +CCTK_REAL vec_elt0 (CCTK_REAL_VEC const& x) +{ +#if 0 + // _mm_cvtsd_f64 does not exist on PGI compilers + return _mm_cvtsd_f64(x); // this is a no-op +#endif + CCTK_REAL a; _mm_store_sd(&a,x); return a; +} + +DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_shuffle_pd(x,x,_MM_SHUFFLE2(1,1)))) + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p)) +DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x)) +DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x)) + +// Cheap functions +static vec_mask const vec_fabs_mask = +{ { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } }; +DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask.v)) +DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y)) +DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y)) +DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x)) + +// Expensive functions +DEFINE_FUNCTION_V_V(exp,set(exp(vec_elt0(x)),exp(vec_elt1(x)))) +DEFINE_FUNCTION_V_V(log,set(log(vec_elt0(x)),log(vec_elt1(x)))) +DEFINE_FUNCTION_VR_V(pow,set(pow(vec_elt0(x),a),pow(vec_elt1(x),a))) + +#endif + + + +// Intel, float +#elif defined(KRANC_VECTORS) && defined(__SSE__) && defined(CCTK_REAL_PRECISION_4) + +#include <xmmintrin.h> + +// A vector type corresponding to CCTK_REAL +typedef __m128 CCTK_REAL_VEC; + + + +// Power, double +#elif defined(KRANC_VECTORS) && defined(__ALTIVEC__) && defined(_ARCH_PWR7) && defined(CCTK_REAL_PRECISION_8) + +#include <altivec.h> + +// Vector type corresponding to CCTK_REAL +struct CCTK_REAL_VEC { + // The underlying scalar and vector types + typedef double S; + typedef vector double V; + V v; + + // vec_insert, vec_extract, vec_splat + + // Set a vector from scalars + inline CCTK_REAL_VEC(S const& a, S const& b) { v[0]=a; v[1]=b; } + + // Set a vector from a scalar, replicating the scalar + inline CCTK_REAL_VEC(S const& a): v(vec_splats(a)) { } + + // Convert from and to the underlying vector type + inline CCTK_REAL_VEC(V const& v_): v(v_) { } + inline operator V const() const { return v; } + + inline CCTK_REAL_VEC() { } + + // Copy constructor + inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { } +}; + +DEFINE_FUNCTION_R_V(vec_set1,CCTK_REAL_VEC(a)) +DEFINE_FUNCTION_RR_V(vec_set,CCTK_REAL_VEC(a,b)) + +// Get a scalar from the vector +DEFINE_FUNCTION_V_R(vec_elt0,x.v[0]) +DEFINE_FUNCTION_V_R(vec_elt1,x.v[1]) + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +DEFINE_FUNCTION_PR_V(vec_load,p) +DEFINE_FUNCTION_PR_V(vec_loadu,vec_xld2(0,const_cast<CCTK_REAL*>(&p))) +// vec_xlds + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: default to unaligned load +template<int mod> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p)) +// Implementation: load aligned if the modulus is zero +#define static +template<> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl<0>,vec_load(p)) +#undef static +// Call the implementation with the modulus +template<int off> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe,vec_loadu_maybe_impl<off&(2-1)>(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC::V*)&p=x) +DEFINE_FUNCTION_PRV(vec_store_nta,*(CCTK_REAL_VEC::V*)&p=x) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +static inline +void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n) +{ + switch (n) { + case 1: p=x.v[0]; break; + default: assert(0); + } +} +static inline +void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n) +{ + switch (n) { + case 1: (&p)[1]=x.v[1]; break; + default: assert(0); + } +} + +// Double-argument operators, both vectors +DEFINE_FUNCTION_VV_V(operator+,vec_add(x,y)) +DEFINE_FUNCTION_VV_V(operator-,vec_sub(x,y)) +DEFINE_FUNCTION_VV_V(operator*,vec_mul(x,y)) +DEFINE_FUNCTION_VV_V(operator/,vec_div(x,y)) + +// Double-argument operators, vector and scalar +DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a)) +DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a)) + +// Double-argument operators, scalar and vector +DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x) +DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x) +DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x) +DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x) + +// Single-argument operators +DEFINE_FUNCTION_V_V(operator+,x) +DEFINE_FUNCTION_V_V(operator-,vec_neg(x)) + +// Cheap functions +DEFINE_FUNCTION_V_V(fabs,vec_abs(x)) +DEFINE_FUNCTION_VV_V(fmax,vec_max(x,y)) +DEFINE_FUNCTION_VV_V(fmin,vec_min(x,y)) + +// Expensive functions +DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x)))) +DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x)))) +DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a))) +DEFINE_FUNCTION_V_V(sqrt,vec_set(sqrt(vec_elt0(x)),sqrt(vec_elt1(x)))) + + + +// Fallback: pseudo-vectorisation +#elif 0 + +// There is no vector type corresponding to CCTK_REAL +struct CCTK_REAL_VEC { + // The underlying scalar and vector types + CCTK_REAL v, w; + + // Set a vector from scalars + inline CCTK_REAL_VEC(CCTK_REAL const& a, CCTK_REAL const& b): v(a), w(b) { } + + // Set a vector from a scalar, replicating the scalar + inline CCTK_REAL_VEC(CCTK_REAL const& a): v(a), w(a) { } + + inline CCTK_REAL_VEC() { } + + // Copy constructor + inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x.v), w(x.w) { } +}; + + + +DEFINE_FUNCTION_PR_V(vec_load,*(CCTK_REAL_VEC const* restrict)&p) +DEFINE_FUNCTION_PR_V(vec_loadu,vec_load(p)) +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +template<int off> +DEFINE_FUNCTION_PR_V(vec_loadm,vec_load(p)) + +DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC* restrict)&p=x) +DEFINE_FUNCTION_PRV(vec_store_nta,vec_store(p,x)) + +// Double-argument operators, both vectors +DEFINE_FUNCTION_VV_V(operator+,CCTK_REAL_VEC(x.v+y.v,x.w+y.w)) +DEFINE_FUNCTION_VV_V(operator-,CCTK_REAL_VEC(x.v-y.v,x.w-y.w)) +DEFINE_FUNCTION_VV_V(operator*,CCTK_REAL_VEC(x.v*y.v,x.w*y.w)) +DEFINE_FUNCTION_VV_V(operator/,CCTK_REAL_VEC(x.v/y.v,x.w/y.w)) + +// Double-argument operators, vector and scalar +DEFINE_FUNCTION_VR_V(operator+,CCTK_REAL_VEC(x.v+a,x.w+a)) +DEFINE_FUNCTION_VR_V(operator-,CCTK_REAL_VEC(x.v-a,x.w-a)) +DEFINE_FUNCTION_VR_V(operator*,CCTK_REAL_VEC(x.v*a,x.w*a)) +DEFINE_FUNCTION_VR_V(operator/,CCTK_REAL_VEC(x.v/a,x.w/a)) + +// Double-argument operators, scalar and vector +DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a+x.v,a+x.w)) +DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a-x.v,a-x.w)) +DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a*x.v,a*x.w)) +DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a/x.v,a/x.w)) + +// Single-argument operators +DEFINE_FUNCTION_V_V(operator+,x) +DEFINE_FUNCTION_V_V(operator-,CCTK_REAL_VEC(-x.v,-x.w)) + +// Cheap functions +DEFINE_FUNCTION_V_V(fabs,CCTK_REAL_VEC(fabs(x.v),fabs(x.w))) +DEFINE_FUNCTION_VV_V(fmax,CCTK_REAL_VEC(fmax(x.v,y.v),fmax(x.w,y.w))) +DEFINE_FUNCTION_VV_V(fmin,CCTK_REAL_VEC(fmin(x.v,y.v),fmin(x.w,y.w))) +DEFINE_FUNCTION_V_V(sqrt,CCTK_REAL_VEC(sqrt(x.v),sqrt(x.w))) + +// Expensive functions +DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.v),exp(x.w))) +DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.v),log(x.w))) +DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.v,a),pow(x.w,a))) + + + +// Fallback: no vectorisation +#else + +// There is no vector type corresponding to CCTK_REAL +typedef CCTK_REAL CCTK_REAL_VEC; + + + +DEFINE_FUNCTION_PR_V(vec_load,p) +DEFINE_FUNCTION_PR_V(vec_loadu,p) +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: default to unaligned load +template<int off> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe,p) +template<int off1,int off2, int off3> +DEFINE_FUNCTION_PR_V(vec_loadu_maybe3,p) + +DEFINE_FUNCTION_PRV(vec_store,p=x) +DEFINE_FUNCTION_PRV(vec_store_nta,p=x) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +static inline +void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n) +{ + assert(0); +} +static inline +void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n) +{ + assert(0); +} + + + +#endif + + + +#undef DEFINE_FUNCTION_PR_V +#undef DEFINE_FUNCTION_PRV +#undef DEFINE_FUNCTION_V_V +#undef DEFINE_FUNCTION_R_V +#undef DEFINE_FUNCTION_VV_V +#undef DEFINE_FUNCTION_VR_V +#undef DEFINE_FUNCTION_RV_V +#undef DEFINE_FUNCTION_RR_V + + + +// Number of vector elements in a CCTK_REAL_VEC +static +int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL); + + + +#endif // #ifndef VECTORS_HH diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh new file mode 100644 index 0000000..f439c9b --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh @@ -0,0 +1,72 @@ +// Pseudo vectorisation using scalar operations + + + +// Number of vector elements in a CCTK_REAL_VEC +static int const CCTK_REAL_VEC_SIZE = 2; + +// There is no vector type corresponding to CCTK_REAL +struct CCTK_REAL_VEC { + // The underlying scalar and vector types + CCTK_REAL v[CCTK_REAL_VEC_SIZE]; + + // Set a vector from scalars + inline CCTK_REAL_VEC(CCTK_REAL const& a, CCTK_REAL const& b): v(a), w(b) { } + + // Set a vector from a scalar, replicating the scalar + inline CCTK_REAL_VEC(CCTK_REAL const& a): v(a), w(a) { } + + inline CCTK_REAL_VEC() { } + + // Copy constructor + inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x) { v[0]=x.v[0]; v[1]=x.v[1]; } +}; + + + +// Load and store vectors + +DEFINE_FUNCTION_PR_V(vec_load,*(CCTK_REAL_VEC const* restrict)&p) +DEFINE_FUNCTION_PR_V(vec_loadu,vec_load(p)) +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +#define vec_loadu_maybe(off,p) (vec_load(p)) +#define vec_loadu_maybe3(off1,off2,off3,p) (vec_load(p)) + +DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC* restrict)&p=x) +DEFINE_FUNCTION_PRV(vec_store_nta,vec_store(p,x)) + + + +// Functions and operators + +// Double-argument operators, both vectors +DEFINE_FUNCTION_VV_V(operator+,CCTK_REAL_VEC(x.v[0]+y.v[0],x.v[1]+y.v[1])) +DEFINE_FUNCTION_VV_V(operator-,CCTK_REAL_VEC(x.v[0]-y.v[0],x.v[1]-y.v[1])) +DEFINE_FUNCTION_VV_V(operator*,CCTK_REAL_VEC(x.v[0]*y.v[0],x.v[1]*y.v[1])) +DEFINE_FUNCTION_VV_V(operator/,CCTK_REAL_VEC(x.v[0]/y.v[0],x.v[1]/y.v[1])) + +// Double-argument operators, vector and scalar +DEFINE_FUNCTION_VR_V(operator+,CCTK_REAL_VEC(x.v[0]+a,x.v[1]+a)) +DEFINE_FUNCTION_VR_V(operator-,CCTK_REAL_VEC(x.v[0]-a,x.v[1]-a)) +DEFINE_FUNCTION_VR_V(operator*,CCTK_REAL_VEC(x.v[0]*a,x.v[1]*a)) +DEFINE_FUNCTION_VR_V(operator/,CCTK_REAL_VEC(x.v[0]/a,x.v[1]/a)) + +// Double-argument operators, scalar and vector +DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a+x.v[0],a+x.v[1])) +DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a-x.v[0],a-x.v[1])) +DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a*x.v[0],a*x.v[1])) +DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a/x.v[0],a/x.v[1])) + +// Single-argument operators +DEFINE_FUNCTION_V_V(operator+,x) +DEFINE_FUNCTION_V_V(operator-,CCTK_REAL_VEC(-x.v[0],-x.v[1])) + +// Functions +DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.v[0]),exp(x.v[1]))) +DEFINE_FUNCTION_V_V(fabs,CCTK_REAL_VEC(fabs(x.v[0]),fabs(x.v[1]))) +DEFINE_FUNCTION_VV_V(fmax,CCTK_REAL_VEC(fmax(x.v[0],y.v[0]),fmax(x.v[1],y.v[1]))) +DEFINE_FUNCTION_VV_V(fmin,CCTK_REAL_VEC(fmin(x.v[0],y.v[0]),fmin(x.v[1],y.v[1]))) +DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.v[0]),log(x.v[1]))) +DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.v[0],a),pow(x.v[1],a))) +DEFINE_FUNCTION_V_V(sqrt,CCTK_REAL_VEC(sqrt(x.v[0]),sqrt(x.v[1]))) diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh new file mode 100644 index 0000000..0d950c7 --- /dev/null +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh @@ -0,0 +1,14 @@ +// Undefine all macros defined in "Vectors-define.hh", so that we +// leave a clean namespace + + + +#undef DEFINE_FUNCTION_PR_V +#undef DEFINE_FUNCTION_PRV +#undef DEFINE_FUNCTION_V_V +#undef DEFINE_FUNCTION_R_V +#undef DEFINE_FUNCTION_VV_V +#undef DEFINE_FUNCTION_VR_V +#undef DEFINE_FUNCTION_RV_V +#undef DEFINE_FUNCTION_RR_V +#undef DEFINE_FUNCTION_VVV_V diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh index 3fb77e1..d32afb2 100644 --- a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh +++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh @@ -5,231 +5,47 @@ // Vectorisation +#include <assert.h> #include <math.h> #include <stdlib.h> -#include <cctk.h> - - - -// I: i,j: integer -// R: a,b: real -// V: x,y: vector (of real) -// P: p,q: pointer (i.e. const reference) to something -// L: l,m: L-value (i.e. non-const reference) to something - -#define DEFINE_FUNCTION_PR_V(name,expr) \ -inline \ -CCTK_REAL_VEC name (CCTK_REAL const& p) \ -{ \ - return expr; \ -} - -#define DEFINE_FUNCTION_PRV(name,expr) \ -inline \ -void name (CCTK_REAL& p, CCTK_REAL_VEC const& x) \ -{ \ - expr; \ -} - -#define DEFINE_FUNCTION_PVR(name,expr) \ -inline \ -void name (CCTK_REAL_VEC& p, CCTK_REAL const& a) \ -{ \ - expr; \ -} - -#define DEFINE_FUNCTION_V_V(name,expr) \ -inline \ -CCTK_REAL_VEC name (CCTK_REAL_VEC const& x) \ - CCTK_ATTRIBUTE_PURE \ -{ \ - return CCTK_REAL_VEC(expr); \ -} - -#define DEFINE_FUNCTION_V_R(name,expr) \ -inline \ -CCTK_REAL name (CCTK_REAL_VEC const& x) \ - CCTK_ATTRIBUTE_PURE \ -{ \ - return expr; \ -} - -#define DEFINE_FUNCTION_R_V(name,expr) \ -inline \ -CCTK_REAL_VEC name (CCTK_REAL const& a) \ - CCTK_ATTRIBUTE_PURE \ -{ \ - return expr; \ -} - -#define DEFINE_FUNCTION_VV_V(name,expr) \ -inline \ -CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL_VEC const& y) \ - CCTK_ATTRIBUTE_PURE \ -{ \ - return expr; \ -} - -#define DEFINE_FUNCTION_VR_V(name,expr) \ -inline \ -CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL const& a) \ - CCTK_ATTRIBUTE_PURE \ -{ \ - return expr; \ -} - -#define DEFINE_FUNCTION_RV_V(name,expr) \ -inline \ -CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL_VEC const& x) \ - CCTK_ATTRIBUTE_PURE \ -{ \ - return expr; \ -} - -#define DEFINE_FUNCTION_RR_V(name,expr) \ -inline \ -CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL const& b) \ - CCTK_ATTRIBUTE_PURE \ -{ \ - return expr; \ -} - - - -// Intel, double -#if defined(KRANC_VECTORS) && defined(__SSE2__) && defined(CCTK_REAL_PRECISION_8) - -#include <emmintrin.h> - -// Vector type corresponding to CCTK_REAL -struct CCTK_REAL_VEC { - // The underlying scalar and vector types - typedef double S; - typedef __m128d V; - static int const n = sizeof(V)/sizeof(S); - V v; - - // Set a vector from scalars - CCTK_REAL_VEC(S const& a, S const& b): v(_mm_set_pd(a,b)) { }; - - // Get a scalar from the vector - S elt0() const { return _mm_cvtsd_f64(v); /* this is a no-op */ } - S elt1() const { return _mm_cvtsd_f64(_mm_shuffle_pd(v,v,_MM_SHUFFLE2(1,1))); } - - // Set a vector from a scalar, replicating the scalar - CCTK_REAL_VEC(S const& a): v(_mm_set1_pd(a)) { }; - - // Convert from and to the underlying vector type - CCTK_REAL_VEC(V const& v_): v(v_) { }; - operator V const() const { return v; } - - CCTK_REAL_VEC() { }; - - // Copy constructor - CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { }; -}; - -// Load a vector from memory (aligned and unaligned); this loads from -// a reference to a scalar -DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p)); -DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p)); - -// Store a vector to memory (aligned and non-temporal); this stores to -// a reference to a scalar -DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x)) -DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x)) - -// Double-argument operators, both vectors -DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y)) -DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y)) -DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y)) -DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y)) - -// Double-argument operators, vector and scalar -DEFINE_FUNCTION_VR_V(operator+,x+CCTK_REAL_VEC(a)) -DEFINE_FUNCTION_VR_V(operator-,x-CCTK_REAL_VEC(a)) -DEFINE_FUNCTION_VR_V(operator*,x*CCTK_REAL_VEC(a)) -DEFINE_FUNCTION_VR_V(operator/,x/CCTK_REAL_VEC(a)) - -// Double-argument operators, scalar and vector -DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a)+x) -DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a)-x) -DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a)*x) -DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a)/x) - -// Single-argument operators -DEFINE_FUNCTION_V_V(operator+,x) -DEFINE_FUNCTION_V_V(operator-,0.0-x) - -// Cheap functions -static union { - unsigned long long const bits[2]; - CCTK_REAL_VEC::V v; -} const fabs_mask = - { { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } }; -DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,fabs_mask.v)) -DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y)) -DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y)) -DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x)) - -// Expensive functions -DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.elt0()),exp(x.elt1()))) -DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.elt0()),log(x.elt1()))) -DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.elt0(),a),pow(x.elt1(),a))) - -// Un-implemented functions -DEFINE_FUNCTION_V_R(signbit,0) - - - -#if 0 -// Intel, float -#elif defined(KRANC_VECTORS) && defined(__SSE__) && defined(CCTK_REAL_PRECISION_4) - -#include <xmmintrin.h> - -// A vector type corresponding to CCTK_REAL -typedef __m128 CCTK_REAL_VEC; -#endif - +#include <cctk.h> -// Fallback: no vectorisation -#else - -// There is no vector type corresponding to CCTK_REAL -typedef CCTK_REAL CCTK_REAL_VEC; +#include "Vectors-define.hh" -DEFINE_FUNCTION_PR_V(vec_load,p) -DEFINE_FUNCTION_PR_V(vec_loadu,p) +#if defined(KRANC_VECTORS) +// Vectorise -DEFINE_FUNCTION_PRV(vec_store,p=x) -DEFINE_FUNCTION_PRV(vec_store_nta,p=x) +# if ! defined(CCTK_REAL_PRECISION_8) +# error "Vectorisation is currently only supported for double precision" +# endif -DEFINE_FUNCTION_V_R(signbit,x<0) +# if defined(__SSE2__) // SSE2 (Intel) +# if defined(KRANC_DIRECT) +# include "Vectors-SSE2-direct.hh" +# else +# include "Vectors-SSE2.hh" +# endif +# elif defined(__ALTIVEC__) && defined(_ARCH_PWR7) // Altivec (Power) +# if defined(KRANC_DIRECT) +# include "Vectors-VSX-direct.hh" +# else +# include "Vectors-VSX.hh" +# endif +# else +# include "Vectors-pseudo.hh" +# endif +#else +// Don't vectorise +# include "Vectors-default.hh" #endif - - -#undef DEFINE_FUNCTION_PR_V -#undef DEFINE_FUNCTION_PRV -#undef DEFINE_FUNCTION_V_V -#undef DEFINE_FUNCTION_R_V -#undef DEFINE_FUNCTION_VV_V -#undef DEFINE_FUNCTION_VR_V -#undef DEFINE_FUNCTION_RV_V -#undef DEFINE_FUNCTION_RR_V - - - -// Number of vector elements in a CCTK_REAL_VEC -static -size_t const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL); +#include "Vectors-undefine.hh" diff --git a/Tools/CodeGen/CalculationFunction.m b/Tools/CodeGen/CalculationFunction.m index 3d86503..df79553 100644 --- a/Tools/CodeGen/CalculationFunction.m +++ b/Tools/CodeGen/CalculationFunction.m @@ -178,7 +178,7 @@ localName[x_] := definePreDefinitions[pDefs_] := CommentedBlock["Initialize predefined quantities", - Map[DeclareAssignVariable["CCTK_REAL", #[[1]], #[[2]]] &, pDefs]]; + Map[DeclareAssignVariable["CCTK_REAL_VEC", #[[1]], #[[2]]] &, pDefs]]; (* -------------------------------------------------------------------------- Equations @@ -326,7 +326,7 @@ Options[CreateCalculationFunction] = ThornOptions; CreateCalculationFunction[calc_, debug_, useCSE_, opts:OptionsPattern[]] := Module[{gfs, allSymbols, knownSymbols, - shorts, eqs, parameters, + shorts, eqs, parameters, parameterRules, functionName, dsUsed, groups, pddefs, cleancalc, eqLoop, where, addToStencilWidth, pDefs, haveCondTextuals, condTextuals}, @@ -366,10 +366,14 @@ CreateCalculationFunction[calc_, debug_, useCSE_, opts:OptionsPattern[]] := If[!lookupDefault[cleancalc, NoSimplify, False], InfoMessage[InfoFull, "Simplifying equations", eqs]; - eqs = Simplify[eqs, {r>0}]]]; + eqs = Simplify[eqs, {r>=0}]]]; InfoMessage[InfoFull, "Equations:"]; + (* Wrap parameters with ToReal *) + parameterRules = Map[(#->ToReal[#])&, parameters]; + eqs = eqs /. parameterRules; + Map[printEq, eqs]; (* Check all the function names *) @@ -537,7 +541,7 @@ equationLoop[eqs_, cleancalc_, gfs_, shorts_, incs_, groups_, pddefs_, GenericGridLoop[functionName, { - DeclareDerivatives[defsWithoutShorts, eqsOrdered], + (* DeclareDerivatives[defsWithoutShorts, eqsOrdered], *) CommentedBlock["Assign local copies of grid functions", Map[DeclareMaybeAssignVariableInLoop[ @@ -558,6 +562,22 @@ equationLoop[eqs_, cleancalc_, gfs_, shorts_, incs_, groups_, pddefs_, Map[InfoVariable[#[[1]]] &, (eqs2 /. localMap)], ""], + CommentedBlock["If necessary, store only partial vectors after the first iteration", + ConditionalOnParameterTextual["CCTK_REAL_VEC_SIZE > 1 && i<lc_imin", + { + DeclareAssignVariable["int", "elt_count", "lc_imin-i"], + Map[StoreHighPartialVariableInLoop[GridName[#], localName[#], "elt_count"] &, + gfsInLHS], + "continue;\n" + }]], + CommentedBlock["If necessary, store only partial vectors after the last iteration", + ConditionalOnParameterTextual["CCTK_REAL_VEC_SIZE > 1 && i+CCTK_REAL_VEC_SIZE > lc_imax", + { + DeclareAssignVariable["int", "elt_count", "lc_imax-i"], + Map[StoreLowPartialVariableInLoop[GridName[#], localName[#], "elt_count"] &, + gfsInLHS], + "break;\n" + }]], CommentedBlock["Copy local copies back to grid functions", Map[StoreVariableInLoop[GridName[#], localName[#]] &, gfsInLHS]], diff --git a/Tools/CodeGen/CodeGen.m b/Tools/CodeGen/CodeGen.m index de8dbd2..09e7fe1 100644 --- a/Tools/CodeGen/CodeGen.m +++ b/Tools/CodeGen/CodeGen.m @@ -62,14 +62,16 @@ AssignVariableInLoop::usage = "AssignVariableInLoop[dest_, src_] returns a block "that assigns 'src' to 'dest'."; StoreVariableInLoop::usage = "StoreVariableInLoop[dest_, src_] returns a block of code " <> "that assigns 'src' to 'dest'."; +StoreLowPartialVariableInLoop::usage = "StoreLowPartialVariableInLoop[dest_, src_, count_] returns a block of code " <> + "that assigns 'src' to 'dest'."; +StoreHighPartialVariableInLoop::usage = "StoreHighPartialVariableInLoop[dest_, src_, count_] returns a block of code " <> + "that assigns 'src' to 'dest'."; DeclareAssignVariableInLoop::usage = "DeclareAssignVariableInLoop[type_, dest_, src_] returns a block of code " <> "that assigns 'src' to 'dest'."; MaybeAssignVariableInLoop::usage = "MaybeAssignVariableInLoop[dest_, src_, cond_] returns a block of code " <> "that assigns 'src' to 'dest'."; DeclareMaybeAssignVariableInLoop::usage = "DeclareMaybeAssignVariableInLoop[type_, dest_, src_, cond_] returns a block of code " <> "that assigns 'src' to 'dest'."; -UNUSEDDeclareVariablesInLoopVectorised::usage = ""; -UNUSEDAssignVariablesInLoopVectorised::usage = ""; TestForNaN::usage = "TestForNaN[expr_] returns a block of code " <> "that tests 'expr' for nan."; CommentedBlock::usage = "CommentedBlock[comment, block] returns a block consisting " <> @@ -285,62 +287,24 @@ AssignVariableInLoop[dest_, src_] := StoreVariableInLoop[dest_, src_] := {"vec_store_nta(", dest, ",", src, ")", EOL[]}; +StoreLowPartialVariableInLoop[dest_, src_, count_] := + {"vec_store_nta_partial_lo(", dest, ",", src, ",", count, ")", EOL[]}; + +StoreHighPartialVariableInLoop[dest_, src_, count_] := + {"vec_store_nta_partial_hi(", dest, ",", src, ",", count, ")", EOL[]}; + DeclareAssignVariableInLoop[type_, dest_, src_] := {type, " const ", dest, " = vec_load(", src, ")", EOL[]}; MaybeAssignVariableInLoop[dest_, src_, cond_] := If [cond, - {dest, " = useMatter ? vec_load(", src, ") : 0.0", EOL[]}, + {dest, " = useMatter ? vec_load(", src, ") : ToReal(0.0)", EOL[]}, {dest, " = vec_load(", src, ")", EOL[]}]; DeclareMaybeAssignVariableInLoop[type_, dest_, src_, mmaCond_, codeCond_] := If [mmaCond, - {type, " ", dest, " = (", codeCond, ") ? vec_load(", src, ") : 0.0", EOL[]}, - {type, " ", dest, " = vec_load(", src, ")", EOL[]}]; - -(* TODO: move these into OpenMP loop *) -UNUSEDDeclareVariablesInLoopVectorised[dests_, temps_, srcs_] := - { - {"#undef LC_PRELOOP_STATEMENTS", "\n"}, - {"#define LC_PRELOOP_STATEMENTS", " \\\n"}, - {"int const GFD_imin = lc_imin + ((lc_imin + cctk_lsh[0] * (j + cctk_lsh[1] * k)) & (CCTK_REAL_VEC_SIZE-1))", "; \\\n"}, - {"int const GFD_imax = lc_imax + ((lc_imax + cctk_lsh[0] * (j + cctk_lsh[1] * k)) & (CCTK_REAL_VEC_SIZE-1)) - CCTK_REAL_VEC_SIZE", "; \\\n"}, - Map[Function[x, Module[{dest, temp, src}, - {dest, temp, src} = x; - {"CCTK_REAL_VEC ", temp, "; \\\n"}]], - Transpose[{dests, temps, srcs}]], - {"\n"} - }; - -UNUSEDAssignVariablesInLoopVectorised[dests_, temps_, srcs_] := - { - {"{\n"}, - {" if (i < GFD_imin || i >= GFD_imax) {\n"}, - Map[Function[x, Module[{dest, temp, src}, - {dest, temp, src} = x; - {" ", dest, "[index] = ", src, EOL[]}]], - Transpose[{dests, temps, srcs}]], - {" } else {\n"}, - {" size_t const index0 = index & (CCTK_REAL_VEC_SIZE-1)", EOL[]}, - Map[Function[x, Module[{dest, temp, src}, - {dest, temp, src} = x; - {" ((CCTK_REAL*)&", temp, ")[index0] = ", - src, EOL[]}]], - Transpose[{dests, temps, srcs}]], - {" if (index0 == CCTK_REAL_VEC_SIZE-1) {\n"}, - {" size_t const index1 = index - (CCTK_REAL_VEC_SIZE-1)", EOL[]}, - Map[Function[x, Module[{dest, temp, src}, - {dest, temp, src} = x; - {" _mm_stream_pd (&", dest, "[index1], ", - temp, ")", EOL[]}]], - Transpose[{dests, temps, srcs}]], - {" }\n"}, - {" }\n"}, - {"}\n"} - }; - -UNUSEDAssignVariableInLoopsVectorised[dest_, temp_, src_] := - {"GFD_save_and_store(", dest, ",", "index", ",", "&", temp, ",", src, ")", EOL[]}; + {type, " ", dest, " = (", codeCond, ") ? vec_load(", src, ") : ToReal(0.0)", EOL[]}, + {type, " ", dest, " = vec_load(", src, ")", EOL[]}]; TestForNaN[expr_] := {"if (isnan(", expr, ")) {\n", @@ -463,13 +427,13 @@ DeclareFDVariables[] := InitialiseFDSpacingVariablesC[] := { - DeclareAssignVariable["CCTK_REAL", "dx", "CCTK_DELTA_SPACE(0)"], - DeclareAssignVariable["CCTK_REAL", "dy", "CCTK_DELTA_SPACE(1)"], - DeclareAssignVariable["CCTK_REAL", "dz", "CCTK_DELTA_SPACE(2)"], (* DeclareAssignVariable["int", "di", "CCTK_GFINDEX3D(cctkGH,1,0,0) - CCTK_GFINDEX3D(cctkGH,0,0,0)"], *) DeclareAssignVariable["int", "di", "1"], DeclareAssignVariable["int", "dj", "CCTK_GFINDEX3D(cctkGH,0,1,0) - CCTK_GFINDEX3D(cctkGH,0,0,0)"], - DeclareAssignVariable["int", "dk", "CCTK_GFINDEX3D(cctkGH,0,0,1) - CCTK_GFINDEX3D(cctkGH,0,0,0)"] + DeclareAssignVariable["int", "dk", "CCTK_GFINDEX3D(cctkGH,0,0,1) - CCTK_GFINDEX3D(cctkGH,0,0,0)"], + DeclareAssignVariable["CCTK_REAL_VEC", "dx", "ToReal(CCTK_DELTA_SPACE(0))"], + DeclareAssignVariable["CCTK_REAL_VEC", "dy", "ToReal(CCTK_DELTA_SPACE(1))"], + DeclareAssignVariable["CCTK_REAL_VEC", "dz", "ToReal(CCTK_DELTA_SPACE(2))"] }; InitialiseFDSpacingVariablesFortran[] := @@ -486,17 +450,17 @@ InitialiseFDVariables[] := InitialiseFDSpacingVariablesFortran[], InitialiseFDSpacingVariablesC[]], - DeclareAssignVariable["CCTK_REAL", "dxi", "1.0 / dx"], - DeclareAssignVariable["CCTK_REAL", "dyi", "1.0 / dy"], - DeclareAssignVariable["CCTK_REAL", "dzi", "1.0 / dz"], - DeclareAssignVariable["CCTK_REAL", "khalf", "0.5"], - DeclareAssignVariable["CCTK_REAL", "kthird", "1/3.0"], - DeclareAssignVariable["CCTK_REAL", "ktwothird", "2.0/3.0"], - DeclareAssignVariable["CCTK_REAL", "kfourthird", "4.0/3.0"], - DeclareAssignVariable["CCTK_REAL", "keightthird", "8.0/3.0"], - DeclareAssignVariable["CCTK_REAL", "hdxi", "0.5 * dxi"], - DeclareAssignVariable["CCTK_REAL", "hdyi", "0.5 * dyi"], - DeclareAssignVariable["CCTK_REAL", "hdzi", "0.5 * dzi"]}]; + DeclareAssignVariable["CCTK_REAL_VEC", "dxi", "INV(dx)"], + DeclareAssignVariable["CCTK_REAL_VEC", "dyi", "INV(dy)"], + DeclareAssignVariable["CCTK_REAL_VEC", "dzi", "INV(dz)"], + DeclareAssignVariable["CCTK_REAL_VEC", "khalf", "ToReal(0.5)"], + DeclareAssignVariable["CCTK_REAL_VEC", "kthird", "ToReal(1.0/3.0)"], + DeclareAssignVariable["CCTK_REAL_VEC", "ktwothird", "ToReal(2.0/3.0)"], + DeclareAssignVariable["CCTK_REAL_VEC", "kfourthird", "ToReal(4.0/3.0)"], + DeclareAssignVariable["CCTK_REAL_VEC", "keightthird", "ToReal(8.0/3.0)"], + DeclareAssignVariable["CCTK_REAL_VEC", "hdxi", "fmul(ToReal(0.5), dxi)"], + DeclareAssignVariable["CCTK_REAL_VEC", "hdyi", "fmul(ToReal(0.5), dyi)"], + DeclareAssignVariable["CCTK_REAL_VEC", "hdzi", "fmul(ToReal(0.5), dzi)"]}]; GridName[x_] := If[SOURCELANGUAGE == "C", ToExpression[ToString[x] <> "[index]"], @@ -657,20 +621,21 @@ GenericGridLoopUsingLoopControl[functionName_, block_] := CommentedBlock["Loop over the grid points", { "#pragma omp parallel\n", - "LC_LOOP3 (", functionName, ",\n", - " i,j,k, min[0],min[1],min[2], max[0],max[1],max[2],\n", - " cctk_lsh[0],cctk_lsh[1],cctk_lsh[2])\n", + "LC_LOOP3VEC (", functionName, ",\n", + " i,j,k, min[0],min[1],min[2], max[0],max[1],max[2],\n", + " cctk_lsh[0],cctk_lsh[1],cctk_lsh[2],\n", + " CCTK_REAL_VEC_SIZE)\n", "{\n", indentBlock[ { - DeclareVariable["index", "// int"], - DeclareAssignVariable["int", "index", "CCTK_GFINDEX3D(cctkGH,i,j,k)"], + (* DeclareVariable["index", "// int"], *) + (* DeclareAssignVariable["int", "index", "CCTK_GFINDEX3D(cctkGH,i,j,k)"], *) + DeclareAssignVariable["int", "index", "di*i + dj*j + dk*k"], block } ], - "i += CCTK_REAL_VEC_SIZE-1;\n", "}\n", - "LC_ENDLOOP3 (", functionName, ");\n" + "LC_ENDLOOP3VEC (", functionName, ");\n" } ], "" @@ -797,45 +762,117 @@ insertFile[name_] := (* Take an expression x and replace occurrences of Powers with the C macros SQR, CUB, QAD *) -ReplacePowers[x_] := - Module[{rhs}, - rhs = x /. Power[xx_, -1] -> INV[xx]; +ReplacePowers[expr_] := + Module[{rhs, fmaRules, arithRules}, + rhs = expr /. Power[xx_, -1] -> INV[xx]; If[SOURCELANGUAGE == "C", Module[{}, - rhs = rhs //. Power[xx_, 2] -> SQR[xx]; - rhs = rhs //. Power[xx_, 3] -> CUB[xx]; - rhs = rhs //. Power[xx_, 4] -> QAD[xx]; + rhs = rhs /. Power[xx_, 2 ] -> SQR[xx]; + rhs = rhs /. Power[xx_, 3 ] -> CUB[xx]; + rhs = rhs /. Power[xx_, 4 ] -> QAD[xx]; + rhs = rhs /. Power[xx_, -2 ] -> INV[SQR[xx]]; + rhs = rhs /. Power[xx_, 1/2] -> sqrt[xx]; + rhs = rhs /. Power[xx_, -1/2] -> INV[sqrt[xx]]; + rhs = rhs /. Power[xx_, 0.5] -> sqrt[xx]; + rhs = rhs /. Power[xx_, -0.5] -> INV[sqrt[xx]]; - rhs = rhs //. xx_/2 -> khalf xx; - rhs = rhs //. (-1/2) -> -khalf; + (* + rhs = rhs /. 1/2 -> khalf + rhs = rhs /. -1/2 -> -khalf; - rhs = rhs //. xx_/3 -> kthird xx; - rhs = rhs //. (-1/3) -> -kthird; + rhs = rhs /. 1/3 -> kthird; + rhs = rhs /. -1/3 -> -kthird; - rhs = rhs //. 2/3 -> ktwothird; - rhs = rhs //. (-2/3) -> -ktwothird; + rhs = rhs /. 2/3 -> ktwothird; + rhs = rhs /. -2/3 -> -ktwothird; - rhs = rhs //. 4/3 -> kfourthird; - rhs = rhs //. (-4/3) -> -kfourthird; + rhs = rhs /. 4/3 -> kfourthird; + rhs = rhs /. -4/3 -> -kfourthird; - rhs = rhs //. 8/3 -> keightthird; - rhs = rhs //. (-8/3) -> -keightthird; + rhs = rhs /. 8/3 -> keightthird; + rhs = rhs /. -8/3 -> -keightthird; + *) - rhs = rhs //. xx_ y_ + xx_ z_ -> xx(y+z); + (* Avoid rational numbers *) + rhs = rhs /. Rational[xx_,yy_] :> N[xx/yy, 30]; - rhs = rhs //. Power[E, power_] -> exp[power]; - rhs = rhs //. Power[xx_, 0.5] -> sqrt[xx]; + rhs = rhs //. IfThen[cond1_,xx1_,yy1_] + IfThen[cond2_,xx2_,yy2_] /; cond1==cond2 :> IfThen[cond1, Simplify[ xx1 + xx2], Simplify[ yy1 + yy2]]; + rhs = rhs //. ff1_ IfThen[cond1_,xx1_,yy1_] + IfThen[cond2_,xx2_,yy2_] /; cond1==cond2 :> IfThen[cond1, Simplify[ff1 xx1 + xx2], Simplify[ff1 yy1 + yy2]]; + rhs = rhs //. IfThen[cond1_,xx1_,yy1_] + ff2_ IfThen[cond2_,xx2_,yy2_] /; cond1==cond2 :> IfThen[cond1, Simplify[ xx1 + ff2 xx2], Simplify[ yy1 + ff2 yy2]]; + rhs = rhs //. ff1_ IfThen[cond1_,xx1_,yy1_] + ff2_ IfThen[cond2_,xx2_,yy2_] /; cond1==cond2 :> IfThen[cond1, Simplify[ff1 xx1 + ff2 xx2], Simplify[ff1 yy1 + ff2 yy2]]; + + (* Is this still a good idea when FMA instructions are used? *) + rhs = rhs //. xx_ yy_ + xx_ zz_ -> xx (yy+zz); + rhs = rhs //. xx_ yy_ - xx_ zz_ -> xx (yy-zz); + + rhs = rhs /. Power[E, power_] -> exp[power]; (* there have been some problems doing the Max/Min replacement via the preprocessor for C, so we do it here *) - rhs = rhs //. Max[xx_, yy_] -> fmax[xx, yy]; - rhs = rhs //. Min[xx_, yy_] -> fmin[xx, yy]; - - rhs = rhs //. Power[xx_, power_] -> pow[xx, power]], - - rhs = rhs //. Power[xx_, power_] -> xx^power + rhs = rhs /. Max[xx_, yy_] -> fmax[xx, yy]; + rhs = rhs /. Min[xx_, yy_] -> fmin[xx, yy]; + + rhs = rhs /. Power[xx_, power_] -> pow[xx, power]; + + (* FMA (fused multiply-add) instructions *) + (* Note that -x is represented as Times[-1, x] *) + isNotMinusOneQ[n_] := ! (IntegerQ[n] && n == -1); + isNotTimesMinusOneQ[n_] := ! MatchQ[n,- _]; + fmaRules = { + + (xx_? isNotMinusOneQ) (yy_? isNotMinusOneQ) + (zz_? isNotTimesMinusOneQ) :> fmadd [xx,yy,zz], + + (xx_? isNotMinusOneQ) (yy_? isNotMinusOneQ) - (zz_? isNotTimesMinusOneQ) :> fmsub [xx,yy,zz], + - (xx_? isNotMinusOneQ) (yy_? isNotMinusOneQ) + (zz_? isNotTimesMinusOneQ) :> fnmadd[xx,yy,zz], + - (xx_? isNotMinusOneQ) (yy_? isNotMinusOneQ) - (zz_? isNotTimesMinusOneQ) :> fnmsub[xx,yy,zz], + + (xx_? isNotMinusOneQ) (yy_ + 1) -> fmadd [xx, yy, xx], + + (xx_? isNotMinusOneQ) (yy_ - 1) -> fmsub [xx, yy, xx], + - (xx_? isNotMinusOneQ) (yy_ + 1) -> fnmadd[xx, yy, xx], + - (xx_? isNotMinusOneQ) (yy_ - 1) -> fnmsub[xx, yy, xx], + fmadd[xx_, - yy_, zz_] -> fnmsub[xx,yy,zz], + fmsub[xx_, - yy_, zz_] -> fnmadd[xx,yy,zz] + }; + rhs = rhs //. fmaRules; + + (* Constants *) + rhs = rhs /. xx_Integer/; xx!=-1 :> ToReal[xx]; + rhs = rhs /. xx_Real -> ToReal[xx]; + rhs = rhs /. - ToReal[xx_] -> ToReal[- xx]; + rhs = rhs /. ToReal[xx_] + ToReal[yy_] -> ToReal[xx + yy]; + rhs = rhs /. ToReal[xx_] * ToReal[yy_] -> ToReal[xx * yy]; + rhs = rhs /. pow[xx_, ToReal[power_]] -> pow[xx, power]; + rhs = rhs /. IfThen[ToReal[xx_], yy_, zz_] -> IfThen[xx, yy, zz]; + + (* Replace all operators and functions *) + (* fadd, fsub, fmul, fdiv, fneg *) + isNotFneg[n_] := ! MatchQ[n,fneg[_]]; + arithRules = { + - xx_ -> fneg[xx], + xx_ * yy_ -> fmul[xx,yy], + xx_ / yy_ -> fdiv[xx,yy], + xx_ + yy_ -> fadd[xx,yy], + xx_ - yy_ -> fsub[xx,yy], + fmul[-1,xx_] -> fneg[xx], + fadd[xx_,fneg[yy_]] -> fsub[xx,yy], + fadd[fneg[xx_],(yy_? isNotFneg)] :> fsub[yy,xx], + Abs[xx_] -> kfabs[xx], + Log[xx_] -> klog[xx], + fabs[xx_] -> kfabs[xx], + fmax[xx_,yy_] -> kfmax[xx,yy], + fmin[xx_,yy_] -> kfmin[xx,yy], + sqrt[xx_] -> ksqrt[xx], + exp[xx_] -> kexp[xx], + log[xx_] -> klog[xx], + pow[xx_,yy_] -> kpow[xx,yy] + }; + rhs = rhs //. arithRules; + rhs = rhs /. IfThen[fmul[xx_, yy_], aa_, bb_] -> IfThen[xx*yy, aa, bb]; + rhs = rhs /. ToReal[fneg[xx_]] -> ToReal[-xx]; + rhs = rhs /. ToReal[fmul[xx_, yy_]] -> ToReal[xx*yy]; + rhs = rhs /. kpow[xx_, fneg[power_]] -> kpow[xx, -power]; + ], + + rhs = rhs /. Power[xx_, power_] -> xx^power ]; (* Print[rhs//FullForm];*) rhs diff --git a/Tools/CodeGen/Differencing.m b/Tools/CodeGen/Differencing.m index 609aa9d..1ac5ba8 100644 --- a/Tools/CodeGen/Differencing.m +++ b/Tools/CodeGen/Differencing.m @@ -168,8 +168,6 @@ CreateDifferencingHeader[derivOps_, zeroDims_] := pDefs = Union[Flatten[Map[First, mDefPairs]]]; expressions = Flatten[Map[#[[2]]&, mDefPairs]]; -(* expressions = Flatten[Map[ComponentDerivativeOperatorInlineDefinition, dupsRemoved]];*) - {pDefs,Map[{#, "\n"} &, expressions]}]; ordergfds[_[v1_,___], _[v2_,___]] := @@ -213,7 +211,10 @@ PrecomputeDerivative[d:pd_[gf_, inds___]] := evaluateDerivative[d:pd_[gf_, inds___]] := Module[{macroname}, macroName = ComponentDerivativeOperatorMacroName[pd[inds] -> expr]; - Return[ToString[macroName] <> "(" <> ToString[gf] <> ", i, j, k)"]]; + (* Return[ToString[macroName] <> "(" <> ToString[gf] <> ", i, j, k)"] *) + (* Return[ToString[macroName] <> "(" <> ToString[gf] <> ")"] *) + Return[ToString[macroName] <> "(&" <> ToString[gf] <> "[index])"] + ]; DeclareDerivative[d:pd_[gf_, inds___]] := DeclareVariable[GridFunctionDerivativeName[d], "// CCTK_REAL_VEC"]; @@ -248,7 +249,7 @@ sbpMacroDefinition[macroName_, d_] := <> "(i,j,k,sbp_" <> l <> "min,sbp_" <> l <> "max,d" <> ds <> ",u,q" <> ds <> ",cctkGH))"}] ]; ComponentDerivativeOperatorMacroDefinition[componentDerivOp:(name_[inds___] -> expr_)] := - Module[{macroName, rhs, rhs2, i = "i", j = "j", k = "k", spacings, spacings2, pat, ss, num, den, newnum, signModifier, quotient, liName, rhs3, rhs4}, + Module[{macroName, rhs, i = "i", j = "j", k = "k", spacings, spacings2, pat, ss, num, den, newnum, signModifier, quotient, liName}, macroName = ComponentDerivativeOperatorMacroName[componentDerivOp]; @@ -262,22 +263,23 @@ ComponentDerivativeOperatorMacroDefinition[componentDerivOp:(name_[inds___] -> e Return[sbpMacroDefinition[macroName, 3]]]; rhs = DifferenceGF[expr, i, j, k]; +(* Print["rhs1 == ", FullForm[rhs]];*) spacings = {spacing[1] -> 1/"dxi", spacing[2] -> 1/"dyi", spacing[3] -> 1/"dzi"}; spacings2 = {spacing[1] -> "dx", spacing[2] -> "dy", spacing[3] -> "dz"}; - rhs2 = FullSimplify[rhs]; + rhs = FullSimplify[rhs]; -(* Print["rhs2 == ", FullForm[rhs2]];*) +(* Print["rhs2 == ", FullForm[rhs]];*) pat = Times[spInExpr:(Power[spacing[_],_]..), (Rational[x_,y_])..., rest__]; (* Print["pat == ", pat//FullForm];*) - If[MatchQ[rhs2, pat], + If[MatchQ[rhs, pat], (* Print["matches!"];*) - ss = Times[rhs2 /. pat -> spInExpr]; + ss = Times[rhs /. pat -> spInExpr]; (* Print["ss == ", ss];*) - num = rhs2 /. pat -> x; - den = rhs2 /. pat -> y; + num = rhs /. pat -> x; + den = rhs /. pat -> y; (* Print["num == ", num]; Print["den == ", den];*) If[{num, 1, 2} === {1, 2},(* Print["SEQ!"]; *) newnum = 1; den=1; signModifier = "", @@ -303,39 +305,35 @@ ComponentDerivativeOperatorMacroDefinition[componentDerivOp:(name_[inds___] -> e liName = "p" <> signModifier <> quotient <> ToString[Apply[SequenceForm,Simplify[1/(ss /. spacings2)],{0,Infinity}]]; (* Print["liName == ", liName];*) - rhs3 = rhs2 /. pat -> Times[liName, rest], + (* rhs = rhs /. pat -> Times[liName, rest], *) + rhs = (rhs /. pat -> Times[liName, rest]) / liName, (* Print["!!!!!!!!DOES NOT MATCH!!!!!!!!!"];*) - rhs3 = rhs2]; + rhs = rhs]; -(* Print["rhs3 == ", rhs3];*) +(* Print["rhs3 == ", FullForm[rhs]];*) pDefs = {{liName -> CFormHideStrings[ReplacePowers[num / den ss /. spacings2]]}}; -(* rhs4 = Factor[rhs3];*) - - rhs4 = rhs3 //. (x_ a_ + x_ b_) -> x(a+b); - rhs5 = rhs4 //. (x_ a_ - x_ b_) -> x(a-b); +(* rhs = Factor[rhs];*) + rhs = rhs //. (x_ a_ + x_ b_) -> x (a+b); + rhs = rhs //. (x_ a_ - x_ b_) -> x (a-b); (* Print[componentDerivOp, ": "]; - Print[FullForm[rhs5]]; + Print[FullForm[rhs]]; Print[""];*) - rhs6 = CFormHideStrings[ReplacePowers[rhs5 /. spacings]]; - {pDefs, FlattenBlock[{"#define ", macroName, "(u,i,j,k) ", "(", rhs6, ")"}]}]; - -ComponentDerivativeOperatorInlineDefinition[componentDerivOp:(name_[inds___] -> expr_)] := - Module[{inlineName, rhs, rhs2, i = "i", j = "j", k = "k", spacings}, - - inlineName = ComponentDerivativeOperatorMacroName[componentDerivOp]; - - rhs = DifferenceGF[expr, i, j, k]; -(* rhs = DifferenceGFInline[expr, i, j, k];*) - spacings = {spacing[1] -> 1/"dxi", spacing[2] -> 1/"dyi", spacing[3] -> 1/"dzi"}; - rhs2 = CFormHideStrings[FullSimplify[ReplacePowers[rhs /. spacings]]]; - - DefineFunction[inlineName, "static inline CCTK_REAL", - "CCTK_REAL *u, int i, int j, int k", - {"return ", rhs2, ";\n"}]]; + rhs = CFormHideStrings[ReplacePowers[rhs /. spacings]]; + (* {pDefs, FlattenBlock[{"#define ", macroName, "(u,i,j,k) ", "(", rhs, ")"}]} *) + {pDefs, FlattenBlock[{ + "#ifndef KRANC_DIFF_FUNCTIONS\n", + (* default, differencing operators are macros *) + "# define ", macroName, "(u) ", "(fmul(", liName, ",", rhs, "))\n", + "#else\n", + (* new, differencing operators are static functions *) + "# define ", macroName, "(u) ", "(", liName, "*", macroName, "_impl((u),dj,dk))\n", + "static CCTK_REAL_VEC ", macroName, "_impl(CCTK_REAL const* restrict const u, int const dj, int const dk) ", "{ return ", rhs, "; }\n", + "#endif\n" + }]}]; ComponentDerivativeOperatorMacroName[componentDerivOp:(name_[inds___] -> expr_)] := Module[{stringName}, @@ -368,14 +366,6 @@ DifferenceGF[op_, i_, j_, k_] := Apply[Plus, Map[DifferenceGFTerm[#, i, j, k] &, expanded]], DifferenceGFTerm[expanded, i, j, k]]]; -DifferenceGFInline[op_, i_, j_, k_] := - Module[{expanded}, - expanded = Expand[op]; - - If[Head[expanded] === Plus, - Apply[Plus, Map[DifferenceGFTermInline[#, i, j, k] &, expanded]], - DifferenceGFTerm[expanded, i, j, k]]]; - (* Return the fragment of a macro definition for defining a derivative operator *) @@ -404,10 +394,31 @@ DifferenceGFTerm[op_, i_, j_, k_] := "(int)(" <> ToString[CFormHideStrings[j+ny]] <> ")," <> "(int)(" <> ToString[CFormHideStrings[k+nz]] <> "))]", *) - remaining "vec_loadu((u)[index" <> - "+di*(" <> ToString[CFormHideStrings[nx]] <> ")" <> +(* + remaining "vec_loadu_maybe(" <> ToString[CFormHideStrings[nx]] <> "," <> + "(u)[index" <> + "+di*(" <> ToString[CFormHideStrings[nx]] <> ")" <> + "+dj*(" <> ToString[CFormHideStrings[ny]] <> ")" <> + "+dk*(" <> ToString[CFormHideStrings[nz]] <> ")])", +*) +(* + remaining "vec_loadu_maybe(" <> ToString[CFormHideStrings[nx]] <> "," <> + "(u)[(" <> ToString[CFormHideStrings[nx]] <> ")" <> "+dj*(" <> ToString[CFormHideStrings[ny]] <> ")" <> "+dk*(" <> ToString[CFormHideStrings[nz]] <> ")])", +*) + remaining "vec_loadu_maybe3" <> + "(" <> ToString[CFormHideStrings[nx /. {dir1->1, dir2->1, dir3->1}]] <> "," <> + ToString[CFormHideStrings[ny /. {dir1->1, dir2->1, dir3->1}]] <> "," <> + ToString[CFormHideStrings[nz /. {dir1->1, dir2->1, dir3->1}]] <> "," <> + "(u)[(" <> ToString[CFormHideStrings[nx]] <> ")" <> + "+dj*(" <> ToString[CFormHideStrings[ny]] <> ")" <> + "+dk*(" <> ToString[CFormHideStrings[nz]] <> ")])", +(* + remaining "vec_loadu(u[(" <> ToString[CFormHideStrings[nx]] <> ")" <> + "+dj*(" <> ToString[CFormHideStrings[ny]] <> ")" <> + "+dk*(" <> ToString[CFormHideStrings[nz]] <> ")])", +*) (* remaining "(u)[CCTK_GFINDEX3D(cctkGH,floor((" <> ToString[CFormHideStrings[i+nx]] <> ")+0.5),floor((" <> @@ -417,27 +428,6 @@ DifferenceGFTerm[op_, i_, j_, k_] := remaining "u(" <> ToString[FortranForm[i+nx]] <> "," <> ToString[FortranForm[j+ny]] <> "," <> ToString[FortranForm[k+nz]] <> ")"] ]; -(* Return the fragment of a function definition for defining a derivative - operator *) -DifferenceGFTermInline[op_, i_, j_, k_] := - Module[{nx, ny, nz, remaining}, - - If[op === 0, - Return[0]]; - - nx = Exponent[op, shift[1]]; - ny = Exponent[op, shift[2]]; - nz = Exponent[op, shift[3]]; - - remaining = op / (shift[1]^nx) / (shift[2]^ny) / (shift[3]^nz); - - If[Cases[{remaining}, shift[_], Infinity] != {}, - ThrowError["Could not parse difference operator:", op]]; - - remaining "(u)[CCTK_GFINDEX3D(cctkGH," <> ToString[CFormHideStrings[i+nx]] <> "," <> - ToString[CFormHideStrings[j+ny]] <> "," <> ToString[CFormHideStrings[k+nz]] <> ")]" - ]; - DerivativeOperatorGFDs[gf_]; diff --git a/Tools/CodeGen/Kranc.m b/Tools/CodeGen/Kranc.m index fd07c53..21acd21 100644 --- a/Tools/CodeGen/Kranc.m +++ b/Tools/CodeGen/Kranc.m @@ -22,7 +22,11 @@ BeginPackage["Kranc`"]; (* CodeGen.m *) -{INV, SQR, CUB, QAD, exp, pow, fmax, fmin, dx, dy, dz, khalf, kthird, ktwothird, kfourthird, keightthird}; +{INV, SQR, CUB, QAD, IfThen, ToReal, sqrt, exp, pow, fmax, fmin, + fmadd, fmsub, fnmadd, fnmsub, fneg, fadd, fsub, fmul, fdiv, + kfabs, kfmax, kfmin, ksqrt, kexp, klog, kpow, + dir1, dir2, dir3, dx, dy, dz, + khalf, kthird, ktwothird, kfourthird, keightthird}; (* Helpers.m *) diff --git a/Tools/CodeGen/Thorn.m b/Tools/CodeGen/Thorn.m index 353be9c..4ada28a 100644 --- a/Tools/CodeGen/Thorn.m +++ b/Tools/CodeGen/Thorn.m @@ -476,11 +476,12 @@ CreateSchedule[globalStorageGroups_, scheduledGroups_, scheduledFunctions_] := calculationMacros[] := CommentedBlock["Define macros used in calculations", Map[{"#define ", #, "\n"} &, - {"INITVALUE (42)", - "INV(x) ((1.0) / (x))" , - "SQR(x) ((x) * (x))" , - "CUB(x) ((x) * (x) * (x))" , - "QAD(x) ((x) * (x) * (x) * (x))"}]]; + {"INITVALUE (42)", + "INV(x) (fdiv(ToReal(1.0),x))", + "SQR(x) (fmul(x,x))", + "CUB(x) (x*SQR(x))", + "QAD(x) (SQR(SQR(x)))" + }]]; (* Given a list of Calculation structures as defined above, create a CodeGen representation of a source file that defines a function for @@ -508,7 +509,7 @@ CreateSetterSource[calcs_, debug_, useCSE_, include_, imp_, ], Map[IncludeFile, Join[{"cctk.h", "cctk_Arguments.h", "cctk_Parameters.h", - (*"precomputations.h",*) "GenericFD.h", "Differencing.h", "Vectors.hh"}, include, + (*"precomputations.h",*) "GenericFD.h", "Vectors.hh", "Differencing.h"}, include, If[OptionValue[UseLoopControl], {"loopcontrol.h"}, {}]]], calculationMacros[], @@ -738,10 +739,10 @@ CreateMoLBoundariesSource[spec_] := "if (CCTK_EQUALS(" <> boundpar <> ", \"none\" ) ||\n", " CCTK_EQUALS(" <> boundpar <> ", \"static\") ||\n", " CCTK_EQUALS(" <> boundpar <> ", \"flat\" ) ||\n", - " CCTK_EQUALS(" <> boundpar <> ", \"zero\" ) ) \n", + " CCTK_EQUALS(" <> boundpar <> ", \"zero\" ) )\n", "{\n", - " ierr = Boundary_SelectGroupForBC(cctkGH, CCTK_ALL_FACES, 1, -1, \n", + " ierr = Boundary_SelectGroupForBC(cctkGH, CCTK_ALL_FACES, boundary_width, -1,\n", " \"" <> fullgroupname <> "\", " <> boundpar <> ");\n", " if (ierr < 0)\n", @@ -760,10 +761,10 @@ CreateMoLBoundariesSource[spec_] := "if (CCTK_EQUALS(" <> boundpar <> ", \"none\" ) ||\n", " CCTK_EQUALS(" <> boundpar <> ", \"static\") ||\n", " CCTK_EQUALS(" <> boundpar <> ", \"flat\" ) ||\n", - " CCTK_EQUALS(" <> boundpar <> ", \"zero\" ) ) \n", + " CCTK_EQUALS(" <> boundpar <> ", \"zero\" ) )\n", "{\n", - " ierr = Boundary_SelectVarForBC(cctkGH, CCTK_ALL_FACES, 1, -1, \n", + " ierr = Boundary_SelectVarForBC(cctkGH, CCTK_ALL_FACES, boundary_width, -1,\n", " \"" <> fullgfname <> "\", " <> boundpar <> ");\n", " if (ierr < 0)\n", @@ -796,7 +797,7 @@ CreateMoLBoundariesSource[spec_] := " CCTK_WARN(0, \"could not set SPEED value in table!\");\n", "\n", - " ierr = Boundary_SelectGroupForBC(cctkGH, CCTK_ALL_FACES, 1, "<>myhandle<>", \n", + " ierr = Boundary_SelectGroupForBC(cctkGH, CCTK_ALL_FACES, boundary_width, "<>myhandle<>", \n", " \"" <> fullgroupname <> "\", \"Radiation\");\n\n", " if (ierr < 0)\n", @@ -830,7 +831,7 @@ CreateMoLBoundariesSource[spec_] := " CCTK_WARN(0, \"could not set SPEED value in table!\");\n", "\n", - " ierr = Boundary_SelectVarForBC(cctkGH, CCTK_ALL_FACES, 1, "<>myhandle<>", \n", + " ierr = Boundary_SelectVarForBC(cctkGH, CCTK_ALL_FACES, boundary_width, "<>myhandle<>", \n", " \"" <> fullgfname <> "\", \"Radiation\");\n\n", " if (ierr < 0)\n", @@ -859,7 +860,7 @@ CreateMoLBoundariesSource[spec_] := " CCTK_WARN(0, \"could not set SCALAR value in table!\");\n", "\n", - " ierr = Boundary_SelectGroupForBC(cctkGH, CCTK_ALL_FACES, 1, "<>myhandle<>", \n", + " ierr = Boundary_SelectGroupForBC(cctkGH, CCTK_ALL_FACES, boundary_width, "<>myhandle<>", \n", " \"" <> fullgroupname <> "\", \"scalar\");\n\n", " if (ierr < 0)\n", @@ -889,7 +890,7 @@ CreateMoLBoundariesSource[spec_] := " CCTK_WARN(0, \"could not set SCALAR value in table!\");\n", "\n", - " ierr = Boundary_SelectVarForBC(cctkGH, CCTK_ALL_FACES, 1, "<>myhandle<>", \n", + " ierr = Boundary_SelectVarForBC(cctkGH, CCTK_ALL_FACES, boundary_width, "<>myhandle<>", \n", " \"" <> fullgfname <> "\", \"scalar\");\n\n", " if (ierr < 0)\n", |