aboutsummaryrefslogtreecommitdiff
path: root/Auxiliary/Cactus/KrancNumericalTools
diff options
context:
space:
mode:
authorIan Hinder <ian.hinder@aei.mpg.de>2010-11-22 11:20:48 +0100
committerIan Hinder <ian.hinder@aei.mpg.de>2010-11-23 00:19:37 +0100
commitd6c4d4c2131107ef3a4004692823e2041394acd6 (patch)
treebbdd18d5c45e9d4b4f722c8792c4c7d1f02ced19 /Auxiliary/Cactus/KrancNumericalTools
parentc9431558d69ee1b8769c918044a45bdd50520b46 (diff)
Implement vectorisation
This is Erik's vectorisation working tree from 13-Oct-2010
Diffstat (limited to 'Auxiliary/Cactus/KrancNumericalTools')
-rw-r--r--Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h59
-rw-r--r--Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh135
-rw-r--r--Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh194
-rw-r--r--Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh111
-rw-r--r--Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh212
-rw-r--r--Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh31
-rw-r--r--Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh104
-rw-r--r--Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh591
-rw-r--r--Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh72
-rw-r--r--Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh14
-rw-r--r--Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh238
11 files changed, 1535 insertions, 226 deletions
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h
index 29c89d1..ee6a3b7 100644
--- a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h
@@ -1,17 +1,17 @@
-#define Power(x, y) (pow((x), (y)))
+#define Power(x, y) (pow(x,y))
#define Sqrt(x) (sqrt(x))
#ifdef KRANC_C
-#define Abs(x) (fabs(x))
-#define Min(x, y) (fmin((x), (y)))
-#define Max(x, y) (fmax((x), (y)))
-#define IfThen(x,y,z) ((x) ? (y) : (z))
+# define Abs(x) (fabs(x))
+# define Min(x, y) (fmin(x,y))
+# define Max(x, y) (fmax(x,y))
+# define IfThen(x,y,z) ((x) ? (y) : (z))
#else
-#define Abs(x) (abs(x))
-#define Min(x, y) (min((x), (y)))
-#define Max(x, y) (max((x), (y)))
+# define Abs(x) (abs(x))
+# define Min(x, y) (min(x,y))
+# define Max(x, y) (max(x,y))
/* IfThen cannot be expressed in Fortran */
#endif
@@ -31,17 +31,46 @@
#define Tanh(x) (tanh(x))
#ifdef KRANC_C
-#define Sign(x) (signbit(x)?-1:+1)
+# define Sign(x) ((x)<0?-1:+1)
+# define ToReal(x) ((CCTK_REAL)(x))
#else
-#define Sign(x) (sgn(x))
+# define Sign(x) (sgn(x))
+# define ToReal(x) (real((x),kind(khalf)))
#endif
+/* TODO: use fma(x,y,z) to implement fmadd and friends? Note that fma
+ may be unsupported, or may be slow. */
+
+/* #define fmadd(x,y,z) ((x)*(y)+(z)) */
+/* #define fmsub(x,y,z) ((x)*(y)-(z)) */
+/* #define fnmadd(x,y,z) (-(z)-(x)*(y)) */
+/* #define fnmsub(x,y,z) (+(z)-(x)*(y)) */
+
+#define fneg(x) (-(x))
+#define fmul(x,y) ((x)*(y))
+#define fdiv(x,y) ((x)/(y))
+#define fadd(x,y) ((x)+(y))
+#define fsub(x,y) ((x)-(y))
+
+#define fmadd(x,y,z) (fadd(fmul(x,y),z))
+#define fmsub(x,y,z) (fsub(fmul(x,y),z))
+#define fnmadd(x,y,z) (fsub(fneg(z),fmul(x,y)))
+#define fnmsub(x,y,z) (fsub(z,fmul(x,y)))
+
+#define kexp(x) (exp(x))
+#define kfabs(x) (fabs(x))
+#define kfmax(x,y) (fmax(x,y))
+#define kfmin(x,y) (fmin(x,y))
+#define klog(x) (log(x))
+#define kpow(x,y) (pow(x,y))
+#define ksqrt(x) (sqrt(x))
+
#ifdef KRANC_C
-#define E M_E
-#define Pi M_PI
+# define E M_E
+# define Pi M_PI
#else
-#define E 2.71828182845904523536029d0
-#define Pi 3.14159265358979323846264d0
+# define E 2.71828182845904523536029d0
+# define Pi 3.14159265358979323846264d0
#endif
-#define UnitStep(x) ( (x) > 0 ? 1 : 0 )
+#define UnitStep(x) ((x)>0)
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh
new file mode 100644
index 0000000..12cd6e8
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh
@@ -0,0 +1,135 @@
+// Vectorise using Intel's or AMD's SSE2
+
+// Use the type __m128d directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+
+
+#include <emmintrin.h>
+
+// Vector type corresponding to CCTK_REAL
+typedef __m128d CCTK_REAL_VEC;
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+// Create vectors, extract vector elements
+
+#define vec_set1(a) (_mm_set1_pd(a))
+#define vec_set(a,b) (_mm_set_pd(b,a))
+
+// Get a scalar from the vector
+#if defined(__PGI) && defined (__amd64__)
+// _mm_cvtsd_f64 does not exist on PGI compilers
+// # define vec_elt0(x) (*(CCTK_REAL const*)&(x))
+# define vec_elt0(x) ({ CCTK_REAL a_elt0; asm ("" : "=x" (a_elt0) : "0" (x)); a_elt0; })
+#else
+// this is a no-op
+# define vec_elt0(x) (_mm_cvtsd_f64(x))
+#endif
+#define vec_elt1(x_) ({ CCTK_REAL_VEC const x_elt1=(x_); vec_elt0(_mm_unpackhi_pd(x_elt1,x_elt1)); })
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec_load(p) (_mm_load_pd(&(p)))
+#define vec_loadu(p) (_mm_loadu_pd(&(p)))
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: Always use unaligned load
+#define vec_loadu_maybe(off,p) (vec_loadu(p))
+#define vec_loadu_maybe3(off1,off2,off3,p) (vec_loadu(p))
+#if 0
+#define vec_loadu_maybe(off,p) \
+ (!((off)&(CCTK_REAL_VEC_SIZE-1)) ? \
+ vec_load(p) : vec_loadu(p))
+#define vec_loadu_maybe3(off1,off2,off3,p) \
+ (!((off1)&(CCTK_REAL_VEC_SIZE-1)) && \
+ !((off2)&(CCTK_REAL_VEC_SIZE-1)) && \
+ !((off3)&(CCTK_REAL_VEC_SIZE-1)) ? \
+ vec_load(p) : vec_loadu(p))
+#endif
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec_store(p,x) (_mm_store_pd(&(p),x))
+#define vec_storeu(p,x) (_mm_storeu_pd(&(p),x))
+#if defined(KRANC_CACHE)
+# define vec_store_nta(p,x) (_mm_stream_pd(&(p),x))
+#else
+# define vec_store_nta(p,x) (_mm_store_pd(&(p),x))
+#endif
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+#define vec_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x))
+#define vec_store_nta_partial_hi(p,x,n) (_mm_storeh_pd((&(p))+1,x))
+
+
+
+// Functions and operators
+
+// Operators
+#undef fneg
+#undef fmul
+#undef fdiv
+#undef fadd
+#undef fsub
+#if defined(__PGI) && defined (__amd64__)
+// The PGI compiler does not understand __m128d literals
+static union {
+ unsigned long long s[CCTK_REAL_VEC_SIZE];
+ CCTK_REAL_VEC v;
+} vec_neg_mask_impl = {0x8000000000000000ULL, 0x8000000000000000ULL};
+# define vec_neg_mask (vec_neg_mask_impl.v)
+#else
+# define vec_neg_mask ((CCTK_REAL_VEC)(__m128i){0x8000000000000000ULL, 0x8000000000000000ULL})
+#endif
+#define fneg(x) (_mm_xor_pd(x,vec_neg_mask))
+#define fmul(x,y) (_mm_mul_pd(x,y))
+#define fdiv(x,y) (_mm_div_pd(x,y))
+#define fadd(x,y) (_mm_add_pd(x,y))
+#define fsub(x,y) (_mm_sub_pd(x,y))
+
+// Cheap functions
+#undef kfabs
+#undef kfmax
+#undef kfmin
+#undef ksqrt
+#if defined(__PGI) && defined (__amd64__)
+// The PGI compiler does not understand __m128d literals
+static union {
+ unsigned long long s[CCTK_REAL_VEC_SIZE];
+ CCTK_REAL_VEC v;
+} vec_fabs_mask_impl = {0x7fffffffffffffffULL, 0x7fffffffffffffffULL};
+# define vec_fabs_mask (vec_fabs_mask_impl.v)
+#else
+# define vec_fabs_mask ((CCTK_REAL_VEC)(__m128i){0x7fffffffffffffffULL, 0x7fffffffffffffffULL})
+#endif
+#define kfabs(x) (_mm_and_pd(x,vec_fabs_mask))
+#define kfmax(x,y) (_mm_max_pd(x,y))
+#define kfmin(x,y) (_mm_min_pd(x,y))
+#define ksqrt(x) (_mm_sqrt_pd(x))
+
+// Expensive functions
+#undef kexp
+#undef klog
+#undef kpow
+#define kexp(x_) ({ CCTK_REAL_VEC const x_exp=(x_); vec_set(exp(vec_elt0(x_exp)),exp(vec_elt1(x_exp))); })
+#define klog(x_) ({ CCTK_REAL_VEC const x_log=(x_); vec_set(log(vec_elt0(x_log)),log(vec_elt1(x_log))); })
+#define kpow(x_,a_) ({ CCTK_REAL_VEC const x_pow=(x_); CCTK_REAL const a_pow=(a_); vec_set(pow(vec_elt0(x_pow),a_pow),pow(vec_elt1(x_pow),a_pow)); })
+
+
+
+#undef Sign
+#define Sign(x) (42)
+
+#undef ToReal
+#define ToReal(x) (vec_set1(x))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh
new file mode 100644
index 0000000..4a4eea6
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh
@@ -0,0 +1,194 @@
+// Vectorise using Intel's or AMD's SSE2
+
+
+
+#include <emmintrin.h>
+
+// Vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+ // The underlying scalar and vector types
+ typedef double S;
+ typedef __m128d V;
+ V v;
+
+ // Convert from and to the underlying vector type
+ inline CCTK_REAL_VEC(V const v_): v(v_) { }
+ inline operator V const() const { return v; }
+
+ inline CCTK_REAL_VEC() { }
+
+ // Copy constructor
+ inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { }
+};
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+// Create vectors, extract vector elements
+
+DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a))
+DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a))
+
+// Get a scalar from the vector
+#if defined(__PGI) && defined (__amd64__)
+// _mm_cvtsd_f64 does not exist on PGI compilers
+// DEFINE_FUNCTION_V_R(vec_elt0,({ CCTK_REAL a; _mm_store_sd(&a,x); a; }))
+// DEFINE_FUNCTION_V_R(vec_elt0,(*(CCTK_REAL const*)&x))
+// This generates the fastest code with PGI compilers
+DEFINE_FUNCTION_V_R(vec_elt0,({ CCTK_REAL a; asm ("" : "=x" (a) : "0" (x)); a; }))
+#else
+DEFINE_FUNCTION_V_R(vec_elt0,_mm_cvtsd_f64(x)) // this is a no-op
+#endif
+DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_unpackhi_pd(x,x)))
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p))
+DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p))
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int mod>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p))
+template<int mod1,int mod2,int mod3>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p))
+// Implementation: load aligned if the modulus is zero
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p)
+{
+ return vec_load(p);
+}
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p)
+{
+ return vec_load(p);
+}
+// Call the implementation with the modulus
+#define vec_loadu_maybe(off,p) \
+ (vec_loadu_maybe_impl<(off)&(CCTK_REAL_VEC_SIZE-1>(p)))
+#define vec_loadu_maybe3(off1,off2,off3,p) \
+ (vec_loadu_maybe_impl3<(off1)&(CCTK_REAL_VEC_SIZE-1), \
+ (off2)&(CCTK_REAL_VEC_SIZE-1), \
+ (off3)&(CCTK_REAL_VEC_SIZE-1)>(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x))
+DEFINE_FUNCTION_PRV(vec_storeu,_mm_storeu_pd(&p,x))
+#if defined(KRANC_CACHE)
+DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x))
+#else
+DEFINE_FUNCTION_PRV(vec_store_nta,_mm_store_pd(&p,x))
+#endif
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_store_nta_partial_lo (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n)
+{
+ switch (n) {
+ case 1: _mm_storel_pd(&p,x); break;
+ default: assert(0);
+ }
+}
+static inline
+void vec_store_nta_partial_hi (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n)
+{
+ switch (n) {
+ case 1: _mm_storeh_pd((&p)+1,x); break;
+ default: assert(0);
+ }
+}
+
+
+
+// Functions and operators
+
+// Single-argument operators
+#if 0
+DEFINE_FUNCTION_V_V(operator+,x)
+static CCTK_REAL_VEC const vec_neg_mask =
+ (CCTK_REAL_VEC::V)(__m128i) { 0x8000000000000000ULL, 0x8000000000000000ULL };
+DEFINE_FUNCTION_V_V(operator-,_mm_xor_pd(x,vec_neg_mask))
+#endif
+DEFINE_FUNCTION_V_V(operator+,+x.v)
+DEFINE_FUNCTION_V_V(operator-,-x.v)
+
+// Double-argument operators, both vectors
+#if 0
+DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y))
+#endif
+DEFINE_FUNCTION_VV_V(operator+,x.v+y.v)
+DEFINE_FUNCTION_VV_V(operator-,x.v-y.v)
+DEFINE_FUNCTION_VV_V(operator*,x.v*y.v)
+DEFINE_FUNCTION_VV_V(operator/,x.v/y.v)
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x)
+DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x)
+DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x)
+DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x)
+
+// Cheap functions
+#if defined(__PGI) && defined (__amd64__)
+// The PGI compiler does not understand __m128d literals
+static union {
+ CCTK_REAL_VEC::S s[CCTK_REAL_VEC_SIZE];
+ CCTK_REAL_VEC::V v;
+} vec_fabs_mask_impl = { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL };
+# define vec_fabs_mask (vec_fabs_mask_impl.v)
+#else
+static CCTK_REAL_VEC const vec_fabs_mask =
+ (CCTK_REAL_VEC::V)(__m128i) { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL };
+#endif
+DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask))
+DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y))
+DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y))
+DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+
+
+
+#undef Sign
+#define Sign(x) (42)
+
+#undef ToReal
+#define ToReal(x) vec_set1(x)
+
+#if defined(__PGI) && defined (__amd64__)
+// Special case for PGI 9.0.4 to avoid an internal compiler error
+#undef IfThen
+static inline
+CCTK_REAL_VEC IfThen (bool const cond, CCTK_REAL_VEC const x, CCTK_REAL_VEC const y)
+{
+ union {
+ __m128i vi;
+ CCTK_REAL_VEC::V v;
+ } mask;
+ mask.vi = _mm_set1_epi64x(-(long long)cond);
+ return _mm_or_pd(_mm_and_pd(x.v, mask.v), _mm_andnot_pd(mask.v, y.v));
+}
+#endif
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh
new file mode 100644
index 0000000..7e06017
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh
@@ -0,0 +1,111 @@
+// Vectorise using IBM's Altivec
+
+// Use the type vector double directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+
+
+#include <altivec.h>
+
+// Vector type corresponding to CCTK_REAL
+typedef vector double CCTK_REAL_VEC;
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+// Create vectors, extract vector elements
+
+#define vec_set1(a) (vec_splats(a))
+#if defined(__GNUC__)
+// GNU doesn't support array indices on vectors
+union vec_mask {
+ double elts[2];
+ vector double v;
+};
+# define vec_set(a,b) ({ vec_mask x_set; x_set.elts[0]=(a); x_set.elts[1]=(b); x_set.v; })
+#else
+# define vec_set(a,b) ({ CCTK_REAL_VEC x_set; x_set[0]=(a); x_set[1]=(b); x_set; })
+#endif
+
+// Get a scalar from the vector
+#if defined(__GNUC__)
+// GNU doesn't support array indices on vectors
+# define vec_elt0(x) ({ vec_mask x_elt0; x_elt0.v=(x); x_elt0.elts[0]; })
+# define vec_elt1(x) ({ vec_mask x_elt1; x_elt1.v=(x); x_elt1.elts[1]; })
+#else
+# define vec_elt0(x) ((x)[0])
+# define vec_elt1(x) ((x)[1])
+#endif
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec_load(p) (*(CCTK_REAL_VEC const*)&(p))
+#define vec_loadu(p) (vec_load(p))
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#define vec_loadu_maybe(off,p) (vec_load(p))
+#define vec_loadu_maybe3(off1,off2,off3,p) (vec_load(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec_store(p,x) (*(CCTK_REAL_VEC*)&(p)=(x))
+#define vec_storeu(p,x) (*(CCTK_REAL_VEC*)&(p)=(x))
+// TODO: Use stvxl instruction?
+#define vec_store_nta(p,x) vec_store(p,x)
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+#define vec_store_nta_partial_lo(p,x,n) ((p)=vec_elt0(x))
+#define vec_store_nta_partial_hi(p,x,n) ((&(p))[1]=vec_elt1(x))
+
+
+
+// Functions and operators
+
+// Other Altivec functions are:
+// nabs: -abs a
+// madd msub nmadd nmsub: [+-]a*b[+-]c
+
+// Triple-argument operators, all vectors
+#undef fmadd
+#undef fmsub
+#undef fnmadd
+#undef fnmsub
+#define fmadd(x,y,z) (vec_madd(x,y,z))
+#define fmsub(x,y,z) (vec_msub(x,y,z))
+#define fnmadd(x,y,z) (vec_nmadd(x,y,z))
+#define fnmsub(x,y,z) (vec_nmsub(x,y,z))
+
+// Cheap functions
+#undef kfabs
+#undef kfmax
+#undef kfmin
+#define kfabs(x) (vec_abs(x))
+#define kfmax(x,y) (vec_max(x,y))
+#define kfmin(x,y) (vec_min(x,y))
+
+// Expensive functions
+#undef kexp
+#undef klog
+#undef kpow
+#undef ksqrt
+#define kexp(x_) ({ CCTK_REAL_VEC const x_exp=(x_); vec_set(exp(vec_elt0(x_exp)),exp(vec_elt1(x_exp))); })
+#define klog(x_) ({ CCTK_REAL_VEC const x_log=(x_); vec_set(log(vec_elt0(x_log)),log(vec_elt1(x_log))); })
+#define kpow(x_,a_) ({ CCTK_REAL_VEC const x_pow=(x_); CCTK_REAL const a_pow=(a_); vec_set(pow(vec_elt0(x_pow),a_pow),pow(vec_elt1(x_pow),a_pow)); })
+#define ksqrt(x_) ({ CCTK_REAL_VEC const x_sqrt=(x_); vec_set(sqrt(vec_elt0(x_sqrt)),sqrt(vec_elt1(x_sqrt))); })
+
+
+
+#undef Sign
+#define Sign(x) (42)
+
+#undef ToReal
+#define ToReal(x) (vec_set1((CCTK_REAL)(x)))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh
new file mode 100644
index 0000000..f591647
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh
@@ -0,0 +1,212 @@
+// Vectorise using IBM's Altivec
+
+
+
+#include <altivec.h>
+
+// Vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+ // The underlying scalar and vector types
+ typedef double S;
+ typedef vector double V;
+ V v;
+ union vec_mask {
+ S elts[2];
+ V v;
+ };
+
+ // Set a vector from scalars
+#if 0
+ // IBM
+ inline CCTK_REAL_VEC(S const a, S const b) { v[0]=a; v[1]=b; }
+#endif
+#if 0
+ inline CCTK_REAL_VEC(S const a, S const b):
+ v(vec_mergel(vec_splats(a), vec_splats(b))) { }
+#endif
+ inline CCTK_REAL_VEC(S const a, S const b)
+ {
+ vec_mask x;
+ x.elts[0] = a;
+ x.elts[1] = b;
+ v = x.v;
+ }
+
+ // Set a vector from a scalar, replicating the scalar
+ // Note: Could also use vec_xlds instead
+ inline CCTK_REAL_VEC(S const a): v(vec_splats(a)) { }
+
+ // Convert from and to the underlying vector type
+ inline CCTK_REAL_VEC(V const v_): v(v_) { }
+ inline operator V const() const { return v; }
+
+ inline CCTK_REAL_VEC() { }
+
+ // Copy constructor
+ inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { }
+};
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+// Create vectors, extract vector elements
+DEFINE_FUNCTION_R_V(vec_set1,CCTK_REAL_VEC(a))
+DEFINE_FUNCTION_RR_V(vec_set,CCTK_REAL_VEC(a,b))
+
+// Get a scalar from the vector
+#if 0
+// IBM
+DEFINE_FUNCTION_V_R(vec_elt0,x.v[0])
+DEFINE_FUNCTION_V_R(vec_elt1,x.v[1])
+#endif
+static inline CCTK_REAL vec_elt0(CCTK_REAL_VEC const x)
+{
+ CCTK_REAL_VEC::vec_mask x1;
+ x1.v = x;
+ return x1.elts[0];
+}
+static inline CCTK_REAL vec_elt1(CCTK_REAL_VEC const x)
+{
+ CCTK_REAL_VEC::vec_mask x1;
+ x1.v = x;
+ return x1.elts[1];
+}
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,p)
+#if 0
+// IBM
+DEFINE_FUNCTION_PR_V(vec_loadu,vec_xld2(0,const_cast<CCTK_REAL*>(&p)))
+#endif
+DEFINE_FUNCTION_PR_V(vec_loadu,p)
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int mod>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p))
+template<int mod1,int mod2,int mod3>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p))
+// Implementation: load aligned if the modulus is zero
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p)
+{
+ return vec_load(p);
+}
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p)
+{
+ return vec_load(p);
+}
+// Call the implementation with the modulus
+#define vec_loadu_maybe(off,p) \
+ (vec_loadu_maybe_impl<(off)&(CCTK_REAL_VEC_SIZE-1>(p)))
+#define vec_loadu_maybe3(off1,off2,off3,p) \
+ (vec_loadu_maybe_impl3<(off1)&(CCTK_REAL_VEC_SIZE-1), \
+ (off2)&(CCTK_REAL_VEC_SIZE-1), \
+ (off3)&(CCTK_REAL_VEC_SIZE-1)>(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC::V*)&p=x)
+DEFINE_FUNCTION_PRV(vec_storeu,*(CCTK_REAL_VEC::V*)&p=x)
+// TODO: Use stvxl instruction?
+DEFINE_FUNCTION_PRV(vec_store_nta,*(CCTK_REAL_VEC::V*)&p=x)
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_store_nta_partial_lo (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n)
+{
+ switch (n) {
+ case 1: p=vec_elt0(x); break;
+ default: assert(0);
+ }
+}
+static inline
+void vec_store_nta_partial_hi (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n)
+{
+ switch (n) {
+ case 1: (&p)[1]=vec_elt1(x); break;
+ default: assert(0);
+ }
+}
+
+
+
+// Functions and operators
+
+// Other Altivec functions are:
+// nabs: -abs a
+// madd msub nmadd nmsub: [+-]a*b[+-]c
+
+// Single-argument operators
+#if 0
+DEFINE_FUNCTION_V_V(operator+,x)
+DEFINE_FUNCTION_V_V(operator-,vec_neg(x))
+#endif
+DEFINE_FUNCTION_V_V(operator+,+x.v)
+DEFINE_FUNCTION_V_V(operator-,-x.v)
+
+// Double-argument operators, both vectors
+#if 0
+DEFINE_FUNCTION_VV_V(operator+,vec_add(x,y))
+DEFINE_FUNCTION_VV_V(operator-,vec_sub(x,y))
+DEFINE_FUNCTION_VV_V(operator*,vec_mul(x,y))
+DEFINE_FUNCTION_VV_V(operator/,vec_div(x,y))
+#endif
+DEFINE_FUNCTION_VV_V(operator+,x.v+y.v)
+DEFINE_FUNCTION_VV_V(operator-,x.v-y.v)
+DEFINE_FUNCTION_VV_V(operator*,x.v*y.v)
+DEFINE_FUNCTION_VV_V(operator/,x.v/y.v)
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x)
+DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x)
+DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x)
+DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x)
+
+// Triple-argument operators, all vectors
+#undef fmadd
+#undef fmsub
+#undef fnmadd
+#undef fnmsub
+DEFINE_FUNCTION_VVV_V(fmadd,vec_madd(x.v,y.v,z.v))
+DEFINE_FUNCTION_VVV_V(fmsub,vec_msub(x.v,y.v,z.v))
+DEFINE_FUNCTION_VVV_V(fnmadd,vec_nmadd(x.v,y.v,z.v))
+DEFINE_FUNCTION_VVV_V(fnmsub,vec_nmsub(x.v,y.v,z.v))
+
+// Cheap functions
+DEFINE_FUNCTION_V_V(fabs,vec_abs(x.v))
+DEFINE_FUNCTION_VV_V(fmax,vec_max(x.v,y.v))
+DEFINE_FUNCTION_VV_V(fmin,vec_min(x.v,y.v))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+DEFINE_FUNCTION_V_V(sqrt,vec_set(sqrt(vec_elt0(x)),sqrt(vec_elt1(x))))
+
+
+
+#undef Sign
+#define Sign(x) (42)
+
+#undef ToReal
+#define ToReal(x) (vec_set1(x))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh
new file mode 100644
index 0000000..f928ed8
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh
@@ -0,0 +1,31 @@
+// Fallback vectorisation implementation: Do not vectorise
+
+
+
+// Use CCTK_REAL
+typedef CCTK_REAL CCTK_REAL_VEC;
+
+// Number of vector elements in a CCTK_REAL_VEC
+static int const CCTK_REAL_VEC_SIZE = 1;
+
+
+
+// We use macros here, so that we are not surprised by compilers which
+// don't like to inline functions (e.g. PGI). This should also make
+// debug builds (which may not inline) more efficient.
+
+#define vec_load(p) (p)
+#define vec_loadu(p) (p)
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#define vec_loadu_maybe(off,p) (p)
+#define vec_loadu_maybe3(off1,off2,off3,p) (p)
+
+#define vec_store(p,x) ((p)=(x))
+#define vec_store_nta(p,x) ((p)=(x))
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+#define vec_store_nta_partial_lo(p,x,n) (assert(0))
+#define vec_store_nta_partial_hi(p,x,n) (assert(0))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh
new file mode 100644
index 0000000..f5c0b22
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh
@@ -0,0 +1,104 @@
+// Define some macros that simplify defining short function that are
+// supposed to be inlined
+
+
+
+// Letters defining the prototype (argument and return value types):
+// I: i,j: integer
+// R: a,b: real
+// V: x,y: vector (of real)
+// P: p,q: pointer (i.e. const reference) to something
+// L: l,m: L-value (i.e. non-const reference) to something
+
+
+
+// Load and store
+
+#define DEFINE_FUNCTION_PR_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL const& p) \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_PRV(name,expr) \
+static inline \
+void name (CCTK_REAL& p, CCTK_REAL_VEC const x) \
+{ \
+ expr; \
+}
+
+#define DEFINE_FUNCTION_PVR(name,expr) \
+static inline \
+void name (CCTK_REAL_VEC& p, CCTK_REAL const a) \
+{ \
+ expr; \
+}
+
+
+
+// Functions and operators
+
+#define DEFINE_FUNCTION_V_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const x) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_V_R(name,expr) \
+static inline \
+CCTK_REAL name (CCTK_REAL_VEC const x) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_R_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL const a) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_VV_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL_VEC const y) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_VR_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL const a) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_RV_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL const a, CCTK_REAL_VEC const x) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_RR_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL const a, CCTK_REAL const b) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_VVV_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL_VEC const y, CCTK_REAL_VEC const z) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh
new file mode 100644
index 0000000..df83b3a
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh
@@ -0,0 +1,591 @@
+#ifndef VECTORS_HH
+#define VECTORS_HH
+
+
+
+// Vectorisation
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+
+#include <cctk.h>
+
+
+
+// I: i,j: integer
+// R: a,b: real
+// V: x,y: vector (of real)
+// P: p,q: pointer (i.e. const reference) to something
+// L: l,m: L-value (i.e. non-const reference) to something
+
+#define DEFINE_FUNCTION_PR_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL const& p) \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_PRV(name,expr) \
+static inline \
+void name (CCTK_REAL& p, CCTK_REAL_VEC const& x) \
+{ \
+ expr; \
+}
+
+#define DEFINE_FUNCTION_PVR(name,expr) \
+static inline \
+void name (CCTK_REAL_VEC& p, CCTK_REAL const& a) \
+{ \
+ expr; \
+}
+
+#define DEFINE_FUNCTION_V_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const& x) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_V_R(name,expr) \
+static inline \
+CCTK_REAL name (CCTK_REAL_VEC const& x) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_R_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL const& a) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_VV_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL_VEC const& y) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_VR_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL const& a) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_RV_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL_VEC const& x) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
+
+#define DEFINE_FUNCTION_RR_V(name,expr) \
+static inline \
+CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL const& b) \
+ CCTK_ATTRIBUTE_PURE \
+{ \
+ return expr; \
+}
+
+
+
+// Intel, double
+#if defined(KRANC_VECTORS) && defined(__SSE2__) && defined(CCTK_REAL_PRECISION_8)
+
+#include <emmintrin.h>
+
+// Vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+ // The underlying scalar and vector types
+ typedef double S;
+ typedef __m128d V;
+ V v;
+
+ // Set a vector from scalars
+ inline CCTK_REAL_VEC(S const& a, S const& b): v(_mm_set_pd(b,a)) { }
+
+ // Set a vector from a scalar, replicating the scalar
+ inline CCTK_REAL_VEC(S const& a): v(_mm_set1_pd(a)) { }
+
+ // Convert from and to the underlying vector type
+ inline CCTK_REAL_VEC(V const& v_): v(v_) { }
+ inline operator V const() const { return v; }
+
+ inline CCTK_REAL_VEC() { }
+
+ // Copy constructor
+ inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { }
+};
+
+union vec_mask {
+ unsigned long long bits[2];
+ CCTK_REAL_VEC::V v;
+};
+
+DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a))
+DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a))
+
+// Get a scalar from the vector
+#if defined(__PGI) && defined (__amd64__)
+// _mm_cvtsd_f64 does not exist on PGI compilers
+static inline
+CCTK_REAL vec_elt0 (CCTK_REAL_VEC const& x)
+{
+ CCTK_REAL a; _mm_store_sd(&a,x); return a;
+}
+#else
+DEFINE_FUNCTION_V_R(vec_elt0,_mm_cvtsd_f64(x)) //this is a no-op
+#endif
+
+#if 0
+DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_shuffle_pd(x,x,_MM_SHUFFLE2(1,1))))
+#endif
+static inline
+CCTK_REAL vec_elt1 (CCTK_REAL_VEC const& x)
+{
+ CCTK_REAL a; _mm_storeh_pd(&a,x); return a;
+}
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p))
+DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p))
+
+#if 0
+// Load a partial vector (duplicating the last loaded element to fill
+// the remaining elements)
+// TODO: Should this be aligned or unaligned?
+static inline
+CCTK_REAL_VEC vec_load_partial (CCTK_REAL const& p, int const n)
+{
+ switch (n) {
+ case 1: return _mm_load1_pd(p);
+ default: assert(0);
+ }
+}
+#endif
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int mod>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p))
+template<int mod1,int mod2,int mod3>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p))
+// Implementation: load aligned if the modulus is zero
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p)
+{
+ return vec_load(p);
+}
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p)
+{
+ return vec_load(p);
+}
+// Call the implementation with the modulus
+template<int off>
+static inline
+CCTK_REAL_VEC vec_loadu_maybe (CCTK_REAL const& p)
+{
+ return vec_loadu_maybe_impl<off&(2-1)>(p);
+}
+template<int off1,int off2, int off3>
+static inline
+CCTK_REAL_VEC vec_loadu_maybe3 (CCTK_REAL const& p)
+{
+ return vec_loadu_maybe_impl3<off1&(2-1),off2&(2-1),off3&(2-1)>(p);
+}
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x))
+DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x))
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+ switch (n) {
+ case 1: _mm_storel_pd(&p,x); break;
+ default: assert(0);
+ }
+}
+static inline
+void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+ switch (n) {
+ case 1: _mm_storeh_pd((&p)+1,x); break;
+ default: assert(0);
+ }
+}
+
+// Double-argument operators, both vectors
+DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y))
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x)
+DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x)
+DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x)
+DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x)
+
+// Single-argument operators
+DEFINE_FUNCTION_V_V(operator+,x)
+#if 0
+DEFINE_FUNCTION_V_V(operator-,vec_set(0.0,0.0)-x)
+#endif
+static vec_mask const vec_neg_mask =
+{ { 0x8000000000000000ULL, 0x8000000000000000ULL } };
+DEFINE_FUNCTION_V_V(operator-,_mm_xor_pd(x,vec_neg_mask.v))
+
+// Cheap functions
+static vec_mask const vec_fabs_mask =
+{ { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } };
+DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask.v))
+DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y))
+DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y))
+DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+
+// Special case for PGI to avoid internal compiler error
+#if defined(__PGI) && defined (__amd64__)
+#undef IfThen
+CCTK_REAL_VEC IfThen (bool const cond, CCTK_REAL_VEC const& x, CCTK_REAL_VEC co\
+nst& y)
+{
+ return cond*x + (not cond)*y;
+}
+#endif
+
+
+
+#if 0
+// Try to use the __m128d type directly.
+
+// This does not really work, because it is not possible to define
+// automatic conversion operators from double to __m128d, so that
+// explicit conversions are required. This makes the code look more
+// clumsy.
+
+// Vector type corresponding to CCTK_REAL
+typedef __m128d CCTK_REAL_VEC;
+
+DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a))
+DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a))
+
+// Get a scalar from the vector
+static inline
+CCTK_REAL vec_elt0 (CCTK_REAL_VEC const& x)
+{
+#if 0
+ // _mm_cvtsd_f64 does not exist on PGI compilers
+ return _mm_cvtsd_f64(x); // this is a no-op
+#endif
+ CCTK_REAL a; _mm_store_sd(&a,x); return a;
+}
+
+DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_shuffle_pd(x,x,_MM_SHUFFLE2(1,1))))
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p))
+DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x))
+DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x))
+
+// Cheap functions
+static vec_mask const vec_fabs_mask =
+{ { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } };
+DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask.v))
+DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y))
+DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y))
+DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+
+#endif
+
+
+
+// Intel, float
+#elif defined(KRANC_VECTORS) && defined(__SSE__) && defined(CCTK_REAL_PRECISION_4)
+
+#include <xmmintrin.h>
+
+// A vector type corresponding to CCTK_REAL
+typedef __m128 CCTK_REAL_VEC;
+
+
+
+// Power, double
+#elif defined(KRANC_VECTORS) && defined(__ALTIVEC__) && defined(_ARCH_PWR7) && defined(CCTK_REAL_PRECISION_8)
+
+#include <altivec.h>
+
+// Vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+ // The underlying scalar and vector types
+ typedef double S;
+ typedef vector double V;
+ V v;
+
+ // vec_insert, vec_extract, vec_splat
+
+ // Set a vector from scalars
+ inline CCTK_REAL_VEC(S const& a, S const& b) { v[0]=a; v[1]=b; }
+
+ // Set a vector from a scalar, replicating the scalar
+ inline CCTK_REAL_VEC(S const& a): v(vec_splats(a)) { }
+
+ // Convert from and to the underlying vector type
+ inline CCTK_REAL_VEC(V const& v_): v(v_) { }
+ inline operator V const() const { return v; }
+
+ inline CCTK_REAL_VEC() { }
+
+ // Copy constructor
+ inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { }
+};
+
+DEFINE_FUNCTION_R_V(vec_set1,CCTK_REAL_VEC(a))
+DEFINE_FUNCTION_RR_V(vec_set,CCTK_REAL_VEC(a,b))
+
+// Get a scalar from the vector
+DEFINE_FUNCTION_V_R(vec_elt0,x.v[0])
+DEFINE_FUNCTION_V_R(vec_elt1,x.v[1])
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,p)
+DEFINE_FUNCTION_PR_V(vec_loadu,vec_xld2(0,const_cast<CCTK_REAL*>(&p)))
+// vec_xlds
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int mod>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p))
+// Implementation: load aligned if the modulus is zero
+#define static
+template<>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl<0>,vec_load(p))
+#undef static
+// Call the implementation with the modulus
+template<int off>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe,vec_loadu_maybe_impl<off&(2-1)>(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC::V*)&p=x)
+DEFINE_FUNCTION_PRV(vec_store_nta,*(CCTK_REAL_VEC::V*)&p=x)
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+ switch (n) {
+ case 1: p=x.v[0]; break;
+ default: assert(0);
+ }
+}
+static inline
+void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+ switch (n) {
+ case 1: (&p)[1]=x.v[1]; break;
+ default: assert(0);
+ }
+}
+
+// Double-argument operators, both vectors
+DEFINE_FUNCTION_VV_V(operator+,vec_add(x,y))
+DEFINE_FUNCTION_VV_V(operator-,vec_sub(x,y))
+DEFINE_FUNCTION_VV_V(operator*,vec_mul(x,y))
+DEFINE_FUNCTION_VV_V(operator/,vec_div(x,y))
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x)
+DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x)
+DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x)
+DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x)
+
+// Single-argument operators
+DEFINE_FUNCTION_V_V(operator+,x)
+DEFINE_FUNCTION_V_V(operator-,vec_neg(x))
+
+// Cheap functions
+DEFINE_FUNCTION_V_V(fabs,vec_abs(x))
+DEFINE_FUNCTION_VV_V(fmax,vec_max(x,y))
+DEFINE_FUNCTION_VV_V(fmin,vec_min(x,y))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+DEFINE_FUNCTION_V_V(sqrt,vec_set(sqrt(vec_elt0(x)),sqrt(vec_elt1(x))))
+
+
+
+// Fallback: pseudo-vectorisation
+#elif 0
+
+// There is no vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+ // The underlying scalar and vector types
+ CCTK_REAL v, w;
+
+ // Set a vector from scalars
+ inline CCTK_REAL_VEC(CCTK_REAL const& a, CCTK_REAL const& b): v(a), w(b) { }
+
+ // Set a vector from a scalar, replicating the scalar
+ inline CCTK_REAL_VEC(CCTK_REAL const& a): v(a), w(a) { }
+
+ inline CCTK_REAL_VEC() { }
+
+ // Copy constructor
+ inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x.v), w(x.w) { }
+};
+
+
+
+DEFINE_FUNCTION_PR_V(vec_load,*(CCTK_REAL_VEC const* restrict)&p)
+DEFINE_FUNCTION_PR_V(vec_loadu,vec_load(p))
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+template<int off>
+DEFINE_FUNCTION_PR_V(vec_loadm,vec_load(p))
+
+DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC* restrict)&p=x)
+DEFINE_FUNCTION_PRV(vec_store_nta,vec_store(p,x))
+
+// Double-argument operators, both vectors
+DEFINE_FUNCTION_VV_V(operator+,CCTK_REAL_VEC(x.v+y.v,x.w+y.w))
+DEFINE_FUNCTION_VV_V(operator-,CCTK_REAL_VEC(x.v-y.v,x.w-y.w))
+DEFINE_FUNCTION_VV_V(operator*,CCTK_REAL_VEC(x.v*y.v,x.w*y.w))
+DEFINE_FUNCTION_VV_V(operator/,CCTK_REAL_VEC(x.v/y.v,x.w/y.w))
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,CCTK_REAL_VEC(x.v+a,x.w+a))
+DEFINE_FUNCTION_VR_V(operator-,CCTK_REAL_VEC(x.v-a,x.w-a))
+DEFINE_FUNCTION_VR_V(operator*,CCTK_REAL_VEC(x.v*a,x.w*a))
+DEFINE_FUNCTION_VR_V(operator/,CCTK_REAL_VEC(x.v/a,x.w/a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a+x.v,a+x.w))
+DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a-x.v,a-x.w))
+DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a*x.v,a*x.w))
+DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a/x.v,a/x.w))
+
+// Single-argument operators
+DEFINE_FUNCTION_V_V(operator+,x)
+DEFINE_FUNCTION_V_V(operator-,CCTK_REAL_VEC(-x.v,-x.w))
+
+// Cheap functions
+DEFINE_FUNCTION_V_V(fabs,CCTK_REAL_VEC(fabs(x.v),fabs(x.w)))
+DEFINE_FUNCTION_VV_V(fmax,CCTK_REAL_VEC(fmax(x.v,y.v),fmax(x.w,y.w)))
+DEFINE_FUNCTION_VV_V(fmin,CCTK_REAL_VEC(fmin(x.v,y.v),fmin(x.w,y.w)))
+DEFINE_FUNCTION_V_V(sqrt,CCTK_REAL_VEC(sqrt(x.v),sqrt(x.w)))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.v),exp(x.w)))
+DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.v),log(x.w)))
+DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.v,a),pow(x.w,a)))
+
+
+
+// Fallback: no vectorisation
+#else
+
+// There is no vector type corresponding to CCTK_REAL
+typedef CCTK_REAL CCTK_REAL_VEC;
+
+
+
+DEFINE_FUNCTION_PR_V(vec_load,p)
+DEFINE_FUNCTION_PR_V(vec_loadu,p)
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int off>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe,p)
+template<int off1,int off2, int off3>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe3,p)
+
+DEFINE_FUNCTION_PRV(vec_store,p=x)
+DEFINE_FUNCTION_PRV(vec_store_nta,p=x)
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+ assert(0);
+}
+static inline
+void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+ assert(0);
+}
+
+
+
+#endif
+
+
+
+#undef DEFINE_FUNCTION_PR_V
+#undef DEFINE_FUNCTION_PRV
+#undef DEFINE_FUNCTION_V_V
+#undef DEFINE_FUNCTION_R_V
+#undef DEFINE_FUNCTION_VV_V
+#undef DEFINE_FUNCTION_VR_V
+#undef DEFINE_FUNCTION_RV_V
+#undef DEFINE_FUNCTION_RR_V
+
+
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+#endif // #ifndef VECTORS_HH
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh
new file mode 100644
index 0000000..f439c9b
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh
@@ -0,0 +1,72 @@
+// Pseudo vectorisation using scalar operations
+
+
+
+// Number of vector elements in a CCTK_REAL_VEC
+static int const CCTK_REAL_VEC_SIZE = 2;
+
+// There is no vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+ // The underlying scalar and vector types
+ CCTK_REAL v[CCTK_REAL_VEC_SIZE];
+
+ // Set a vector from scalars
+ inline CCTK_REAL_VEC(CCTK_REAL const& a, CCTK_REAL const& b): v(a), w(b) { }
+
+ // Set a vector from a scalar, replicating the scalar
+ inline CCTK_REAL_VEC(CCTK_REAL const& a): v(a), w(a) { }
+
+ inline CCTK_REAL_VEC() { }
+
+ // Copy constructor
+ inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x) { v[0]=x.v[0]; v[1]=x.v[1]; }
+};
+
+
+
+// Load and store vectors
+
+DEFINE_FUNCTION_PR_V(vec_load,*(CCTK_REAL_VEC const* restrict)&p)
+DEFINE_FUNCTION_PR_V(vec_loadu,vec_load(p))
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#define vec_loadu_maybe(off,p) (vec_load(p))
+#define vec_loadu_maybe3(off1,off2,off3,p) (vec_load(p))
+
+DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC* restrict)&p=x)
+DEFINE_FUNCTION_PRV(vec_store_nta,vec_store(p,x))
+
+
+
+// Functions and operators
+
+// Double-argument operators, both vectors
+DEFINE_FUNCTION_VV_V(operator+,CCTK_REAL_VEC(x.v[0]+y.v[0],x.v[1]+y.v[1]))
+DEFINE_FUNCTION_VV_V(operator-,CCTK_REAL_VEC(x.v[0]-y.v[0],x.v[1]-y.v[1]))
+DEFINE_FUNCTION_VV_V(operator*,CCTK_REAL_VEC(x.v[0]*y.v[0],x.v[1]*y.v[1]))
+DEFINE_FUNCTION_VV_V(operator/,CCTK_REAL_VEC(x.v[0]/y.v[0],x.v[1]/y.v[1]))
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,CCTK_REAL_VEC(x.v[0]+a,x.v[1]+a))
+DEFINE_FUNCTION_VR_V(operator-,CCTK_REAL_VEC(x.v[0]-a,x.v[1]-a))
+DEFINE_FUNCTION_VR_V(operator*,CCTK_REAL_VEC(x.v[0]*a,x.v[1]*a))
+DEFINE_FUNCTION_VR_V(operator/,CCTK_REAL_VEC(x.v[0]/a,x.v[1]/a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a+x.v[0],a+x.v[1]))
+DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a-x.v[0],a-x.v[1]))
+DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a*x.v[0],a*x.v[1]))
+DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a/x.v[0],a/x.v[1]))
+
+// Single-argument operators
+DEFINE_FUNCTION_V_V(operator+,x)
+DEFINE_FUNCTION_V_V(operator-,CCTK_REAL_VEC(-x.v[0],-x.v[1]))
+
+// Functions
+DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.v[0]),exp(x.v[1])))
+DEFINE_FUNCTION_V_V(fabs,CCTK_REAL_VEC(fabs(x.v[0]),fabs(x.v[1])))
+DEFINE_FUNCTION_VV_V(fmax,CCTK_REAL_VEC(fmax(x.v[0],y.v[0]),fmax(x.v[1],y.v[1])))
+DEFINE_FUNCTION_VV_V(fmin,CCTK_REAL_VEC(fmin(x.v[0],y.v[0]),fmin(x.v[1],y.v[1])))
+DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.v[0]),log(x.v[1])))
+DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.v[0],a),pow(x.v[1],a)))
+DEFINE_FUNCTION_V_V(sqrt,CCTK_REAL_VEC(sqrt(x.v[0]),sqrt(x.v[1])))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh
new file mode 100644
index 0000000..0d950c7
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh
@@ -0,0 +1,14 @@
+// Undefine all macros defined in "Vectors-define.hh", so that we
+// leave a clean namespace
+
+
+
+#undef DEFINE_FUNCTION_PR_V
+#undef DEFINE_FUNCTION_PRV
+#undef DEFINE_FUNCTION_V_V
+#undef DEFINE_FUNCTION_R_V
+#undef DEFINE_FUNCTION_VV_V
+#undef DEFINE_FUNCTION_VR_V
+#undef DEFINE_FUNCTION_RV_V
+#undef DEFINE_FUNCTION_RR_V
+#undef DEFINE_FUNCTION_VVV_V
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh
index 3fb77e1..d32afb2 100644
--- a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh
@@ -5,231 +5,47 @@
// Vectorisation
+#include <assert.h>
#include <math.h>
#include <stdlib.h>
-#include <cctk.h>
-
-
-
-// I: i,j: integer
-// R: a,b: real
-// V: x,y: vector (of real)
-// P: p,q: pointer (i.e. const reference) to something
-// L: l,m: L-value (i.e. non-const reference) to something
-
-#define DEFINE_FUNCTION_PR_V(name,expr) \
-inline \
-CCTK_REAL_VEC name (CCTK_REAL const& p) \
-{ \
- return expr; \
-}
-
-#define DEFINE_FUNCTION_PRV(name,expr) \
-inline \
-void name (CCTK_REAL& p, CCTK_REAL_VEC const& x) \
-{ \
- expr; \
-}
-
-#define DEFINE_FUNCTION_PVR(name,expr) \
-inline \
-void name (CCTK_REAL_VEC& p, CCTK_REAL const& a) \
-{ \
- expr; \
-}
-
-#define DEFINE_FUNCTION_V_V(name,expr) \
-inline \
-CCTK_REAL_VEC name (CCTK_REAL_VEC const& x) \
- CCTK_ATTRIBUTE_PURE \
-{ \
- return CCTK_REAL_VEC(expr); \
-}
-
-#define DEFINE_FUNCTION_V_R(name,expr) \
-inline \
-CCTK_REAL name (CCTK_REAL_VEC const& x) \
- CCTK_ATTRIBUTE_PURE \
-{ \
- return expr; \
-}
-
-#define DEFINE_FUNCTION_R_V(name,expr) \
-inline \
-CCTK_REAL_VEC name (CCTK_REAL const& a) \
- CCTK_ATTRIBUTE_PURE \
-{ \
- return expr; \
-}
-
-#define DEFINE_FUNCTION_VV_V(name,expr) \
-inline \
-CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL_VEC const& y) \
- CCTK_ATTRIBUTE_PURE \
-{ \
- return expr; \
-}
-
-#define DEFINE_FUNCTION_VR_V(name,expr) \
-inline \
-CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL const& a) \
- CCTK_ATTRIBUTE_PURE \
-{ \
- return expr; \
-}
-
-#define DEFINE_FUNCTION_RV_V(name,expr) \
-inline \
-CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL_VEC const& x) \
- CCTK_ATTRIBUTE_PURE \
-{ \
- return expr; \
-}
-
-#define DEFINE_FUNCTION_RR_V(name,expr) \
-inline \
-CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL const& b) \
- CCTK_ATTRIBUTE_PURE \
-{ \
- return expr; \
-}
-
-
-
-// Intel, double
-#if defined(KRANC_VECTORS) && defined(__SSE2__) && defined(CCTK_REAL_PRECISION_8)
-
-#include <emmintrin.h>
-
-// Vector type corresponding to CCTK_REAL
-struct CCTK_REAL_VEC {
- // The underlying scalar and vector types
- typedef double S;
- typedef __m128d V;
- static int const n = sizeof(V)/sizeof(S);
- V v;
-
- // Set a vector from scalars
- CCTK_REAL_VEC(S const& a, S const& b): v(_mm_set_pd(a,b)) { };
-
- // Get a scalar from the vector
- S elt0() const { return _mm_cvtsd_f64(v); /* this is a no-op */ }
- S elt1() const { return _mm_cvtsd_f64(_mm_shuffle_pd(v,v,_MM_SHUFFLE2(1,1))); }
-
- // Set a vector from a scalar, replicating the scalar
- CCTK_REAL_VEC(S const& a): v(_mm_set1_pd(a)) { };
-
- // Convert from and to the underlying vector type
- CCTK_REAL_VEC(V const& v_): v(v_) { };
- operator V const() const { return v; }
-
- CCTK_REAL_VEC() { };
-
- // Copy constructor
- CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { };
-};
-
-// Load a vector from memory (aligned and unaligned); this loads from
-// a reference to a scalar
-DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p));
-DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p));
-
-// Store a vector to memory (aligned and non-temporal); this stores to
-// a reference to a scalar
-DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x))
-DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x))
-
-// Double-argument operators, both vectors
-DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y))
-DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y))
-DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y))
-DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y))
-
-// Double-argument operators, vector and scalar
-DEFINE_FUNCTION_VR_V(operator+,x+CCTK_REAL_VEC(a))
-DEFINE_FUNCTION_VR_V(operator-,x-CCTK_REAL_VEC(a))
-DEFINE_FUNCTION_VR_V(operator*,x*CCTK_REAL_VEC(a))
-DEFINE_FUNCTION_VR_V(operator/,x/CCTK_REAL_VEC(a))
-
-// Double-argument operators, scalar and vector
-DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a)+x)
-DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a)-x)
-DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a)*x)
-DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a)/x)
-
-// Single-argument operators
-DEFINE_FUNCTION_V_V(operator+,x)
-DEFINE_FUNCTION_V_V(operator-,0.0-x)
-
-// Cheap functions
-static union {
- unsigned long long const bits[2];
- CCTK_REAL_VEC::V v;
-} const fabs_mask =
- { { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } };
-DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,fabs_mask.v))
-DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y))
-DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y))
-DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x))
-
-// Expensive functions
-DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.elt0()),exp(x.elt1())))
-DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.elt0()),log(x.elt1())))
-DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.elt0(),a),pow(x.elt1(),a)))
-
-// Un-implemented functions
-DEFINE_FUNCTION_V_R(signbit,0)
-
-
-
-#if 0
-// Intel, float
-#elif defined(KRANC_VECTORS) && defined(__SSE__) && defined(CCTK_REAL_PRECISION_4)
-
-#include <xmmintrin.h>
-
-// A vector type corresponding to CCTK_REAL
-typedef __m128 CCTK_REAL_VEC;
-#endif
-
+#include <cctk.h>
-// Fallback: no vectorisation
-#else
-
-// There is no vector type corresponding to CCTK_REAL
-typedef CCTK_REAL CCTK_REAL_VEC;
+#include "Vectors-define.hh"
-DEFINE_FUNCTION_PR_V(vec_load,p)
-DEFINE_FUNCTION_PR_V(vec_loadu,p)
+#if defined(KRANC_VECTORS)
+// Vectorise
-DEFINE_FUNCTION_PRV(vec_store,p=x)
-DEFINE_FUNCTION_PRV(vec_store_nta,p=x)
+# if ! defined(CCTK_REAL_PRECISION_8)
+# error "Vectorisation is currently only supported for double precision"
+# endif
-DEFINE_FUNCTION_V_R(signbit,x<0)
+# if defined(__SSE2__) // SSE2 (Intel)
+# if defined(KRANC_DIRECT)
+# include "Vectors-SSE2-direct.hh"
+# else
+# include "Vectors-SSE2.hh"
+# endif
+# elif defined(__ALTIVEC__) && defined(_ARCH_PWR7) // Altivec (Power)
+# if defined(KRANC_DIRECT)
+# include "Vectors-VSX-direct.hh"
+# else
+# include "Vectors-VSX.hh"
+# endif
+# else
+# include "Vectors-pseudo.hh"
+# endif
+#else
+// Don't vectorise
+# include "Vectors-default.hh"
#endif
-
-
-#undef DEFINE_FUNCTION_PR_V
-#undef DEFINE_FUNCTION_PRV
-#undef DEFINE_FUNCTION_V_V
-#undef DEFINE_FUNCTION_R_V
-#undef DEFINE_FUNCTION_VV_V
-#undef DEFINE_FUNCTION_VR_V
-#undef DEFINE_FUNCTION_RV_V
-#undef DEFINE_FUNCTION_RR_V
-
-
-
-// Number of vector elements in a CCTK_REAL_VEC
-static
-size_t const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+#include "Vectors-undefine.hh"