Implement vectorisation

This is Erik's vectorisation working tree from 13-Oct-2010
author: Ian Hinder <ian.hinder@aei.mpg.de> 2010-11-22 11:20:48 +0100
committer: Ian Hinder <ian.hinder@aei.mpg.de> 2010-11-23 00:19:37 +0100
commit: d6c4d4c2131107ef3a4004692823e2041394acd6 (patch)
tree: bbdd18d5c45e9d4b4f722c8792c4c7d1f02ced19 /Auxiliary/Cactus/KrancNumericalTools
parent: c9431558d69ee1b8769c918044a45bdd50520b46 (diff)
11 files changed, 1535 insertions, 226 deletions
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h
index 29c89d1..ee6a3b7 100644
--- a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h
@@ -1,17 +1,17 @@
  
-#define Power(x, y)   (pow((x), (y)))
+#define Power(x, y)   (pow(x,y))
 #define Sqrt(x)       (sqrt(x))
 
 
 #ifdef KRANC_C
-#define Abs(x)        (fabs(x))
-#define Min(x, y)     (fmin((x), (y)))
-#define Max(x, y)     (fmax((x), (y)))
-#define IfThen(x,y,z) ((x) ? (y) : (z))
+#  define Abs(x)        (fabs(x))
+#  define Min(x, y)     (fmin(x,y))
+#  define Max(x, y)     (fmax(x,y))
+#  define IfThen(x,y,z) ((x) ? (y) : (z))
 #else
-#define Abs(x)        (abs(x))
-#define Min(x, y)     (min((x), (y)))
-#define Max(x, y)     (max((x), (y)))
+#  define Abs(x)        (abs(x))
+#  define Min(x, y)     (min(x,y))
+#  define Max(x, y)     (max(x,y))
 /* IfThen cannot be expressed in Fortran */
 #endif
 
@@ -31,17 +31,46 @@
 #define Tanh(x)       (tanh(x))
 
 #ifdef KRANC_C
-#define Sign(x)       (signbit(x)?-1:+1)
+#  define Sign(x)     ((x)<0?-1:+1)
+#  define ToReal(x)   ((CCTK_REAL)(x))
 #else
-#define Sign(x)       (sgn(x))
+#  define Sign(x)     (sgn(x))
+#  define ToReal(x)   (real((x),kind(khalf)))
 #endif
 
+/* TODO: use fma(x,y,z) to implement fmadd and friends?  Note that fma
+   may be unsupported, or may be slow.  */
+
+/* #define fmadd(x,y,z)  ((x)*(y)+(z)) */
+/* #define fmsub(x,y,z)  ((x)*(y)-(z)) */
+/* #define fnmadd(x,y,z) (-(z)-(x)*(y)) */
+/* #define fnmsub(x,y,z) (+(z)-(x)*(y)) */
+
+#define fneg(x)   (-(x))
+#define fmul(x,y) ((x)*(y))
+#define fdiv(x,y) ((x)/(y))
+#define fadd(x,y) ((x)+(y))
+#define fsub(x,y) ((x)-(y))
+
+#define fmadd(x,y,z)  (fadd(fmul(x,y),z))
+#define fmsub(x,y,z)  (fsub(fmul(x,y),z))
+#define fnmadd(x,y,z) (fsub(fneg(z),fmul(x,y)))
+#define fnmsub(x,y,z) (fsub(z,fmul(x,y)))
+
+#define kexp(x)    (exp(x))
+#define kfabs(x)   (fabs(x))
+#define kfmax(x,y) (fmax(x,y))
+#define kfmin(x,y) (fmin(x,y))
+#define klog(x)    (log(x))
+#define kpow(x,y)  (pow(x,y))
+#define ksqrt(x)   (sqrt(x))
+
 #ifdef KRANC_C
-#define E               M_E
-#define Pi              M_PI
+#  define E           M_E
+#  define Pi          M_PI
 #else
-#define E               2.71828182845904523536029d0
-#define Pi              3.14159265358979323846264d0
+#  define E           2.71828182845904523536029d0
+#  define Pi          3.14159265358979323846264d0
 #endif
 
-#define UnitStep(x) ( (x) > 0 ? 1 : 0 )
+#define UnitStep(x)   ((x)>0)
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh
new file mode 100644
index 0000000..12cd6e8
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh
@@ -0,0 +1,135 @@
+// Vectorise using Intel's or AMD's SSE2
+
+// Use the type __m128d directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+
+
+#include <emmintrin.h>
+
+// Vector type corresponding to CCTK_REAL
+typedef __m128d CCTK_REAL_VEC;
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+// Create vectors, extract vector elements
+
+#define vec_set1(a) (_mm_set1_pd(a))
+#define vec_set(a,b) (_mm_set_pd(b,a))
+
+// Get a scalar from the vector
+#if defined(__PGI) && defined (__amd64__)
+// _mm_cvtsd_f64 does not exist on PGI compilers
+// #  define vec_elt0(x) (*(CCTK_REAL const*)&(x))
+#  define vec_elt0(x) ({ CCTK_REAL a_elt0; asm ("" : "=x" (a_elt0) : "0" (x)); a_elt0; })
+#else
+// this is a no-op
+#  define vec_elt0(x) (_mm_cvtsd_f64(x))
+#endif
+#define vec_elt1(x_) ({ CCTK_REAL_VEC const x_elt1=(x_); vec_elt0(_mm_unpackhi_pd(x_elt1,x_elt1)); })
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec_load(p)  (_mm_load_pd(&(p)))
+#define vec_loadu(p) (_mm_loadu_pd(&(p)))
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: Always use unaligned load
+#define vec_loadu_maybe(off,p) (vec_loadu(p))
+#define vec_loadu_maybe3(off1,off2,off3,p) (vec_loadu(p))
+#if 0
+#define vec_loadu_maybe(off,p)                  \
+  (!((off)&(CCTK_REAL_VEC_SIZE-1)) ?            \
+   vec_load(p) : vec_loadu(p))
+#define vec_loadu_maybe3(off1,off2,off3,p)      \
+  (!((off1)&(CCTK_REAL_VEC_SIZE-1)) &&          \
+   !((off2)&(CCTK_REAL_VEC_SIZE-1)) &&          \
+   !((off3)&(CCTK_REAL_VEC_SIZE-1)) ?           \
+   vec_load(p) : vec_loadu(p))
+#endif
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec_store(p,x) (_mm_store_pd(&(p),x))
+#define vec_storeu(p,x) (_mm_storeu_pd(&(p),x))
+#if defined(KRANC_CACHE)
+#  define vec_store_nta(p,x) (_mm_stream_pd(&(p),x))
+#else
+#  define vec_store_nta(p,x) (_mm_store_pd(&(p),x))
+#endif
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+#define vec_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x))
+#define vec_store_nta_partial_hi(p,x,n) (_mm_storeh_pd((&(p))+1,x))
+
+
+
+// Functions and operators
+
+// Operators
+#undef fneg
+#undef fmul
+#undef fdiv
+#undef fadd
+#undef fsub
+#if defined(__PGI) && defined (__amd64__)
+// The PGI compiler does not understand __m128d literals
+static union {
+  unsigned long long s[CCTK_REAL_VEC_SIZE];
+  CCTK_REAL_VEC v;
+} vec_neg_mask_impl = {0x8000000000000000ULL, 0x8000000000000000ULL};
+#  define vec_neg_mask (vec_neg_mask_impl.v)
+#else
+#  define vec_neg_mask ((CCTK_REAL_VEC)(__m128i){0x8000000000000000ULL, 0x8000000000000000ULL})
+#endif
+#define fneg(x)   (_mm_xor_pd(x,vec_neg_mask))
+#define fmul(x,y) (_mm_mul_pd(x,y))
+#define fdiv(x,y) (_mm_div_pd(x,y))
+#define fadd(x,y) (_mm_add_pd(x,y))
+#define fsub(x,y) (_mm_sub_pd(x,y))
+
+// Cheap functions
+#undef kfabs
+#undef kfmax
+#undef kfmin
+#undef ksqrt
+#if defined(__PGI) && defined (__amd64__)
+// The PGI compiler does not understand __m128d literals
+static union {
+  unsigned long long s[CCTK_REAL_VEC_SIZE];
+  CCTK_REAL_VEC v;
+} vec_fabs_mask_impl = {0x7fffffffffffffffULL, 0x7fffffffffffffffULL};
+#  define vec_fabs_mask (vec_fabs_mask_impl.v)
+#else
+#  define vec_fabs_mask ((CCTK_REAL_VEC)(__m128i){0x7fffffffffffffffULL, 0x7fffffffffffffffULL})
+#endif
+#define kfabs(x)   (_mm_and_pd(x,vec_fabs_mask))
+#define kfmax(x,y) (_mm_max_pd(x,y))
+#define kfmin(x,y) (_mm_min_pd(x,y))
+#define ksqrt(x)   (_mm_sqrt_pd(x))
+
+// Expensive functions
+#undef kexp
+#undef klog
+#undef kpow
+#define kexp(x_)    ({ CCTK_REAL_VEC const x_exp=(x_); vec_set(exp(vec_elt0(x_exp)),exp(vec_elt1(x_exp))); })
+#define klog(x_)    ({ CCTK_REAL_VEC const x_log=(x_); vec_set(log(vec_elt0(x_log)),log(vec_elt1(x_log))); })
+#define kpow(x_,a_) ({ CCTK_REAL_VEC const x_pow=(x_); CCTK_REAL const a_pow=(a_); vec_set(pow(vec_elt0(x_pow),a_pow),pow(vec_elt1(x_pow),a_pow)); })
+
+
+
+#undef Sign
+#define Sign(x) (42)
+
+#undef ToReal
+#define ToReal(x) (vec_set1(x))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh
new file mode 100644
index 0000000..4a4eea6
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh
@@ -0,0 +1,194 @@
+// Vectorise using Intel's or AMD's SSE2
+
+
+
+#include <emmintrin.h>
+
+// Vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+  // The underlying scalar and vector types
+  typedef double S;
+  typedef __m128d V;
+  V v;
+  
+  // Convert from and to the underlying vector type
+  inline CCTK_REAL_VEC(V const v_): v(v_) { }
+  inline operator V const() const { return v; }
+  
+  inline CCTK_REAL_VEC() { }
+  
+  // Copy constructor
+  inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { }
+};
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+// Create vectors, extract vector elements
+
+DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a))
+DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a))
+
+// Get a scalar from the vector
+#if defined(__PGI) && defined (__amd64__)
+// _mm_cvtsd_f64 does not exist on PGI compilers
+// DEFINE_FUNCTION_V_R(vec_elt0,({ CCTK_REAL a; _mm_store_sd(&a,x); a; }))
+// DEFINE_FUNCTION_V_R(vec_elt0,(*(CCTK_REAL const*)&x))
+// This generates the fastest code with PGI compilers
+DEFINE_FUNCTION_V_R(vec_elt0,({ CCTK_REAL a; asm ("" : "=x" (a) : "0" (x)); a; }))
+#else
+DEFINE_FUNCTION_V_R(vec_elt0,_mm_cvtsd_f64(x)) // this is a no-op
+#endif
+DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_unpackhi_pd(x,x)))
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p))
+DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p))
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int mod>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p))
+template<int mod1,int mod2,int mod3>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p))
+// Implementation: load aligned if the modulus is zero
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p)
+{
+  return vec_load(p);
+}
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p)
+{
+  return vec_load(p);
+}
+// Call the implementation with the modulus
+#define vec_loadu_maybe(off,p)                                  \
+  (vec_loadu_maybe_impl<(off)&(CCTK_REAL_VEC_SIZE-1>(p)))
+#define vec_loadu_maybe3(off1,off2,off3,p)                      \
+  (vec_loadu_maybe_impl3<(off1)&(CCTK_REAL_VEC_SIZE-1),         \
+                         (off2)&(CCTK_REAL_VEC_SIZE-1),         \
+                         (off3)&(CCTK_REAL_VEC_SIZE-1)>(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x))
+DEFINE_FUNCTION_PRV(vec_storeu,_mm_storeu_pd(&p,x))
+#if defined(KRANC_CACHE)
+DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x))
+#else
+DEFINE_FUNCTION_PRV(vec_store_nta,_mm_store_pd(&p,x))
+#endif
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_store_nta_partial_lo (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n)
+{
+  switch (n) {
+  case 1: _mm_storel_pd(&p,x); break;
+  default: assert(0);
+  }
+}
+static inline
+void vec_store_nta_partial_hi (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n)
+{
+  switch (n) {
+  case 1: _mm_storeh_pd((&p)+1,x); break;
+  default: assert(0);
+  }
+}
+
+
+
+// Functions and operators
+
+// Single-argument operators
+#if 0
+DEFINE_FUNCTION_V_V(operator+,x)
+static CCTK_REAL_VEC const vec_neg_mask =
+  (CCTK_REAL_VEC::V)(__m128i) { 0x8000000000000000ULL, 0x8000000000000000ULL };
+DEFINE_FUNCTION_V_V(operator-,_mm_xor_pd(x,vec_neg_mask))
+#endif
+DEFINE_FUNCTION_V_V(operator+,+x.v)
+DEFINE_FUNCTION_V_V(operator-,-x.v)
+
+// Double-argument operators, both vectors
+#if 0
+DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y))
+#endif
+DEFINE_FUNCTION_VV_V(operator+,x.v+y.v)
+DEFINE_FUNCTION_VV_V(operator-,x.v-y.v)
+DEFINE_FUNCTION_VV_V(operator*,x.v*y.v)
+DEFINE_FUNCTION_VV_V(operator/,x.v/y.v)
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x)
+DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x)
+DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x)
+DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x)
+
+// Cheap functions
+#if defined(__PGI) && defined (__amd64__)
+// The PGI compiler does not understand __m128d literals
+static union {
+  CCTK_REAL_VEC::S s[CCTK_REAL_VEC_SIZE];
+  CCTK_REAL_VEC::V v;
+} vec_fabs_mask_impl = { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL };
+#  define vec_fabs_mask (vec_fabs_mask_impl.v)
+#else
+static CCTK_REAL_VEC const vec_fabs_mask =
+  (CCTK_REAL_VEC::V)(__m128i) { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL };
+#endif
+DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask))
+DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y))
+DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y))
+DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+
+
+
+#undef Sign
+#define Sign(x) (42)
+
+#undef ToReal
+#define ToReal(x) vec_set1(x)
+
+#if defined(__PGI) && defined (__amd64__)
+// Special case for PGI 9.0.4 to avoid an internal compiler error
+#undef IfThen
+static inline
+CCTK_REAL_VEC IfThen (bool const cond, CCTK_REAL_VEC const x, CCTK_REAL_VEC const y)
+{
+  union {
+    __m128i vi;
+    CCTK_REAL_VEC::V v;
+  } mask;
+  mask.vi = _mm_set1_epi64x(-(long long)cond);
+  return _mm_or_pd(_mm_and_pd(x.v, mask.v), _mm_andnot_pd(mask.v, y.v));
+}
+#endif
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh
new file mode 100644
index 0000000..7e06017
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh
@@ -0,0 +1,111 @@
+// Vectorise using IBM's Altivec
+
+// Use the type vector double directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+
+
+#include <altivec.h>
+
+// Vector type corresponding to CCTK_REAL
+typedef vector double CCTK_REAL_VEC;
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+// Create vectors, extract vector elements
+
+#define vec_set1(a) (vec_splats(a))
+#if defined(__GNUC__)
+// GNU doesn't support array indices on vectors
+union vec_mask {
+  double elts[2];
+  vector double v;
+};
+#  define vec_set(a,b) ({ vec_mask x_set; x_set.elts[0]=(a); x_set.elts[1]=(b); x_set.v; })
+#else
+#  define vec_set(a,b) ({ CCTK_REAL_VEC x_set; x_set[0]=(a); x_set[1]=(b); x_set; })
+#endif
+
+// Get a scalar from the vector
+#if defined(__GNUC__)
+// GNU doesn't support array indices on vectors
+#  define vec_elt0(x) ({ vec_mask x_elt0; x_elt0.v=(x); x_elt0.elts[0]; })
+#  define vec_elt1(x) ({ vec_mask x_elt1; x_elt1.v=(x); x_elt1.elts[1]; })
+#else
+#  define vec_elt0(x) ((x)[0])
+#  define vec_elt1(x) ((x)[1])
+#endif
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec_load(p)  (*(CCTK_REAL_VEC const*)&(p))
+#define vec_loadu(p) (vec_load(p))
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#define vec_loadu_maybe(off,p) (vec_load(p))
+#define vec_loadu_maybe3(off1,off2,off3,p) (vec_load(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec_store(p,x) (*(CCTK_REAL_VEC*)&(p)=(x))
+#define vec_storeu(p,x) (*(CCTK_REAL_VEC*)&(p)=(x))
+// TODO: Use stvxl instruction?
+#define vec_store_nta(p,x) vec_store(p,x)
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+#define vec_store_nta_partial_lo(p,x,n) ((p)=vec_elt0(x))
+#define vec_store_nta_partial_hi(p,x,n) ((&(p))[1]=vec_elt1(x))
+
+
+
+// Functions and operators
+
+// Other Altivec functions are:
+//    nabs: -abs a
+//    madd msub nmadd nmsub: [+-]a*b[+-]c
+
+// Triple-argument operators, all vectors
+#undef fmadd
+#undef fmsub
+#undef fnmadd
+#undef fnmsub
+#define fmadd(x,y,z)  (vec_madd(x,y,z))
+#define fmsub(x,y,z)  (vec_msub(x,y,z))
+#define fnmadd(x,y,z) (vec_nmadd(x,y,z))
+#define fnmsub(x,y,z) (vec_nmsub(x,y,z))
+
+// Cheap functions
+#undef kfabs
+#undef kfmax
+#undef kfmin
+#define kfabs(x)   (vec_abs(x))
+#define kfmax(x,y) (vec_max(x,y))
+#define kfmin(x,y) (vec_min(x,y))
+
+// Expensive functions
+#undef kexp
+#undef klog
+#undef kpow
+#undef ksqrt
+#define kexp(x_)    ({ CCTK_REAL_VEC const x_exp=(x_); vec_set(exp(vec_elt0(x_exp)),exp(vec_elt1(x_exp))); })
+#define klog(x_)    ({ CCTK_REAL_VEC const x_log=(x_); vec_set(log(vec_elt0(x_log)),log(vec_elt1(x_log))); })
+#define kpow(x_,a_) ({ CCTK_REAL_VEC const x_pow=(x_); CCTK_REAL const a_pow=(a_); vec_set(pow(vec_elt0(x_pow),a_pow),pow(vec_elt1(x_pow),a_pow)); })
+#define ksqrt(x_)   ({ CCTK_REAL_VEC const x_sqrt=(x_); vec_set(sqrt(vec_elt0(x_sqrt)),sqrt(vec_elt1(x_sqrt))); })
+
+
+
+#undef Sign
+#define Sign(x) (42)
+
+#undef ToReal
+#define ToReal(x) (vec_set1((CCTK_REAL)(x)))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh
new file mode 100644
index 0000000..f591647
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh
@@ -0,0 +1,212 @@
+// Vectorise using IBM's Altivec
+
+
+
+#include <altivec.h>
+
+// Vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+  // The underlying scalar and vector types
+  typedef double S;
+  typedef vector double V;
+  V v;
+  union vec_mask {
+    S elts[2];
+    V v;
+  };
+  
+  // Set a vector from scalars
+#if 0
+  // IBM
+  inline CCTK_REAL_VEC(S const a, S const b) { v[0]=a; v[1]=b; }
+#endif
+#if 0
+  inline CCTK_REAL_VEC(S const a, S const b):
+    v(vec_mergel(vec_splats(a), vec_splats(b))) { }
+#endif
+  inline CCTK_REAL_VEC(S const a, S const b)
+  {
+    vec_mask x;
+    x.elts[0] = a;
+    x.elts[1] = b;
+    v = x.v;
+  }
+  
+  // Set a vector from a scalar, replicating the scalar
+  // Note: Could also use vec_xlds instead
+  inline CCTK_REAL_VEC(S const a): v(vec_splats(a)) { }
+  
+  // Convert from and to the underlying vector type
+  inline CCTK_REAL_VEC(V const v_): v(v_) { }
+  inline operator V const() const { return v; }
+  
+  inline CCTK_REAL_VEC() { }
+  
+  // Copy constructor
+  inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { }
+};
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+// Create vectors, extract vector elements
+DEFINE_FUNCTION_R_V(vec_set1,CCTK_REAL_VEC(a))
+DEFINE_FUNCTION_RR_V(vec_set,CCTK_REAL_VEC(a,b))
+
+// Get a scalar from the vector
+#if 0
+// IBM
+DEFINE_FUNCTION_V_R(vec_elt0,x.v[0])
+DEFINE_FUNCTION_V_R(vec_elt1,x.v[1])
+#endif
+static inline CCTK_REAL vec_elt0(CCTK_REAL_VEC const x)
+{
+  CCTK_REAL_VEC::vec_mask x1;
+  x1.v = x;
+  return x1.elts[0];
+}
+static inline CCTK_REAL vec_elt1(CCTK_REAL_VEC const x)
+{
+  CCTK_REAL_VEC::vec_mask x1;
+  x1.v = x;
+  return x1.elts[1];
+}
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,p)
+#if 0
+// IBM
+DEFINE_FUNCTION_PR_V(vec_loadu,vec_xld2(0,const_cast<CCTK_REAL*>(&p)))
+#endif
+DEFINE_FUNCTION_PR_V(vec_loadu,p)
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int mod>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p))
+template<int mod1,int mod2,int mod3>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p))
+// Implementation: load aligned if the modulus is zero
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p)
+{
+  return vec_load(p);
+}
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p)
+{
+  return vec_load(p);
+}
+// Call the implementation with the modulus
+#define vec_loadu_maybe(off,p)                                  \
+  (vec_loadu_maybe_impl<(off)&(CCTK_REAL_VEC_SIZE-1>(p)))
+#define vec_loadu_maybe3(off1,off2,off3,p)                      \
+  (vec_loadu_maybe_impl3<(off1)&(CCTK_REAL_VEC_SIZE-1),         \
+                         (off2)&(CCTK_REAL_VEC_SIZE-1),         \
+                         (off3)&(CCTK_REAL_VEC_SIZE-1)>(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC::V*)&p=x)
+DEFINE_FUNCTION_PRV(vec_storeu,*(CCTK_REAL_VEC::V*)&p=x)
+// TODO: Use stvxl instruction?
+DEFINE_FUNCTION_PRV(vec_store_nta,*(CCTK_REAL_VEC::V*)&p=x)
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_store_nta_partial_lo (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n)
+{
+  switch (n) {
+  case 1: p=vec_elt0(x); break;
+  default: assert(0);
+  }
+}
+static inline
+void vec_store_nta_partial_hi (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n)
+{
+  switch (n) {
+  case 1: (&p)[1]=vec_elt1(x); break;
+  default: assert(0);
+  }
+}
+
+
+
+// Functions and operators
+
+// Other Altivec functions are:
+//    nabs: -abs a
+//    madd msub nmadd nmsub: [+-]a*b[+-]c
+
+// Single-argument operators
+#if 0
+DEFINE_FUNCTION_V_V(operator+,x)
+DEFINE_FUNCTION_V_V(operator-,vec_neg(x))
+#endif
+DEFINE_FUNCTION_V_V(operator+,+x.v)
+DEFINE_FUNCTION_V_V(operator-,-x.v)
+
+// Double-argument operators, both vectors
+#if 0
+DEFINE_FUNCTION_VV_V(operator+,vec_add(x,y))
+DEFINE_FUNCTION_VV_V(operator-,vec_sub(x,y))
+DEFINE_FUNCTION_VV_V(operator*,vec_mul(x,y))
+DEFINE_FUNCTION_VV_V(operator/,vec_div(x,y))
+#endif
+DEFINE_FUNCTION_VV_V(operator+,x.v+y.v)
+DEFINE_FUNCTION_VV_V(operator-,x.v-y.v)
+DEFINE_FUNCTION_VV_V(operator*,x.v*y.v)
+DEFINE_FUNCTION_VV_V(operator/,x.v/y.v)
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x)
+DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x)
+DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x)
+DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x)
+
+// Triple-argument operators, all vectors
+#undef fmadd
+#undef fmsub
+#undef fnmadd
+#undef fnmsub
+DEFINE_FUNCTION_VVV_V(fmadd,vec_madd(x.v,y.v,z.v))
+DEFINE_FUNCTION_VVV_V(fmsub,vec_msub(x.v,y.v,z.v))
+DEFINE_FUNCTION_VVV_V(fnmadd,vec_nmadd(x.v,y.v,z.v))
+DEFINE_FUNCTION_VVV_V(fnmsub,vec_nmsub(x.v,y.v,z.v))
+
+// Cheap functions
+DEFINE_FUNCTION_V_V(fabs,vec_abs(x.v))
+DEFINE_FUNCTION_VV_V(fmax,vec_max(x.v,y.v))
+DEFINE_FUNCTION_VV_V(fmin,vec_min(x.v,y.v))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+DEFINE_FUNCTION_V_V(sqrt,vec_set(sqrt(vec_elt0(x)),sqrt(vec_elt1(x))))
+
+
+
+#undef Sign
+#define Sign(x) (42)
+
+#undef ToReal
+#define ToReal(x) (vec_set1(x))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh
new file mode 100644
index 0000000..f928ed8
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh
@@ -0,0 +1,31 @@
+// Fallback vectorisation implementation: Do not vectorise
+
+
+
+// Use CCTK_REAL
+typedef CCTK_REAL CCTK_REAL_VEC;
+
+// Number of vector elements in a CCTK_REAL_VEC
+static int const CCTK_REAL_VEC_SIZE = 1;
+
+
+
+// We use macros here, so that we are not surprised by compilers which
+// don't like to inline functions (e.g. PGI).  This should also make
+// debug builds (which may not inline) more efficient.
+
+#define vec_load(p) (p)
+#define vec_loadu(p) (p)
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#define vec_loadu_maybe(off,p) (p)
+#define vec_loadu_maybe3(off1,off2,off3,p) (p)
+
+#define vec_store(p,x) ((p)=(x))
+#define vec_store_nta(p,x) ((p)=(x))
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+#define vec_store_nta_partial_lo(p,x,n) (assert(0))
+#define vec_store_nta_partial_hi(p,x,n) (assert(0))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh
new file mode 100644
index 0000000..f5c0b22
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh
@@ -0,0 +1,104 @@
+// Define some macros that simplify defining short function that are
+// supposed to be inlined
+
+
+
+// Letters defining the prototype (argument and return value types):
+//    I: i,j: integer
+//    R: a,b: real
+//    V: x,y: vector (of real)
+//    P: p,q: pointer (i.e. const reference) to something
+//    L: l,m: L-value (i.e. non-const reference) to something
+
+
+
+// Load and store
+
+#define DEFINE_FUNCTION_PR_V(name,expr)         \
+static inline                                   \
+CCTK_REAL_VEC name (CCTK_REAL const& p)         \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_PRV(name,expr)          \
+static inline                                   \
+void name (CCTK_REAL& p, CCTK_REAL_VEC const x) \
+{                                               \
+  expr;                                         \
+}
+
+#define DEFINE_FUNCTION_PVR(name,expr)          \
+static inline                                   \
+void name (CCTK_REAL_VEC& p, CCTK_REAL const a) \
+{                                               \
+  expr;                                         \
+}
+
+
+
+// Functions and operators
+
+#define DEFINE_FUNCTION_V_V(name,expr)          \
+static inline                                   \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const x)      \
+  CCTK_ATTRIBUTE_PURE                           \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_V_R(name,expr)          \
+static inline                                   \
+CCTK_REAL name (CCTK_REAL_VEC const x)          \
+  CCTK_ATTRIBUTE_PURE                           \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_R_V(name,expr)          \
+static inline                                   \
+CCTK_REAL_VEC name (CCTK_REAL const a)          \
+  CCTK_ATTRIBUTE_PURE                           \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_VV_V(name,expr)                                 \
+static inline                                                           \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL_VEC const y)       \
+  CCTK_ATTRIBUTE_PURE                                                   \
+{                                                                       \
+  return expr;                                                          \
+}
+
+#define DEFINE_FUNCTION_VR_V(name,expr)                         \
+static inline                                                   \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL const a)   \
+  CCTK_ATTRIBUTE_PURE                                           \
+{                                                               \
+  return expr;                                                  \
+}
+
+#define DEFINE_FUNCTION_RV_V(name,expr)                         \
+static inline                                                   \
+CCTK_REAL_VEC name (CCTK_REAL const a, CCTK_REAL_VEC const x)   \
+  CCTK_ATTRIBUTE_PURE                                           \
+{                                                               \
+  return expr;                                                  \
+}
+
+#define DEFINE_FUNCTION_RR_V(name,expr)                         \
+static inline                                                   \
+CCTK_REAL_VEC name (CCTK_REAL const a, CCTK_REAL const b)       \
+  CCTK_ATTRIBUTE_PURE                                           \
+{                                                               \
+  return expr;                                                  \
+}
+
+#define DEFINE_FUNCTION_VVV_V(name,expr)                                \
+static inline                                                           \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL_VEC const y, CCTK_REAL_VEC const z) \
+  CCTK_ATTRIBUTE_PURE                                                   \
+{                                                                       \
+  return expr;                                                          \
+}
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh
new file mode 100644
index 0000000..df83b3a
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh
@@ -0,0 +1,591 @@
+#ifndef VECTORS_HH
+#define VECTORS_HH
+
+
+
+// Vectorisation
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+
+#include <cctk.h>
+
+
+
+// I: i,j: integer
+// R: a,b: real
+// V: x,y: vector (of real)
+// P: p,q: pointer (i.e. const reference) to something
+// L: l,m: L-value (i.e. non-const reference) to something
+
+#define DEFINE_FUNCTION_PR_V(name,expr)         \
+static inline                                   \
+CCTK_REAL_VEC name (CCTK_REAL const& p)         \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_PRV(name,expr)                  \
+static inline                                           \
+void name (CCTK_REAL& p, CCTK_REAL_VEC const& x)        \
+{                                                       \
+  expr;                                                 \
+}
+
+#define DEFINE_FUNCTION_PVR(name,expr)                  \
+static inline                                           \
+void name (CCTK_REAL_VEC& p, CCTK_REAL const& a)        \
+{                                                       \
+  expr;                                                 \
+}
+
+#define DEFINE_FUNCTION_V_V(name,expr)          \
+static inline                                   \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const& x)     \
+  CCTK_ATTRIBUTE_PURE                           \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_V_R(name,expr)          \
+static inline                                   \
+CCTK_REAL name (CCTK_REAL_VEC const& x)         \
+  CCTK_ATTRIBUTE_PURE                           \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_R_V(name,expr)          \
+static inline                                   \
+CCTK_REAL_VEC name (CCTK_REAL const& a)         \
+  CCTK_ATTRIBUTE_PURE                           \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_VV_V(name,expr)                                 \
+static inline                                                           \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL_VEC const& y)     \
+  CCTK_ATTRIBUTE_PURE                                                   \
+{                                                                       \
+  return expr;                                                          \
+}
+
+#define DEFINE_FUNCTION_VR_V(name,expr)                         \
+static inline                                                   \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL const& a) \
+  CCTK_ATTRIBUTE_PURE                                           \
+{                                                               \
+  return expr;                                                  \
+}
+
+#define DEFINE_FUNCTION_RV_V(name,expr)                         \
+static inline                                                   \
+CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL_VEC const& x) \
+  CCTK_ATTRIBUTE_PURE                                           \
+{                                                               \
+  return expr;                                                  \
+}
+
+#define DEFINE_FUNCTION_RR_V(name,expr)                         \
+static inline                                                   \
+CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL const& b)     \
+  CCTK_ATTRIBUTE_PURE                                           \
+{                                                               \
+  return expr;                                                  \
+}
+
+
+
+// Intel, double
+#if defined(KRANC_VECTORS) && defined(__SSE2__) && defined(CCTK_REAL_PRECISION_8)
+
+#include <emmintrin.h>
+
+// Vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+  // The underlying scalar and vector types
+  typedef double S;
+  typedef __m128d V;
+  V v;
+  
+  // Set a vector from scalars
+  inline CCTK_REAL_VEC(S const& a, S const& b): v(_mm_set_pd(b,a)) { }
+
+  // Set a vector from a scalar, replicating the scalar
+  inline CCTK_REAL_VEC(S const& a): v(_mm_set1_pd(a)) { }
+  
+  // Convert from and to the underlying vector type
+  inline CCTK_REAL_VEC(V const& v_): v(v_) { }
+  inline operator V const() const { return v; }
+  
+  inline CCTK_REAL_VEC() { }
+  
+  // Copy constructor
+  inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { }
+};
+
+union vec_mask {
+  unsigned long long bits[2];
+  CCTK_REAL_VEC::V v;
+};
+
+DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a))
+DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a))
+
+// Get a scalar from the vector
+#if defined(__PGI) && defined (__amd64__)
+// _mm_cvtsd_f64 does not exist on PGI compilers
+static inline
+CCTK_REAL vec_elt0 (CCTK_REAL_VEC const& x)
+{
+  CCTK_REAL a; _mm_store_sd(&a,x); return a;
+}
+#else
+DEFINE_FUNCTION_V_R(vec_elt0,_mm_cvtsd_f64(x)) //this is a no-op
+#endif
+
+#if 0
+DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_shuffle_pd(x,x,_MM_SHUFFLE2(1,1))))
+#endif
+static inline
+CCTK_REAL vec_elt1 (CCTK_REAL_VEC const& x)
+{
+  CCTK_REAL a; _mm_storeh_pd(&a,x); return a;
+}
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p))
+DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p))
+
+#if 0
+// Load a partial vector (duplicating the last loaded element to fill
+// the remaining elements)
+// TODO: Should this be aligned or unaligned?
+static inline
+CCTK_REAL_VEC vec_load_partial (CCTK_REAL const& p, int const n)
+{
+  switch (n) {
+  case 1: return _mm_load1_pd(p);
+  default: assert(0);
+  }
+}
+#endif
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int mod>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p))
+template<int mod1,int mod2,int mod3>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p))
+// Implementation: load aligned if the modulus is zero
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p)
+{
+  return vec_load(p);
+}
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p)
+{
+  return vec_load(p);
+}
+// Call the implementation with the modulus
+template<int off>
+static inline
+CCTK_REAL_VEC vec_loadu_maybe (CCTK_REAL const& p)
+{
+  return vec_loadu_maybe_impl<off&(2-1)>(p);
+}
+template<int off1,int off2, int off3>
+static inline
+CCTK_REAL_VEC vec_loadu_maybe3 (CCTK_REAL const& p)
+{
+  return vec_loadu_maybe_impl3<off1&(2-1),off2&(2-1),off3&(2-1)>(p);
+}
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x))
+DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x))
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+  switch (n) {
+  case 1: _mm_storel_pd(&p,x); break;
+  default: assert(0);
+  }
+}
+static inline
+void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+  switch (n) {
+  case 1: _mm_storeh_pd((&p)+1,x); break;
+  default: assert(0);
+  }
+}
+
+// Double-argument operators, both vectors
+DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y))
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x)
+DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x)
+DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x)
+DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x)
+
+// Single-argument operators
+DEFINE_FUNCTION_V_V(operator+,x)
+#if 0
+DEFINE_FUNCTION_V_V(operator-,vec_set(0.0,0.0)-x)
+#endif
+static vec_mask const vec_neg_mask = 
+{ { 0x8000000000000000ULL, 0x8000000000000000ULL } };
+DEFINE_FUNCTION_V_V(operator-,_mm_xor_pd(x,vec_neg_mask.v))
+
+// Cheap functions
+static vec_mask const vec_fabs_mask =
+{ { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } };
+DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask.v))
+DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y))
+DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y))
+DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+
+// Special case for PGI to avoid internal compiler error
+#if defined(__PGI) && defined (__amd64__)
+#undef IfThen
+CCTK_REAL_VEC IfThen (bool const cond, CCTK_REAL_VEC const& x, CCTK_REAL_VEC co\
+nst& y)
+{
+  return cond*x + (not cond)*y;
+}
+#endif
+
+
+
+#if 0
+// Try to use the __m128d type directly.
+
+// This does not really work, because it is not possible to define
+// automatic conversion operators from double to __m128d, so that
+// explicit conversions are required.  This makes the code look more
+// clumsy.
+
+// Vector type corresponding to CCTK_REAL
+typedef __m128d CCTK_REAL_VEC;
+
+DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a))
+DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a))
+
+// Get a scalar from the vector
+static inline
+CCTK_REAL vec_elt0 (CCTK_REAL_VEC const& x)
+{
+#if 0
+  // _mm_cvtsd_f64 does not exist on PGI compilers
+  return _mm_cvtsd_f64(x);      // this is a no-op
+#endif
+  CCTK_REAL a; _mm_store_sd(&a,x); return a;
+}
+
+DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_shuffle_pd(x,x,_MM_SHUFFLE2(1,1))))
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p))
+DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x))
+DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x))
+
+// Cheap functions
+static vec_mask const vec_fabs_mask =
+{ { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } };
+DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask.v))
+DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y))
+DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y))
+DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+
+#endif
+
+
+
+// Intel, float
+#elif defined(KRANC_VECTORS) && defined(__SSE__) && defined(CCTK_REAL_PRECISION_4)
+
+#include <xmmintrin.h>
+
+// A vector type corresponding to CCTK_REAL
+typedef __m128 CCTK_REAL_VEC;
+
+
+
+// Power, double
+#elif defined(KRANC_VECTORS) && defined(__ALTIVEC__) && defined(_ARCH_PWR7) && defined(CCTK_REAL_PRECISION_8)
+
+#include <altivec.h>
+
+// Vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+  // The underlying scalar and vector types
+  typedef double S;
+  typedef vector double V;
+  V v;
+  
+  // vec_insert, vec_extract, vec_splat
+
+  // Set a vector from scalars
+  inline CCTK_REAL_VEC(S const& a, S const& b) { v[0]=a; v[1]=b; }
+  
+  // Set a vector from a scalar, replicating the scalar
+  inline CCTK_REAL_VEC(S const& a): v(vec_splats(a)) { }
+  
+  // Convert from and to the underlying vector type
+  inline CCTK_REAL_VEC(V const& v_): v(v_) { }
+  inline operator V const() const { return v; }
+  
+  inline CCTK_REAL_VEC() { }
+  
+  // Copy constructor
+  inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { }
+};
+
+DEFINE_FUNCTION_R_V(vec_set1,CCTK_REAL_VEC(a))
+DEFINE_FUNCTION_RR_V(vec_set,CCTK_REAL_VEC(a,b))
+
+// Get a scalar from the vector
+DEFINE_FUNCTION_V_R(vec_elt0,x.v[0])
+DEFINE_FUNCTION_V_R(vec_elt1,x.v[1])
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,p)
+DEFINE_FUNCTION_PR_V(vec_loadu,vec_xld2(0,const_cast<CCTK_REAL*>(&p)))
+// vec_xlds
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int mod>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p))
+// Implementation: load aligned if the modulus is zero
+#define static
+template<>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl<0>,vec_load(p))
+#undef static
+// Call the implementation with the modulus
+template<int off>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe,vec_loadu_maybe_impl<off&(2-1)>(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC::V*)&p=x)
+DEFINE_FUNCTION_PRV(vec_store_nta,*(CCTK_REAL_VEC::V*)&p=x)
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+  switch (n) {
+  case 1: p=x.v[0]; break;
+  default: assert(0);
+  }
+}
+static inline
+void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+  switch (n) {
+  case 1: (&p)[1]=x.v[1]; break;
+  default: assert(0);
+  }
+}
+
+// Double-argument operators, both vectors
+DEFINE_FUNCTION_VV_V(operator+,vec_add(x,y))
+DEFINE_FUNCTION_VV_V(operator-,vec_sub(x,y))
+DEFINE_FUNCTION_VV_V(operator*,vec_mul(x,y))
+DEFINE_FUNCTION_VV_V(operator/,vec_div(x,y))
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x)
+DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x)
+DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x)
+DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x)
+
+// Single-argument operators
+DEFINE_FUNCTION_V_V(operator+,x)
+DEFINE_FUNCTION_V_V(operator-,vec_neg(x))
+
+// Cheap functions
+DEFINE_FUNCTION_V_V(fabs,vec_abs(x))
+DEFINE_FUNCTION_VV_V(fmax,vec_max(x,y))
+DEFINE_FUNCTION_VV_V(fmin,vec_min(x,y))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+DEFINE_FUNCTION_V_V(sqrt,vec_set(sqrt(vec_elt0(x)),sqrt(vec_elt1(x))))
+
+
+
+// Fallback: pseudo-vectorisation
+#elif 0
+
+// There is no vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+  // The underlying scalar and vector types
+  CCTK_REAL v, w;
+  
+  // Set a vector from scalars
+  inline CCTK_REAL_VEC(CCTK_REAL const& a, CCTK_REAL const& b): v(a), w(b) { }
+
+  // Set a vector from a scalar, replicating the scalar
+  inline CCTK_REAL_VEC(CCTK_REAL const& a): v(a), w(a) { }
+  
+  inline CCTK_REAL_VEC() { }
+  
+  // Copy constructor
+  inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x.v), w(x.w) { }
+};
+
+
+
+DEFINE_FUNCTION_PR_V(vec_load,*(CCTK_REAL_VEC const* restrict)&p)
+DEFINE_FUNCTION_PR_V(vec_loadu,vec_load(p))
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+template<int off>
+DEFINE_FUNCTION_PR_V(vec_loadm,vec_load(p))
+
+DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC* restrict)&p=x)
+DEFINE_FUNCTION_PRV(vec_store_nta,vec_store(p,x))
+
+// Double-argument operators, both vectors
+DEFINE_FUNCTION_VV_V(operator+,CCTK_REAL_VEC(x.v+y.v,x.w+y.w))
+DEFINE_FUNCTION_VV_V(operator-,CCTK_REAL_VEC(x.v-y.v,x.w-y.w))
+DEFINE_FUNCTION_VV_V(operator*,CCTK_REAL_VEC(x.v*y.v,x.w*y.w))
+DEFINE_FUNCTION_VV_V(operator/,CCTK_REAL_VEC(x.v/y.v,x.w/y.w))
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,CCTK_REAL_VEC(x.v+a,x.w+a))
+DEFINE_FUNCTION_VR_V(operator-,CCTK_REAL_VEC(x.v-a,x.w-a))
+DEFINE_FUNCTION_VR_V(operator*,CCTK_REAL_VEC(x.v*a,x.w*a))
+DEFINE_FUNCTION_VR_V(operator/,CCTK_REAL_VEC(x.v/a,x.w/a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a+x.v,a+x.w))
+DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a-x.v,a-x.w))
+DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a*x.v,a*x.w))
+DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a/x.v,a/x.w))
+
+// Single-argument operators
+DEFINE_FUNCTION_V_V(operator+,x)
+DEFINE_FUNCTION_V_V(operator-,CCTK_REAL_VEC(-x.v,-x.w))
+
+// Cheap functions
+DEFINE_FUNCTION_V_V(fabs,CCTK_REAL_VEC(fabs(x.v),fabs(x.w)))
+DEFINE_FUNCTION_VV_V(fmax,CCTK_REAL_VEC(fmax(x.v,y.v),fmax(x.w,y.w)))
+DEFINE_FUNCTION_VV_V(fmin,CCTK_REAL_VEC(fmin(x.v,y.v),fmin(x.w,y.w)))
+DEFINE_FUNCTION_V_V(sqrt,CCTK_REAL_VEC(sqrt(x.v),sqrt(x.w)))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.v),exp(x.w)))
+DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.v),log(x.w)))
+DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.v,a),pow(x.w,a)))
+
+
+
+// Fallback: no vectorisation
+#else
+
+// There is no vector type corresponding to CCTK_REAL
+typedef CCTK_REAL CCTK_REAL_VEC;
+
+
+
+DEFINE_FUNCTION_PR_V(vec_load,p)
+DEFINE_FUNCTION_PR_V(vec_loadu,p)
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int off>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe,p)
+template<int off1,int off2, int off3>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe3,p)
+
+DEFINE_FUNCTION_PRV(vec_store,p=x)
+DEFINE_FUNCTION_PRV(vec_store_nta,p=x)
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+  assert(0);
+}
+static inline
+void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+  assert(0);
+}
+
+
+
+#endif
+
+
+
+#undef DEFINE_FUNCTION_PR_V
+#undef DEFINE_FUNCTION_PRV
+#undef DEFINE_FUNCTION_V_V
+#undef DEFINE_FUNCTION_R_V
+#undef DEFINE_FUNCTION_VV_V
+#undef DEFINE_FUNCTION_VR_V
+#undef DEFINE_FUNCTION_RV_V
+#undef DEFINE_FUNCTION_RR_V
+
+
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+#endif  // #ifndef VECTORS_HH
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh
new file mode 100644
index 0000000..f439c9b
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh
@@ -0,0 +1,72 @@
+// Pseudo vectorisation using scalar operations
+
+
+
+// Number of vector elements in a CCTK_REAL_VEC
+static int const CCTK_REAL_VEC_SIZE = 2;
+
+// There is no vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+  // The underlying scalar and vector types
+  CCTK_REAL v[CCTK_REAL_VEC_SIZE];
+  
+  // Set a vector from scalars
+  inline CCTK_REAL_VEC(CCTK_REAL const& a, CCTK_REAL const& b): v(a), w(b) { }
+
+  // Set a vector from a scalar, replicating the scalar
+  inline CCTK_REAL_VEC(CCTK_REAL const& a): v(a), w(a) { }
+  
+  inline CCTK_REAL_VEC() { }
+  
+  // Copy constructor
+  inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x) { v[0]=x.v[0]; v[1]=x.v[1]; }
+};
+
+
+
+// Load and store vectors
+
+DEFINE_FUNCTION_PR_V(vec_load,*(CCTK_REAL_VEC const* restrict)&p)
+DEFINE_FUNCTION_PR_V(vec_loadu,vec_load(p))
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#define vec_loadu_maybe(off,p) (vec_load(p))
+#define vec_loadu_maybe3(off1,off2,off3,p) (vec_load(p))
+
+DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC* restrict)&p=x)
+DEFINE_FUNCTION_PRV(vec_store_nta,vec_store(p,x))
+
+
+
+// Functions and operators
+
+// Double-argument operators, both vectors
+DEFINE_FUNCTION_VV_V(operator+,CCTK_REAL_VEC(x.v[0]+y.v[0],x.v[1]+y.v[1]))
+DEFINE_FUNCTION_VV_V(operator-,CCTK_REAL_VEC(x.v[0]-y.v[0],x.v[1]-y.v[1]))
+DEFINE_FUNCTION_VV_V(operator*,CCTK_REAL_VEC(x.v[0]*y.v[0],x.v[1]*y.v[1]))
+DEFINE_FUNCTION_VV_V(operator/,CCTK_REAL_VEC(x.v[0]/y.v[0],x.v[1]/y.v[1]))
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,CCTK_REAL_VEC(x.v[0]+a,x.v[1]+a))
+DEFINE_FUNCTION_VR_V(operator-,CCTK_REAL_VEC(x.v[0]-a,x.v[1]-a))
+DEFINE_FUNCTION_VR_V(operator*,CCTK_REAL_VEC(x.v[0]*a,x.v[1]*a))
+DEFINE_FUNCTION_VR_V(operator/,CCTK_REAL_VEC(x.v[0]/a,x.v[1]/a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a+x.v[0],a+x.v[1]))
+DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a-x.v[0],a-x.v[1]))
+DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a*x.v[0],a*x.v[1]))
+DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a/x.v[0],a/x.v[1]))
+
+// Single-argument operators
+DEFINE_FUNCTION_V_V(operator+,x)
+DEFINE_FUNCTION_V_V(operator-,CCTK_REAL_VEC(-x.v[0],-x.v[1]))
+
+// Functions
+DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.v[0]),exp(x.v[1])))
+DEFINE_FUNCTION_V_V(fabs,CCTK_REAL_VEC(fabs(x.v[0]),fabs(x.v[1])))
+DEFINE_FUNCTION_VV_V(fmax,CCTK_REAL_VEC(fmax(x.v[0],y.v[0]),fmax(x.v[1],y.v[1])))
+DEFINE_FUNCTION_VV_V(fmin,CCTK_REAL_VEC(fmin(x.v[0],y.v[0]),fmin(x.v[1],y.v[1])))
+DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.v[0]),log(x.v[1])))
+DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.v[0],a),pow(x.v[1],a)))
+DEFINE_FUNCTION_V_V(sqrt,CCTK_REAL_VEC(sqrt(x.v[0]),sqrt(x.v[1])))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh
new file mode 100644
index 0000000..0d950c7
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh
@@ -0,0 +1,14 @@
+// Undefine all macros defined in "Vectors-define.hh", so that we
+// leave a clean namespace
+
+
+
+#undef DEFINE_FUNCTION_PR_V
+#undef DEFINE_FUNCTION_PRV
+#undef DEFINE_FUNCTION_V_V
+#undef DEFINE_FUNCTION_R_V
+#undef DEFINE_FUNCTION_VV_V
+#undef DEFINE_FUNCTION_VR_V
+#undef DEFINE_FUNCTION_RV_V
+#undef DEFINE_FUNCTION_RR_V
+#undef DEFINE_FUNCTION_VVV_V
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh
index 3fb77e1..d32afb2 100644
--- a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh
@@ -5,231 +5,47 @@
 
 // Vectorisation
 
+#include <assert.h>
 #include <math.h>
 #include <stdlib.h>
-#include <cctk.h>
-
-
-
-// I: i,j: integer
-// R: a,b: real
-// V: x,y: vector (of real)
-// P: p,q: pointer (i.e. const reference) to something
-// L: l,m: L-value (i.e. non-const reference) to something
-
-#define DEFINE_FUNCTION_PR_V(name,expr)         \
-inline                                          \
-CCTK_REAL_VEC name (CCTK_REAL const& p)         \
-{                                               \
-  return expr;                                  \
-}
-
-#define DEFINE_FUNCTION_PRV(name,expr)                  \
-inline                                                  \
-void name (CCTK_REAL& p, CCTK_REAL_VEC const& x)        \
-{                                                       \
-  expr;                                                 \
-}
-
-#define DEFINE_FUNCTION_PVR(name,expr)                  \
-inline                                                  \
-void name (CCTK_REAL_VEC& p, CCTK_REAL const& a)        \
-{                                                       \
-  expr;                                                 \
-}
-
-#define DEFINE_FUNCTION_V_V(name,expr)          \
-inline                                          \
-CCTK_REAL_VEC name (CCTK_REAL_VEC const& x)     \
-  CCTK_ATTRIBUTE_PURE                           \
-{                                               \
-  return CCTK_REAL_VEC(expr);                   \
-}
-
-#define DEFINE_FUNCTION_V_R(name,expr)          \
-inline                                          \
-CCTK_REAL name (CCTK_REAL_VEC const& x)         \
-  CCTK_ATTRIBUTE_PURE                           \
-{                                               \
-  return expr;                                  \
-}
-
-#define DEFINE_FUNCTION_R_V(name,expr)          \
-inline                                          \
-CCTK_REAL_VEC name (CCTK_REAL const& a)         \
-  CCTK_ATTRIBUTE_PURE                           \
-{                                               \
-  return expr;                                  \
-}
-
-#define DEFINE_FUNCTION_VV_V(name,expr)                                 \
-inline                                                                  \
-CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL_VEC const& y)     \
-  CCTK_ATTRIBUTE_PURE                                                   \
-{                                                                       \
-  return expr;                                                          \
-}
-
-#define DEFINE_FUNCTION_VR_V(name,expr)                         \
-inline                                                          \
-CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL const& a) \
-  CCTK_ATTRIBUTE_PURE                                           \
-{                                                               \
-  return expr;                                                  \
-}
-
-#define DEFINE_FUNCTION_RV_V(name,expr)                         \
-inline                                                          \
-CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL_VEC const& x) \
-  CCTK_ATTRIBUTE_PURE                                           \
-{                                                               \
-  return expr;                                                  \
-}
-
-#define DEFINE_FUNCTION_RR_V(name,expr)                         \
-inline                                                          \
-CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL const& b)     \
-  CCTK_ATTRIBUTE_PURE                                           \
-{                                                               \
-  return expr;                                                  \
-}
-
-
-
-// Intel, double
-#if defined(KRANC_VECTORS) && defined(__SSE2__) && defined(CCTK_REAL_PRECISION_8)
-
-#include <emmintrin.h>
-
-// Vector type corresponding to CCTK_REAL
-struct CCTK_REAL_VEC {
-  // The underlying scalar and vector types
-  typedef double S;
-  typedef __m128d V;
-  static int const n = sizeof(V)/sizeof(S);
-  V v;
-  
-  // Set a vector from scalars
-  CCTK_REAL_VEC(S const& a, S const& b): v(_mm_set_pd(a,b)) { };
-  
-  // Get a scalar from the vector
-  S elt0() const { return _mm_cvtsd_f64(v); /* this is a no-op */ }
-  S elt1() const { return _mm_cvtsd_f64(_mm_shuffle_pd(v,v,_MM_SHUFFLE2(1,1))); }
-  
-  // Set a vector from a scalar, replicating the scalar
-  CCTK_REAL_VEC(S const& a): v(_mm_set1_pd(a)) { };
-  
-  // Convert from and to the underlying vector type
-  CCTK_REAL_VEC(V const& v_): v(v_) { };
-  operator V const() const { return v; }
-  
-  CCTK_REAL_VEC() { };
-  
-  // Copy constructor
-  CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { };
-};
-
-// Load a vector from memory (aligned and unaligned); this loads from
-// a reference to a scalar
-DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p));
-DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p));
-
-// Store a vector to memory (aligned and non-temporal); this stores to
-// a reference to a scalar
-DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x))
-DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x))
-
-// Double-argument operators, both vectors
-DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y))
-DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y))
-DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y))
-DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y))
-
-// Double-argument operators, vector and scalar
-DEFINE_FUNCTION_VR_V(operator+,x+CCTK_REAL_VEC(a))
-DEFINE_FUNCTION_VR_V(operator-,x-CCTK_REAL_VEC(a))
-DEFINE_FUNCTION_VR_V(operator*,x*CCTK_REAL_VEC(a))
-DEFINE_FUNCTION_VR_V(operator/,x/CCTK_REAL_VEC(a))
-
-// Double-argument operators, scalar and vector
-DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a)+x)
-DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a)-x)
-DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a)*x)
-DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a)/x)
-
-// Single-argument operators
-DEFINE_FUNCTION_V_V(operator+,x)
-DEFINE_FUNCTION_V_V(operator-,0.0-x)
-
-// Cheap functions
-static union {
-  unsigned long long const bits[2];
-  CCTK_REAL_VEC::V v;
-} const fabs_mask = 
-  { { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } };
-DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,fabs_mask.v))
-DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y))
-DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y))
-DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x))
-
-// Expensive functions
-DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.elt0()),exp(x.elt1())))
-DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.elt0()),log(x.elt1())))
-DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.elt0(),a),pow(x.elt1(),a)))
-
-// Un-implemented functions
-DEFINE_FUNCTION_V_R(signbit,0)
-
-
-
-#if 0
-// Intel, float
-#elif defined(KRANC_VECTORS) && defined(__SSE__) && defined(CCTK_REAL_PRECISION_4)
-
-#include <xmmintrin.h>
-
-// A vector type corresponding to CCTK_REAL
-typedef __m128 CCTK_REAL_VEC;
-#endif
-
 
+#include <cctk.h>
 
-// Fallback: no vectorisation
-#else
-
-// There is no vector type corresponding to CCTK_REAL
-typedef CCTK_REAL CCTK_REAL_VEC;
 
 
+#include "Vectors-define.hh"
 
-DEFINE_FUNCTION_PR_V(vec_load,p)
-DEFINE_FUNCTION_PR_V(vec_loadu,p)
+#if defined(KRANC_VECTORS)
+// Vectorise
 
-DEFINE_FUNCTION_PRV(vec_store,p=x)
-DEFINE_FUNCTION_PRV(vec_store_nta,p=x)
+#  if ! defined(CCTK_REAL_PRECISION_8)
+#    error "Vectorisation is currently only supported for double precision"
+#  endif
 
-DEFINE_FUNCTION_V_R(signbit,x<0)
+#  if defined(__SSE2__)         // SSE2 (Intel)
+#    if defined(KRANC_DIRECT)
+#      include "Vectors-SSE2-direct.hh"
+#    else
+#      include "Vectors-SSE2.hh"
+#    endif
+#  elif defined(__ALTIVEC__) && defined(_ARCH_PWR7) // Altivec (Power)
+#    if defined(KRANC_DIRECT)
+#      include "Vectors-VSX-direct.hh"
+#    else
+#      include "Vectors-VSX.hh"
+#    endif
+#  else
+#    include "Vectors-pseudo.hh"
+#  endif
 
+#else
+// Don't vectorise
 
+#  include "Vectors-default.hh"
 
 #endif
 
-
-
-#undef DEFINE_FUNCTION_PR_V
-#undef DEFINE_FUNCTION_PRV
-#undef DEFINE_FUNCTION_V_V
-#undef DEFINE_FUNCTION_R_V
-#undef DEFINE_FUNCTION_VV_V
-#undef DEFINE_FUNCTION_VR_V
-#undef DEFINE_FUNCTION_RV_V
-#undef DEFINE_FUNCTION_RR_V
-
-
-
-// Number of vector elements in a CCTK_REAL_VEC
-static
-size_t const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+#include "Vectors-undefine.hh"
author	Ian Hinder <ian.hinder@aei.mpg.de>	2010-11-22 11:20:48 +0100
committer	Ian Hinder <ian.hinder@aei.mpg.de>	2010-11-23 00:19:37 +0100
commit	d6c4d4c2131107ef3a4004692823e2041394acd6 (patch)
tree	bbdd18d5c45e9d4b4f722c8792c4c7d1f02ced19 /Auxiliary/Cactus/KrancNumericalTools
parent	c9431558d69ee1b8769c918044a45bdd50520b46 (diff)