Implement vectorisation

This is Erik's vectorisation working tree from 13-Oct-2010
author: Ian Hinder <ian.hinder@aei.mpg.de> 2010-11-22 11:20:48 +0100
committer: Ian Hinder <ian.hinder@aei.mpg.de> 2010-11-23 00:19:37 +0100
commit: d6c4d4c2131107ef3a4004692823e2041394acd6 (patch)
tree: bbdd18d5c45e9d4b4f722c8792c4c7d1f02ced19
parent: c9431558d69ee1b8769c918044a45bdd50520b46 (diff)
16 files changed, 1767 insertions, 406 deletions
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h
index 29c89d1..ee6a3b7 100644
--- a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/MathematicaCompat.h
@@ -1,17 +1,17 @@
  
-#define Power(x, y)   (pow((x), (y)))
+#define Power(x, y)   (pow(x,y))
 #define Sqrt(x)       (sqrt(x))
 
 
 #ifdef KRANC_C
-#define Abs(x)        (fabs(x))
-#define Min(x, y)     (fmin((x), (y)))
-#define Max(x, y)     (fmax((x), (y)))
-#define IfThen(x,y,z) ((x) ? (y) : (z))
+#  define Abs(x)        (fabs(x))
+#  define Min(x, y)     (fmin(x,y))
+#  define Max(x, y)     (fmax(x,y))
+#  define IfThen(x,y,z) ((x) ? (y) : (z))
 #else
-#define Abs(x)        (abs(x))
-#define Min(x, y)     (min((x), (y)))
-#define Max(x, y)     (max((x), (y)))
+#  define Abs(x)        (abs(x))
+#  define Min(x, y)     (min(x,y))
+#  define Max(x, y)     (max(x,y))
 /* IfThen cannot be expressed in Fortran */
 #endif
 
@@ -31,17 +31,46 @@
 #define Tanh(x)       (tanh(x))
 
 #ifdef KRANC_C
-#define Sign(x)       (signbit(x)?-1:+1)
+#  define Sign(x)     ((x)<0?-1:+1)
+#  define ToReal(x)   ((CCTK_REAL)(x))
 #else
-#define Sign(x)       (sgn(x))
+#  define Sign(x)     (sgn(x))
+#  define ToReal(x)   (real((x),kind(khalf)))
 #endif
 
+/* TODO: use fma(x,y,z) to implement fmadd and friends?  Note that fma
+   may be unsupported, or may be slow.  */
+
+/* #define fmadd(x,y,z)  ((x)*(y)+(z)) */
+/* #define fmsub(x,y,z)  ((x)*(y)-(z)) */
+/* #define fnmadd(x,y,z) (-(z)-(x)*(y)) */
+/* #define fnmsub(x,y,z) (+(z)-(x)*(y)) */
+
+#define fneg(x)   (-(x))
+#define fmul(x,y) ((x)*(y))
+#define fdiv(x,y) ((x)/(y))
+#define fadd(x,y) ((x)+(y))
+#define fsub(x,y) ((x)-(y))
+
+#define fmadd(x,y,z)  (fadd(fmul(x,y),z))
+#define fmsub(x,y,z)  (fsub(fmul(x,y),z))
+#define fnmadd(x,y,z) (fsub(fneg(z),fmul(x,y)))
+#define fnmsub(x,y,z) (fsub(z,fmul(x,y)))
+
+#define kexp(x)    (exp(x))
+#define kfabs(x)   (fabs(x))
+#define kfmax(x,y) (fmax(x,y))
+#define kfmin(x,y) (fmin(x,y))
+#define klog(x)    (log(x))
+#define kpow(x,y)  (pow(x,y))
+#define ksqrt(x)   (sqrt(x))
+
 #ifdef KRANC_C
-#define E               M_E
-#define Pi              M_PI
+#  define E           M_E
+#  define Pi          M_PI
 #else
-#define E               2.71828182845904523536029d0
-#define Pi              3.14159265358979323846264d0
+#  define E           2.71828182845904523536029d0
+#  define Pi          3.14159265358979323846264d0
 #endif
 
-#define UnitStep(x) ( (x) > 0 ? 1 : 0 )
+#define UnitStep(x)   ((x)>0)
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh
new file mode 100644
index 0000000..12cd6e8
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2-direct.hh
@@ -0,0 +1,135 @@
+// Vectorise using Intel's or AMD's SSE2
+
+// Use the type __m128d directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+
+
+#include <emmintrin.h>
+
+// Vector type corresponding to CCTK_REAL
+typedef __m128d CCTK_REAL_VEC;
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+// Create vectors, extract vector elements
+
+#define vec_set1(a) (_mm_set1_pd(a))
+#define vec_set(a,b) (_mm_set_pd(b,a))
+
+// Get a scalar from the vector
+#if defined(__PGI) && defined (__amd64__)
+// _mm_cvtsd_f64 does not exist on PGI compilers
+// #  define vec_elt0(x) (*(CCTK_REAL const*)&(x))
+#  define vec_elt0(x) ({ CCTK_REAL a_elt0; asm ("" : "=x" (a_elt0) : "0" (x)); a_elt0; })
+#else
+// this is a no-op
+#  define vec_elt0(x) (_mm_cvtsd_f64(x))
+#endif
+#define vec_elt1(x_) ({ CCTK_REAL_VEC const x_elt1=(x_); vec_elt0(_mm_unpackhi_pd(x_elt1,x_elt1)); })
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec_load(p)  (_mm_load_pd(&(p)))
+#define vec_loadu(p) (_mm_loadu_pd(&(p)))
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: Always use unaligned load
+#define vec_loadu_maybe(off,p) (vec_loadu(p))
+#define vec_loadu_maybe3(off1,off2,off3,p) (vec_loadu(p))
+#if 0
+#define vec_loadu_maybe(off,p)                  \
+  (!((off)&(CCTK_REAL_VEC_SIZE-1)) ?            \
+   vec_load(p) : vec_loadu(p))
+#define vec_loadu_maybe3(off1,off2,off3,p)      \
+  (!((off1)&(CCTK_REAL_VEC_SIZE-1)) &&          \
+   !((off2)&(CCTK_REAL_VEC_SIZE-1)) &&          \
+   !((off3)&(CCTK_REAL_VEC_SIZE-1)) ?           \
+   vec_load(p) : vec_loadu(p))
+#endif
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec_store(p,x) (_mm_store_pd(&(p),x))
+#define vec_storeu(p,x) (_mm_storeu_pd(&(p),x))
+#if defined(KRANC_CACHE)
+#  define vec_store_nta(p,x) (_mm_stream_pd(&(p),x))
+#else
+#  define vec_store_nta(p,x) (_mm_store_pd(&(p),x))
+#endif
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+#define vec_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x))
+#define vec_store_nta_partial_hi(p,x,n) (_mm_storeh_pd((&(p))+1,x))
+
+
+
+// Functions and operators
+
+// Operators
+#undef fneg
+#undef fmul
+#undef fdiv
+#undef fadd
+#undef fsub
+#if defined(__PGI) && defined (__amd64__)
+// The PGI compiler does not understand __m128d literals
+static union {
+  unsigned long long s[CCTK_REAL_VEC_SIZE];
+  CCTK_REAL_VEC v;
+} vec_neg_mask_impl = {0x8000000000000000ULL, 0x8000000000000000ULL};
+#  define vec_neg_mask (vec_neg_mask_impl.v)
+#else
+#  define vec_neg_mask ((CCTK_REAL_VEC)(__m128i){0x8000000000000000ULL, 0x8000000000000000ULL})
+#endif
+#define fneg(x)   (_mm_xor_pd(x,vec_neg_mask))
+#define fmul(x,y) (_mm_mul_pd(x,y))
+#define fdiv(x,y) (_mm_div_pd(x,y))
+#define fadd(x,y) (_mm_add_pd(x,y))
+#define fsub(x,y) (_mm_sub_pd(x,y))
+
+// Cheap functions
+#undef kfabs
+#undef kfmax
+#undef kfmin
+#undef ksqrt
+#if defined(__PGI) && defined (__amd64__)
+// The PGI compiler does not understand __m128d literals
+static union {
+  unsigned long long s[CCTK_REAL_VEC_SIZE];
+  CCTK_REAL_VEC v;
+} vec_fabs_mask_impl = {0x7fffffffffffffffULL, 0x7fffffffffffffffULL};
+#  define vec_fabs_mask (vec_fabs_mask_impl.v)
+#else
+#  define vec_fabs_mask ((CCTK_REAL_VEC)(__m128i){0x7fffffffffffffffULL, 0x7fffffffffffffffULL})
+#endif
+#define kfabs(x)   (_mm_and_pd(x,vec_fabs_mask))
+#define kfmax(x,y) (_mm_max_pd(x,y))
+#define kfmin(x,y) (_mm_min_pd(x,y))
+#define ksqrt(x)   (_mm_sqrt_pd(x))
+
+// Expensive functions
+#undef kexp
+#undef klog
+#undef kpow
+#define kexp(x_)    ({ CCTK_REAL_VEC const x_exp=(x_); vec_set(exp(vec_elt0(x_exp)),exp(vec_elt1(x_exp))); })
+#define klog(x_)    ({ CCTK_REAL_VEC const x_log=(x_); vec_set(log(vec_elt0(x_log)),log(vec_elt1(x_log))); })
+#define kpow(x_,a_) ({ CCTK_REAL_VEC const x_pow=(x_); CCTK_REAL const a_pow=(a_); vec_set(pow(vec_elt0(x_pow),a_pow),pow(vec_elt1(x_pow),a_pow)); })
+
+
+
+#undef Sign
+#define Sign(x) (42)
+
+#undef ToReal
+#define ToReal(x) (vec_set1(x))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh
new file mode 100644
index 0000000..4a4eea6
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-SSE2.hh
@@ -0,0 +1,194 @@
+// Vectorise using Intel's or AMD's SSE2
+
+
+
+#include <emmintrin.h>
+
+// Vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+  // The underlying scalar and vector types
+  typedef double S;
+  typedef __m128d V;
+  V v;
+  
+  // Convert from and to the underlying vector type
+  inline CCTK_REAL_VEC(V const v_): v(v_) { }
+  inline operator V const() const { return v; }
+  
+  inline CCTK_REAL_VEC() { }
+  
+  // Copy constructor
+  inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { }
+};
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+// Create vectors, extract vector elements
+
+DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a))
+DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a))
+
+// Get a scalar from the vector
+#if defined(__PGI) && defined (__amd64__)
+// _mm_cvtsd_f64 does not exist on PGI compilers
+// DEFINE_FUNCTION_V_R(vec_elt0,({ CCTK_REAL a; _mm_store_sd(&a,x); a; }))
+// DEFINE_FUNCTION_V_R(vec_elt0,(*(CCTK_REAL const*)&x))
+// This generates the fastest code with PGI compilers
+DEFINE_FUNCTION_V_R(vec_elt0,({ CCTK_REAL a; asm ("" : "=x" (a) : "0" (x)); a; }))
+#else
+DEFINE_FUNCTION_V_R(vec_elt0,_mm_cvtsd_f64(x)) // this is a no-op
+#endif
+DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_unpackhi_pd(x,x)))
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p))
+DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p))
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int mod>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p))
+template<int mod1,int mod2,int mod3>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p))
+// Implementation: load aligned if the modulus is zero
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p)
+{
+  return vec_load(p);
+}
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p)
+{
+  return vec_load(p);
+}
+// Call the implementation with the modulus
+#define vec_loadu_maybe(off,p)                                  \
+  (vec_loadu_maybe_impl<(off)&(CCTK_REAL_VEC_SIZE-1>(p)))
+#define vec_loadu_maybe3(off1,off2,off3,p)                      \
+  (vec_loadu_maybe_impl3<(off1)&(CCTK_REAL_VEC_SIZE-1),         \
+                         (off2)&(CCTK_REAL_VEC_SIZE-1),         \
+                         (off3)&(CCTK_REAL_VEC_SIZE-1)>(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x))
+DEFINE_FUNCTION_PRV(vec_storeu,_mm_storeu_pd(&p,x))
+#if defined(KRANC_CACHE)
+DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x))
+#else
+DEFINE_FUNCTION_PRV(vec_store_nta,_mm_store_pd(&p,x))
+#endif
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_store_nta_partial_lo (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n)
+{
+  switch (n) {
+  case 1: _mm_storel_pd(&p,x); break;
+  default: assert(0);
+  }
+}
+static inline
+void vec_store_nta_partial_hi (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n)
+{
+  switch (n) {
+  case 1: _mm_storeh_pd((&p)+1,x); break;
+  default: assert(0);
+  }
+}
+
+
+
+// Functions and operators
+
+// Single-argument operators
+#if 0
+DEFINE_FUNCTION_V_V(operator+,x)
+static CCTK_REAL_VEC const vec_neg_mask =
+  (CCTK_REAL_VEC::V)(__m128i) { 0x8000000000000000ULL, 0x8000000000000000ULL };
+DEFINE_FUNCTION_V_V(operator-,_mm_xor_pd(x,vec_neg_mask))
+#endif
+DEFINE_FUNCTION_V_V(operator+,+x.v)
+DEFINE_FUNCTION_V_V(operator-,-x.v)
+
+// Double-argument operators, both vectors
+#if 0
+DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y))
+#endif
+DEFINE_FUNCTION_VV_V(operator+,x.v+y.v)
+DEFINE_FUNCTION_VV_V(operator-,x.v-y.v)
+DEFINE_FUNCTION_VV_V(operator*,x.v*y.v)
+DEFINE_FUNCTION_VV_V(operator/,x.v/y.v)
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x)
+DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x)
+DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x)
+DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x)
+
+// Cheap functions
+#if defined(__PGI) && defined (__amd64__)
+// The PGI compiler does not understand __m128d literals
+static union {
+  CCTK_REAL_VEC::S s[CCTK_REAL_VEC_SIZE];
+  CCTK_REAL_VEC::V v;
+} vec_fabs_mask_impl = { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL };
+#  define vec_fabs_mask (vec_fabs_mask_impl.v)
+#else
+static CCTK_REAL_VEC const vec_fabs_mask =
+  (CCTK_REAL_VEC::V)(__m128i) { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL };
+#endif
+DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask))
+DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y))
+DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y))
+DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+
+
+
+#undef Sign
+#define Sign(x) (42)
+
+#undef ToReal
+#define ToReal(x) vec_set1(x)
+
+#if defined(__PGI) && defined (__amd64__)
+// Special case for PGI 9.0.4 to avoid an internal compiler error
+#undef IfThen
+static inline
+CCTK_REAL_VEC IfThen (bool const cond, CCTK_REAL_VEC const x, CCTK_REAL_VEC const y)
+{
+  union {
+    __m128i vi;
+    CCTK_REAL_VEC::V v;
+  } mask;
+  mask.vi = _mm_set1_epi64x(-(long long)cond);
+  return _mm_or_pd(_mm_and_pd(x.v, mask.v), _mm_andnot_pd(mask.v, y.v));
+}
+#endif
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh
new file mode 100644
index 0000000..7e06017
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX-direct.hh
@@ -0,0 +1,111 @@
+// Vectorise using IBM's Altivec
+
+// Use the type vector double directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+
+
+#include <altivec.h>
+
+// Vector type corresponding to CCTK_REAL
+typedef vector double CCTK_REAL_VEC;
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+// Create vectors, extract vector elements
+
+#define vec_set1(a) (vec_splats(a))
+#if defined(__GNUC__)
+// GNU doesn't support array indices on vectors
+union vec_mask {
+  double elts[2];
+  vector double v;
+};
+#  define vec_set(a,b) ({ vec_mask x_set; x_set.elts[0]=(a); x_set.elts[1]=(b); x_set.v; })
+#else
+#  define vec_set(a,b) ({ CCTK_REAL_VEC x_set; x_set[0]=(a); x_set[1]=(b); x_set; })
+#endif
+
+// Get a scalar from the vector
+#if defined(__GNUC__)
+// GNU doesn't support array indices on vectors
+#  define vec_elt0(x) ({ vec_mask x_elt0; x_elt0.v=(x); x_elt0.elts[0]; })
+#  define vec_elt1(x) ({ vec_mask x_elt1; x_elt1.v=(x); x_elt1.elts[1]; })
+#else
+#  define vec_elt0(x) ((x)[0])
+#  define vec_elt1(x) ((x)[1])
+#endif
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec_load(p)  (*(CCTK_REAL_VEC const*)&(p))
+#define vec_loadu(p) (vec_load(p))
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#define vec_loadu_maybe(off,p) (vec_load(p))
+#define vec_loadu_maybe3(off1,off2,off3,p) (vec_load(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec_store(p,x) (*(CCTK_REAL_VEC*)&(p)=(x))
+#define vec_storeu(p,x) (*(CCTK_REAL_VEC*)&(p)=(x))
+// TODO: Use stvxl instruction?
+#define vec_store_nta(p,x) vec_store(p,x)
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+#define vec_store_nta_partial_lo(p,x,n) ((p)=vec_elt0(x))
+#define vec_store_nta_partial_hi(p,x,n) ((&(p))[1]=vec_elt1(x))
+
+
+
+// Functions and operators
+
+// Other Altivec functions are:
+//    nabs: -abs a
+//    madd msub nmadd nmsub: [+-]a*b[+-]c
+
+// Triple-argument operators, all vectors
+#undef fmadd
+#undef fmsub
+#undef fnmadd
+#undef fnmsub
+#define fmadd(x,y,z)  (vec_madd(x,y,z))
+#define fmsub(x,y,z)  (vec_msub(x,y,z))
+#define fnmadd(x,y,z) (vec_nmadd(x,y,z))
+#define fnmsub(x,y,z) (vec_nmsub(x,y,z))
+
+// Cheap functions
+#undef kfabs
+#undef kfmax
+#undef kfmin
+#define kfabs(x)   (vec_abs(x))
+#define kfmax(x,y) (vec_max(x,y))
+#define kfmin(x,y) (vec_min(x,y))
+
+// Expensive functions
+#undef kexp
+#undef klog
+#undef kpow
+#undef ksqrt
+#define kexp(x_)    ({ CCTK_REAL_VEC const x_exp=(x_); vec_set(exp(vec_elt0(x_exp)),exp(vec_elt1(x_exp))); })
+#define klog(x_)    ({ CCTK_REAL_VEC const x_log=(x_); vec_set(log(vec_elt0(x_log)),log(vec_elt1(x_log))); })
+#define kpow(x_,a_) ({ CCTK_REAL_VEC const x_pow=(x_); CCTK_REAL const a_pow=(a_); vec_set(pow(vec_elt0(x_pow),a_pow),pow(vec_elt1(x_pow),a_pow)); })
+#define ksqrt(x_)   ({ CCTK_REAL_VEC const x_sqrt=(x_); vec_set(sqrt(vec_elt0(x_sqrt)),sqrt(vec_elt1(x_sqrt))); })
+
+
+
+#undef Sign
+#define Sign(x) (42)
+
+#undef ToReal
+#define ToReal(x) (vec_set1((CCTK_REAL)(x)))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh
new file mode 100644
index 0000000..f591647
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-VSX.hh
@@ -0,0 +1,212 @@
+// Vectorise using IBM's Altivec
+
+
+
+#include <altivec.h>
+
+// Vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+  // The underlying scalar and vector types
+  typedef double S;
+  typedef vector double V;
+  V v;
+  union vec_mask {
+    S elts[2];
+    V v;
+  };
+  
+  // Set a vector from scalars
+#if 0
+  // IBM
+  inline CCTK_REAL_VEC(S const a, S const b) { v[0]=a; v[1]=b; }
+#endif
+#if 0
+  inline CCTK_REAL_VEC(S const a, S const b):
+    v(vec_mergel(vec_splats(a), vec_splats(b))) { }
+#endif
+  inline CCTK_REAL_VEC(S const a, S const b)
+  {
+    vec_mask x;
+    x.elts[0] = a;
+    x.elts[1] = b;
+    v = x.v;
+  }
+  
+  // Set a vector from a scalar, replicating the scalar
+  // Note: Could also use vec_xlds instead
+  inline CCTK_REAL_VEC(S const a): v(vec_splats(a)) { }
+  
+  // Convert from and to the underlying vector type
+  inline CCTK_REAL_VEC(V const v_): v(v_) { }
+  inline operator V const() const { return v; }
+  
+  inline CCTK_REAL_VEC() { }
+  
+  // Copy constructor
+  inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { }
+};
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+// Create vectors, extract vector elements
+DEFINE_FUNCTION_R_V(vec_set1,CCTK_REAL_VEC(a))
+DEFINE_FUNCTION_RR_V(vec_set,CCTK_REAL_VEC(a,b))
+
+// Get a scalar from the vector
+#if 0
+// IBM
+DEFINE_FUNCTION_V_R(vec_elt0,x.v[0])
+DEFINE_FUNCTION_V_R(vec_elt1,x.v[1])
+#endif
+static inline CCTK_REAL vec_elt0(CCTK_REAL_VEC const x)
+{
+  CCTK_REAL_VEC::vec_mask x1;
+  x1.v = x;
+  return x1.elts[0];
+}
+static inline CCTK_REAL vec_elt1(CCTK_REAL_VEC const x)
+{
+  CCTK_REAL_VEC::vec_mask x1;
+  x1.v = x;
+  return x1.elts[1];
+}
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,p)
+#if 0
+// IBM
+DEFINE_FUNCTION_PR_V(vec_loadu,vec_xld2(0,const_cast<CCTK_REAL*>(&p)))
+#endif
+DEFINE_FUNCTION_PR_V(vec_loadu,p)
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int mod>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p))
+template<int mod1,int mod2,int mod3>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p))
+// Implementation: load aligned if the modulus is zero
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p)
+{
+  return vec_load(p);
+}
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p)
+{
+  return vec_load(p);
+}
+// Call the implementation with the modulus
+#define vec_loadu_maybe(off,p)                                  \
+  (vec_loadu_maybe_impl<(off)&(CCTK_REAL_VEC_SIZE-1>(p)))
+#define vec_loadu_maybe3(off1,off2,off3,p)                      \
+  (vec_loadu_maybe_impl3<(off1)&(CCTK_REAL_VEC_SIZE-1),         \
+                         (off2)&(CCTK_REAL_VEC_SIZE-1),         \
+                         (off3)&(CCTK_REAL_VEC_SIZE-1)>(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC::V*)&p=x)
+DEFINE_FUNCTION_PRV(vec_storeu,*(CCTK_REAL_VEC::V*)&p=x)
+// TODO: Use stvxl instruction?
+DEFINE_FUNCTION_PRV(vec_store_nta,*(CCTK_REAL_VEC::V*)&p=x)
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_store_nta_partial_lo (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n)
+{
+  switch (n) {
+  case 1: p=vec_elt0(x); break;
+  default: assert(0);
+  }
+}
+static inline
+void vec_store_nta_partial_hi (CCTK_REAL& p, CCTK_REAL_VEC const x, int const n)
+{
+  switch (n) {
+  case 1: (&p)[1]=vec_elt1(x); break;
+  default: assert(0);
+  }
+}
+
+
+
+// Functions and operators
+
+// Other Altivec functions are:
+//    nabs: -abs a
+//    madd msub nmadd nmsub: [+-]a*b[+-]c
+
+// Single-argument operators
+#if 0
+DEFINE_FUNCTION_V_V(operator+,x)
+DEFINE_FUNCTION_V_V(operator-,vec_neg(x))
+#endif
+DEFINE_FUNCTION_V_V(operator+,+x.v)
+DEFINE_FUNCTION_V_V(operator-,-x.v)
+
+// Double-argument operators, both vectors
+#if 0
+DEFINE_FUNCTION_VV_V(operator+,vec_add(x,y))
+DEFINE_FUNCTION_VV_V(operator-,vec_sub(x,y))
+DEFINE_FUNCTION_VV_V(operator*,vec_mul(x,y))
+DEFINE_FUNCTION_VV_V(operator/,vec_div(x,y))
+#endif
+DEFINE_FUNCTION_VV_V(operator+,x.v+y.v)
+DEFINE_FUNCTION_VV_V(operator-,x.v-y.v)
+DEFINE_FUNCTION_VV_V(operator*,x.v*y.v)
+DEFINE_FUNCTION_VV_V(operator/,x.v/y.v)
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x)
+DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x)
+DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x)
+DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x)
+
+// Triple-argument operators, all vectors
+#undef fmadd
+#undef fmsub
+#undef fnmadd
+#undef fnmsub
+DEFINE_FUNCTION_VVV_V(fmadd,vec_madd(x.v,y.v,z.v))
+DEFINE_FUNCTION_VVV_V(fmsub,vec_msub(x.v,y.v,z.v))
+DEFINE_FUNCTION_VVV_V(fnmadd,vec_nmadd(x.v,y.v,z.v))
+DEFINE_FUNCTION_VVV_V(fnmsub,vec_nmsub(x.v,y.v,z.v))
+
+// Cheap functions
+DEFINE_FUNCTION_V_V(fabs,vec_abs(x.v))
+DEFINE_FUNCTION_VV_V(fmax,vec_max(x.v,y.v))
+DEFINE_FUNCTION_VV_V(fmin,vec_min(x.v,y.v))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+DEFINE_FUNCTION_V_V(sqrt,vec_set(sqrt(vec_elt0(x)),sqrt(vec_elt1(x))))
+
+
+
+#undef Sign
+#define Sign(x) (42)
+
+#undef ToReal
+#define ToReal(x) (vec_set1(x))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh
new file mode 100644
index 0000000..f928ed8
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-default.hh
@@ -0,0 +1,31 @@
+// Fallback vectorisation implementation: Do not vectorise
+
+
+
+// Use CCTK_REAL
+typedef CCTK_REAL CCTK_REAL_VEC;
+
+// Number of vector elements in a CCTK_REAL_VEC
+static int const CCTK_REAL_VEC_SIZE = 1;
+
+
+
+// We use macros here, so that we are not surprised by compilers which
+// don't like to inline functions (e.g. PGI).  This should also make
+// debug builds (which may not inline) more efficient.
+
+#define vec_load(p) (p)
+#define vec_loadu(p) (p)
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#define vec_loadu_maybe(off,p) (p)
+#define vec_loadu_maybe3(off1,off2,off3,p) (p)
+
+#define vec_store(p,x) ((p)=(x))
+#define vec_store_nta(p,x) ((p)=(x))
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+#define vec_store_nta_partial_lo(p,x,n) (assert(0))
+#define vec_store_nta_partial_hi(p,x,n) (assert(0))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh
new file mode 100644
index 0000000..f5c0b22
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-define.hh
@@ -0,0 +1,104 @@
+// Define some macros that simplify defining short function that are
+// supposed to be inlined
+
+
+
+// Letters defining the prototype (argument and return value types):
+//    I: i,j: integer
+//    R: a,b: real
+//    V: x,y: vector (of real)
+//    P: p,q: pointer (i.e. const reference) to something
+//    L: l,m: L-value (i.e. non-const reference) to something
+
+
+
+// Load and store
+
+#define DEFINE_FUNCTION_PR_V(name,expr)         \
+static inline                                   \
+CCTK_REAL_VEC name (CCTK_REAL const& p)         \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_PRV(name,expr)          \
+static inline                                   \
+void name (CCTK_REAL& p, CCTK_REAL_VEC const x) \
+{                                               \
+  expr;                                         \
+}
+
+#define DEFINE_FUNCTION_PVR(name,expr)          \
+static inline                                   \
+void name (CCTK_REAL_VEC& p, CCTK_REAL const a) \
+{                                               \
+  expr;                                         \
+}
+
+
+
+// Functions and operators
+
+#define DEFINE_FUNCTION_V_V(name,expr)          \
+static inline                                   \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const x)      \
+  CCTK_ATTRIBUTE_PURE                           \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_V_R(name,expr)          \
+static inline                                   \
+CCTK_REAL name (CCTK_REAL_VEC const x)          \
+  CCTK_ATTRIBUTE_PURE                           \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_R_V(name,expr)          \
+static inline                                   \
+CCTK_REAL_VEC name (CCTK_REAL const a)          \
+  CCTK_ATTRIBUTE_PURE                           \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_VV_V(name,expr)                                 \
+static inline                                                           \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL_VEC const y)       \
+  CCTK_ATTRIBUTE_PURE                                                   \
+{                                                                       \
+  return expr;                                                          \
+}
+
+#define DEFINE_FUNCTION_VR_V(name,expr)                         \
+static inline                                                   \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL const a)   \
+  CCTK_ATTRIBUTE_PURE                                           \
+{                                                               \
+  return expr;                                                  \
+}
+
+#define DEFINE_FUNCTION_RV_V(name,expr)                         \
+static inline                                                   \
+CCTK_REAL_VEC name (CCTK_REAL const a, CCTK_REAL_VEC const x)   \
+  CCTK_ATTRIBUTE_PURE                                           \
+{                                                               \
+  return expr;                                                  \
+}
+
+#define DEFINE_FUNCTION_RR_V(name,expr)                         \
+static inline                                                   \
+CCTK_REAL_VEC name (CCTK_REAL const a, CCTK_REAL const b)       \
+  CCTK_ATTRIBUTE_PURE                                           \
+{                                                               \
+  return expr;                                                  \
+}
+
+#define DEFINE_FUNCTION_VVV_V(name,expr)                                \
+static inline                                                           \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const x, CCTK_REAL_VEC const y, CCTK_REAL_VEC const z) \
+  CCTK_ATTRIBUTE_PURE                                                   \
+{                                                                       \
+  return expr;                                                          \
+}
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh
new file mode 100644
index 0000000..df83b3a
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-outdated.hh
@@ -0,0 +1,591 @@
+#ifndef VECTORS_HH
+#define VECTORS_HH
+
+
+
+// Vectorisation
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+
+#include <cctk.h>
+
+
+
+// I: i,j: integer
+// R: a,b: real
+// V: x,y: vector (of real)
+// P: p,q: pointer (i.e. const reference) to something
+// L: l,m: L-value (i.e. non-const reference) to something
+
+#define DEFINE_FUNCTION_PR_V(name,expr)         \
+static inline                                   \
+CCTK_REAL_VEC name (CCTK_REAL const& p)         \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_PRV(name,expr)                  \
+static inline                                           \
+void name (CCTK_REAL& p, CCTK_REAL_VEC const& x)        \
+{                                                       \
+  expr;                                                 \
+}
+
+#define DEFINE_FUNCTION_PVR(name,expr)                  \
+static inline                                           \
+void name (CCTK_REAL_VEC& p, CCTK_REAL const& a)        \
+{                                                       \
+  expr;                                                 \
+}
+
+#define DEFINE_FUNCTION_V_V(name,expr)          \
+static inline                                   \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const& x)     \
+  CCTK_ATTRIBUTE_PURE                           \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_V_R(name,expr)          \
+static inline                                   \
+CCTK_REAL name (CCTK_REAL_VEC const& x)         \
+  CCTK_ATTRIBUTE_PURE                           \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_R_V(name,expr)          \
+static inline                                   \
+CCTK_REAL_VEC name (CCTK_REAL const& a)         \
+  CCTK_ATTRIBUTE_PURE                           \
+{                                               \
+  return expr;                                  \
+}
+
+#define DEFINE_FUNCTION_VV_V(name,expr)                                 \
+static inline                                                           \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL_VEC const& y)     \
+  CCTK_ATTRIBUTE_PURE                                                   \
+{                                                                       \
+  return expr;                                                          \
+}
+
+#define DEFINE_FUNCTION_VR_V(name,expr)                         \
+static inline                                                   \
+CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL const& a) \
+  CCTK_ATTRIBUTE_PURE                                           \
+{                                                               \
+  return expr;                                                  \
+}
+
+#define DEFINE_FUNCTION_RV_V(name,expr)                         \
+static inline                                                   \
+CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL_VEC const& x) \
+  CCTK_ATTRIBUTE_PURE                                           \
+{                                                               \
+  return expr;                                                  \
+}
+
+#define DEFINE_FUNCTION_RR_V(name,expr)                         \
+static inline                                                   \
+CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL const& b)     \
+  CCTK_ATTRIBUTE_PURE                                           \
+{                                                               \
+  return expr;                                                  \
+}
+
+
+
+// Intel, double
+#if defined(KRANC_VECTORS) && defined(__SSE2__) && defined(CCTK_REAL_PRECISION_8)
+
+#include <emmintrin.h>
+
+// Vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+  // The underlying scalar and vector types
+  typedef double S;
+  typedef __m128d V;
+  V v;
+  
+  // Set a vector from scalars
+  inline CCTK_REAL_VEC(S const& a, S const& b): v(_mm_set_pd(b,a)) { }
+
+  // Set a vector from a scalar, replicating the scalar
+  inline CCTK_REAL_VEC(S const& a): v(_mm_set1_pd(a)) { }
+  
+  // Convert from and to the underlying vector type
+  inline CCTK_REAL_VEC(V const& v_): v(v_) { }
+  inline operator V const() const { return v; }
+  
+  inline CCTK_REAL_VEC() { }
+  
+  // Copy constructor
+  inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { }
+};
+
+union vec_mask {
+  unsigned long long bits[2];
+  CCTK_REAL_VEC::V v;
+};
+
+DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a))
+DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a))
+
+// Get a scalar from the vector
+#if defined(__PGI) && defined (__amd64__)
+// _mm_cvtsd_f64 does not exist on PGI compilers
+static inline
+CCTK_REAL vec_elt0 (CCTK_REAL_VEC const& x)
+{
+  CCTK_REAL a; _mm_store_sd(&a,x); return a;
+}
+#else
+DEFINE_FUNCTION_V_R(vec_elt0,_mm_cvtsd_f64(x)) //this is a no-op
+#endif
+
+#if 0
+DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_shuffle_pd(x,x,_MM_SHUFFLE2(1,1))))
+#endif
+static inline
+CCTK_REAL vec_elt1 (CCTK_REAL_VEC const& x)
+{
+  CCTK_REAL a; _mm_storeh_pd(&a,x); return a;
+}
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p))
+DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p))
+
+#if 0
+// Load a partial vector (duplicating the last loaded element to fill
+// the remaining elements)
+// TODO: Should this be aligned or unaligned?
+static inline
+CCTK_REAL_VEC vec_load_partial (CCTK_REAL const& p, int const n)
+{
+  switch (n) {
+  case 1: return _mm_load1_pd(p);
+  default: assert(0);
+  }
+}
+#endif
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int mod>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p))
+template<int mod1,int mod2,int mod3>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl3,vec_loadu(p))
+// Implementation: load aligned if the modulus is zero
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl<0> (CCTK_REAL const& p)
+{
+  return vec_load(p);
+}
+template<>
+inline
+CCTK_REAL_VEC vec_loadu_maybe_impl3<0,0,0> (CCTK_REAL const& p)
+{
+  return vec_load(p);
+}
+// Call the implementation with the modulus
+template<int off>
+static inline
+CCTK_REAL_VEC vec_loadu_maybe (CCTK_REAL const& p)
+{
+  return vec_loadu_maybe_impl<off&(2-1)>(p);
+}
+template<int off1,int off2, int off3>
+static inline
+CCTK_REAL_VEC vec_loadu_maybe3 (CCTK_REAL const& p)
+{
+  return vec_loadu_maybe_impl3<off1&(2-1),off2&(2-1),off3&(2-1)>(p);
+}
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x))
+DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x))
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+  switch (n) {
+  case 1: _mm_storel_pd(&p,x); break;
+  default: assert(0);
+  }
+}
+static inline
+void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+  switch (n) {
+  case 1: _mm_storeh_pd((&p)+1,x); break;
+  default: assert(0);
+  }
+}
+
+// Double-argument operators, both vectors
+DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y))
+DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y))
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x)
+DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x)
+DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x)
+DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x)
+
+// Single-argument operators
+DEFINE_FUNCTION_V_V(operator+,x)
+#if 0
+DEFINE_FUNCTION_V_V(operator-,vec_set(0.0,0.0)-x)
+#endif
+static vec_mask const vec_neg_mask = 
+{ { 0x8000000000000000ULL, 0x8000000000000000ULL } };
+DEFINE_FUNCTION_V_V(operator-,_mm_xor_pd(x,vec_neg_mask.v))
+
+// Cheap functions
+static vec_mask const vec_fabs_mask =
+{ { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } };
+DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask.v))
+DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y))
+DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y))
+DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+
+// Special case for PGI to avoid internal compiler error
+#if defined(__PGI) && defined (__amd64__)
+#undef IfThen
+CCTK_REAL_VEC IfThen (bool const cond, CCTK_REAL_VEC const& x, CCTK_REAL_VEC co\
+nst& y)
+{
+  return cond*x + (not cond)*y;
+}
+#endif
+
+
+
+#if 0
+// Try to use the __m128d type directly.
+
+// This does not really work, because it is not possible to define
+// automatic conversion operators from double to __m128d, so that
+// explicit conversions are required.  This makes the code look more
+// clumsy.
+
+// Vector type corresponding to CCTK_REAL
+typedef __m128d CCTK_REAL_VEC;
+
+DEFINE_FUNCTION_R_V(vec_set1,_mm_set1_pd(a))
+DEFINE_FUNCTION_RR_V(vec_set,_mm_set_pd(b,a))
+
+// Get a scalar from the vector
+static inline
+CCTK_REAL vec_elt0 (CCTK_REAL_VEC const& x)
+{
+#if 0
+  // _mm_cvtsd_f64 does not exist on PGI compilers
+  return _mm_cvtsd_f64(x);      // this is a no-op
+#endif
+  CCTK_REAL a; _mm_store_sd(&a,x); return a;
+}
+
+DEFINE_FUNCTION_V_R(vec_elt1,vec_elt0(_mm_shuffle_pd(x,x,_MM_SHUFFLE2(1,1))))
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p))
+DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x))
+DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x))
+
+// Cheap functions
+static vec_mask const vec_fabs_mask =
+{ { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } };
+DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,vec_fabs_mask.v))
+DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y))
+DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y))
+DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+
+#endif
+
+
+
+// Intel, float
+#elif defined(KRANC_VECTORS) && defined(__SSE__) && defined(CCTK_REAL_PRECISION_4)
+
+#include <xmmintrin.h>
+
+// A vector type corresponding to CCTK_REAL
+typedef __m128 CCTK_REAL_VEC;
+
+
+
+// Power, double
+#elif defined(KRANC_VECTORS) && defined(__ALTIVEC__) && defined(_ARCH_PWR7) && defined(CCTK_REAL_PRECISION_8)
+
+#include <altivec.h>
+
+// Vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+  // The underlying scalar and vector types
+  typedef double S;
+  typedef vector double V;
+  V v;
+  
+  // vec_insert, vec_extract, vec_splat
+
+  // Set a vector from scalars
+  inline CCTK_REAL_VEC(S const& a, S const& b) { v[0]=a; v[1]=b; }
+  
+  // Set a vector from a scalar, replicating the scalar
+  inline CCTK_REAL_VEC(S const& a): v(vec_splats(a)) { }
+  
+  // Convert from and to the underlying vector type
+  inline CCTK_REAL_VEC(V const& v_): v(v_) { }
+  inline operator V const() const { return v; }
+  
+  inline CCTK_REAL_VEC() { }
+  
+  // Copy constructor
+  inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { }
+};
+
+DEFINE_FUNCTION_R_V(vec_set1,CCTK_REAL_VEC(a))
+DEFINE_FUNCTION_RR_V(vec_set,CCTK_REAL_VEC(a,b))
+
+// Get a scalar from the vector
+DEFINE_FUNCTION_V_R(vec_elt0,x.v[0])
+DEFINE_FUNCTION_V_R(vec_elt1,x.v[1])
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+DEFINE_FUNCTION_PR_V(vec_load,p)
+DEFINE_FUNCTION_PR_V(vec_loadu,vec_xld2(0,const_cast<CCTK_REAL*>(&p)))
+// vec_xlds
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int mod>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl,vec_loadu(p))
+// Implementation: load aligned if the modulus is zero
+#define static
+template<>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe_impl<0>,vec_load(p))
+#undef static
+// Call the implementation with the modulus
+template<int off>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe,vec_loadu_maybe_impl<off&(2-1)>(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC::V*)&p=x)
+DEFINE_FUNCTION_PRV(vec_store_nta,*(CCTK_REAL_VEC::V*)&p=x)
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+  switch (n) {
+  case 1: p=x.v[0]; break;
+  default: assert(0);
+  }
+}
+static inline
+void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+  switch (n) {
+  case 1: (&p)[1]=x.v[1]; break;
+  default: assert(0);
+  }
+}
+
+// Double-argument operators, both vectors
+DEFINE_FUNCTION_VV_V(operator+,vec_add(x,y))
+DEFINE_FUNCTION_VV_V(operator-,vec_sub(x,y))
+DEFINE_FUNCTION_VV_V(operator*,vec_mul(x,y))
+DEFINE_FUNCTION_VV_V(operator/,vec_div(x,y))
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,x+vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator-,x-vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator*,x*vec_set1(a))
+DEFINE_FUNCTION_VR_V(operator/,x/vec_set1(a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,vec_set1(a)+x)
+DEFINE_FUNCTION_RV_V(operator-,vec_set1(a)-x)
+DEFINE_FUNCTION_RV_V(operator*,vec_set1(a)*x)
+DEFINE_FUNCTION_RV_V(operator/,vec_set1(a)/x)
+
+// Single-argument operators
+DEFINE_FUNCTION_V_V(operator+,x)
+DEFINE_FUNCTION_V_V(operator-,vec_neg(x))
+
+// Cheap functions
+DEFINE_FUNCTION_V_V(fabs,vec_abs(x))
+DEFINE_FUNCTION_VV_V(fmax,vec_max(x,y))
+DEFINE_FUNCTION_VV_V(fmin,vec_min(x,y))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,vec_set(exp(vec_elt0(x)),exp(vec_elt1(x))))
+DEFINE_FUNCTION_V_V(log,vec_set(log(vec_elt0(x)),log(vec_elt1(x))))
+DEFINE_FUNCTION_VR_V(pow,vec_set(pow(vec_elt0(x),a),pow(vec_elt1(x),a)))
+DEFINE_FUNCTION_V_V(sqrt,vec_set(sqrt(vec_elt0(x)),sqrt(vec_elt1(x))))
+
+
+
+// Fallback: pseudo-vectorisation
+#elif 0
+
+// There is no vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+  // The underlying scalar and vector types
+  CCTK_REAL v, w;
+  
+  // Set a vector from scalars
+  inline CCTK_REAL_VEC(CCTK_REAL const& a, CCTK_REAL const& b): v(a), w(b) { }
+
+  // Set a vector from a scalar, replicating the scalar
+  inline CCTK_REAL_VEC(CCTK_REAL const& a): v(a), w(a) { }
+  
+  inline CCTK_REAL_VEC() { }
+  
+  // Copy constructor
+  inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x.v), w(x.w) { }
+};
+
+
+
+DEFINE_FUNCTION_PR_V(vec_load,*(CCTK_REAL_VEC const* restrict)&p)
+DEFINE_FUNCTION_PR_V(vec_loadu,vec_load(p))
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+template<int off>
+DEFINE_FUNCTION_PR_V(vec_loadm,vec_load(p))
+
+DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC* restrict)&p=x)
+DEFINE_FUNCTION_PRV(vec_store_nta,vec_store(p,x))
+
+// Double-argument operators, both vectors
+DEFINE_FUNCTION_VV_V(operator+,CCTK_REAL_VEC(x.v+y.v,x.w+y.w))
+DEFINE_FUNCTION_VV_V(operator-,CCTK_REAL_VEC(x.v-y.v,x.w-y.w))
+DEFINE_FUNCTION_VV_V(operator*,CCTK_REAL_VEC(x.v*y.v,x.w*y.w))
+DEFINE_FUNCTION_VV_V(operator/,CCTK_REAL_VEC(x.v/y.v,x.w/y.w))
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,CCTK_REAL_VEC(x.v+a,x.w+a))
+DEFINE_FUNCTION_VR_V(operator-,CCTK_REAL_VEC(x.v-a,x.w-a))
+DEFINE_FUNCTION_VR_V(operator*,CCTK_REAL_VEC(x.v*a,x.w*a))
+DEFINE_FUNCTION_VR_V(operator/,CCTK_REAL_VEC(x.v/a,x.w/a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a+x.v,a+x.w))
+DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a-x.v,a-x.w))
+DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a*x.v,a*x.w))
+DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a/x.v,a/x.w))
+
+// Single-argument operators
+DEFINE_FUNCTION_V_V(operator+,x)
+DEFINE_FUNCTION_V_V(operator-,CCTK_REAL_VEC(-x.v,-x.w))
+
+// Cheap functions
+DEFINE_FUNCTION_V_V(fabs,CCTK_REAL_VEC(fabs(x.v),fabs(x.w)))
+DEFINE_FUNCTION_VV_V(fmax,CCTK_REAL_VEC(fmax(x.v,y.v),fmax(x.w,y.w)))
+DEFINE_FUNCTION_VV_V(fmin,CCTK_REAL_VEC(fmin(x.v,y.v),fmin(x.w,y.w)))
+DEFINE_FUNCTION_V_V(sqrt,CCTK_REAL_VEC(sqrt(x.v),sqrt(x.w)))
+
+// Expensive functions
+DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.v),exp(x.w)))
+DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.v),log(x.w)))
+DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.v,a),pow(x.w,a)))
+
+
+
+// Fallback: no vectorisation
+#else
+
+// There is no vector type corresponding to CCTK_REAL
+typedef CCTK_REAL CCTK_REAL_VEC;
+
+
+
+DEFINE_FUNCTION_PR_V(vec_load,p)
+DEFINE_FUNCTION_PR_V(vec_loadu,p)
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: default to unaligned load
+template<int off>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe,p)
+template<int off1,int off2, int off3>
+DEFINE_FUNCTION_PR_V(vec_loadu_maybe3,p)
+
+DEFINE_FUNCTION_PRV(vec_store,p=x)
+DEFINE_FUNCTION_PRV(vec_store_nta,p=x)
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+static inline
+void vec_storel_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+  assert(0);
+}
+static inline
+void vec_storeh_partial (CCTK_REAL& p, CCTK_REAL_VEC const& x, int const n)
+{
+  assert(0);
+}
+
+
+
+#endif
+
+
+
+#undef DEFINE_FUNCTION_PR_V
+#undef DEFINE_FUNCTION_PRV
+#undef DEFINE_FUNCTION_V_V
+#undef DEFINE_FUNCTION_R_V
+#undef DEFINE_FUNCTION_VV_V
+#undef DEFINE_FUNCTION_VR_V
+#undef DEFINE_FUNCTION_RV_V
+#undef DEFINE_FUNCTION_RR_V
+
+
+
+// Number of vector elements in a CCTK_REAL_VEC
+static
+int const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+
+
+
+#endif  // #ifndef VECTORS_HH
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh
new file mode 100644
index 0000000..f439c9b
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-pseudo.hh
@@ -0,0 +1,72 @@
+// Pseudo vectorisation using scalar operations
+
+
+
+// Number of vector elements in a CCTK_REAL_VEC
+static int const CCTK_REAL_VEC_SIZE = 2;
+
+// There is no vector type corresponding to CCTK_REAL
+struct CCTK_REAL_VEC {
+  // The underlying scalar and vector types
+  CCTK_REAL v[CCTK_REAL_VEC_SIZE];
+  
+  // Set a vector from scalars
+  inline CCTK_REAL_VEC(CCTK_REAL const& a, CCTK_REAL const& b): v(a), w(b) { }
+
+  // Set a vector from a scalar, replicating the scalar
+  inline CCTK_REAL_VEC(CCTK_REAL const& a): v(a), w(a) { }
+  
+  inline CCTK_REAL_VEC() { }
+  
+  // Copy constructor
+  inline CCTK_REAL_VEC(CCTK_REAL_VEC const& x) { v[0]=x.v[0]; v[1]=x.v[1]; }
+};
+
+
+
+// Load and store vectors
+
+DEFINE_FUNCTION_PR_V(vec_load,*(CCTK_REAL_VEC const* restrict)&p)
+DEFINE_FUNCTION_PR_V(vec_loadu,vec_load(p))
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#define vec_loadu_maybe(off,p) (vec_load(p))
+#define vec_loadu_maybe3(off1,off2,off3,p) (vec_load(p))
+
+DEFINE_FUNCTION_PRV(vec_store,*(CCTK_REAL_VEC* restrict)&p=x)
+DEFINE_FUNCTION_PRV(vec_store_nta,vec_store(p,x))
+
+
+
+// Functions and operators
+
+// Double-argument operators, both vectors
+DEFINE_FUNCTION_VV_V(operator+,CCTK_REAL_VEC(x.v[0]+y.v[0],x.v[1]+y.v[1]))
+DEFINE_FUNCTION_VV_V(operator-,CCTK_REAL_VEC(x.v[0]-y.v[0],x.v[1]-y.v[1]))
+DEFINE_FUNCTION_VV_V(operator*,CCTK_REAL_VEC(x.v[0]*y.v[0],x.v[1]*y.v[1]))
+DEFINE_FUNCTION_VV_V(operator/,CCTK_REAL_VEC(x.v[0]/y.v[0],x.v[1]/y.v[1]))
+
+// Double-argument operators, vector and scalar
+DEFINE_FUNCTION_VR_V(operator+,CCTK_REAL_VEC(x.v[0]+a,x.v[1]+a))
+DEFINE_FUNCTION_VR_V(operator-,CCTK_REAL_VEC(x.v[0]-a,x.v[1]-a))
+DEFINE_FUNCTION_VR_V(operator*,CCTK_REAL_VEC(x.v[0]*a,x.v[1]*a))
+DEFINE_FUNCTION_VR_V(operator/,CCTK_REAL_VEC(x.v[0]/a,x.v[1]/a))
+
+// Double-argument operators, scalar and vector
+DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a+x.v[0],a+x.v[1]))
+DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a-x.v[0],a-x.v[1]))
+DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a*x.v[0],a*x.v[1]))
+DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a/x.v[0],a/x.v[1]))
+
+// Single-argument operators
+DEFINE_FUNCTION_V_V(operator+,x)
+DEFINE_FUNCTION_V_V(operator-,CCTK_REAL_VEC(-x.v[0],-x.v[1]))
+
+// Functions
+DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.v[0]),exp(x.v[1])))
+DEFINE_FUNCTION_V_V(fabs,CCTK_REAL_VEC(fabs(x.v[0]),fabs(x.v[1])))
+DEFINE_FUNCTION_VV_V(fmax,CCTK_REAL_VEC(fmax(x.v[0],y.v[0]),fmax(x.v[1],y.v[1])))
+DEFINE_FUNCTION_VV_V(fmin,CCTK_REAL_VEC(fmin(x.v[0],y.v[0]),fmin(x.v[1],y.v[1])))
+DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.v[0]),log(x.v[1])))
+DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.v[0],a),pow(x.v[1],a)))
+DEFINE_FUNCTION_V_V(sqrt,CCTK_REAL_VEC(sqrt(x.v[0]),sqrt(x.v[1])))
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh
new file mode 100644
index 0000000..0d950c7
--- /dev/null
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors-undefine.hh
@@ -0,0 +1,14 @@
+// Undefine all macros defined in "Vectors-define.hh", so that we
+// leave a clean namespace
+
+
+
+#undef DEFINE_FUNCTION_PR_V
+#undef DEFINE_FUNCTION_PRV
+#undef DEFINE_FUNCTION_V_V
+#undef DEFINE_FUNCTION_R_V
+#undef DEFINE_FUNCTION_VV_V
+#undef DEFINE_FUNCTION_VR_V
+#undef DEFINE_FUNCTION_RV_V
+#undef DEFINE_FUNCTION_RR_V
+#undef DEFINE_FUNCTION_VVV_V
diff --git a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh
index 3fb77e1..d32afb2 100644
--- a/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh
+++ b/Auxiliary/Cactus/KrancNumericalTools/GenericFD/src/Vectors.hh
@@ -5,231 +5,47 @@
 
 // Vectorisation
 
+#include <assert.h>
 #include <math.h>
 #include <stdlib.h>
-#include <cctk.h>
-
-
-
-// I: i,j: integer
-// R: a,b: real
-// V: x,y: vector (of real)
-// P: p,q: pointer (i.e. const reference) to something
-// L: l,m: L-value (i.e. non-const reference) to something
-
-#define DEFINE_FUNCTION_PR_V(name,expr)         \
-inline                                          \
-CCTK_REAL_VEC name (CCTK_REAL const& p)         \
-{                                               \
-  return expr;                                  \
-}
-
-#define DEFINE_FUNCTION_PRV(name,expr)                  \
-inline                                                  \
-void name (CCTK_REAL& p, CCTK_REAL_VEC const& x)        \
-{                                                       \
-  expr;                                                 \
-}
-
-#define DEFINE_FUNCTION_PVR(name,expr)                  \
-inline                                                  \
-void name (CCTK_REAL_VEC& p, CCTK_REAL const& a)        \
-{                                                       \
-  expr;                                                 \
-}
-
-#define DEFINE_FUNCTION_V_V(name,expr)          \
-inline                                          \
-CCTK_REAL_VEC name (CCTK_REAL_VEC const& x)     \
-  CCTK_ATTRIBUTE_PURE                           \
-{                                               \
-  return CCTK_REAL_VEC(expr);                   \
-}
-
-#define DEFINE_FUNCTION_V_R(name,expr)          \
-inline                                          \
-CCTK_REAL name (CCTK_REAL_VEC const& x)         \
-  CCTK_ATTRIBUTE_PURE                           \
-{                                               \
-  return expr;                                  \
-}
-
-#define DEFINE_FUNCTION_R_V(name,expr)          \
-inline                                          \
-CCTK_REAL_VEC name (CCTK_REAL const& a)         \
-  CCTK_ATTRIBUTE_PURE                           \
-{                                               \
-  return expr;                                  \
-}
-
-#define DEFINE_FUNCTION_VV_V(name,expr)                                 \
-inline                                                                  \
-CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL_VEC const& y)     \
-  CCTK_ATTRIBUTE_PURE                                                   \
-{                                                                       \
-  return expr;                                                          \
-}
-
-#define DEFINE_FUNCTION_VR_V(name,expr)                         \
-inline                                                          \
-CCTK_REAL_VEC name (CCTK_REAL_VEC const& x, CCTK_REAL const& a) \
-  CCTK_ATTRIBUTE_PURE                                           \
-{                                                               \
-  return expr;                                                  \
-}
-
-#define DEFINE_FUNCTION_RV_V(name,expr)                         \
-inline                                                          \
-CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL_VEC const& x) \
-  CCTK_ATTRIBUTE_PURE                                           \
-{                                                               \
-  return expr;                                                  \
-}
-
-#define DEFINE_FUNCTION_RR_V(name,expr)                         \
-inline                                                          \
-CCTK_REAL_VEC name (CCTK_REAL const& a, CCTK_REAL const& b)     \
-  CCTK_ATTRIBUTE_PURE                                           \
-{                                                               \
-  return expr;                                                  \
-}
-
-
-
-// Intel, double
-#if defined(KRANC_VECTORS) && defined(__SSE2__) && defined(CCTK_REAL_PRECISION_8)
-
-#include <emmintrin.h>
-
-// Vector type corresponding to CCTK_REAL
-struct CCTK_REAL_VEC {
-  // The underlying scalar and vector types
-  typedef double S;
-  typedef __m128d V;
-  static int const n = sizeof(V)/sizeof(S);
-  V v;
-  
-  // Set a vector from scalars
-  CCTK_REAL_VEC(S const& a, S const& b): v(_mm_set_pd(a,b)) { };
-  
-  // Get a scalar from the vector
-  S elt0() const { return _mm_cvtsd_f64(v); /* this is a no-op */ }
-  S elt1() const { return _mm_cvtsd_f64(_mm_shuffle_pd(v,v,_MM_SHUFFLE2(1,1))); }
-  
-  // Set a vector from a scalar, replicating the scalar
-  CCTK_REAL_VEC(S const& a): v(_mm_set1_pd(a)) { };
-  
-  // Convert from and to the underlying vector type
-  CCTK_REAL_VEC(V const& v_): v(v_) { };
-  operator V const() const { return v; }
-  
-  CCTK_REAL_VEC() { };
-  
-  // Copy constructor
-  CCTK_REAL_VEC(CCTK_REAL_VEC const& x): v(x) { };
-};
-
-// Load a vector from memory (aligned and unaligned); this loads from
-// a reference to a scalar
-DEFINE_FUNCTION_PR_V(vec_load,_mm_load_pd(&p));
-DEFINE_FUNCTION_PR_V(vec_loadu,_mm_loadu_pd(&p));
-
-// Store a vector to memory (aligned and non-temporal); this stores to
-// a reference to a scalar
-DEFINE_FUNCTION_PRV(vec_store,_mm_store_pd(&p,x))
-DEFINE_FUNCTION_PRV(vec_store_nta,_mm_stream_pd(&p,x))
-
-// Double-argument operators, both vectors
-DEFINE_FUNCTION_VV_V(operator+,_mm_add_pd(x,y))
-DEFINE_FUNCTION_VV_V(operator-,_mm_sub_pd(x,y))
-DEFINE_FUNCTION_VV_V(operator*,_mm_mul_pd(x,y))
-DEFINE_FUNCTION_VV_V(operator/,_mm_div_pd(x,y))
-
-// Double-argument operators, vector and scalar
-DEFINE_FUNCTION_VR_V(operator+,x+CCTK_REAL_VEC(a))
-DEFINE_FUNCTION_VR_V(operator-,x-CCTK_REAL_VEC(a))
-DEFINE_FUNCTION_VR_V(operator*,x*CCTK_REAL_VEC(a))
-DEFINE_FUNCTION_VR_V(operator/,x/CCTK_REAL_VEC(a))
-
-// Double-argument operators, scalar and vector
-DEFINE_FUNCTION_RV_V(operator+,CCTK_REAL_VEC(a)+x)
-DEFINE_FUNCTION_RV_V(operator-,CCTK_REAL_VEC(a)-x)
-DEFINE_FUNCTION_RV_V(operator*,CCTK_REAL_VEC(a)*x)
-DEFINE_FUNCTION_RV_V(operator/,CCTK_REAL_VEC(a)/x)
-
-// Single-argument operators
-DEFINE_FUNCTION_V_V(operator+,x)
-DEFINE_FUNCTION_V_V(operator-,0.0-x)
-
-// Cheap functions
-static union {
-  unsigned long long const bits[2];
-  CCTK_REAL_VEC::V v;
-} const fabs_mask = 
-  { { 0x7fffffffffffffffULL, 0x7fffffffffffffffULL } };
-DEFINE_FUNCTION_V_V(fabs,_mm_and_pd(x,fabs_mask.v))
-DEFINE_FUNCTION_VV_V(fmax,_mm_max_pd(x,y))
-DEFINE_FUNCTION_VV_V(fmin,_mm_min_pd(x,y))
-DEFINE_FUNCTION_V_V(sqrt,_mm_sqrt_pd(x))
-
-// Expensive functions
-DEFINE_FUNCTION_V_V(exp,CCTK_REAL_VEC(exp(x.elt0()),exp(x.elt1())))
-DEFINE_FUNCTION_V_V(log,CCTK_REAL_VEC(log(x.elt0()),log(x.elt1())))
-DEFINE_FUNCTION_VR_V(pow,CCTK_REAL_VEC(pow(x.elt0(),a),pow(x.elt1(),a)))
-
-// Un-implemented functions
-DEFINE_FUNCTION_V_R(signbit,0)
-
-
-
-#if 0
-// Intel, float
-#elif defined(KRANC_VECTORS) && defined(__SSE__) && defined(CCTK_REAL_PRECISION_4)
-
-#include <xmmintrin.h>
-
-// A vector type corresponding to CCTK_REAL
-typedef __m128 CCTK_REAL_VEC;
-#endif
-
 
+#include <cctk.h>
 
-// Fallback: no vectorisation
-#else
-
-// There is no vector type corresponding to CCTK_REAL
-typedef CCTK_REAL CCTK_REAL_VEC;
 
 
+#include "Vectors-define.hh"
 
-DEFINE_FUNCTION_PR_V(vec_load,p)
-DEFINE_FUNCTION_PR_V(vec_loadu,p)
+#if defined(KRANC_VECTORS)
+// Vectorise
 
-DEFINE_FUNCTION_PRV(vec_store,p=x)
-DEFINE_FUNCTION_PRV(vec_store_nta,p=x)
+#  if ! defined(CCTK_REAL_PRECISION_8)
+#    error "Vectorisation is currently only supported for double precision"
+#  endif
 
-DEFINE_FUNCTION_V_R(signbit,x<0)
+#  if defined(__SSE2__)         // SSE2 (Intel)
+#    if defined(KRANC_DIRECT)
+#      include "Vectors-SSE2-direct.hh"
+#    else
+#      include "Vectors-SSE2.hh"
+#    endif
+#  elif defined(__ALTIVEC__) && defined(_ARCH_PWR7) // Altivec (Power)
+#    if defined(KRANC_DIRECT)
+#      include "Vectors-VSX-direct.hh"
+#    else
+#      include "Vectors-VSX.hh"
+#    endif
+#  else
+#    include "Vectors-pseudo.hh"
+#  endif
 
+#else
+// Don't vectorise
 
+#  include "Vectors-default.hh"
 
 #endif
 
-
-
-#undef DEFINE_FUNCTION_PR_V
-#undef DEFINE_FUNCTION_PRV
-#undef DEFINE_FUNCTION_V_V
-#undef DEFINE_FUNCTION_R_V
-#undef DEFINE_FUNCTION_VV_V
-#undef DEFINE_FUNCTION_VR_V
-#undef DEFINE_FUNCTION_RV_V
-#undef DEFINE_FUNCTION_RR_V
-
-
-
-// Number of vector elements in a CCTK_REAL_VEC
-static
-size_t const CCTK_REAL_VEC_SIZE = sizeof(CCTK_REAL_VEC) / sizeof(CCTK_REAL);
+#include "Vectors-undefine.hh"
 
 
 
diff --git a/Tools/CodeGen/CalculationFunction.m b/Tools/CodeGen/CalculationFunction.m
index 3d86503..df79553 100644
--- a/Tools/CodeGen/CalculationFunction.m
+++ b/Tools/CodeGen/CalculationFunction.m
@@ -178,7 +178,7 @@ localName[x_] :=
 
 definePreDefinitions[pDefs_] :=
   CommentedBlock["Initialize predefined quantities",
-    Map[DeclareAssignVariable["CCTK_REAL", #[[1]], #[[2]]] &, pDefs]];
+    Map[DeclareAssignVariable["CCTK_REAL_VEC", #[[1]], #[[2]]] &, pDefs]];
 
 (* --------------------------------------------------------------------------
    Equations
@@ -326,7 +326,7 @@ Options[CreateCalculationFunction] = ThornOptions;
 
 CreateCalculationFunction[calc_, debug_, useCSE_, opts:OptionsPattern[]] :=
   Module[{gfs, allSymbols, knownSymbols,
-          shorts, eqs, parameters,
+          shorts, eqs, parameters, parameterRules,
           functionName, dsUsed, groups, pddefs, cleancalc, eqLoop, where,
           addToStencilWidth, pDefs, haveCondTextuals, condTextuals},
 
@@ -366,10 +366,14 @@ CreateCalculationFunction[calc_, debug_, useCSE_, opts:OptionsPattern[]] :=
 
     If[!lookupDefault[cleancalc, NoSimplify, False],
        InfoMessage[InfoFull, "Simplifying equations", eqs];
-       eqs = Simplify[eqs, {r>0}]]];
+       eqs = Simplify[eqs, {r>=0}]]];
 
   InfoMessage[InfoFull, "Equations:"];
 
+  (* Wrap parameters with ToReal *)
+  parameterRules = Map[(#->ToReal[#])&, parameters];
+  eqs = eqs /. parameterRules;
+
   Map[printEq, eqs];
 
   (* Check all the function names *)
@@ -537,7 +541,7 @@ equationLoop[eqs_, cleancalc_, gfs_, shorts_, incs_, groups_, pddefs_,
 
     GenericGridLoop[functionName,
     {
-      DeclareDerivatives[defsWithoutShorts, eqsOrdered],
+      (* DeclareDerivatives[defsWithoutShorts, eqsOrdered], *)
 
       CommentedBlock["Assign local copies of grid functions",
         Map[DeclareMaybeAssignVariableInLoop[
@@ -558,6 +562,22 @@ equationLoop[eqs_, cleancalc_, gfs_, shorts_, incs_, groups_, pddefs_,
         Map[InfoVariable[#[[1]]] &, (eqs2 /. localMap)],
         ""],
 
+      CommentedBlock["If necessary, store only partial vectors after the first iteration",
+        ConditionalOnParameterTextual["CCTK_REAL_VEC_SIZE > 1 && i<lc_imin",
+          {
+            DeclareAssignVariable["int", "elt_count", "lc_imin-i"],
+            Map[StoreHighPartialVariableInLoop[GridName[#], localName[#], "elt_count"] &,
+                gfsInLHS],
+            "continue;\n"
+          }]],
+      CommentedBlock["If necessary, store only partial vectors after the last iteration",
+        ConditionalOnParameterTextual["CCTK_REAL_VEC_SIZE > 1 && i+CCTK_REAL_VEC_SIZE > lc_imax",
+          {
+            DeclareAssignVariable["int", "elt_count", "lc_imax-i"],
+            Map[StoreLowPartialVariableInLoop[GridName[#], localName[#], "elt_count"] &,
+                gfsInLHS],
+            "break;\n"
+          }]],
       CommentedBlock["Copy local copies back to grid functions",
         Map[StoreVariableInLoop[GridName[#], localName[#]] &,
             gfsInLHS]],
diff --git a/Tools/CodeGen/CodeGen.m b/Tools/CodeGen/CodeGen.m
index de8dbd2..09e7fe1 100644
--- a/Tools/CodeGen/CodeGen.m
+++ b/Tools/CodeGen/CodeGen.m
@@ -62,14 +62,16 @@ AssignVariableInLoop::usage = "AssignVariableInLoop[dest_, src_] returns a block
   "that assigns 'src' to 'dest'.";
 StoreVariableInLoop::usage = "StoreVariableInLoop[dest_, src_] returns a block of code " <>
   "that assigns 'src' to 'dest'.";
+StoreLowPartialVariableInLoop::usage = "StoreLowPartialVariableInLoop[dest_, src_, count_] returns a block of code " <>
+  "that assigns 'src' to 'dest'.";
+StoreHighPartialVariableInLoop::usage = "StoreHighPartialVariableInLoop[dest_, src_, count_] returns a block of code " <>
+  "that assigns 'src' to 'dest'.";
 DeclareAssignVariableInLoop::usage = "DeclareAssignVariableInLoop[type_, dest_, src_] returns a block of code " <>
   "that assigns 'src' to 'dest'.";
 MaybeAssignVariableInLoop::usage = "MaybeAssignVariableInLoop[dest_, src_, cond_] returns a block of code " <>
   "that assigns 'src' to 'dest'.";
 DeclareMaybeAssignVariableInLoop::usage = "DeclareMaybeAssignVariableInLoop[type_, dest_, src_, cond_] returns a block of code " <>
   "that assigns 'src' to 'dest'.";
-UNUSEDDeclareVariablesInLoopVectorised::usage = "";
-UNUSEDAssignVariablesInLoopVectorised::usage = "";
 TestForNaN::usage = "TestForNaN[expr_] returns a block of code " <>
   "that tests 'expr' for nan.";
 CommentedBlock::usage = "CommentedBlock[comment, block] returns a block consisting " <>
@@ -285,62 +287,24 @@ AssignVariableInLoop[dest_, src_] :=
 StoreVariableInLoop[dest_, src_] :=
   {"vec_store_nta(", dest, ",", src, ")", EOL[]};
 
+StoreLowPartialVariableInLoop[dest_, src_, count_] :=
+  {"vec_store_nta_partial_lo(", dest, ",", src, ",", count, ")", EOL[]};
+
+StoreHighPartialVariableInLoop[dest_, src_, count_] :=
+  {"vec_store_nta_partial_hi(", dest, ",", src, ",", count, ")", EOL[]};
+
 DeclareAssignVariableInLoop[type_, dest_, src_] :=
   {type, " const ", dest, " = vec_load(", src, ")", EOL[]};
 
 MaybeAssignVariableInLoop[dest_, src_, cond_] :=
   If [cond,
-      {dest, " = useMatter ? vec_load(", src, ") : 0.0", EOL[]},
+      {dest, " = useMatter ? vec_load(", src, ") : ToReal(0.0)", EOL[]},
       {dest, " = vec_load(", src, ")", EOL[]}];
 
 DeclareMaybeAssignVariableInLoop[type_, dest_, src_, mmaCond_, codeCond_] :=
   If [mmaCond,
-      {type, "  ", dest, " = (", codeCond, ") ? vec_load(", src, ") : 0.0", EOL[]},
-      {type, "  ", dest, " = vec_load(", src, ")", EOL[]}];
-
-(* TODO: move these into OpenMP loop *)
-UNUSEDDeclareVariablesInLoopVectorised[dests_, temps_, srcs_] :=
-  {
-   {"#undef LC_PRELOOP_STATEMENTS", "\n"},
-   {"#define LC_PRELOOP_STATEMENTS", " \\\n"},
-   {"int const GFD_imin = lc_imin + ((lc_imin + cctk_lsh[0] * (j + cctk_lsh[1] * k)) & (CCTK_REAL_VEC_SIZE-1))", "; \\\n"},
-   {"int const GFD_imax = lc_imax + ((lc_imax + cctk_lsh[0] * (j + cctk_lsh[1] * k)) & (CCTK_REAL_VEC_SIZE-1)) - CCTK_REAL_VEC_SIZE", "; \\\n"},
-   Map[Function[x, Module[{dest, temp, src},
-                          {dest, temp, src} = x;
-                          {"CCTK_REAL_VEC ", temp, "; \\\n"}]],
-       Transpose[{dests, temps, srcs}]],
-   {"\n"}
-  };
-
-UNUSEDAssignVariablesInLoopVectorised[dests_, temps_, srcs_] :=
-  {
-   {"{\n"},
-   {"  if (i < GFD_imin || i >= GFD_imax) {\n"},
-   Map[Function[x, Module[{dest, temp, src},
-                          {dest, temp, src} = x;
-                          {"    ", dest, "[index] = ", src, EOL[]}]],
-       Transpose[{dests, temps, srcs}]],
-   {"  } else {\n"},
-   {"    size_t const index0 = index & (CCTK_REAL_VEC_SIZE-1)", EOL[]},
-   Map[Function[x, Module[{dest, temp, src},
-                          {dest, temp, src} = x;
-                          {"    ((CCTK_REAL*)&", temp, ")[index0] = ",
-                           src, EOL[]}]],
-       Transpose[{dests, temps, srcs}]],
-   {"    if (index0 == CCTK_REAL_VEC_SIZE-1) {\n"},
-   {"      size_t const index1 = index - (CCTK_REAL_VEC_SIZE-1)", EOL[]},
-   Map[Function[x, Module[{dest, temp, src},
-                          {dest, temp, src} = x;
-                          {"      _mm_stream_pd (&", dest, "[index1], ",
-                           temp, ")", EOL[]}]],
-       Transpose[{dests, temps, srcs}]],
-   {"    }\n"},
-   {"  }\n"},
-   {"}\n"}
-  };
-
-UNUSEDAssignVariableInLoopsVectorised[dest_, temp_, src_] :=
-  {"GFD_save_and_store(", dest, ",", "index", ",", "&", temp, ",", src, ")", EOL[]};
+      {type, " ", dest, " = (", codeCond, ") ? vec_load(", src, ") : ToReal(0.0)", EOL[]},
+      {type, " ", dest, " = vec_load(", src, ")", EOL[]}];
 
 TestForNaN[expr_] :=
   {"if (isnan(", expr, ")) {\n",
@@ -463,13 +427,13 @@ DeclareFDVariables[] :=
 
 InitialiseFDSpacingVariablesC[] := 
   {
-    DeclareAssignVariable["CCTK_REAL", "dx", "CCTK_DELTA_SPACE(0)"],
-    DeclareAssignVariable["CCTK_REAL", "dy", "CCTK_DELTA_SPACE(1)"],
-    DeclareAssignVariable["CCTK_REAL", "dz", "CCTK_DELTA_SPACE(2)"],
     (* DeclareAssignVariable["int", "di", "CCTK_GFINDEX3D(cctkGH,1,0,0) - CCTK_GFINDEX3D(cctkGH,0,0,0)"], *)
     DeclareAssignVariable["int", "di", "1"],
     DeclareAssignVariable["int", "dj", "CCTK_GFINDEX3D(cctkGH,0,1,0) - CCTK_GFINDEX3D(cctkGH,0,0,0)"],
-    DeclareAssignVariable["int", "dk", "CCTK_GFINDEX3D(cctkGH,0,0,1) - CCTK_GFINDEX3D(cctkGH,0,0,0)"]
+    DeclareAssignVariable["int", "dk", "CCTK_GFINDEX3D(cctkGH,0,0,1) - CCTK_GFINDEX3D(cctkGH,0,0,0)"],
+    DeclareAssignVariable["CCTK_REAL_VEC", "dx", "ToReal(CCTK_DELTA_SPACE(0))"],
+    DeclareAssignVariable["CCTK_REAL_VEC", "dy", "ToReal(CCTK_DELTA_SPACE(1))"],
+    DeclareAssignVariable["CCTK_REAL_VEC", "dz", "ToReal(CCTK_DELTA_SPACE(2))"]
   };
 
 InitialiseFDSpacingVariablesFortran[] := 
@@ -486,17 +450,17 @@ InitialiseFDVariables[] :=
        InitialiseFDSpacingVariablesFortran[],
        InitialiseFDSpacingVariablesC[]],
     
-    DeclareAssignVariable["CCTK_REAL", "dxi", "1.0 / dx"],
-    DeclareAssignVariable["CCTK_REAL", "dyi", "1.0 / dy"],
-    DeclareAssignVariable["CCTK_REAL", "dzi", "1.0 / dz"],
-    DeclareAssignVariable["CCTK_REAL", "khalf", "0.5"],
-    DeclareAssignVariable["CCTK_REAL", "kthird", "1/3.0"],
-    DeclareAssignVariable["CCTK_REAL", "ktwothird", "2.0/3.0"],
-    DeclareAssignVariable["CCTK_REAL", "kfourthird", "4.0/3.0"],
-    DeclareAssignVariable["CCTK_REAL", "keightthird", "8.0/3.0"],
-    DeclareAssignVariable["CCTK_REAL", "hdxi", "0.5 * dxi"],
-    DeclareAssignVariable["CCTK_REAL", "hdyi", "0.5 * dyi"],
-    DeclareAssignVariable["CCTK_REAL", "hdzi", "0.5 * dzi"]}];
+    DeclareAssignVariable["CCTK_REAL_VEC", "dxi", "INV(dx)"],
+    DeclareAssignVariable["CCTK_REAL_VEC", "dyi", "INV(dy)"],
+    DeclareAssignVariable["CCTK_REAL_VEC", "dzi", "INV(dz)"],
+    DeclareAssignVariable["CCTK_REAL_VEC", "khalf", "ToReal(0.5)"],
+    DeclareAssignVariable["CCTK_REAL_VEC", "kthird", "ToReal(1.0/3.0)"],
+    DeclareAssignVariable["CCTK_REAL_VEC", "ktwothird", "ToReal(2.0/3.0)"],
+    DeclareAssignVariable["CCTK_REAL_VEC", "kfourthird", "ToReal(4.0/3.0)"],
+    DeclareAssignVariable["CCTK_REAL_VEC", "keightthird", "ToReal(8.0/3.0)"],
+    DeclareAssignVariable["CCTK_REAL_VEC", "hdxi", "fmul(ToReal(0.5), dxi)"],
+    DeclareAssignVariable["CCTK_REAL_VEC", "hdyi", "fmul(ToReal(0.5), dyi)"],
+    DeclareAssignVariable["CCTK_REAL_VEC", "hdzi", "fmul(ToReal(0.5), dzi)"]}];
 
 GridName[x_] := If[SOURCELANGUAGE == "C",
                    ToExpression[ToString[x] <> "[index]"],
@@ -657,20 +621,21 @@ GenericGridLoopUsingLoopControl[functionName_, block_] :=
     CommentedBlock["Loop over the grid points",
       {
         "#pragma omp parallel\n",
-        "LC_LOOP3 (", functionName, ",\n",
-        "          i,j,k, min[0],min[1],min[2], max[0],max[1],max[2],\n",
-        "          cctk_lsh[0],cctk_lsh[1],cctk_lsh[2])\n",
+        "LC_LOOP3VEC (", functionName, ",\n",
+        "             i,j,k, min[0],min[1],min[2], max[0],max[1],max[2],\n",
+        "             cctk_lsh[0],cctk_lsh[1],cctk_lsh[2],\n",
+        "             CCTK_REAL_VEC_SIZE)\n",
         "{\n",
         indentBlock[
           {
-             DeclareVariable["index", "// int"],
-             DeclareAssignVariable["int", "index", "CCTK_GFINDEX3D(cctkGH,i,j,k)"],
+             (* DeclareVariable["index", "// int"], *)
+             (* DeclareAssignVariable["int", "index", "CCTK_GFINDEX3D(cctkGH,i,j,k)"], *)
+             DeclareAssignVariable["int", "index", "di*i + dj*j + dk*k"],
              block
           }
         ],
-        "i += CCTK_REAL_VEC_SIZE-1;\n",
         "}\n",
-        "LC_ENDLOOP3 (", functionName, ");\n"
+        "LC_ENDLOOP3VEC (", functionName, ");\n"
       }
     ],
     ""
@@ -797,45 +762,117 @@ insertFile[name_] :=
 
 (* Take an expression x and replace occurrences of Powers with the C
 macros SQR, CUB, QAD *)
-ReplacePowers[x_] :=
-  Module[{rhs},
-    rhs = x /. Power[xx_, -1] -> INV[xx];
+ReplacePowers[expr_] :=
+  Module[{rhs, fmaRules, arithRules},
+    rhs = expr /. Power[xx_, -1] -> INV[xx];
 
     If[SOURCELANGUAGE == "C",
            Module[{},
-	     rhs = rhs //. Power[xx_, 2] -> SQR[xx];
-	     rhs = rhs //. Power[xx_, 3] -> CUB[xx];
-	     rhs = rhs //. Power[xx_, 4] -> QAD[xx];
+	     rhs = rhs /. Power[xx_,  2  ] -> SQR[xx];
+	     rhs = rhs /. Power[xx_,  3  ] -> CUB[xx];
+	     rhs = rhs /. Power[xx_,  4  ] -> QAD[xx];
+	     rhs = rhs /. Power[xx_, -2  ] -> INV[SQR[xx]];
+             rhs = rhs /. Power[xx_,  1/2] -> sqrt[xx];
+             rhs = rhs /. Power[xx_, -1/2] -> INV[sqrt[xx]];
+             rhs = rhs /. Power[xx_,  0.5] -> sqrt[xx];
+             rhs = rhs /. Power[xx_, -0.5] -> INV[sqrt[xx]];
 
-	     rhs = rhs //. xx_/2 -> khalf xx;
-	     rhs = rhs //. (-1/2) -> -khalf;
+             (*
+	     rhs = rhs /.  1/2 ->  khalf
+	     rhs = rhs /. -1/2 -> -khalf;
 
-	     rhs = rhs //. xx_/3 -> kthird xx;
-	     rhs = rhs //. (-1/3) -> -kthird;
+	     rhs = rhs /.  1/3 ->  kthird;
+	     rhs = rhs /. -1/3 -> -kthird;
 
-	     rhs = rhs //. 2/3 -> ktwothird;
-	     rhs = rhs //. (-2/3) -> -ktwothird;
+	     rhs = rhs /.  2/3 ->  ktwothird;
+	     rhs = rhs /. -2/3 -> -ktwothird;
 
-	     rhs = rhs //. 4/3 -> kfourthird;
-	     rhs = rhs //. (-4/3) -> -kfourthird;
+	     rhs = rhs /.  4/3 ->  kfourthird;
+	     rhs = rhs /. -4/3 -> -kfourthird;
 
-	     rhs = rhs //. 8/3 -> keightthird;
-	     rhs = rhs //. (-8/3) -> -keightthird;
+	     rhs = rhs /.  8/3 ->  keightthird;
+	     rhs = rhs /. -8/3 -> -keightthird;
+             *)
 
-	     rhs = rhs //. xx_ y_ + xx_ z_ -> xx(y+z); 
+             (* Avoid rational numbers *)
+	     rhs = rhs /. Rational[xx_,yy_] :> N[xx/yy, 30];
 
-             rhs = rhs //. Power[E, power_] -> exp[power];
-             rhs = rhs //. Power[xx_, 0.5] -> sqrt[xx];
+             rhs = rhs //.      IfThen[cond1_,xx1_,yy1_] +      IfThen[cond2_,xx2_,yy2_] /; cond1==cond2 :> IfThen[cond1, Simplify[    xx1 +     xx2], Simplify[    yy1 +     yy2]];
+             rhs = rhs //. ff1_ IfThen[cond1_,xx1_,yy1_] +      IfThen[cond2_,xx2_,yy2_] /; cond1==cond2 :> IfThen[cond1, Simplify[ff1 xx1 +     xx2], Simplify[ff1 yy1 +     yy2]];
+             rhs = rhs //.      IfThen[cond1_,xx1_,yy1_] + ff2_ IfThen[cond2_,xx2_,yy2_] /; cond1==cond2 :> IfThen[cond1, Simplify[    xx1 + ff2 xx2], Simplify[    yy1 + ff2 yy2]];
+             rhs = rhs //. ff1_ IfThen[cond1_,xx1_,yy1_] + ff2_ IfThen[cond2_,xx2_,yy2_] /; cond1==cond2 :> IfThen[cond1, Simplify[ff1 xx1 + ff2 xx2], Simplify[ff1 yy1 + ff2 yy2]];
+
+             (* Is this still a good idea when FMA instructions are used? *)
+	     rhs = rhs //. xx_ yy_ + xx_ zz_ -> xx (yy+zz);
+	     rhs = rhs //. xx_ yy_ - xx_ zz_ -> xx (yy-zz);
+
+             rhs = rhs /. Power[E, power_] -> exp[power];
 
              (* there have been some problems doing the Max/Min
                 replacement via the preprocessor for C, so we do it
                 here *)
-             rhs = rhs //. Max[xx_, yy_] -> fmax[xx, yy];
-             rhs = rhs //. Min[xx_, yy_] -> fmin[xx, yy];
-
-             rhs = rhs //. Power[xx_, power_] -> pow[xx, power]],
-
-           rhs = rhs //. Power[xx_, power_] -> xx^power
+             rhs = rhs /. Max[xx_, yy_] -> fmax[xx, yy];
+             rhs = rhs /. Min[xx_, yy_] -> fmin[xx, yy];
+
+             rhs = rhs /. Power[xx_, power_] -> pow[xx, power];
+
+             (* FMA (fused multiply-add) instructions *)
+             (* Note that -x is represented as Times[-1, x] *)
+             isNotMinusOneQ[n_] := ! (IntegerQ[n] && n == -1);
+             isNotTimesMinusOneQ[n_] := ! MatchQ[n,- _];
+             fmaRules = {
+               + (xx_? isNotMinusOneQ) (yy_? isNotMinusOneQ) + (zz_? isNotTimesMinusOneQ) :> fmadd [xx,yy,zz],
+               + (xx_? isNotMinusOneQ) (yy_? isNotMinusOneQ) - (zz_? isNotTimesMinusOneQ) :> fmsub [xx,yy,zz],
+               - (xx_? isNotMinusOneQ) (yy_? isNotMinusOneQ) + (zz_? isNotTimesMinusOneQ) :> fnmadd[xx,yy,zz],
+               - (xx_? isNotMinusOneQ) (yy_? isNotMinusOneQ) - (zz_? isNotTimesMinusOneQ) :> fnmsub[xx,yy,zz],
+               + (xx_? isNotMinusOneQ) (yy_ + 1) -> fmadd [xx, yy, xx],
+               + (xx_? isNotMinusOneQ) (yy_ - 1) -> fmsub [xx, yy, xx],
+               - (xx_? isNotMinusOneQ) (yy_ + 1) -> fnmadd[xx, yy, xx],
+               - (xx_? isNotMinusOneQ) (yy_ - 1) -> fnmsub[xx, yy, xx],
+               fmadd[xx_, - yy_, zz_] -> fnmsub[xx,yy,zz],
+               fmsub[xx_, - yy_, zz_] -> fnmadd[xx,yy,zz]
+             };
+             rhs = rhs //. fmaRules;
+
+             (* Constants *)
+             rhs = rhs /. xx_Integer/; xx!=-1 :> ToReal[xx];
+             rhs = rhs /. xx_Real -> ToReal[xx];
+             rhs = rhs /. - ToReal[xx_] -> ToReal[- xx];
+             rhs = rhs /. ToReal[xx_] + ToReal[yy_] -> ToReal[xx + yy];
+             rhs = rhs /. ToReal[xx_] * ToReal[yy_] -> ToReal[xx * yy];
+             rhs = rhs /. pow[xx_, ToReal[power_]] -> pow[xx, power];
+             rhs = rhs /. IfThen[ToReal[xx_], yy_, zz_] -> IfThen[xx, yy, zz];
+
+             (* Replace all operators and functions *)
+             (* fadd, fsub, fmul, fdiv, fneg *)
+             isNotFneg[n_] := ! MatchQ[n,fneg[_]];
+             arithRules = {
+               - xx_ -> fneg[xx],
+               xx_ * yy_ -> fmul[xx,yy],
+               xx_ / yy_ -> fdiv[xx,yy],
+               xx_ + yy_ -> fadd[xx,yy],
+               xx_ - yy_ -> fsub[xx,yy],
+               fmul[-1,xx_] -> fneg[xx],
+               fadd[xx_,fneg[yy_]] -> fsub[xx,yy],
+               fadd[fneg[xx_],(yy_? isNotFneg)] :> fsub[yy,xx],
+               Abs[xx_]      -> kfabs[xx],
+               Log[xx_]      -> klog[xx],
+               fabs[xx_]     -> kfabs[xx],
+               fmax[xx_,yy_] -> kfmax[xx,yy],
+               fmin[xx_,yy_] -> kfmin[xx,yy],
+               sqrt[xx_]     -> ksqrt[xx],
+               exp[xx_]      -> kexp[xx],
+               log[xx_]      -> klog[xx],
+               pow[xx_,yy_]  -> kpow[xx,yy]
+             };
+             rhs = rhs //. arithRules;
+             rhs = rhs /. IfThen[fmul[xx_, yy_], aa_, bb_] -> IfThen[xx*yy, aa, bb];
+             rhs = rhs /. ToReal[fneg[xx_]] -> ToReal[-xx];
+             rhs = rhs /. ToReal[fmul[xx_, yy_]] -> ToReal[xx*yy];
+             rhs = rhs /. kpow[xx_, fneg[power_]] -> kpow[xx, -power];
+           ],
+
+           rhs = rhs /. Power[xx_, power_] -> xx^power
        ];
 (*       Print[rhs//FullForm];*)
     rhs
diff --git a/Tools/CodeGen/Differencing.m b/Tools/CodeGen/Differencing.m
index 609aa9d..1ac5ba8 100644
--- a/Tools/CodeGen/Differencing.m
+++ b/Tools/CodeGen/Differencing.m
@@ -168,8 +168,6 @@ CreateDifferencingHeader[derivOps_, zeroDims_] :=
     pDefs = Union[Flatten[Map[First, mDefPairs]]];
     expressions = Flatten[Map[#[[2]]&, mDefPairs]];
 
-(*    expressions = Flatten[Map[ComponentDerivativeOperatorInlineDefinition, dupsRemoved]];*)
-
     {pDefs,Map[{#, "\n"} &, expressions]}];
 
 ordergfds[_[v1_,___], _[v2_,___]] := 
@@ -213,7 +211,10 @@ PrecomputeDerivative[d:pd_[gf_, inds___]] :=
 evaluateDerivative[d:pd_[gf_, inds___]] :=
   Module[{macroname},
     macroName = ComponentDerivativeOperatorMacroName[pd[inds] -> expr];
-    Return[ToString[macroName] <> "(" <> ToString[gf] <> ", i, j, k)"]];
+    (* Return[ToString[macroName] <> "(" <> ToString[gf] <> ", i, j, k)"] *)
+    (* Return[ToString[macroName] <> "(" <> ToString[gf] <> ")"] *)
+    Return[ToString[macroName] <> "(&" <> ToString[gf] <> "[index])"]
+  ];
 
 DeclareDerivative[d:pd_[gf_, inds___]] :=
   DeclareVariable[GridFunctionDerivativeName[d], "// CCTK_REAL_VEC"];
@@ -248,7 +249,7 @@ sbpMacroDefinition[macroName_, d_] :=
     <> "(i,j,k,sbp_" <> l <> "min,sbp_" <> l <> "max,d" <> ds <> ",u,q" <> ds <> ",cctkGH))"}]    ];
 
 ComponentDerivativeOperatorMacroDefinition[componentDerivOp:(name_[inds___] -> expr_)] :=
-  Module[{macroName, rhs, rhs2, i = "i", j = "j", k = "k", spacings, spacings2, pat, ss, num, den, newnum, signModifier, quotient, liName, rhs3, rhs4},
+  Module[{macroName, rhs, i = "i", j = "j", k = "k", spacings, spacings2, pat, ss, num, den, newnum, signModifier, quotient, liName},
   
     macroName = ComponentDerivativeOperatorMacroName[componentDerivOp];
 
@@ -262,22 +263,23 @@ ComponentDerivativeOperatorMacroDefinition[componentDerivOp:(name_[inds___] -> e
       Return[sbpMacroDefinition[macroName, 3]]];
 
     rhs = DifferenceGF[expr, i, j, k];
+(*    Print["rhs1 == ", FullForm[rhs]];*)
     spacings = {spacing[1] -> 1/"dxi", spacing[2] -> 1/"dyi", spacing[3] -> 1/"dzi"};
     spacings2 = {spacing[1] -> "dx", spacing[2] -> "dy", spacing[3] -> "dz"};
 
-    rhs2 = FullSimplify[rhs];
+    rhs = FullSimplify[rhs];
 
-(*    Print["rhs2 == ", FullForm[rhs2]];*)
+(*    Print["rhs2 == ", FullForm[rhs]];*)
 
     pat = Times[spInExpr:(Power[spacing[_],_]..), (Rational[x_,y_])..., rest__];
 (*    Print["pat == ", pat//FullForm];*)
 
-    If[MatchQ[rhs2, pat],
+    If[MatchQ[rhs, pat],
 (*       Print["matches!"];*)
-       ss = Times[rhs2 /. pat -> spInExpr];
+       ss = Times[rhs /. pat -> spInExpr];
 (*       Print["ss == ", ss];*)
-       num = rhs2 /. pat -> x;
-       den = rhs2 /. pat -> y;
+       num = rhs /. pat -> x;
+       den = rhs /. pat -> y;
 (*       Print["num == ", num];
        Print["den == ", den];*)
        If[{num, 1, 2} === {1, 2},(* Print["SEQ!"]; *) newnum = 1; den=1; signModifier = "",
@@ -303,39 +305,35 @@ ComponentDerivativeOperatorMacroDefinition[componentDerivOp:(name_[inds___] -> e
        liName = "p" <> signModifier <> quotient <> ToString[Apply[SequenceForm,Simplify[1/(ss /. spacings2)],{0,Infinity}]];
 (*       Print["liName == ", liName];*)
 
-       rhs3 = rhs2 /. pat -> Times[liName, rest],
+       (* rhs = rhs /. pat -> Times[liName, rest], *)
+       rhs = (rhs /. pat -> Times[liName, rest]) / liName,
 (*       Print["!!!!!!!!DOES NOT MATCH!!!!!!!!!"];*)
-       rhs3 = rhs2];
+       rhs = rhs];
 
-(*    Print["rhs3 == ", rhs3];*)
+(*    Print["rhs3 == ", FullForm[rhs]];*)
 
     pDefs = {{liName -> CFormHideStrings[ReplacePowers[num / den ss /. spacings2]]}};
 
-(*    rhs4 = Factor[rhs3];*)
-
-    rhs4 = rhs3 //. (x_ a_ + x_ b_) -> x(a+b);
-    rhs5 = rhs4 //. (x_ a_ - x_ b_) -> x(a-b);
+(*    rhs = Factor[rhs];*)
+    rhs = rhs //. (x_ a_ + x_ b_) -> x (a+b);
+    rhs = rhs //. (x_ a_ - x_ b_) -> x (a-b);
     
 (*    Print[componentDerivOp, ": "];
-    Print[FullForm[rhs5]];
+    Print[FullForm[rhs]];
     Print[""];*)
 
-    rhs6 = CFormHideStrings[ReplacePowers[rhs5 /. spacings]];
-    {pDefs, FlattenBlock[{"#define ", macroName, "(u,i,j,k) ", "(", rhs6, ")"}]}];
-
-ComponentDerivativeOperatorInlineDefinition[componentDerivOp:(name_[inds___] -> expr_)] :=
-  Module[{inlineName, rhs, rhs2, i = "i", j = "j", k = "k", spacings},
-  
-    inlineName = ComponentDerivativeOperatorMacroName[componentDerivOp];
-
-    rhs = DifferenceGF[expr, i, j, k];
-(*    rhs = DifferenceGFInline[expr, i, j, k];*)
-    spacings = {spacing[1] -> 1/"dxi", spacing[2] -> 1/"dyi", spacing[3] -> 1/"dzi"};
-    rhs2 = CFormHideStrings[FullSimplify[ReplacePowers[rhs /. spacings]]];
-
-    DefineFunction[inlineName, "static inline CCTK_REAL", 
-      "CCTK_REAL *u, int i, int j, int k",
-      {"return ", rhs2, ";\n"}]];
+    rhs = CFormHideStrings[ReplacePowers[rhs /. spacings]];
+    (* {pDefs, FlattenBlock[{"#define ", macroName, "(u,i,j,k) ", "(", rhs, ")"}]} *)
+    {pDefs, FlattenBlock[{
+      "#ifndef KRANC_DIFF_FUNCTIONS\n",
+       (* default, differencing operators are macros *)
+      "#  define ", macroName, "(u) ", "(fmul(", liName, ",", rhs, "))\n",
+      "#else\n",
+       (* new, differencing operators are static functions *)
+      "#  define ", macroName, "(u) ", "(", liName, "*", macroName, "_impl((u),dj,dk))\n",
+      "static CCTK_REAL_VEC ", macroName, "_impl(CCTK_REAL const* restrict const u, int const dj, int const dk) ", "{ return ", rhs, "; }\n",
+      "#endif\n"
+    }]}];
 
 ComponentDerivativeOperatorMacroName[componentDerivOp:(name_[inds___] -> expr_)] :=
   Module[{stringName},
@@ -368,14 +366,6 @@ DifferenceGF[op_, i_, j_, k_] :=
       Apply[Plus, Map[DifferenceGFTerm[#, i, j, k] &, expanded]],
       DifferenceGFTerm[expanded, i, j, k]]];
 
-DifferenceGFInline[op_, i_, j_, k_] :=
-  Module[{expanded},
-    expanded = Expand[op];
-    
-    If[Head[expanded] === Plus,
-      Apply[Plus, Map[DifferenceGFTermInline[#, i, j, k] &, expanded]],
-      DifferenceGFTerm[expanded, i, j, k]]];
-
 
 (* Return the fragment of a macro definition for defining a derivative
    operator *)
@@ -404,10 +394,31 @@ DifferenceGFTerm[op_, i_, j_, k_] :=
       "(int)(" <> ToString[CFormHideStrings[j+ny]] <> ")," <>
       "(int)(" <> ToString[CFormHideStrings[k+nz]] <> "))]",
 *)
-    remaining "vec_loadu((u)[index" <>
-      "+di*(" <> ToString[CFormHideStrings[nx]] <> ")" <>
+(*
+    remaining "vec_loadu_maybe(" <> ToString[CFormHideStrings[nx]] <> "," <>
+      "(u)[index" <>
+     "+di*(" <> ToString[CFormHideStrings[nx]] <> ")" <>
+     "+dj*(" <> ToString[CFormHideStrings[ny]] <> ")" <>
+     "+dk*(" <> ToString[CFormHideStrings[nz]] <> ")])",
+*)
+(*
+    remaining "vec_loadu_maybe(" <> ToString[CFormHideStrings[nx]] <> "," <>
+      "(u)[(" <> ToString[CFormHideStrings[nx]] <> ")" <>
       "+dj*(" <> ToString[CFormHideStrings[ny]] <> ")" <>
       "+dk*(" <> ToString[CFormHideStrings[nz]] <> ")])",
+*)
+    remaining "vec_loadu_maybe3" <>
+      "(" <> ToString[CFormHideStrings[nx /. {dir1->1, dir2->1, dir3->1}]] <> "," <>
+             ToString[CFormHideStrings[ny /. {dir1->1, dir2->1, dir3->1}]] <> "," <>
+             ToString[CFormHideStrings[nz /. {dir1->1, dir2->1, dir3->1}]] <> "," <>
+      "(u)[(" <> ToString[CFormHideStrings[nx]] <> ")" <>
+      "+dj*(" <> ToString[CFormHideStrings[ny]] <> ")" <>
+      "+dk*(" <> ToString[CFormHideStrings[nz]] <> ")])",
+(*
+    remaining "vec_loadu(u[(" <> ToString[CFormHideStrings[nx]] <> ")" <>
+                      "+dj*(" <> ToString[CFormHideStrings[ny]] <> ")" <>
+                      "+dk*(" <> ToString[CFormHideStrings[nz]] <> ")])",
+*)
 (*
     remaining "(u)[CCTK_GFINDEX3D(cctkGH,floor((" <>
       ToString[CFormHideStrings[i+nx]] <> ")+0.5),floor((" <>
@@ -417,27 +428,6 @@ DifferenceGFTerm[op_, i_, j_, k_] :=
     remaining "u(" <> ToString[FortranForm[i+nx]] <> "," <> 
       ToString[FortranForm[j+ny]] <> "," <> ToString[FortranForm[k+nz]] <> ")"] ];
 
-(* Return the fragment of a function definition for defining a derivative
-   operator *)
-DifferenceGFTermInline[op_, i_, j_, k_] :=
-  Module[{nx, ny, nz, remaining},
-
-    If[op === 0,
-      Return[0]];
-
-    nx = Exponent[op, shift[1]];
-    ny = Exponent[op, shift[2]];
-    nz = Exponent[op, shift[3]];
-
-    remaining = op / (shift[1]^nx) / (shift[2]^ny) / (shift[3]^nz);
-
-    If[Cases[{remaining}, shift[_], Infinity] != {},
-      ThrowError["Could not parse difference operator:", op]];
-    
-    remaining "(u)[CCTK_GFINDEX3D(cctkGH," <> ToString[CFormHideStrings[i+nx]] <> "," <>
-      ToString[CFormHideStrings[j+ny]] <> "," <> ToString[CFormHideStrings[k+nz]] <> ")]"
-     ];
-
 
 DerivativeOperatorGFDs[gf_];
 
diff --git a/Tools/CodeGen/Kranc.m b/Tools/CodeGen/Kranc.m
index fd07c53..21acd21 100644
--- a/Tools/CodeGen/Kranc.m
+++ b/Tools/CodeGen/Kranc.m
@@ -22,7 +22,11 @@ BeginPackage["Kranc`"];
 
 (* CodeGen.m *)
 
-{INV, SQR, CUB, QAD, exp, pow, fmax, fmin, dx, dy, dz, khalf, kthird, ktwothird, kfourthird, keightthird};
+{INV, SQR, CUB, QAD, IfThen, ToReal, sqrt, exp, pow, fmax, fmin,
+ fmadd, fmsub, fnmadd, fnmsub, fneg, fadd, fsub, fmul, fdiv,
+ kfabs, kfmax, kfmin, ksqrt, kexp, klog, kpow,
+ dir1, dir2, dir3, dx, dy, dz,
+ khalf, kthird, ktwothird, kfourthird, keightthird};
 
 (* Helpers.m *)
 
diff --git a/Tools/CodeGen/Thorn.m b/Tools/CodeGen/Thorn.m
index 353be9c..4ada28a 100644
--- a/Tools/CodeGen/Thorn.m
+++ b/Tools/CodeGen/Thorn.m
@@ -476,11 +476,12 @@ CreateSchedule[globalStorageGroups_, scheduledGroups_, scheduledFunctions_] :=
 calculationMacros[] :=
   CommentedBlock["Define macros used in calculations",
     Map[{"#define ", #, "\n"} &,
-       {"INITVALUE  (42)",
-        "INV(x) ((1.0) / (x))"   ,        
-        "SQR(x) ((x) * (x))"   ,        
-        "CUB(x) ((x) * (x) * (x))"   , 
-        "QAD(x) ((x) * (x) * (x) * (x))"}]];
+       {"INITVALUE (42)",
+        "INV(x) (fdiv(ToReal(1.0),x))",
+        "SQR(x) (fmul(x,x))",
+        "CUB(x) (x*SQR(x))",
+        "QAD(x) (SQR(SQR(x)))"
+       }]];
 
 (* Given a list of Calculation structures as defined above, create a
    CodeGen representation of a source file that defines a function for
@@ -508,7 +509,7 @@ CreateSetterSource[calcs_, debug_, useCSE_, include_, imp_,
       ],
 
    Map[IncludeFile, Join[{"cctk.h", "cctk_Arguments.h", "cctk_Parameters.h",
-                     (*"precomputations.h",*) "GenericFD.h", "Differencing.h", "Vectors.hh"}, include,
+                     (*"precomputations.h",*) "GenericFD.h", "Vectors.hh", "Differencing.h"}, include,
                          If[OptionValue[UseLoopControl], {"loopcontrol.h"}, {}]]],
    calculationMacros[],
 
@@ -738,10 +739,10 @@ CreateMoLBoundariesSource[spec_] :=
      "if (CCTK_EQUALS(" <> boundpar <> ", \"none\"  ) ||\n",
      "    CCTK_EQUALS(" <> boundpar <> ", \"static\") ||\n",
      "    CCTK_EQUALS(" <> boundpar <> ", \"flat\"  ) ||\n",
-     "    CCTK_EQUALS(" <> boundpar <> ", \"zero\"  ) ) \n",
+     "    CCTK_EQUALS(" <> boundpar <> ", \"zero\"  ) )\n",
      "{\n",
 
-     "  ierr = Boundary_SelectGroupForBC(cctkGH, CCTK_ALL_FACES, 1, -1, \n",
+     "  ierr = Boundary_SelectGroupForBC(cctkGH, CCTK_ALL_FACES, boundary_width, -1,\n",
      "                    \"" <> fullgroupname <> "\", " <> boundpar <> ");\n",
 
      "  if (ierr < 0)\n",
@@ -760,10 +761,10 @@ CreateMoLBoundariesSource[spec_] :=
      "if (CCTK_EQUALS(" <> boundpar <> ", \"none\"  ) ||\n",
      "    CCTK_EQUALS(" <> boundpar <> ", \"static\") ||\n",
      "    CCTK_EQUALS(" <> boundpar <> ", \"flat\"  ) ||\n",
-     "    CCTK_EQUALS(" <> boundpar <> ", \"zero\"  ) ) \n",
+     "    CCTK_EQUALS(" <> boundpar <> ", \"zero\"  ) )\n",
      "{\n",
 
-     "  ierr = Boundary_SelectVarForBC(cctkGH, CCTK_ALL_FACES, 1, -1, \n",
+     "  ierr = Boundary_SelectVarForBC(cctkGH, CCTK_ALL_FACES, boundary_width, -1,\n",
      "                    \"" <> fullgfname <> "\", " <> boundpar <> ");\n",
 
      "  if (ierr < 0)\n",
@@ -796,7 +797,7 @@ CreateMoLBoundariesSource[spec_] :=
       "     CCTK_WARN(0, \"could not set SPEED value in table!\");\n",
 
       "\n",
-      "  ierr = Boundary_SelectGroupForBC(cctkGH, CCTK_ALL_FACES, 1, "<>myhandle<>", \n",
+      "  ierr = Boundary_SelectGroupForBC(cctkGH, CCTK_ALL_FACES, boundary_width, "<>myhandle<>", \n",
       "                    \"" <> fullgroupname <> "\", \"Radiation\");\n\n",
 
       "  if (ierr < 0)\n",
@@ -830,7 +831,7 @@ CreateMoLBoundariesSource[spec_] :=
       "      CCTK_WARN(0, \"could not set SPEED value in table!\");\n",
 
       "\n",
-      "  ierr = Boundary_SelectVarForBC(cctkGH, CCTK_ALL_FACES, 1, "<>myhandle<>", \n",
+      "  ierr = Boundary_SelectVarForBC(cctkGH, CCTK_ALL_FACES, boundary_width, "<>myhandle<>", \n",
       "                    \"" <> fullgfname <> "\", \"Radiation\");\n\n",
 
       "  if (ierr < 0)\n",
@@ -859,7 +860,7 @@ CreateMoLBoundariesSource[spec_] :=
       "      CCTK_WARN(0, \"could not set SCALAR value in table!\");\n",
 
       "\n",
-      "  ierr = Boundary_SelectGroupForBC(cctkGH, CCTK_ALL_FACES, 1, "<>myhandle<>", \n",
+      "  ierr = Boundary_SelectGroupForBC(cctkGH, CCTK_ALL_FACES, boundary_width, "<>myhandle<>", \n",
       "                    \"" <> fullgroupname <> "\", \"scalar\");\n\n",
 
       "  if (ierr < 0)\n",
@@ -889,7 +890,7 @@ CreateMoLBoundariesSource[spec_] :=
       "    CCTK_WARN(0, \"could not set SCALAR value in table!\");\n",
 
       "\n",
-      "  ierr = Boundary_SelectVarForBC(cctkGH, CCTK_ALL_FACES, 1, "<>myhandle<>", \n",
+      "  ierr = Boundary_SelectVarForBC(cctkGH, CCTK_ALL_FACES, boundary_width, "<>myhandle<>", \n",
       "                    \"" <> fullgfname <> "\", \"scalar\");\n\n",
 
       "  if (ierr < 0)\n",
author	Ian Hinder <ian.hinder@aei.mpg.de>	2010-11-22 11:20:48 +0100
committer	Ian Hinder <ian.hinder@aei.mpg.de>	2010-11-23 00:19:37 +0100
commit	d6c4d4c2131107ef3a4004692823e2041394acd6 (patch)
tree	bbdd18d5c45e9d4b4f722c8792c4c7d1f02ced19
parent	c9431558d69ee1b8769c918044a45bdd50520b46 (diff)