aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoreschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2010-12-03 14:51:56 +0000
committereschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2010-12-03 14:51:56 +0000
commit421103bf1df43f250a452df460553f72c824f8db (patch)
tree5742f5eeee2f14402b3ebc734bdaeb8b71c8a436
parent96b656fc2d33f2d5238db7e8de17b11c5893084b (diff)
Add implementation
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@4 105869f7-3296-0410-a4ea-f4349344b45a
-rw-r--r--src/make.code.defn7
-rw-r--r--src/vectors-default-4.h79
-rw-r--r--src/vectors-default-8.h79
-rw-r--r--src/vectors-intel-4.h155
-rw-r--r--src/vectors-intel-8.h116
-rw-r--r--src/vectors-power-4.h128
-rw-r--r--src/vectors-power-8.h106
-rw-r--r--src/vectors.h284
8 files changed, 954 insertions, 0 deletions
diff --git a/src/make.code.defn b/src/make.code.defn
new file mode 100644
index 0000000..fa10ccb
--- /dev/null
+++ b/src/make.code.defn
@@ -0,0 +1,7 @@
+# Main make.code.defn file for thorn Vectors
+
+# Source files in this directory
+SRCS =
+
+# Subdirectories containing source files
+SUBDIRS =
diff --git a/src/vectors-default-4.h b/src/vectors-default-4.h
new file mode 100644
index 0000000..e20109d
--- /dev/null
+++ b/src/vectors-default-4.h
@@ -0,0 +1,79 @@
+// Fallback vectorisation implementation: Do not vectorise
+
+
+
+// We use macros here, so that we are not surprised by compilers which
+// don't like to inline functions. This should also make debug builds
+// (which may not inline) more efficient.
+
+
+
+// Use CCTK_REAL4
+#define CCTK_REAL4_VEC CCTK_REAL4
+
+// Number of vector elements in a vector
+#define CCTK_REAL4_VEC_SIZE 1
+
+
+
+// Create a vector replicating a scalar
+#define vec4_set1(a) (a)
+// Create a vector from N scalars
+#define vec4_set(a) (a)
+
+// Access vectors elements
+#define vec4_elt0(x) (x)
+#define vec4_elt(x,d) (x)
+
+
+
+// Load an aligned vector from memory
+#define vec4_load(p) (p)
+// Load an unaligned vector from memory
+#define vec4_loadu(p) (p)
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset and the vector size. These functions are
+// useful e.g. for loading neightbouring grid points while evaluating
+// finite differencing stencils.
+#define vec4_loadu_maybe(off,p) (p)
+#define vec4_loadu_maybe3(off1,off2,off3,p) (p)
+
+// Aligned store
+#define vec4_store(p,x) ((p)=(x))
+// Unaligned store
+#define vec4_store_nta(p,x) ((p)=(x))
+
+// Store the n lower elements of a vector to memory
+#define vec4_store_nta_partial_lo(p,x,n) (assert(0))
+// Store the n higher elements of a vector into memory. This stores
+// the vector elements into memory locations as if element 0 were
+// stored at p.
+#define vec4_store_nta_partial_hi(p,x,n) (assert(0))
+
+
+
+// Operators
+#define k4pos(x) (+(x))
+#define k4neg(x) (-(x))
+
+#define k4add(x,y) ((x)+(y))
+#define k4sub(x,y) ((x)-(y))
+#define k4mul(x,y) ((x)*(y))
+#define k4div(x,y) ((x)/(y))
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#define k4madd(x,y,z) (+(x)*(y)+(z))
+#define k4msub(x,y,z) (+(x)*(y)-(z))
+#define k4nmadd(x,y,z) (-(x)*(y)-(z))
+#define k4nmsub(x,y,z) (-(x)*(y)+(z))
+
+// Functions
+#define k4exp(x) (expf(x))
+#define k4fabs(x) (fabsf(x))
+#define k4fmax(x,y) (fmaxf(x,y))
+#define k4fmin(x,y) (fminf(x,y))
+#define k4fnabs(x) (-fabsf(x))
+#define k4log(x) (logf(x))
+#define k4pow(x,a) (powf(x,a))
+#define k4sqrt(x) (sqrtf(x))
diff --git a/src/vectors-default-8.h b/src/vectors-default-8.h
new file mode 100644
index 0000000..8ea3ac8
--- /dev/null
+++ b/src/vectors-default-8.h
@@ -0,0 +1,79 @@
+// Fallback vectorisation implementation: Do not vectorise
+
+
+
+// We use macros here, so that we are not surprised by compilers which
+// don't like to inline functions. This should also make debug builds
+// (which may not inline) more efficient.
+
+
+
+// Use CCTK_REAL8
+#define CCTK_REAL8_VEC CCTK_REAL8
+
+// Number of vector elements in a vector
+#define CCTK_REAL8_VEC_SIZE 1
+
+
+
+// Create a vector replicating a scalar
+#define vec8_set1(a) (a)
+// Create a vector from N scalars
+#define vec8_set(a) (a)
+
+// Access vectors elements
+#define vec8_elt0(x) (x)
+#define vec8_elt(x,d) (x)
+
+
+
+// Load an aligned vector from memory
+#define vec8_load(p) (p)
+// Load an unaligned vector from memory
+#define vec8_loadu(p) (p)
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset and the vector size. These functions are
+// useful e.g. for loading neightbouring grid points while evaluating
+// finite differencing stencils.
+#define vec8_loadu_maybe(off,p) (p)
+#define vec8_loadu_maybe3(off1,off2,off3,p) (p)
+
+// Aligned store
+#define vec8_store(p,x) ((p)=(x))
+// Unaligned store
+#define vec8_store_nta(p,x) ((p)=(x))
+
+// Store the n lower elements of a vector to memory
+#define vec8_store_nta_partial_lo(p,x,n) (assert(0))
+// Store the n higher elements of a vector into memory. This stores
+// the vector elements into memory locations as if element 0 were
+// stored at p.
+#define vec8_store_nta_partial_hi(p,x,n) (assert(0))
+
+
+
+// Operators
+#define k8pos(x) (+(x))
+#define k8neg(x) (-(x))
+
+#define k8add(x,y) ((x)+(y))
+#define k8sub(x,y) ((x)-(y))
+#define k8mul(x,y) ((x)*(y))
+#define k8div(x,y) ((x)/(y))
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#define k8madd(x,y,z) (+(x)*(y)+(z))
+#define k8msub(x,y,z) (+(x)*(y)-(z))
+#define k8nmadd(x,y,z) (-(x)*(y)-(z))
+#define k8nmsub(x,y,z) (-(x)*(y)+(z))
+
+// Functions
+#define k8exp(x) (exp(x))
+#define k8fabs(x) (fabs(x))
+#define k8fmax(x,y) (fmax(x,y))
+#define k8fmin(x,y) (fmin(x,y))
+#define k8fnabs(x) (-fabs(x))
+#define k8log(x) (log(x))
+#define k8pow(x,a) (pow(x,a))
+#define k8sqrt(x) (sqrt(x))
diff --git a/src/vectors-intel-4.h b/src/vectors-intel-4.h
new file mode 100644
index 0000000..4549a70
--- /dev/null
+++ b/src/vectors-intel-4.h
@@ -0,0 +1,155 @@
+// Vectorise using Intel's or AMD's SSE
+
+// Use the type __m128 directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+
+
+#include <xmmintrin.h>
+
+
+
+// Vector type corresponding to CCTK_REAL
+#define CCTK_REAL4_VEC __m128
+
+// Number of vector elements in a CCTK_REAL_VEC
+#define CCTK_REAL4_VEC_SIZE 4
+
+
+
+// Create vectors, extract vector elements
+
+#define vec4_set1(a) (_mm_set1_ps(a))
+#define vec4_set(a,b,c,d) (_mm_set_ps(d,c,b,a)) // note reversed arguments
+
+#if defined(__PGI) && defined (__amd64__)
+// _mm_cvtss_f32 does not exist on PGI compilers
+# define vec4_elt0(x) \
+({ \
+ CCTK_REAL4 aelt0; \
+ asm ("" : "=x" (aelt0) : "0" (x)); \
+ aelt0; \
+})
+#else
+# define vec4_elt0(x) (_mm_cvtss_f32(x)) // this is a no-op
+#endif
+#define vec4_elt1(x) \
+({ \
+ CCTK_REAL4_VEC const xelt1=(x); \
+ vec4_elt0(_mm_shuffle_ps(xelt1,xelt1,_MM_SHUFFLE(1,0,3,2))); \
+})
+#define vec4_elt2(x) \
+({ \
+ CCTK_REAL4_VEC const xelt2=(x); \
+ vec4_elt0(_mm_unpackhi_ps(xelt2,xelt2)); \
+})
+#define vec4_elt3(x) \
+({ \
+ CCTK_REAL4_VEC const xelt3=(x); \
+ vec4_elt0(_mm_shuffle_ps(xelt3,xelt3,_MM_SHUFFLE(3,2,1,0))); \
+})
+#define vec4_elt(x,d) \
+({ \
+ CCTK_REAL4_VEC const xelt=(x); \
+ CCTK_REAL4 aelt; \
+ switch (d) { \
+ case 0: aelt=vec4_elt0(xelt); break; \
+ case 1: aelt=vec4_elt1(xelt); break; \
+ case 2: aelt=vec4_elt2(xelt); break; \
+ case 3: aelt=vec4_elt3(xelt); break; \
+ } \
+ aelt; \
+})
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec4_load(p) (_mm_load_ps(&(p)))
+#define vec4_loadu(p) (_mm_loadu_ps(&(p)))
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: Always use unaligned load
+#define vec4_loadu_maybe(off,p) (vec4_loadu(p))
+#define vec4_loadu_maybe3(off1,off2,off3,p) (vec4_loadu(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec4_store(p,x) (_mm_store_ps(&(p),x))
+#define vec4_storeu(p,x) (_mm_storeu_ps(&(p),x))
+#define vec4_store_nta(p,x) (_mm_stream_ps(&(p),x))
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+#define vec4_store_nta_partial_lo(p,x,n) \
+({ \
+ switch (n) { \
+ case 3: (&(p))[2]=vec_elt2(p); \
+ case 2: _mm_storel_pi(&(p),x); break; \
+ case 1: (&(p))[0]=vec_elt0(p); \
+ } \
+})
+#define vec4_store_nta_partial_hi(p,x,n) \
+({ \
+ switch (n) { \
+ case 3: (&(p))[1]=vec_elt1(p); \
+ case 2: _mm_storeh_pi(&(p)+2,x); break; \
+ case 1: (&(p))[3]=vec_elt3(p); \
+ } \
+})
+
+
+
+// Functions and operators
+
+static const union {
+ unsigned i[4];
+ __m128 v;
+} k4sign_mask_union = {{ 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U }};
+#define k4sign_mask (k4sign_mask_union.v)
+
+// Operators
+#define k4pos(x) (x)
+#define k4neg(x) (_mm_xor_ps(x,k4sign_mask))
+
+#define k4add(x,y) (_mm_add_ps(x,y))
+#define k4sub(x,y) (_mm_sub_ps(x,y))
+#define k4mul(x,y) (_mm_mul_ps(x,y))
+#define k4div(x,y) (_mm_div_ps(x,y))
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#define k4madd(x,y,z) (k4add(k4mul(x,y),z))
+#define k4msub(x,y,z) (k4sub(k4mul(x,y),z))
+#define k4nmadd(x,y,z) (k4sub(k4neg(z),k4mul(x,y)))
+#define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y)))
+
+// Cheap functions
+#define k4fabs(x) (_mm_andnot_ps(x,k4sign_mask))
+#define k4fmax(x,y) (_mm_max_ps(x,y))
+#define k4fmin(x,y) (_mm_min_ps(x,y))
+#define k4fnabs(x) (_mm_or_ps(x,k4sign_mask))
+#define k4sqrt(x) (_mm_sqrt_ps(x))
+
+// Expensive functions
+#define k4exp(x) \
+({ \
+ CCTK_REAL4_VEC const xexp=(x); \
+ vec4_set(exp(vec4_elt0(xexp)), exp(vec4_elt1(xexp)), \
+ exp(vec4_elt2(xexp)), exp(vec4_elt3(xexp))); \
+})
+#define k4log(x) \
+({ \
+ CCTK_REAL4_VEC const xlog=(x); \
+ vec4_set(log(vec4_elt0(xlog)), log(vec4_elt1(xlog)), \
+ log(vec4_elt2(xlog)), log(vec4_elt3(xlog))); \
+})
+#define k4pow(x,a) \
+({ \
+ CCTK_REAL4_VEC const xpow=(x); \
+ CCTK_REAL4 const apow=(a); \
+ vec4_set(pow(vec4_elt0(xpow),apow), pow(vec4_elt1(xpow),apow), \
+ pow(vec4_elt2(xpow),apow), pow(vec4_elt3(xpow),apow)); \
+})
diff --git a/src/vectors-intel-8.h b/src/vectors-intel-8.h
new file mode 100644
index 0000000..a9e4764
--- /dev/null
+++ b/src/vectors-intel-8.h
@@ -0,0 +1,116 @@
+// Vectorise using Intel's or AMD's SSE2
+
+// Use the type __m128d directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+
+
+#include <emmintrin.h>
+
+
+
+// Vector type corresponding to CCTK_REAL
+#define CCTK_REAL8_VEC __m128d
+
+// Number of vector elements in a CCTK_REAL_VEC
+#define CCTK_REAL8_VEC_SIZE 2
+
+
+
+// Create vectors, extract vector elements
+
+#define vec8_set1(a) (_mm_set1_pd(a))
+#define vec8_set(a,b) (_mm_set_pd(b,a)) // note reversed arguments
+
+#define vec8_elt0(x) (_mm_cvtsd_f64(x)) // this is a no-op
+#define vec8_elt1(x) \
+({ \
+ CCTK_REAL8_VEC const xelt1=(x); \
+ vec8_elt0(_mm_unpackhi_pd(xelt1,xelt1)); \
+})
+#define vec8_elt(x,d) \
+({ \
+ CCTK_REAL8_VEC const xelt=(x); \
+ CCTK_REAL8 aelt; \
+ switch (d) { \
+ case 0: aelt=vec8_elt0(xelt); break; \
+ case 1: aelt=vec8_elt1(xelt); break; \
+ } \
+ aelt; \
+})
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec8_load(p) (_mm_load_pd(&(p)))
+#define vec8_loadu(p) (_mm_loadu_pd(&(p)))
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+// Implementation: Always use unaligned load
+#define vec8_loadu_maybe(off,p) (vec8_loadu(p))
+#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec8_store(p,x) (_mm_store_pd(&(p),x))
+#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x))
+#define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x))
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+#define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x))
+#define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x))
+
+
+
+// Functions and operators
+
+static const union {
+ unsigned long long i[2];
+ __m128d v;
+} k8sign_mask_union = {{ 0x8000000000000000ULL, 0x8000000000000000ULL }};
+#define k8sign_mask (k8sign_mask_union.v)
+
+// Operators
+#define k8pos(x) (x)
+#define k8neg(x) (_mm_xor_pd(x,k8sign_mask))
+
+#define k8add(x,y) (_mm_add_pd(x,y))
+#define k8sub(x,y) (_mm_sub_pd(x,y))
+#define k8mul(x,y) (_mm_mul_pd(x,y))
+#define k8div(x,y) (_mm_div_pd(x,y))
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#define k8madd(x,y,z) (k8add(k8mul(x,y),z))
+#define k8msub(x,y,z) (k8sub(k8mul(x,y),z))
+#define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y)))
+#define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y)))
+
+// Cheap functions
+#define k8fabs(x) (_mm_andnot_pd(x,k8sign_mask))
+#define k8fmax(x,y) (_mm_max_pd(x,y))
+#define k8fmin(x,y) (_mm_min_pd(x,y))
+#define k8fnabs(x) (_mm_or_pd(x,k8sign_mask))
+#define k8sqrt(x) (_mm_sqrt_pd(x))
+
+// Expensive functions
+#define k8exp(x) \
+({ \
+ CCTK_REAL8_VEC const xexp=(x); \
+ vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \
+})
+#define k8log(x) \
+({ \
+ CCTK_REAL8_VEC const xlog=(x); \
+ vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \
+})
+#define k8pow(x,a) \
+({ \
+ CCTK_REAL8_VEC const xpow=(x); \
+ CCTK_REAL8 const apow=(a); \
+ vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \
+})
diff --git a/src/vectors-power-4.h b/src/vectors-power-4.h
new file mode 100644
index 0000000..009b0f4
--- /dev/null
+++ b/src/vectors-power-4.h
@@ -0,0 +1,128 @@
+// Vectorise using IBM's Altivec (Power)
+
+// Use the type vector double directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+
+
+#include <altivec.h>
+
+
+
+// Vector type corresponding to CCTK_REAL
+#define CCTK_REAL4_VEC vector float
+
+// Number of vector elements in a CCTK_REAL_VEC
+#define CCTK_REAL4_VEC_SIZE 4
+
+
+
+// Create vectors, extract vector elements
+
+#define vec4_set1(a) (vec_splats(a))
+#define vec4_set(a,b,c,d) \
+({ \
+ CCTK_REAL4_VEC x; \
+ x[0]=(a); \
+ x[1]=(b); \
+ x[2]=(c); \
+ x[3]=(d); \
+ x; \
+})
+
+#define vec4_elt0(x) ((x)[0])
+#define vec4_elt1(x) ((x)[1])
+#define vec4_elt2(x) ((x)[2])
+#define vec4_elt3(x) ((x)[3])
+#define vec4_elt(x,d) ((x)[d])
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec4_load(p) (*(CCTK_REAL4_VEC const*)&(p))
+#define vec4_loadu(p) (*(CCTK_REAL4_VEC const*)&(p))
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset and the vector size
+#define vec4_loadu_maybe(off,p) (vec4_loadu(p))
+#define vec4_loadu_maybe3(off1,off2,off3,p) (vec4_loadu(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec4_store(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x))
+#define vec4_storeu(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x))
+// TODO: Use stvxl instruction?
+#define vec4_store_nta(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x))
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+#define vec4_store_nta_partial_lo(p,x,n) \
+({ \
+ switch (n) { \
+ case 3: ((&(p))[2]=(x)[2]); \
+ case 2: ((&(p))[1]=(x)[1]); \
+ case 1: ((&(p))[0]=(x)[0]); \
+ } \
+})
+#define vec4_store_nta_partial_hi(p,x,n) \
+({ \
+ switch (n) { \
+ case 3: ((&(p))[1]=(x)[1]); \
+ case 2: ((&(p))[2]=(x)[2]); \
+ case 1: ((&(p))[3]=(x)[3]); \
+ } \
+})
+
+
+
+// Functions and operators
+
+// Operators
+#define k4pos(x) (+(x))
+#define k4neg(x) (-(x))
+
+#define k4add(x,y) ((x)+(y))
+#define k4sub(x,y) ((x)-(y))
+#define k4mul(x,y) ((x)*(y))
+#define k4div(x,y) ((x)/(y))
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#define k4madd(x,y,z) (vec_madd(x,y,z))
+#define k4msub(x,y,z) (vec_msub(x,y,z))
+#define k4nmadd(x,y,z) (vec_nmadd(x,y,z))
+#define k4nmsub(x,y,z) (vec_nmsub(x,y,z))
+
+// Cheap functions
+#define k4fabs(x) (vec_abs(x))
+#define k4fmax(x,y) (vec_max(x,y))
+#define k4fmin(x,y) (vec_min(x,y))
+#define k4fnabs(x) (vec_nabs(x))
+
+#define k4exp(x) \
+({ \
+ CCTK_REAL4_VEC const xexp=(x); \
+ vec4_set(exp(vec4_elt0(xexp)), exp(vec4_elt1(xexp)), \
+ exp(vec4_elt2(xexp)), exp(vec4_elt3(xexp))); \
+})
+#define k4log(x) \
+({ \
+ CCTK_REAL4_VEC const xlog=(x); \
+ vec4_set(log(vec4_elt0(xlog)), log(vec4_elt1(xlog)), \
+ log(vec4_elt2(xlog)), log(vec4_elt3(xlog))); \
+})
+#define k4pow(x,a) \
+({ \
+ CCTK_REAL4_VEC const xpow=(x); \
+ CCTK_REAL4 const apow=(a); \
+ vec4_set(pow(vec4_elt0(xpow),apow), pow(vec4_elt1(xpow),apow), \
+ pow(vec4_elt2(xpow),apow), pow(vec4_elt3(xpow),apow)); \
+})
+#define k4sqrt(x) \
+({ \
+ CCTK_REAL4_VEC const xsqrt=(x); \
+ vec4_set(sqrt(vec4_elt0(xsqrt)), sqrt(vec4_elt1(xsqrt)), \
+ sqrt(vec4_elt2(xsqrt)), sqrt(vec4_elt3(xsqrt))); \
+})
diff --git a/src/vectors-power-8.h b/src/vectors-power-8.h
new file mode 100644
index 0000000..8313168
--- /dev/null
+++ b/src/vectors-power-8.h
@@ -0,0 +1,106 @@
+// Vectorise using IBM's Altivec VSX (Power)
+
+// Use the type vector double directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+
+
+#include <altivec.h>
+
+
+
+// Vector type corresponding to CCTK_REAL
+#define CCTK_REAL8_VEC vector double
+
+// Number of vector elements in a CCTK_REAL_VEC
+#define CCTK_REAL8_VEC_SIZE 2
+
+
+
+// Create vectors, extract vector elements
+
+#define vec8_set1(a) (vec_splats(a))
+#define vec8_set(a,b) \
+({ \
+ CCTK_REAL8_VEC x; \
+ x[0]=(a); \
+ x[1]=(b); \
+ x; \
+})
+
+#define vec8_elt0(x) ((x)[0])
+#define vec8_elt1(x) ((x)[1])
+#define vec8_elt(x,d) ((x)[d])
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec8_load(p) (*(CCTK_REAL8_VEC const*)&(p))
+#define vec8_loadu(p) (*(CCTK_REAL8_VEC const*)&(p))
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset and the vector size
+#define vec8_loadu_maybe(off,p) (vec8_loadu(p))
+#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p))
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec8_store(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x))
+#define vec8_storeu(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x))
+// TODO: Use stvxl instruction?
+#define vec8_store_nta(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x))
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+#define vec8_store_nta_partial_lo(p,x,n) ((&(p))[0]=(x)[0])
+#define vec8_store_nta_partial_hi(p,x,n) ((&(p))[1]=(x)[1])
+
+
+
+// Functions and operators
+
+// Operators
+#define k8pos(x) (+(x))
+#define k8neg(x) (-(x))
+
+#define k8add(x,y) ((x)+(y))
+#define k8sub(x,y) ((x)-(y))
+#define k8mul(x,y) ((x)*(y))
+#define k8div(x,y) ((x)/(y))
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#define k8madd(x,y,z) (vec_madd(x,y,z))
+#define k8msub(x,y,z) (vec_msub(x,y,z))
+#define k8nmadd(x,y,z) (vec_nmadd(x,y,z))
+#define k8nmsub(x,y,z) (vec_nmsub(x,y,z))
+
+// Cheap functions
+#define k8fabs(x) (vec_abs(x))
+#define k8fmax(x,y) (vec_max(x,y))
+#define k8fmin(x,y) (vec_min(x,y))
+#define k8fnabs(x) (vec_nabs(x))
+
+#define k8exp(x) \
+({ \
+ CCTK_REAL8_VEC const xexp=(x); \
+ vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \
+})
+#define k8log(x) \
+({ \
+ CCTK_REAL8_VEC const xlog=(x); \
+ vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \
+})
+#define k8pow(x,a) \
+({ \
+ CCTK_REAL8_VEC const xpow=(x); \
+ CCTK_REAL8 const apow=(a); \
+ vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \
+})
+#define k8sqrt(x) \
+({ \
+ CCTK_REAL8_VEC const xsqrt=(x); \
+ vec8_set(sqrt(vec8_elt0(xsqrt)), sqrt(vec8_elt1(xsqrt))); \
+})
diff --git a/src/vectors.h b/src/vectors.h
new file mode 100644
index 0000000..6fe909f
--- /dev/null
+++ b/src/vectors.h
@@ -0,0 +1,284 @@
+#ifndef VECTORS_H
+#define VECTORS_H
+
+#include <cctk.h>
+
+
+
+#if defined(KRANC_VECTORS)
+
+# if defined(__SSE__) // Intel SSE vector instructions
+# include "vectors-intel-4.h"
+# elif defined(__ALTIVEC__) // Altivec (Power)
+# include "vectors-power-4.h"
+# endif
+
+# if defined(__SSE2__) // Intel SSE2 vector instructions
+# include "vectors-intel-8.h"
+# elif defined(__ALTIVEC__) && defined(_ARCH_PWR7) // Altivec (Power)
+# include "vectors-power-8.h"
+# endif
+
+#endif
+
+// Default implementation, do not vectorise
+#if ! defined(CCTK_REAL4_VEC_SIZE)
+# include "vectors-default-4.h"
+#endif
+#if ! defined(CCTK_REAL8_VEC_SIZE)
+# include "vectors-default-8.h"
+#endif
+
+
+
+// Define macros for CCTK_REAL
+
+#if defined(CCTK_REAL_PRECISION_4)
+
+# define CCTK_REAL_VEC CCTK_REAL4_VEC
+# define CCTK_REAL_VEC_SIZE CCTK_REAL4_VEC_SIZE
+
+# define vec_set1 vec4_set1
+# define vec_set vec4_set
+
+# define vec_elt0 vec4_elt0
+# define vec_elt vec4_elt
+
+# define vec_load vec4_load
+# define vec_loadu vec4_loadu
+# define vec_loadu_maybe vec4_loadu_maybe
+# define vec_loadu_maybe3 vec4_loadu_maybe3
+# define vec_store vec4_store
+# define vec_store_nta vec4_store_nta
+# define vec_store_nta_partial_lo vec4_store_nta_partial_lo
+# define vec_store_nta_partial_hi vec4_store_nta_partial_hi
+
+# define kpos k4pos
+# define kneg k4neg
+
+# define kadd k4add
+# define ksub k4sub
+# define kmul k4mul
+# define kdiv k4div
+
+# define kmadd k4madd
+# define kmsub k4msub
+# define knmadd k4nmadd
+# define knmsub k4nmsub
+
+# define kexp k4exp
+# define kfabs k4fabs
+# define kfmax k4fmax
+# define kfmin k4fmin
+# define kfnabs k4fnabs
+# define klog k4log
+# define kpow k4pow
+# define ksqrt k4sqrt
+
+#elif defined(CCTK_REAL_PRECISION_8)
+
+# define CCTK_REAL_VEC CCTK_REAL8_VEC
+# define CCTK_REAL_VEC_SIZE CCTK_REAL8_VEC_SIZE
+
+# define vec_set1 vec8_set1
+# define vec_set vec8_set
+
+# define vec_elt0 vec8_elt0
+# define vec_elt vec8_elt
+
+# define vec_load vec8_load
+# define vec_loadu vec8_loadu
+# define vec_loadu_maybe vec8_loadu_maybe
+# define vec_loadu_maybe3 vec8_loadu_maybe3
+# define vec_store vec8_store
+# define vec_store_nta vec8_store_nta
+# define vec_store_nta_partial_lo vec8_store_nta_partial_lo
+# define vec_store_nta_partial_hi vec8_store_nta_partial_hi
+
+# define kpos k8pos
+# define kneg k8neg
+
+# define kadd k8add
+# define ksub k8sub
+# define kmul k8mul
+# define kdiv k8div
+
+# define kmadd k8madd
+# define kmsub k8msub
+# define knmadd k8nmadd
+# define knmsub k8nmsub
+
+# define kexp k8exp
+# define kfabs k8fabs
+# define kfmax k8fmax
+# define kfmin k8fmin
+# define kfnabs k8fnabs
+# define klog k8log
+# define kpow k8pow
+# define ksqrt k8sqrt
+
+#else
+
+# error "Unknown CCTK_REAL_PRECISION"
+
+#endif
+
+
+
+// Define a class template for easier access from C++
+
+#ifdef __cplusplus
+
+template<typename T>
+struct vecprops {
+ typedef T scalar_t;
+ typedef T vector_t;
+ static inline int size()
+ {
+ return 1;
+ }
+ static inline vector_t load (scalar_t const& a)
+ {
+ return a;
+ }
+ static inline vector_t loadu (scalar_t const& a)
+ {
+ return a;
+ }
+ static inline scalar_t elt (vector_t const& x, int const d)
+ {
+ return x;
+ }
+ static inline vector_t pos (vector_t const& x)
+ {
+ return +x;
+ }
+ static inline vector_t neg (vector_t const& x)
+ {
+ return -x;
+ }
+ static inline vector_t add (vector_t const& x, vector_t const& y)
+ {
+ return x+y;
+ }
+ static inline vector_t sub (vector_t const& x, vector_t const& y)
+ {
+ return x-y;
+ }
+ static inline vector_t mul (vector_t const& x, vector_t const& y)
+ {
+ return x*y;
+ }
+ static inline vector_t div (vector_t const& x, vector_t const& y)
+ {
+ return x/y;
+ }
+};
+
+template<>
+struct vecprops<CCTK_REAL4> {
+ typedef CCTK_REAL4 scalar_t;
+ typedef CCTK_REAL4_VEC vector_t;
+ static inline int size()
+ {
+ return CCTK_REAL4_VEC_SIZE;
+ }
+ static inline vector_t load (scalar_t const& a)
+ {
+ return vec4_load(a);
+ }
+ static inline vector_t loadu (scalar_t const& a)
+ {
+ return vec4_loadu(a);
+ }
+ static inline scalar_t elt (vector_t const& x, int const d)
+ {
+ return vec4_elt(x,d);
+ }
+ static inline vector_t pos (vector_t const& x)
+ {
+ return k4pos(x);
+ }
+ static inline vector_t neg (vector_t const& x)
+ {
+ return k4neg(x);
+ }
+ static inline vector_t add (vector_t const& x, vector_t const& y)
+ {
+ return k4add(x,y);
+ }
+ static inline vector_t sub (vector_t const& x, vector_t const& y)
+ {
+ return k4sub(x,y);
+ }
+ static inline vector_t mul (vector_t const& x, vector_t const& y)
+ {
+ return k4mul(x,y);
+ }
+ static inline vector_t div (vector_t const& x, vector_t const& y)
+ {
+ return k4div(x,y);
+ }
+};
+
+template<>
+struct vecprops<CCTK_REAL8> {
+ typedef CCTK_REAL8 scalar_t;
+ typedef CCTK_REAL8_VEC vector_t;
+ static inline int size()
+ {
+ return CCTK_REAL8_VEC_SIZE;
+ }
+ static inline vector_t load (scalar_t const& a)
+ {
+ return vec8_load(a);
+ }
+ static inline vector_t loadu (scalar_t const& a)
+ {
+ return vec8_loadu(a);
+ }
+ static inline scalar_t elt (vector_t const& x, int const d)
+ {
+ return vec8_elt(x,d);
+ }
+ static inline vector_t pos (vector_t const& x)
+ {
+ return k8pos(x);
+ }
+ static inline vector_t neg (vector_t const& x)
+ {
+ return k8neg(x);
+ }
+ static inline vector_t add (vector_t const& x, vector_t const& y)
+ {
+ return k8add(x,y);
+ }
+ static inline vector_t sub (vector_t const& x, vector_t const& y)
+ {
+ return k8sub(x,y);
+ }
+ static inline vector_t mul (vector_t const& x, vector_t const& y)
+ {
+ return k8mul(x,y);
+ }
+ static inline vector_t div (vector_t const& x, vector_t const& y)
+ {
+ return k8div(x,y);
+ }
+};
+
+#endif
+
+
+
+// For Kranc
+
+#undef ToReal
+#define ToReal(x) (vec_set1((CCTK_REAL)(x)))
+
+#undef Sign
+#define Sign(x) -999999999 // poison
+
+
+
+#endif // #ifndef VECTORS_H