From 421103bf1df43f250a452df460553f72c824f8db Mon Sep 17 00:00:00 2001 From: eschnett Date: Fri, 3 Dec 2010 14:51:56 +0000 Subject: Add implementation git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@4 105869f7-3296-0410-a4ea-f4349344b45a --- src/make.code.defn | 7 ++ src/vectors-default-4.h | 79 ++++++++++++++ src/vectors-default-8.h | 79 ++++++++++++++ src/vectors-intel-4.h | 155 ++++++++++++++++++++++++++ src/vectors-intel-8.h | 116 ++++++++++++++++++++ src/vectors-power-4.h | 128 ++++++++++++++++++++++ src/vectors-power-8.h | 106 ++++++++++++++++++ src/vectors.h | 284 ++++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 954 insertions(+) create mode 100644 src/make.code.defn create mode 100644 src/vectors-default-4.h create mode 100644 src/vectors-default-8.h create mode 100644 src/vectors-intel-4.h create mode 100644 src/vectors-intel-8.h create mode 100644 src/vectors-power-4.h create mode 100644 src/vectors-power-8.h create mode 100644 src/vectors.h diff --git a/src/make.code.defn b/src/make.code.defn new file mode 100644 index 0000000..fa10ccb --- /dev/null +++ b/src/make.code.defn @@ -0,0 +1,7 @@ +# Main make.code.defn file for thorn Vectors + +# Source files in this directory +SRCS = + +# Subdirectories containing source files +SUBDIRS = diff --git a/src/vectors-default-4.h b/src/vectors-default-4.h new file mode 100644 index 0000000..e20109d --- /dev/null +++ b/src/vectors-default-4.h @@ -0,0 +1,79 @@ +// Fallback vectorisation implementation: Do not vectorise + + + +// We use macros here, so that we are not surprised by compilers which +// don't like to inline functions. This should also make debug builds +// (which may not inline) more efficient. + + + +// Use CCTK_REAL4 +#define CCTK_REAL4_VEC CCTK_REAL4 + +// Number of vector elements in a vector +#define CCTK_REAL4_VEC_SIZE 1 + + + +// Create a vector replicating a scalar +#define vec4_set1(a) (a) +// Create a vector from N scalars +#define vec4_set(a) (a) + +// Access vectors elements +#define vec4_elt0(x) (x) +#define vec4_elt(x,d) (x) + + + +// Load an aligned vector from memory +#define vec4_load(p) (p) +// Load an unaligned vector from memory +#define vec4_loadu(p) (p) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset and the vector size. These functions are +// useful e.g. for loading neightbouring grid points while evaluating +// finite differencing stencils. +#define vec4_loadu_maybe(off,p) (p) +#define vec4_loadu_maybe3(off1,off2,off3,p) (p) + +// Aligned store +#define vec4_store(p,x) ((p)=(x)) +// Unaligned store +#define vec4_store_nta(p,x) ((p)=(x)) + +// Store the n lower elements of a vector to memory +#define vec4_store_nta_partial_lo(p,x,n) (assert(0)) +// Store the n higher elements of a vector into memory. This stores +// the vector elements into memory locations as if element 0 were +// stored at p. +#define vec4_store_nta_partial_hi(p,x,n) (assert(0)) + + + +// Operators +#define k4pos(x) (+(x)) +#define k4neg(x) (-(x)) + +#define k4add(x,y) ((x)+(y)) +#define k4sub(x,y) ((x)-(y)) +#define k4mul(x,y) ((x)*(y)) +#define k4div(x,y) ((x)/(y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k4madd(x,y,z) (+(x)*(y)+(z)) +#define k4msub(x,y,z) (+(x)*(y)-(z)) +#define k4nmadd(x,y,z) (-(x)*(y)-(z)) +#define k4nmsub(x,y,z) (-(x)*(y)+(z)) + +// Functions +#define k4exp(x) (expf(x)) +#define k4fabs(x) (fabsf(x)) +#define k4fmax(x,y) (fmaxf(x,y)) +#define k4fmin(x,y) (fminf(x,y)) +#define k4fnabs(x) (-fabsf(x)) +#define k4log(x) (logf(x)) +#define k4pow(x,a) (powf(x,a)) +#define k4sqrt(x) (sqrtf(x)) diff --git a/src/vectors-default-8.h b/src/vectors-default-8.h new file mode 100644 index 0000000..8ea3ac8 --- /dev/null +++ b/src/vectors-default-8.h @@ -0,0 +1,79 @@ +// Fallback vectorisation implementation: Do not vectorise + + + +// We use macros here, so that we are not surprised by compilers which +// don't like to inline functions. This should also make debug builds +// (which may not inline) more efficient. + + + +// Use CCTK_REAL8 +#define CCTK_REAL8_VEC CCTK_REAL8 + +// Number of vector elements in a vector +#define CCTK_REAL8_VEC_SIZE 1 + + + +// Create a vector replicating a scalar +#define vec8_set1(a) (a) +// Create a vector from N scalars +#define vec8_set(a) (a) + +// Access vectors elements +#define vec8_elt0(x) (x) +#define vec8_elt(x,d) (x) + + + +// Load an aligned vector from memory +#define vec8_load(p) (p) +// Load an unaligned vector from memory +#define vec8_loadu(p) (p) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset and the vector size. These functions are +// useful e.g. for loading neightbouring grid points while evaluating +// finite differencing stencils. +#define vec8_loadu_maybe(off,p) (p) +#define vec8_loadu_maybe3(off1,off2,off3,p) (p) + +// Aligned store +#define vec8_store(p,x) ((p)=(x)) +// Unaligned store +#define vec8_store_nta(p,x) ((p)=(x)) + +// Store the n lower elements of a vector to memory +#define vec8_store_nta_partial_lo(p,x,n) (assert(0)) +// Store the n higher elements of a vector into memory. This stores +// the vector elements into memory locations as if element 0 were +// stored at p. +#define vec8_store_nta_partial_hi(p,x,n) (assert(0)) + + + +// Operators +#define k8pos(x) (+(x)) +#define k8neg(x) (-(x)) + +#define k8add(x,y) ((x)+(y)) +#define k8sub(x,y) ((x)-(y)) +#define k8mul(x,y) ((x)*(y)) +#define k8div(x,y) ((x)/(y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k8madd(x,y,z) (+(x)*(y)+(z)) +#define k8msub(x,y,z) (+(x)*(y)-(z)) +#define k8nmadd(x,y,z) (-(x)*(y)-(z)) +#define k8nmsub(x,y,z) (-(x)*(y)+(z)) + +// Functions +#define k8exp(x) (exp(x)) +#define k8fabs(x) (fabs(x)) +#define k8fmax(x,y) (fmax(x,y)) +#define k8fmin(x,y) (fmin(x,y)) +#define k8fnabs(x) (-fabs(x)) +#define k8log(x) (log(x)) +#define k8pow(x,a) (pow(x,a)) +#define k8sqrt(x) (sqrt(x)) diff --git a/src/vectors-intel-4.h b/src/vectors-intel-4.h new file mode 100644 index 0000000..4549a70 --- /dev/null +++ b/src/vectors-intel-4.h @@ -0,0 +1,155 @@ +// Vectorise using Intel's or AMD's SSE + +// Use the type __m128 directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include + + + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL4_VEC __m128 + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL4_VEC_SIZE 4 + + + +// Create vectors, extract vector elements + +#define vec4_set1(a) (_mm_set1_ps(a)) +#define vec4_set(a,b,c,d) (_mm_set_ps(d,c,b,a)) // note reversed arguments + +#if defined(__PGI) && defined (__amd64__) +// _mm_cvtss_f32 does not exist on PGI compilers +# define vec4_elt0(x) \ +({ \ + CCTK_REAL4 aelt0; \ + asm ("" : "=x" (aelt0) : "0" (x)); \ + aelt0; \ +}) +#else +# define vec4_elt0(x) (_mm_cvtss_f32(x)) // this is a no-op +#endif +#define vec4_elt1(x) \ +({ \ + CCTK_REAL4_VEC const xelt1=(x); \ + vec4_elt0(_mm_shuffle_ps(xelt1,xelt1,_MM_SHUFFLE(1,0,3,2))); \ +}) +#define vec4_elt2(x) \ +({ \ + CCTK_REAL4_VEC const xelt2=(x); \ + vec4_elt0(_mm_unpackhi_ps(xelt2,xelt2)); \ +}) +#define vec4_elt3(x) \ +({ \ + CCTK_REAL4_VEC const xelt3=(x); \ + vec4_elt0(_mm_shuffle_ps(xelt3,xelt3,_MM_SHUFFLE(3,2,1,0))); \ +}) +#define vec4_elt(x,d) \ +({ \ + CCTK_REAL4_VEC const xelt=(x); \ + CCTK_REAL4 aelt; \ + switch (d) { \ + case 0: aelt=vec4_elt0(xelt); break; \ + case 1: aelt=vec4_elt1(xelt); break; \ + case 2: aelt=vec4_elt2(xelt); break; \ + case 3: aelt=vec4_elt3(xelt); break; \ + } \ + aelt; \ +}) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec4_load(p) (_mm_load_ps(&(p))) +#define vec4_loadu(p) (_mm_loadu_ps(&(p))) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: Always use unaligned load +#define vec4_loadu_maybe(off,p) (vec4_loadu(p)) +#define vec4_loadu_maybe3(off1,off2,off3,p) (vec4_loadu(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec4_store(p,x) (_mm_store_ps(&(p),x)) +#define vec4_storeu(p,x) (_mm_storeu_ps(&(p),x)) +#define vec4_store_nta(p,x) (_mm_stream_ps(&(p),x)) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec4_store_nta_partial_lo(p,x,n) \ +({ \ + switch (n) { \ + case 3: (&(p))[2]=vec_elt2(p); \ + case 2: _mm_storel_pi(&(p),x); break; \ + case 1: (&(p))[0]=vec_elt0(p); \ + } \ +}) +#define vec4_store_nta_partial_hi(p,x,n) \ +({ \ + switch (n) { \ + case 3: (&(p))[1]=vec_elt1(p); \ + case 2: _mm_storeh_pi(&(p)+2,x); break; \ + case 1: (&(p))[3]=vec_elt3(p); \ + } \ +}) + + + +// Functions and operators + +static const union { + unsigned i[4]; + __m128 v; +} k4sign_mask_union = {{ 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U }}; +#define k4sign_mask (k4sign_mask_union.v) + +// Operators +#define k4pos(x) (x) +#define k4neg(x) (_mm_xor_ps(x,k4sign_mask)) + +#define k4add(x,y) (_mm_add_ps(x,y)) +#define k4sub(x,y) (_mm_sub_ps(x,y)) +#define k4mul(x,y) (_mm_mul_ps(x,y)) +#define k4div(x,y) (_mm_div_ps(x,y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k4madd(x,y,z) (k4add(k4mul(x,y),z)) +#define k4msub(x,y,z) (k4sub(k4mul(x,y),z)) +#define k4nmadd(x,y,z) (k4sub(k4neg(z),k4mul(x,y))) +#define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y))) + +// Cheap functions +#define k4fabs(x) (_mm_andnot_ps(x,k4sign_mask)) +#define k4fmax(x,y) (_mm_max_ps(x,y)) +#define k4fmin(x,y) (_mm_min_ps(x,y)) +#define k4fnabs(x) (_mm_or_ps(x,k4sign_mask)) +#define k4sqrt(x) (_mm_sqrt_ps(x)) + +// Expensive functions +#define k4exp(x) \ +({ \ + CCTK_REAL4_VEC const xexp=(x); \ + vec4_set(exp(vec4_elt0(xexp)), exp(vec4_elt1(xexp)), \ + exp(vec4_elt2(xexp)), exp(vec4_elt3(xexp))); \ +}) +#define k4log(x) \ +({ \ + CCTK_REAL4_VEC const xlog=(x); \ + vec4_set(log(vec4_elt0(xlog)), log(vec4_elt1(xlog)), \ + log(vec4_elt2(xlog)), log(vec4_elt3(xlog))); \ +}) +#define k4pow(x,a) \ +({ \ + CCTK_REAL4_VEC const xpow=(x); \ + CCTK_REAL4 const apow=(a); \ + vec4_set(pow(vec4_elt0(xpow),apow), pow(vec4_elt1(xpow),apow), \ + pow(vec4_elt2(xpow),apow), pow(vec4_elt3(xpow),apow)); \ +}) diff --git a/src/vectors-intel-8.h b/src/vectors-intel-8.h new file mode 100644 index 0000000..a9e4764 --- /dev/null +++ b/src/vectors-intel-8.h @@ -0,0 +1,116 @@ +// Vectorise using Intel's or AMD's SSE2 + +// Use the type __m128d directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include + + + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL8_VEC __m128d + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL8_VEC_SIZE 2 + + + +// Create vectors, extract vector elements + +#define vec8_set1(a) (_mm_set1_pd(a)) +#define vec8_set(a,b) (_mm_set_pd(b,a)) // note reversed arguments + +#define vec8_elt0(x) (_mm_cvtsd_f64(x)) // this is a no-op +#define vec8_elt1(x) \ +({ \ + CCTK_REAL8_VEC const xelt1=(x); \ + vec8_elt0(_mm_unpackhi_pd(xelt1,xelt1)); \ +}) +#define vec8_elt(x,d) \ +({ \ + CCTK_REAL8_VEC const xelt=(x); \ + CCTK_REAL8 aelt; \ + switch (d) { \ + case 0: aelt=vec8_elt0(xelt); break; \ + case 1: aelt=vec8_elt1(xelt); break; \ + } \ + aelt; \ +}) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec8_load(p) (_mm_load_pd(&(p))) +#define vec8_loadu(p) (_mm_loadu_pd(&(p))) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: Always use unaligned load +#define vec8_loadu_maybe(off,p) (vec8_loadu(p)) +#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec8_store(p,x) (_mm_store_pd(&(p),x)) +#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x)) +#define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x)) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x)) +#define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x)) + + + +// Functions and operators + +static const union { + unsigned long long i[2]; + __m128d v; +} k8sign_mask_union = {{ 0x8000000000000000ULL, 0x8000000000000000ULL }}; +#define k8sign_mask (k8sign_mask_union.v) + +// Operators +#define k8pos(x) (x) +#define k8neg(x) (_mm_xor_pd(x,k8sign_mask)) + +#define k8add(x,y) (_mm_add_pd(x,y)) +#define k8sub(x,y) (_mm_sub_pd(x,y)) +#define k8mul(x,y) (_mm_mul_pd(x,y)) +#define k8div(x,y) (_mm_div_pd(x,y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k8madd(x,y,z) (k8add(k8mul(x,y),z)) +#define k8msub(x,y,z) (k8sub(k8mul(x,y),z)) +#define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y))) +#define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y))) + +// Cheap functions +#define k8fabs(x) (_mm_andnot_pd(x,k8sign_mask)) +#define k8fmax(x,y) (_mm_max_pd(x,y)) +#define k8fmin(x,y) (_mm_min_pd(x,y)) +#define k8fnabs(x) (_mm_or_pd(x,k8sign_mask)) +#define k8sqrt(x) (_mm_sqrt_pd(x)) + +// Expensive functions +#define k8exp(x) \ +({ \ + CCTK_REAL8_VEC const xexp=(x); \ + vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \ +}) +#define k8log(x) \ +({ \ + CCTK_REAL8_VEC const xlog=(x); \ + vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \ +}) +#define k8pow(x,a) \ +({ \ + CCTK_REAL8_VEC const xpow=(x); \ + CCTK_REAL8 const apow=(a); \ + vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \ +}) diff --git a/src/vectors-power-4.h b/src/vectors-power-4.h new file mode 100644 index 0000000..009b0f4 --- /dev/null +++ b/src/vectors-power-4.h @@ -0,0 +1,128 @@ +// Vectorise using IBM's Altivec (Power) + +// Use the type vector double directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include + + + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL4_VEC vector float + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL4_VEC_SIZE 4 + + + +// Create vectors, extract vector elements + +#define vec4_set1(a) (vec_splats(a)) +#define vec4_set(a,b,c,d) \ +({ \ + CCTK_REAL4_VEC x; \ + x[0]=(a); \ + x[1]=(b); \ + x[2]=(c); \ + x[3]=(d); \ + x; \ +}) + +#define vec4_elt0(x) ((x)[0]) +#define vec4_elt1(x) ((x)[1]) +#define vec4_elt2(x) ((x)[2]) +#define vec4_elt3(x) ((x)[3]) +#define vec4_elt(x,d) ((x)[d]) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec4_load(p) (*(CCTK_REAL4_VEC const*)&(p)) +#define vec4_loadu(p) (*(CCTK_REAL4_VEC const*)&(p)) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset and the vector size +#define vec4_loadu_maybe(off,p) (vec4_loadu(p)) +#define vec4_loadu_maybe3(off1,off2,off3,p) (vec4_loadu(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec4_store(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x)) +#define vec4_storeu(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x)) +// TODO: Use stvxl instruction? +#define vec4_store_nta(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x)) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec4_store_nta_partial_lo(p,x,n) \ +({ \ + switch (n) { \ + case 3: ((&(p))[2]=(x)[2]); \ + case 2: ((&(p))[1]=(x)[1]); \ + case 1: ((&(p))[0]=(x)[0]); \ + } \ +}) +#define vec4_store_nta_partial_hi(p,x,n) \ +({ \ + switch (n) { \ + case 3: ((&(p))[1]=(x)[1]); \ + case 2: ((&(p))[2]=(x)[2]); \ + case 1: ((&(p))[3]=(x)[3]); \ + } \ +}) + + + +// Functions and operators + +// Operators +#define k4pos(x) (+(x)) +#define k4neg(x) (-(x)) + +#define k4add(x,y) ((x)+(y)) +#define k4sub(x,y) ((x)-(y)) +#define k4mul(x,y) ((x)*(y)) +#define k4div(x,y) ((x)/(y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k4madd(x,y,z) (vec_madd(x,y,z)) +#define k4msub(x,y,z) (vec_msub(x,y,z)) +#define k4nmadd(x,y,z) (vec_nmadd(x,y,z)) +#define k4nmsub(x,y,z) (vec_nmsub(x,y,z)) + +// Cheap functions +#define k4fabs(x) (vec_abs(x)) +#define k4fmax(x,y) (vec_max(x,y)) +#define k4fmin(x,y) (vec_min(x,y)) +#define k4fnabs(x) (vec_nabs(x)) + +#define k4exp(x) \ +({ \ + CCTK_REAL4_VEC const xexp=(x); \ + vec4_set(exp(vec4_elt0(xexp)), exp(vec4_elt1(xexp)), \ + exp(vec4_elt2(xexp)), exp(vec4_elt3(xexp))); \ +}) +#define k4log(x) \ +({ \ + CCTK_REAL4_VEC const xlog=(x); \ + vec4_set(log(vec4_elt0(xlog)), log(vec4_elt1(xlog)), \ + log(vec4_elt2(xlog)), log(vec4_elt3(xlog))); \ +}) +#define k4pow(x,a) \ +({ \ + CCTK_REAL4_VEC const xpow=(x); \ + CCTK_REAL4 const apow=(a); \ + vec4_set(pow(vec4_elt0(xpow),apow), pow(vec4_elt1(xpow),apow), \ + pow(vec4_elt2(xpow),apow), pow(vec4_elt3(xpow),apow)); \ +}) +#define k4sqrt(x) \ +({ \ + CCTK_REAL4_VEC const xsqrt=(x); \ + vec4_set(sqrt(vec4_elt0(xsqrt)), sqrt(vec4_elt1(xsqrt)), \ + sqrt(vec4_elt2(xsqrt)), sqrt(vec4_elt3(xsqrt))); \ +}) diff --git a/src/vectors-power-8.h b/src/vectors-power-8.h new file mode 100644 index 0000000..8313168 --- /dev/null +++ b/src/vectors-power-8.h @@ -0,0 +1,106 @@ +// Vectorise using IBM's Altivec VSX (Power) + +// Use the type vector double directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include + + + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL8_VEC vector double + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL8_VEC_SIZE 2 + + + +// Create vectors, extract vector elements + +#define vec8_set1(a) (vec_splats(a)) +#define vec8_set(a,b) \ +({ \ + CCTK_REAL8_VEC x; \ + x[0]=(a); \ + x[1]=(b); \ + x; \ +}) + +#define vec8_elt0(x) ((x)[0]) +#define vec8_elt1(x) ((x)[1]) +#define vec8_elt(x,d) ((x)[d]) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec8_load(p) (*(CCTK_REAL8_VEC const*)&(p)) +#define vec8_loadu(p) (*(CCTK_REAL8_VEC const*)&(p)) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset and the vector size +#define vec8_loadu_maybe(off,p) (vec8_loadu(p)) +#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec8_store(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x)) +#define vec8_storeu(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x)) +// TODO: Use stvxl instruction? +#define vec8_store_nta(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x)) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec8_store_nta_partial_lo(p,x,n) ((&(p))[0]=(x)[0]) +#define vec8_store_nta_partial_hi(p,x,n) ((&(p))[1]=(x)[1]) + + + +// Functions and operators + +// Operators +#define k8pos(x) (+(x)) +#define k8neg(x) (-(x)) + +#define k8add(x,y) ((x)+(y)) +#define k8sub(x,y) ((x)-(y)) +#define k8mul(x,y) ((x)*(y)) +#define k8div(x,y) ((x)/(y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k8madd(x,y,z) (vec_madd(x,y,z)) +#define k8msub(x,y,z) (vec_msub(x,y,z)) +#define k8nmadd(x,y,z) (vec_nmadd(x,y,z)) +#define k8nmsub(x,y,z) (vec_nmsub(x,y,z)) + +// Cheap functions +#define k8fabs(x) (vec_abs(x)) +#define k8fmax(x,y) (vec_max(x,y)) +#define k8fmin(x,y) (vec_min(x,y)) +#define k8fnabs(x) (vec_nabs(x)) + +#define k8exp(x) \ +({ \ + CCTK_REAL8_VEC const xexp=(x); \ + vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \ +}) +#define k8log(x) \ +({ \ + CCTK_REAL8_VEC const xlog=(x); \ + vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \ +}) +#define k8pow(x,a) \ +({ \ + CCTK_REAL8_VEC const xpow=(x); \ + CCTK_REAL8 const apow=(a); \ + vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \ +}) +#define k8sqrt(x) \ +({ \ + CCTK_REAL8_VEC const xsqrt=(x); \ + vec8_set(sqrt(vec8_elt0(xsqrt)), sqrt(vec8_elt1(xsqrt))); \ +}) diff --git a/src/vectors.h b/src/vectors.h new file mode 100644 index 0000000..6fe909f --- /dev/null +++ b/src/vectors.h @@ -0,0 +1,284 @@ +#ifndef VECTORS_H +#define VECTORS_H + +#include + + + +#if defined(KRANC_VECTORS) + +# if defined(__SSE__) // Intel SSE vector instructions +# include "vectors-intel-4.h" +# elif defined(__ALTIVEC__) // Altivec (Power) +# include "vectors-power-4.h" +# endif + +# if defined(__SSE2__) // Intel SSE2 vector instructions +# include "vectors-intel-8.h" +# elif defined(__ALTIVEC__) && defined(_ARCH_PWR7) // Altivec (Power) +# include "vectors-power-8.h" +# endif + +#endif + +// Default implementation, do not vectorise +#if ! defined(CCTK_REAL4_VEC_SIZE) +# include "vectors-default-4.h" +#endif +#if ! defined(CCTK_REAL8_VEC_SIZE) +# include "vectors-default-8.h" +#endif + + + +// Define macros for CCTK_REAL + +#if defined(CCTK_REAL_PRECISION_4) + +# define CCTK_REAL_VEC CCTK_REAL4_VEC +# define CCTK_REAL_VEC_SIZE CCTK_REAL4_VEC_SIZE + +# define vec_set1 vec4_set1 +# define vec_set vec4_set + +# define vec_elt0 vec4_elt0 +# define vec_elt vec4_elt + +# define vec_load vec4_load +# define vec_loadu vec4_loadu +# define vec_loadu_maybe vec4_loadu_maybe +# define vec_loadu_maybe3 vec4_loadu_maybe3 +# define vec_store vec4_store +# define vec_store_nta vec4_store_nta +# define vec_store_nta_partial_lo vec4_store_nta_partial_lo +# define vec_store_nta_partial_hi vec4_store_nta_partial_hi + +# define kpos k4pos +# define kneg k4neg + +# define kadd k4add +# define ksub k4sub +# define kmul k4mul +# define kdiv k4div + +# define kmadd k4madd +# define kmsub k4msub +# define knmadd k4nmadd +# define knmsub k4nmsub + +# define kexp k4exp +# define kfabs k4fabs +# define kfmax k4fmax +# define kfmin k4fmin +# define kfnabs k4fnabs +# define klog k4log +# define kpow k4pow +# define ksqrt k4sqrt + +#elif defined(CCTK_REAL_PRECISION_8) + +# define CCTK_REAL_VEC CCTK_REAL8_VEC +# define CCTK_REAL_VEC_SIZE CCTK_REAL8_VEC_SIZE + +# define vec_set1 vec8_set1 +# define vec_set vec8_set + +# define vec_elt0 vec8_elt0 +# define vec_elt vec8_elt + +# define vec_load vec8_load +# define vec_loadu vec8_loadu +# define vec_loadu_maybe vec8_loadu_maybe +# define vec_loadu_maybe3 vec8_loadu_maybe3 +# define vec_store vec8_store +# define vec_store_nta vec8_store_nta +# define vec_store_nta_partial_lo vec8_store_nta_partial_lo +# define vec_store_nta_partial_hi vec8_store_nta_partial_hi + +# define kpos k8pos +# define kneg k8neg + +# define kadd k8add +# define ksub k8sub +# define kmul k8mul +# define kdiv k8div + +# define kmadd k8madd +# define kmsub k8msub +# define knmadd k8nmadd +# define knmsub k8nmsub + +# define kexp k8exp +# define kfabs k8fabs +# define kfmax k8fmax +# define kfmin k8fmin +# define kfnabs k8fnabs +# define klog k8log +# define kpow k8pow +# define ksqrt k8sqrt + +#else + +# error "Unknown CCTK_REAL_PRECISION" + +#endif + + + +// Define a class template for easier access from C++ + +#ifdef __cplusplus + +template +struct vecprops { + typedef T scalar_t; + typedef T vector_t; + static inline int size() + { + return 1; + } + static inline vector_t load (scalar_t const& a) + { + return a; + } + static inline vector_t loadu (scalar_t const& a) + { + return a; + } + static inline scalar_t elt (vector_t const& x, int const d) + { + return x; + } + static inline vector_t pos (vector_t const& x) + { + return +x; + } + static inline vector_t neg (vector_t const& x) + { + return -x; + } + static inline vector_t add (vector_t const& x, vector_t const& y) + { + return x+y; + } + static inline vector_t sub (vector_t const& x, vector_t const& y) + { + return x-y; + } + static inline vector_t mul (vector_t const& x, vector_t const& y) + { + return x*y; + } + static inline vector_t div (vector_t const& x, vector_t const& y) + { + return x/y; + } +}; + +template<> +struct vecprops { + typedef CCTK_REAL4 scalar_t; + typedef CCTK_REAL4_VEC vector_t; + static inline int size() + { + return CCTK_REAL4_VEC_SIZE; + } + static inline vector_t load (scalar_t const& a) + { + return vec4_load(a); + } + static inline vector_t loadu (scalar_t const& a) + { + return vec4_loadu(a); + } + static inline scalar_t elt (vector_t const& x, int const d) + { + return vec4_elt(x,d); + } + static inline vector_t pos (vector_t const& x) + { + return k4pos(x); + } + static inline vector_t neg (vector_t const& x) + { + return k4neg(x); + } + static inline vector_t add (vector_t const& x, vector_t const& y) + { + return k4add(x,y); + } + static inline vector_t sub (vector_t const& x, vector_t const& y) + { + return k4sub(x,y); + } + static inline vector_t mul (vector_t const& x, vector_t const& y) + { + return k4mul(x,y); + } + static inline vector_t div (vector_t const& x, vector_t const& y) + { + return k4div(x,y); + } +}; + +template<> +struct vecprops { + typedef CCTK_REAL8 scalar_t; + typedef CCTK_REAL8_VEC vector_t; + static inline int size() + { + return CCTK_REAL8_VEC_SIZE; + } + static inline vector_t load (scalar_t const& a) + { + return vec8_load(a); + } + static inline vector_t loadu (scalar_t const& a) + { + return vec8_loadu(a); + } + static inline scalar_t elt (vector_t const& x, int const d) + { + return vec8_elt(x,d); + } + static inline vector_t pos (vector_t const& x) + { + return k8pos(x); + } + static inline vector_t neg (vector_t const& x) + { + return k8neg(x); + } + static inline vector_t add (vector_t const& x, vector_t const& y) + { + return k8add(x,y); + } + static inline vector_t sub (vector_t const& x, vector_t const& y) + { + return k8sub(x,y); + } + static inline vector_t mul (vector_t const& x, vector_t const& y) + { + return k8mul(x,y); + } + static inline vector_t div (vector_t const& x, vector_t const& y) + { + return k8div(x,y); + } +}; + +#endif + + + +// For Kranc + +#undef ToReal +#define ToReal(x) (vec_set1((CCTK_REAL)(x))) + +#undef Sign +#define Sign(x) -999999999 // poison + + + +#endif // #ifndef VECTORS_H -- cgit v1.2.3