diff options
author | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2011-01-20 20:22:34 +0000 |
---|---|---|
committer | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2011-01-20 20:22:34 +0000 |
commit | 5d4858e0736a0c0881c65b9e9ac0983d3b5bb24b (patch) | |
tree | edd7f47bf30742d3a9583819496ae8bf9ea80fcf /src/vectors-4-SSE.h | |
parent | 49084a03a0685df85894e22821a7ef63b2d8cf1c (diff) |
Change naming scheme of architecture files
Add support for AVX (next-generation SSE)
Add support for Double Hummer (Blue Gene/P)
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@7 105869f7-3296-0410-a4ea-f4349344b45a
Diffstat (limited to 'src/vectors-4-SSE.h')
-rw-r--r-- | src/vectors-4-SSE.h | 173 |
1 files changed, 173 insertions, 0 deletions
diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h new file mode 100644 index 0000000..bc50e68 --- /dev/null +++ b/src/vectors-4-SSE.h @@ -0,0 +1,173 @@ +// Vectorise using Intel's or AMD's SSE + +// Use the type __m128 directly, without introducing a wrapper class +// Use macros instead of inline functions + + + +#include <xmmintrin.h> + + + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL4_VEC __m128 + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL4_VEC_SIZE 4 + + + +// Create vectors, extract vector elements + +#define vec4_set1(a) (_mm_set1_ps(a)) +#define vec4_set(a,b,c,d) (_mm_set_ps(d,c,b,a)) // note reversed arguments + +#if defined(__PGI) && defined (__amd64__) +// _mm_cvtss_f32 does not exist on PGI compilers +# define vec4_elt0(x) \ +({ \ + CCTK_REAL4 aelt0; \ + asm ("" : "=x" (aelt0) : "0" (x)); \ + aelt0; \ +}) +#else +# define vec4_elt0(x) (_mm_cvtss_f32(x)) // this is a no-op +#endif +#define vec4_elt1(x) \ +({ \ + CCTK_REAL4_VEC const xelt1=(x); \ + vec4_elt0(_mm_shuffle_ps(xelt1,xelt1,_MM_SHUFFLE(1,0,3,2))); \ +}) +#define vec4_elt2(x) \ +({ \ + CCTK_REAL4_VEC const xelt2=(x); \ + vec4_elt0(_mm_unpackhi_ps(xelt2,xelt2)); \ +}) +#define vec4_elt3(x) \ +({ \ + CCTK_REAL4_VEC const xelt3=(x); \ + vec4_elt0(_mm_shuffle_ps(xelt3,xelt3,_MM_SHUFFLE(3,2,1,0))); \ +}) +#if defined(__PGI) && defined (__amd64__) +# define vec4_elt(x,d) \ +({ \ + CCTK_REAL4_VEC const xelt=(x); \ + CCTK_REAL4 aelt; \ + if (d==0) aelt=vec4_elt0(xelt); \ + else if (d==1) aelt=vec4_elt1(xelt); \ + else if (d==2) aelt=vec4_elt2(xelt); \ + else if (d==3) aelt=vec4_elt3(xelt); \ + aelt; \ +}) +#else +# define vec4_elt(x,d) \ +({ \ + CCTK_REAL4_VEC const xelt=(x); \ + CCTK_REAL4 aelt; \ + switch (d) { \ + case 0: aelt=vec4_elt0(xelt); break; \ + case 1: aelt=vec4_elt1(xelt); break; \ + case 2: aelt=vec4_elt2(xelt); break; \ + case 3: aelt=vec4_elt3(xelt); break; \ + } \ + aelt; \ +}) +#endif + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec4_load(p) (_mm_load_ps(&(p))) +#define vec4_loadu(p) (_mm_loadu_ps(&(p))) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset off and the vector size +// Implementation: Always use unaligned load +#define vec4_loadu_maybe(off,p) (vec4_loadu(p)) +#define vec4_loadu_maybe3(off1,off2,off3,p) (vec4_loadu(p)) + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec4_store(p,x) (_mm_store_ps(&(p),x)) +#define vec4_storeu(p,x) (_mm_storeu_ps(&(p),x)) +#define vec4_store_nta(p,x) (_mm_stream_ps(&(p),x)) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec4_store_nta_partial_lo(p,x,n) \ +({ \ + switch (n) { \ + case 3: (&(p))[2]=vec_elt2(p); \ + case 2: _mm_storel_pi(&(p),x); break; \ + case 1: (&(p))[0]=vec_elt0(p); \ + } \ +}) +#define vec4_store_nta_partial_hi(p,x,n) \ +({ \ + switch (n) { \ + case 3: (&(p))[1]=vec_elt1(p); \ + case 2: _mm_storeh_pi(&(p)+2,x); break; \ + case 1: (&(p))[3]=vec_elt3(p); \ + } \ +}) + + + +// Functions and operators + +static const union { + unsigned i[4]; + __m128 v; +} k4sign_mask_union = {{ 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U }}; +#define k4sign_mask (k4sign_mask_union.v) +static const union { + unsigned i[4]; + __m128 v; +} k4abs_mask_union = {{ 0x7fffffffU, 0x7fffffffU, 0x7fffffffU, 0x7fffffffU }}; +#define k4abs_mask (k4abs_mask_union.v) + +// Operators +#define k4pos(x) (x) +#define k4neg(x) (_mm_xor_ps(x,k4sign_mask)) + +#define k4add(x,y) (_mm_add_ps(x,y)) +#define k4sub(x,y) (_mm_sub_ps(x,y)) +#define k4mul(x,y) (_mm_mul_ps(x,y)) +#define k4div(x,y) (_mm_div_ps(x,y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k4madd(x,y,z) (k4add(k4mul(x,y),z)) +#define k4msub(x,y,z) (k4sub(k4mul(x,y),z)) +#define k4nmadd(x,y,z) (k4sub(k4neg(z),k4mul(x,y))) +#define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y))) + +// Cheap functions +#define k4fabs(x) (_mm_and_ps(x,k4abs_mask)) +#define k4fmax(x,y) (_mm_max_ps(x,y)) +#define k4fmin(x,y) (_mm_min_ps(x,y)) +#define k4fnabs(x) (_mm_or_ps(x,k4sign_mask)) +#define k4sqrt(x) (_mm_sqrt_ps(x)) + +// Expensive functions +#define k4exp(x) \ +({ \ + CCTK_REAL4_VEC const xexp=(x); \ + vec4_set(exp(vec4_elt0(xexp)), exp(vec4_elt1(xexp)), \ + exp(vec4_elt2(xexp)), exp(vec4_elt3(xexp))); \ +}) +#define k4log(x) \ +({ \ + CCTK_REAL4_VEC const xlog=(x); \ + vec4_set(log(vec4_elt0(xlog)), log(vec4_elt1(xlog)), \ + log(vec4_elt2(xlog)), log(vec4_elt3(xlog))); \ +}) +#define k4pow(x,a) \ +({ \ + CCTK_REAL4_VEC const xpow=(x); \ + CCTK_REAL4 const apow=(a); \ + vec4_set(pow(vec4_elt0(xpow),apow), pow(vec4_elt1(xpow),apow), \ + pow(vec4_elt2(xpow),apow), pow(vec4_elt3(xpow),apow)); \ +}) |