diff options
author | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2011-06-06 10:11:44 +0000 |
---|---|---|
committer | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2011-06-06 10:11:44 +0000 |
commit | 2ab4d61cd4b632c0e991c781f3c15f3b054d1bbd (patch) | |
tree | 6664b1e9ee360ee0abf9df6b9a5562eb5bdc88c5 /src/vectors-4-SSE.h | |
parent | 5d4858e0736a0c0881c65b9e9ac0983d3b5bb24b (diff) |
Introduce Cactus options for vectorisation
Introduce configuration-time options for vectorisation, including
options to allow architecture-specific choices that may influence
performance.
Introduce "middle" masked stores for large vector sizes and small
loops.
Clean up and simplify some of the implementation code.
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@10 105869f7-3296-0410-a4ea-f4349344b45a
Diffstat (limited to 'src/vectors-4-SSE.h')
-rw-r--r-- | src/vectors-4-SSE.h | 295 |
1 files changed, 208 insertions, 87 deletions
diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h index bc50e68..e6dc735 100644 --- a/src/vectors-4-SSE.h +++ b/src/vectors-4-SSE.h @@ -6,6 +6,10 @@ #include <xmmintrin.h> +#ifdef __SSE4A__ +// AMD's SSE 4a +# include <ammintrin.h> +#endif @@ -22,56 +26,66 @@ #define vec4_set1(a) (_mm_set1_ps(a)) #define vec4_set(a,b,c,d) (_mm_set_ps(d,c,b,a)) // note reversed arguments -#if defined(__PGI) && defined (__amd64__) +// original order is 0123 +#define vec4_swap1032(x_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + _mm_shuffle_ps(x,x, _MM_SHUFFLE(2,3,0,1)); \ + }) +#define vec4_swap2301(x_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + _mm_shuffle_ps(x,x, _MM_SHUFFLE(1,0,3,2)); \ + }) +#define vec4_swap3210(x_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + _mm_shuffle_ps(x,x, _MM_SHUFFLE(0,1,2,3)); \ + }) + +#if defined(__PGI) // _mm_cvtss_f32 does not exist on PGI compilers # define vec4_elt0(x) \ -({ \ - CCTK_REAL4 aelt0; \ - asm ("" : "=x" (aelt0) : "0" (x)); \ - aelt0; \ -}) + ({ \ + CCTK_REAL4 a; \ + asm ("" : "=x" (a) : "0" (x)); \ + a; \ + }) #else # define vec4_elt0(x) (_mm_cvtss_f32(x)) // this is a no-op #endif -#define vec4_elt1(x) \ -({ \ - CCTK_REAL4_VEC const xelt1=(x); \ - vec4_elt0(_mm_shuffle_ps(xelt1,xelt1,_MM_SHUFFLE(1,0,3,2))); \ -}) -#define vec4_elt2(x) \ -({ \ - CCTK_REAL4_VEC const xelt2=(x); \ - vec4_elt0(_mm_unpackhi_ps(xelt2,xelt2)); \ -}) -#define vec4_elt3(x) \ -({ \ - CCTK_REAL4_VEC const xelt3=(x); \ - vec4_elt0(_mm_shuffle_ps(xelt3,xelt3,_MM_SHUFFLE(3,2,1,0))); \ -}) -#if defined(__PGI) && defined (__amd64__) -# define vec4_elt(x,d) \ -({ \ - CCTK_REAL4_VEC const xelt=(x); \ - CCTK_REAL4 aelt; \ - if (d==0) aelt=vec4_elt0(xelt); \ - else if (d==1) aelt=vec4_elt1(xelt); \ - else if (d==2) aelt=vec4_elt2(xelt); \ - else if (d==3) aelt=vec4_elt3(xelt); \ - aelt; \ -}) +#define vec4_elt1(x) vec4_elt0(vec4_swap1032(x)) +#define vec4_elt2(x) vec4_elt0(vec4_swap2301(x)) +#define vec4_elt3(x) vec4_elt0(vec4_swap3210(x)) +#if defined(__PGI) +# define vec4_elt(x_,d) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + CCTK_REAL4 a; \ + if (d==0) a=vec4_elt0(x); \ + else if (d==1) a=vec4_elt1(x); \ + else if (d==2) a=vec4_elt2(x); \ + else if (d==3) a=vec4_elt3(x); \ + a; \ + }) #else -# define vec4_elt(x,d) \ -({ \ - CCTK_REAL4_VEC const xelt=(x); \ - CCTK_REAL4 aelt; \ - switch (d) { \ - case 0: aelt=vec4_elt0(xelt); break; \ - case 1: aelt=vec4_elt1(xelt); break; \ - case 2: aelt=vec4_elt2(xelt); break; \ - case 3: aelt=vec4_elt3(xelt); break; \ - } \ - aelt; \ -}) +# define vec4_elt(x_,d) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + CCTK_REAL4 a; \ + switch (d) { \ + case 0: a=vec4_elt0(x); break; \ + case 1: a=vec4_elt1(x); break; \ + case 2: a=vec4_elt2(x); break; \ + case 3: a=vec4_elt3(x); break; \ + } \ + a; \ + }) #endif @@ -82,37 +96,133 @@ // a reference to a scalar #define vec4_load(p) (_mm_load_ps(&(p))) #define vec4_loadu(p) (_mm_loadu_ps(&(p))) +#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS +# define vec4_load_off1(p) vec_loadu(p) +# define vec4_load_off2(p) vec_loadu(p) +# define vec4_load_off3(p) vec_loadu(p) +#else +# define vec4_load_off1(p_) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const lo=vec4_load((&p)[-1]); \ + CCTK_REAL4_VEC const hi=vec4_load((&p)[+3]); \ + assert(0); \ + CCTK_REAL4_VEC const hi2=_mm_suffle_ps(lo,hi, _MM_SHUFFLE(0,1,2,3)); \ + _mm_shuffle_ps(lo,hi2, _MM_SHUFFLE(2,1,3,0)); \ + }) +# define vec4_load_off2(p_) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const lo=vec4_load((&p)[-2]); \ + CCTK_REAL4_VEC const hi=vec4_load((&p)[+2]); \ + _mm_shuffle_ps(lo,hi, _MM_SHUFFLE(1,0,3,2)); \ + }) +# define vec4_load_off1(p_) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const lo=vec4_load((&p)[-1]); \ + CCTK_REAL4_VEC const hi=vec4_load((&p)[+3]); \ + assert(0); \ + CCTK_REAL4_VEC const lo2=_mm_suffle_ps(lo,hi, _MM_SHUFFLE(0,1,2,3)); \ + _mm_shuffle_ps(lo2,hi, _MM_SHUFFLE(3,0,2,1)); \ + }) +#endif // Load a vector from memory that may or may not be aligned, as // decided by the offset off and the vector size +#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS // Implementation: Always use unaligned load -#define vec4_loadu_maybe(off,p) (vec4_loadu(p)) -#define vec4_loadu_maybe3(off1,off2,off3,p) (vec4_loadu(p)) - -// Store a vector to memory (aligned and non-temporal); this stores to -// a reference to a scalar -#define vec4_store(p,x) (_mm_store_ps(&(p),x)) -#define vec4_storeu(p,x) (_mm_storeu_ps(&(p),x)) -#define vec4_store_nta(p,x) (_mm_stream_ps(&(p),x)) +# define vec4_loadu_maybe(off,p) vec4_loadu(p) +# define vec4_loadu_maybe3(off1,off2,off3,p) vec4_loadu(p) +#else +# define vec4_loadu_maybe(off,p_) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + (off) % CCTK_REAL4_VEC_SIZE == 0 ? \ + vec4_load(p) : \ + vec4_loadu(p); \ + }) +# if VECTORISE_ALIGNED_ARRAYS +// Assume all array x sizes are multiples of the vector size +# define vec4_loadu_maybe3(off1,off2,off3,p) \ + vec4_loadu_maybe(off1,p) +# else +# define vec4_loadu_maybe3(off1,off2,off3,p) \ + vec4_loadu_maybe((off1)|(off2)|(off3),p) +# endif +#endif // Store a lower or higher partial vector (aligned and non-temporal); // the non-temporal hint is probably ignored -#define vec4_store_nta_partial_lo(p,x,n) \ -({ \ - switch (n) { \ - case 3: (&(p))[2]=vec_elt2(p); \ - case 2: _mm_storel_pi(&(p),x); break; \ - case 1: (&(p))[0]=vec_elt0(p); \ - } \ -}) -#define vec4_store_nta_partial_hi(p,x,n) \ -({ \ - switch (n) { \ - case 3: (&(p))[1]=vec_elt1(p); \ - case 2: _mm_storeh_pi(&(p)+2,x); break; \ - case 1: (&(p))[3]=vec_elt3(p); \ - } \ -}) +#if ! VECTORISE_STREAMING_STORES || ! defined(__SSE4A__) +# define vec4_store_nta_partial_lo(p_,x_,n) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + switch (n) { \ + case 1: (&p)[0]=vec4_elt0(x); break; \ + case 2: _mm_storel_ps(&p,x); break; \ + case 3: _mm_storel_ps(&p,x); (&p)[2]=vec4_elt2(x); break; \ + } \ + }) +# define vec4_store_nta_partial_hi(p_,x_,n) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + switch (n) { \ + case 1: (&p)[3]=vec4_elt3(x); break; \ + case 2: _mm_storeh_ps(&p+2,x); break; \ + case 3: _mm_storeh_ps(&p+2,x); (&p)[1]=vec4_elt1(x); break; \ + } \ + }) +#else +# define vec4_store_nta_partial_lo(p_,x_,n) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + switch (n) { \ + case 1: \ + _mm_stream_ss(&p,x); \ + break; \ + case 2: \ + _mm_storel_ps(&p,x); \ + break; \ + case 3: \ + _mm_storel_ps(&p,x); \ + _mm_stream_ss(&p+2, vec4_swap2301(x)); \ + break; \ + } \ + }) +# define vec4_store_nta_partial_hi(p_,x_,n) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + switch (n) { \ + case 1: \ + _mm_stream_ss(&p+3, vec4_swap3210(x)); \ + break; \ + case 2: \ + _mm_storeh_ps(&p+2,x); \ + break; \ + case 3: \ + _mm_storeh_ps(&p+2,x); \ + _mm_stream_ss(&p+1, vec4_swap1032(x)); \ + break; \ + } \ + }) +#endif @@ -132,10 +242,15 @@ static const union { // Operators #define k4pos(x) (x) #define k4neg(x) (_mm_xor_ps(x,k4sign_mask)) +// #define k4inv(x) +// TODO: provide k4inv via rcp and Newton-Raphson +// This is described in AMD's publication 47414. +// This should apply for AVX as well. #define k4add(x,y) (_mm_add_ps(x,y)) #define k4sub(x,y) (_mm_sub_ps(x,y)) #define k4mul(x,y) (_mm_mul_ps(x,y)) +// TODO: use k4inv and k4mul instead #define k4div(x,y) (_mm_div_ps(x,y)) // Fused multiply-add, defined as [+-]x*y[+-]z @@ -149,25 +264,31 @@ static const union { #define k4fmax(x,y) (_mm_max_ps(x,y)) #define k4fmin(x,y) (_mm_min_ps(x,y)) #define k4fnabs(x) (_mm_or_ps(x,k4sign_mask)) +// TODO: maybe use rsqrt and Newton-Raphson #define k4sqrt(x) (_mm_sqrt_ps(x)) // Expensive functions -#define k4exp(x) \ -({ \ - CCTK_REAL4_VEC const xexp=(x); \ - vec4_set(exp(vec4_elt0(xexp)), exp(vec4_elt1(xexp)), \ - exp(vec4_elt2(xexp)), exp(vec4_elt3(xexp))); \ -}) -#define k4log(x) \ -({ \ - CCTK_REAL4_VEC const xlog=(x); \ - vec4_set(log(vec4_elt0(xlog)), log(vec4_elt1(xlog)), \ - log(vec4_elt2(xlog)), log(vec4_elt3(xlog))); \ -}) -#define k4pow(x,a) \ -({ \ - CCTK_REAL4_VEC const xpow=(x); \ - CCTK_REAL4 const apow=(a); \ - vec4_set(pow(vec4_elt0(xpow),apow), pow(vec4_elt1(xpow),apow), \ - pow(vec4_elt2(xpow),apow), pow(vec4_elt3(xpow),apow)); \ -}) +#define K4REPL(f,x_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + vec4_set(f(vec4_elt0(x)), \ + f(vec4_elt1(x)), \ + f(vec4_elt2(x)), \ + f(vec4_elt3(x))); \ + }) +#define K4REPL2(f,x_,a_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + CCTK_REAL4 const aa=(a_); \ + CCTK_REAL4 const a=aa; \ + vec4_set(f(vec4_elt0(x),a), \ + f(vec4_elt1(x),a), \ + f(vec4_elt2(x),a), \ + f(vec4_elt3(x),a)); \ + }) + +#define k4exp(x) K4REPL(exp,x) +#define k4log(x) K4REPL(log,x) +#define k4pow(x,a) K4REPL2(pow,x,a) |