diff options
Diffstat (limited to 'src/vectors-8-SSE2.h')
-rw-r--r-- | src/vectors-8-SSE2.h | 216 |
1 files changed, 153 insertions, 63 deletions
diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h index 34aa24f..4a3f4e2 100644 --- a/src/vectors-8-SSE2.h +++ b/src/vectors-8-SSE2.h @@ -6,6 +6,14 @@ #include <emmintrin.h> +#ifdef __SSE4_1__ +// Intel's SSE 4.1 +# include <smmintrin.h> +#endif +#ifdef __SSE4A__ +// AMD's SSE 4a +# include <ammintrin.h> +#endif @@ -22,43 +30,17 @@ #define vec8_set1(a) (_mm_set1_pd(a)) #define vec8_set(a,b) (_mm_set_pd(b,a)) // note reversed arguments -#if defined(__PGI) && defined (__amd64__) -// _mm_cvtsd_f64 does not exist on PGI 9 compilers -# define vec8_elt0(x) \ -({ \ - CCTK_REAL8 aelt0; \ - asm ("" : "=x" (aelt0) : "0" (x)); \ - aelt0; \ -}) -#else -# define vec8_elt0(x) (_mm_cvtsd_f64(x)) // this is a no-op -#endif -#define vec8_elt1(x) \ -({ \ - CCTK_REAL8_VEC const xelt1=(x); \ - vec8_elt0(_mm_unpackhi_pd(xelt1,xelt1)); \ -}) -#if defined(__PGI) && defined (__amd64__) -# define vec8_elt(x,d) \ -({ \ - CCTK_REAL8_VEC const xelt=(x); \ - CCTK_REAL8 aelt; \ - if (d==0) aelt=vec8_elt0(xelt); \ - else if (d==1) aelt=vec8_elt1(xelt); \ - aelt; \ -}) -#else -# define vec8_elt(x,d) \ -({ \ - CCTK_REAL8_VEC const xelt=(x); \ - CCTK_REAL8 aelt; \ - switch (d) { \ - case 0: aelt=vec8_elt0(xelt); break; \ - case 1: aelt=vec8_elt1(xelt); break; \ - } \ - aelt; \ -}) -#endif +// original order is 01 +#define vec8_swap10(x_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + _mm_shuffle_pd(x,x, _MM_SHUFFLE2(0,1)); \ + }) + +#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0]) +#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1]) +#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d]) @@ -68,29 +50,96 @@ // a reference to a scalar #define vec8_load(p) (_mm_load_pd(&(p))) #define vec8_loadu(p) (_mm_loadu_pd(&(p))) +#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS +# define vec8_load_off1(p) vec_loadu(p) +#else +# define vec8_load_off1(p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + _mm_shuffle_pd(vec8_load((&p)[-1]), \ + vec8_load((&p)[+1]), _MM_SHUFFLE2(0,1)); \ + }) +#endif // Load a vector from memory that may or may not be aligned, as // decided by the offset off and the vector size +#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS // Implementation: Always use unaligned load -#define vec8_loadu_maybe(off,p) (vec8_loadu(p)) -#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) +# define vec8_loadu_maybe(off,p) vec8_loadu(p) +# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p) +#else +# define vec8_loadu_maybe(off,p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + (off) % CCTK_REAL8_VEC_SIZE == 0 ? \ + vec8_load(p) : \ + vec8_load_off1(p); \ + }) +# if VECTORISE_ALIGNED_ARRAYS +// Assume all array x sizes are multiples of the vector size +# define vec8_loadu_maybe3(off1,off2,off3,p) \ + vec8_loadu_maybe(off1,p) +# else +# define vec8_loadu_maybe3(off1,off2,off3,p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \ + (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \ + vec8_loadu(p) : \ + vec8_loadu_maybe(off1,p); \ + }) +# endif +#endif // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar -#define vec8_store(p,x) (_mm_store_pd(&(p),x)) -#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x)) -#define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x)) +#define vec8_store(p,x) (_mm_store_pd(&(p),x)) +#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x)) +#if ! VECTORISE_STREAMING_STORES +# define vec8_store_nta(p,x) vec8_store(p,x) +#else +# define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x)) +#endif -// Store a lower or higher partial vector (aligned and non-temporal); -// the non-temporal hint is probably ignored -#if 1 +// Store a lower or higher partial vector (aligned and non-temporal) +#if ! VECTORISE_STREAMING_STORES # define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x)) # define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x)) #else +# if defined(__SSE4A__) +# define vec8_store_nta_partial_lo(p,x,n) (_mm_stream_sd(&(p),x)) +# define vec8_store_nta_partial_hi(p,x,n) \ + (_mm_stream_sd(&(p)+1, vec8_swap10(x))) +# else +// TODO: use clflush once a whole cache line has been written (cache +// lines are usually larger than the CPU vector size) +# define vec8_store_nta_partial_lo(p_,x,n) \ + ({ \ + CCTK_REAL8& pp=(p_); \ + CCTK_REAL8& p=pp; \ + _mm_storel_pd(&p,x); \ + /* _mm_clflush(&p); */ \ + }) +# define vec8_store_nta_partial_hi(p_,x,n) \ + ({ \ + CCTK_REAL8& pp=(p_); \ + CCTK_REAL8& p=pp; \ + _mm_storeh_pd(&p+1,x); \ + /* _mm_clflush(&p+1); */ \ + }) +# endif +#endif +#if 0 // This is slower; we would need a non-temporal read -# define vec8_store_nta_partial_lo(p,x,n) (vec8_store_nta(p,_mm_loadh_pd(x,&(p)+1))) -# define vec8_store_nta_partial_hi(p,x,n) (vec8_store_nta(p,_mm_loadl_pd(x,&(p)))) +#define vec8_store_nta_partial_lo(p,x,n) \ + vec8_store_nta(p, _mm_loadh_pd(x,&(p)+1)) +#define vec8_store_nta_partial_hi(p,x,n) \ + vec8_store_nta(p, _mm_loadl_pd(x,&(p))) #endif +#define vec8_store_nta_partial_mid(p,x,nlo,nhi) assert(0) @@ -107,6 +156,43 @@ static const union { } k8abs_mask_union = {{ 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }}; #define k8abs_mask (k8sign_mask_union.v) +// Choice [sign(x)>0 ? y : z] +#ifdef __SSE4_1__ +# define k8ifthen(x,y,z) (_mm_blendv_pd(y,z,x)) +#elif 0 +# define k8ifthen(x,y,z) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8_VEC const yy=(y_); \ + CCTK_REAL8_VEC const y=yy; \ + CCTK_REAL8_VEC const zz=(z_); \ + CCTK_REAL8_VEC const z=zz; \ + int const m = _mm_movemask_pd(x); \ + CCTK_REAL8_VEC r; \ + switch (m) { \ + case 0: r = y; break; \ + case 1: r = _mm_move_sd(y,z); break; \ + case 2: r = _mm_move_sd(z,y); break; \ + case 3: r = z; break; \ + } \ + r; \ + }) +#else +# define k8ifthen(x,y,z) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8_VEC const yy=(y_); \ + CCTK_REAL8_VEC const y=yy; \ + CCTK_REAL8_VEC const zz=(z_); \ + CCTK_REAL8_VEC const z=zz; \ + CCTK_REAL8_VEC const c = _mm_and_pd(x,k8sign_mask); \ + vec8_set(not vec8_elt0(c) ? vec8_elt0(y) : vec8_elt0(z), \ + not vec8_elt1(c) ? vec8_elt1(y) : vec8_elt1(z)); \ + }) +#endif + // Operators #define k8pos(x) (x) #define k8neg(x) (_mm_xor_pd(x,k8sign_mask)) @@ -130,19 +216,23 @@ static const union { #define k8sqrt(x) (_mm_sqrt_pd(x)) // Expensive functions -#define k8exp(x) \ -({ \ - CCTK_REAL8_VEC const xexp=(x); \ - vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \ -}) -#define k8log(x) \ -({ \ - CCTK_REAL8_VEC const xlog=(x); \ - vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \ -}) -#define k8pow(x,a) \ -({ \ - CCTK_REAL8_VEC const xpow=(x); \ - CCTK_REAL8 const apow=(a); \ - vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \ -}) +#define K8REPL(f,x_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + vec8_set(f(vec8_elt0(x)), \ + f(vec8_elt1(x))); \ + }) +#define K8REPL2(f,x_,a_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8 const aa=(a_); \ + CCTK_REAL8 const a=aa; \ + vec8_set(f(vec8_elt0(x),a), \ + f(vec8_elt1(x),a)); \ + }) + +#define k8exp(x) K8REPL(exp,x) +#define k8log(x) K8REPL(log,x) +#define k8pow(x,a) K8REPL2(pow,x,a) |