aboutsummaryrefslogtreecommitdiff
path: root/src/vectors-4-SSE.h
diff options
context:
space:
mode:
authoreschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2011-06-06 10:11:44 +0000
committereschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2011-06-06 10:11:44 +0000
commit2ab4d61cd4b632c0e991c781f3c15f3b054d1bbd (patch)
tree6664b1e9ee360ee0abf9df6b9a5562eb5bdc88c5 /src/vectors-4-SSE.h
parent5d4858e0736a0c0881c65b9e9ac0983d3b5bb24b (diff)
Introduce Cactus options for vectorisation
Introduce configuration-time options for vectorisation, including options to allow architecture-specific choices that may influence performance. Introduce "middle" masked stores for large vector sizes and small loops. Clean up and simplify some of the implementation code. git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@10 105869f7-3296-0410-a4ea-f4349344b45a
Diffstat (limited to 'src/vectors-4-SSE.h')
-rw-r--r--src/vectors-4-SSE.h295
1 files changed, 208 insertions, 87 deletions
diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h
index bc50e68..e6dc735 100644
--- a/src/vectors-4-SSE.h
+++ b/src/vectors-4-SSE.h
@@ -6,6 +6,10 @@
#include <xmmintrin.h>
+#ifdef __SSE4A__
+// AMD's SSE 4a
+# include <ammintrin.h>
+#endif
@@ -22,56 +26,66 @@
#define vec4_set1(a) (_mm_set1_ps(a))
#define vec4_set(a,b,c,d) (_mm_set_ps(d,c,b,a)) // note reversed arguments
-#if defined(__PGI) && defined (__amd64__)
+// original order is 0123
+#define vec4_swap1032(x_) \
+ ({ \
+ CCTK_REAL4_VEC const xx=(x_); \
+ CCTK_REAL4_VEC const x=xx; \
+ _mm_shuffle_ps(x,x, _MM_SHUFFLE(2,3,0,1)); \
+ })
+#define vec4_swap2301(x_) \
+ ({ \
+ CCTK_REAL4_VEC const xx=(x_); \
+ CCTK_REAL4_VEC const x=xx; \
+ _mm_shuffle_ps(x,x, _MM_SHUFFLE(1,0,3,2)); \
+ })
+#define vec4_swap3210(x_) \
+ ({ \
+ CCTK_REAL4_VEC const xx=(x_); \
+ CCTK_REAL4_VEC const x=xx; \
+ _mm_shuffle_ps(x,x, _MM_SHUFFLE(0,1,2,3)); \
+ })
+
+#if defined(__PGI)
// _mm_cvtss_f32 does not exist on PGI compilers
# define vec4_elt0(x) \
-({ \
- CCTK_REAL4 aelt0; \
- asm ("" : "=x" (aelt0) : "0" (x)); \
- aelt0; \
-})
+ ({ \
+ CCTK_REAL4 a; \
+ asm ("" : "=x" (a) : "0" (x)); \
+ a; \
+ })
#else
# define vec4_elt0(x) (_mm_cvtss_f32(x)) // this is a no-op
#endif
-#define vec4_elt1(x) \
-({ \
- CCTK_REAL4_VEC const xelt1=(x); \
- vec4_elt0(_mm_shuffle_ps(xelt1,xelt1,_MM_SHUFFLE(1,0,3,2))); \
-})
-#define vec4_elt2(x) \
-({ \
- CCTK_REAL4_VEC const xelt2=(x); \
- vec4_elt0(_mm_unpackhi_ps(xelt2,xelt2)); \
-})
-#define vec4_elt3(x) \
-({ \
- CCTK_REAL4_VEC const xelt3=(x); \
- vec4_elt0(_mm_shuffle_ps(xelt3,xelt3,_MM_SHUFFLE(3,2,1,0))); \
-})
-#if defined(__PGI) && defined (__amd64__)
-# define vec4_elt(x,d) \
-({ \
- CCTK_REAL4_VEC const xelt=(x); \
- CCTK_REAL4 aelt; \
- if (d==0) aelt=vec4_elt0(xelt); \
- else if (d==1) aelt=vec4_elt1(xelt); \
- else if (d==2) aelt=vec4_elt2(xelt); \
- else if (d==3) aelt=vec4_elt3(xelt); \
- aelt; \
-})
+#define vec4_elt1(x) vec4_elt0(vec4_swap1032(x))
+#define vec4_elt2(x) vec4_elt0(vec4_swap2301(x))
+#define vec4_elt3(x) vec4_elt0(vec4_swap3210(x))
+#if defined(__PGI)
+# define vec4_elt(x_,d) \
+ ({ \
+ CCTK_REAL4_VEC const xx=(x_); \
+ CCTK_REAL4_VEC const x=xx; \
+ CCTK_REAL4 a; \
+ if (d==0) a=vec4_elt0(x); \
+ else if (d==1) a=vec4_elt1(x); \
+ else if (d==2) a=vec4_elt2(x); \
+ else if (d==3) a=vec4_elt3(x); \
+ a; \
+ })
#else
-# define vec4_elt(x,d) \
-({ \
- CCTK_REAL4_VEC const xelt=(x); \
- CCTK_REAL4 aelt; \
- switch (d) { \
- case 0: aelt=vec4_elt0(xelt); break; \
- case 1: aelt=vec4_elt1(xelt); break; \
- case 2: aelt=vec4_elt2(xelt); break; \
- case 3: aelt=vec4_elt3(xelt); break; \
- } \
- aelt; \
-})
+# define vec4_elt(x_,d) \
+ ({ \
+ CCTK_REAL4_VEC const xx=(x_); \
+ CCTK_REAL4_VEC const x=xx; \
+ CCTK_REAL4 a; \
+ switch (d) { \
+ case 0: a=vec4_elt0(x); break; \
+ case 1: a=vec4_elt1(x); break; \
+ case 2: a=vec4_elt2(x); break; \
+ case 3: a=vec4_elt3(x); break; \
+ } \
+ a; \
+ })
#endif
@@ -82,37 +96,133 @@
// a reference to a scalar
#define vec4_load(p) (_mm_load_ps(&(p)))
#define vec4_loadu(p) (_mm_loadu_ps(&(p)))
+#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS
+# define vec4_load_off1(p) vec_loadu(p)
+# define vec4_load_off2(p) vec_loadu(p)
+# define vec4_load_off3(p) vec_loadu(p)
+#else
+# define vec4_load_off1(p_) \
+ ({ \
+ CCTK_REAL4 const& pp=(p_); \
+ CCTK_REAL4 const& p=pp; \
+ CCTK_REAL4_VEC const lo=vec4_load((&p)[-1]); \
+ CCTK_REAL4_VEC const hi=vec4_load((&p)[+3]); \
+ assert(0); \
+ CCTK_REAL4_VEC const hi2=_mm_suffle_ps(lo,hi, _MM_SHUFFLE(0,1,2,3)); \
+ _mm_shuffle_ps(lo,hi2, _MM_SHUFFLE(2,1,3,0)); \
+ })
+# define vec4_load_off2(p_) \
+ ({ \
+ CCTK_REAL4 const& pp=(p_); \
+ CCTK_REAL4 const& p=pp; \
+ CCTK_REAL4_VEC const lo=vec4_load((&p)[-2]); \
+ CCTK_REAL4_VEC const hi=vec4_load((&p)[+2]); \
+ _mm_shuffle_ps(lo,hi, _MM_SHUFFLE(1,0,3,2)); \
+ })
+# define vec4_load_off1(p_) \
+ ({ \
+ CCTK_REAL4 const& pp=(p_); \
+ CCTK_REAL4 const& p=pp; \
+ CCTK_REAL4_VEC const lo=vec4_load((&p)[-1]); \
+ CCTK_REAL4_VEC const hi=vec4_load((&p)[+3]); \
+ assert(0); \
+ CCTK_REAL4_VEC const lo2=_mm_suffle_ps(lo,hi, _MM_SHUFFLE(0,1,2,3)); \
+ _mm_shuffle_ps(lo2,hi, _MM_SHUFFLE(3,0,2,1)); \
+ })
+#endif
// Load a vector from memory that may or may not be aligned, as
// decided by the offset off and the vector size
+#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
// Implementation: Always use unaligned load
-#define vec4_loadu_maybe(off,p) (vec4_loadu(p))
-#define vec4_loadu_maybe3(off1,off2,off3,p) (vec4_loadu(p))
-
-// Store a vector to memory (aligned and non-temporal); this stores to
-// a reference to a scalar
-#define vec4_store(p,x) (_mm_store_ps(&(p),x))
-#define vec4_storeu(p,x) (_mm_storeu_ps(&(p),x))
-#define vec4_store_nta(p,x) (_mm_stream_ps(&(p),x))
+# define vec4_loadu_maybe(off,p) vec4_loadu(p)
+# define vec4_loadu_maybe3(off1,off2,off3,p) vec4_loadu(p)
+#else
+# define vec4_loadu_maybe(off,p_) \
+ ({ \
+ CCTK_REAL4 const& pp=(p_); \
+ CCTK_REAL4 const& p=pp; \
+ (off) % CCTK_REAL4_VEC_SIZE == 0 ? \
+ vec4_load(p) : \
+ vec4_loadu(p); \
+ })
+# if VECTORISE_ALIGNED_ARRAYS
+// Assume all array x sizes are multiples of the vector size
+# define vec4_loadu_maybe3(off1,off2,off3,p) \
+ vec4_loadu_maybe(off1,p)
+# else
+# define vec4_loadu_maybe3(off1,off2,off3,p) \
+ vec4_loadu_maybe((off1)|(off2)|(off3),p)
+# endif
+#endif
// Store a lower or higher partial vector (aligned and non-temporal);
// the non-temporal hint is probably ignored
-#define vec4_store_nta_partial_lo(p,x,n) \
-({ \
- switch (n) { \
- case 3: (&(p))[2]=vec_elt2(p); \
- case 2: _mm_storel_pi(&(p),x); break; \
- case 1: (&(p))[0]=vec_elt0(p); \
- } \
-})
-#define vec4_store_nta_partial_hi(p,x,n) \
-({ \
- switch (n) { \
- case 3: (&(p))[1]=vec_elt1(p); \
- case 2: _mm_storeh_pi(&(p)+2,x); break; \
- case 1: (&(p))[3]=vec_elt3(p); \
- } \
-})
+#if ! VECTORISE_STREAMING_STORES || ! defined(__SSE4A__)
+# define vec4_store_nta_partial_lo(p_,x_,n) \
+ ({ \
+ CCTK_REAL4 const& pp=(p_); \
+ CCTK_REAL4 const& p=pp; \
+ CCTK_REAL4_VEC const xx=(x_); \
+ CCTK_REAL4_VEC const x=xx; \
+ switch (n) { \
+ case 1: (&p)[0]=vec4_elt0(x); break; \
+ case 2: _mm_storel_ps(&p,x); break; \
+ case 3: _mm_storel_ps(&p,x); (&p)[2]=vec4_elt2(x); break; \
+ } \
+ })
+# define vec4_store_nta_partial_hi(p_,x_,n) \
+ ({ \
+ CCTK_REAL4 const& pp=(p_); \
+ CCTK_REAL4 const& p=pp; \
+ CCTK_REAL4_VEC const xx=(x_); \
+ CCTK_REAL4_VEC const x=xx; \
+ switch (n) { \
+ case 1: (&p)[3]=vec4_elt3(x); break; \
+ case 2: _mm_storeh_ps(&p+2,x); break; \
+ case 3: _mm_storeh_ps(&p+2,x); (&p)[1]=vec4_elt1(x); break; \
+ } \
+ })
+#else
+# define vec4_store_nta_partial_lo(p_,x_,n) \
+ ({ \
+ CCTK_REAL4 const& pp=(p_); \
+ CCTK_REAL4 const& p=pp; \
+ CCTK_REAL4_VEC const xx=(x_); \
+ CCTK_REAL4_VEC const x=xx; \
+ switch (n) { \
+ case 1: \
+ _mm_stream_ss(&p,x); \
+ break; \
+ case 2: \
+ _mm_storel_ps(&p,x); \
+ break; \
+ case 3: \
+ _mm_storel_ps(&p,x); \
+ _mm_stream_ss(&p+2, vec4_swap2301(x)); \
+ break; \
+ } \
+ })
+# define vec4_store_nta_partial_hi(p_,x_,n) \
+ ({ \
+ CCTK_REAL4 const& pp=(p_); \
+ CCTK_REAL4 const& p=pp; \
+ CCTK_REAL4_VEC const xx=(x_); \
+ CCTK_REAL4_VEC const x=xx; \
+ switch (n) { \
+ case 1: \
+ _mm_stream_ss(&p+3, vec4_swap3210(x)); \
+ break; \
+ case 2: \
+ _mm_storeh_ps(&p+2,x); \
+ break; \
+ case 3: \
+ _mm_storeh_ps(&p+2,x); \
+ _mm_stream_ss(&p+1, vec4_swap1032(x)); \
+ break; \
+ } \
+ })
+#endif
@@ -132,10 +242,15 @@ static const union {
// Operators
#define k4pos(x) (x)
#define k4neg(x) (_mm_xor_ps(x,k4sign_mask))
+// #define k4inv(x)
+// TODO: provide k4inv via rcp and Newton-Raphson
+// This is described in AMD's publication 47414.
+// This should apply for AVX as well.
#define k4add(x,y) (_mm_add_ps(x,y))
#define k4sub(x,y) (_mm_sub_ps(x,y))
#define k4mul(x,y) (_mm_mul_ps(x,y))
+// TODO: use k4inv and k4mul instead
#define k4div(x,y) (_mm_div_ps(x,y))
// Fused multiply-add, defined as [+-]x*y[+-]z
@@ -149,25 +264,31 @@ static const union {
#define k4fmax(x,y) (_mm_max_ps(x,y))
#define k4fmin(x,y) (_mm_min_ps(x,y))
#define k4fnabs(x) (_mm_or_ps(x,k4sign_mask))
+// TODO: maybe use rsqrt and Newton-Raphson
#define k4sqrt(x) (_mm_sqrt_ps(x))
// Expensive functions
-#define k4exp(x) \
-({ \
- CCTK_REAL4_VEC const xexp=(x); \
- vec4_set(exp(vec4_elt0(xexp)), exp(vec4_elt1(xexp)), \
- exp(vec4_elt2(xexp)), exp(vec4_elt3(xexp))); \
-})
-#define k4log(x) \
-({ \
- CCTK_REAL4_VEC const xlog=(x); \
- vec4_set(log(vec4_elt0(xlog)), log(vec4_elt1(xlog)), \
- log(vec4_elt2(xlog)), log(vec4_elt3(xlog))); \
-})
-#define k4pow(x,a) \
-({ \
- CCTK_REAL4_VEC const xpow=(x); \
- CCTK_REAL4 const apow=(a); \
- vec4_set(pow(vec4_elt0(xpow),apow), pow(vec4_elt1(xpow),apow), \
- pow(vec4_elt2(xpow),apow), pow(vec4_elt3(xpow),apow)); \
-})
+#define K4REPL(f,x_) \
+ ({ \
+ CCTK_REAL4_VEC const xx=(x_); \
+ CCTK_REAL4_VEC const x=xx; \
+ vec4_set(f(vec4_elt0(x)), \
+ f(vec4_elt1(x)), \
+ f(vec4_elt2(x)), \
+ f(vec4_elt3(x))); \
+ })
+#define K4REPL2(f,x_,a_) \
+ ({ \
+ CCTK_REAL4_VEC const xx=(x_); \
+ CCTK_REAL4_VEC const x=xx; \
+ CCTK_REAL4 const aa=(a_); \
+ CCTK_REAL4 const a=aa; \
+ vec4_set(f(vec4_elt0(x),a), \
+ f(vec4_elt1(x),a), \
+ f(vec4_elt2(x),a), \
+ f(vec4_elt3(x),a)); \
+ })
+
+#define k4exp(x) K4REPL(exp,x)
+#define k4log(x) K4REPL(log,x)
+#define k4pow(x,a) K4REPL2(pow,x,a)