diff options
author | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2011-06-06 10:11:44 +0000 |
---|---|---|
committer | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2011-06-06 10:11:44 +0000 |
commit | 2ab4d61cd4b632c0e991c781f3c15f3b054d1bbd (patch) | |
tree | 6664b1e9ee360ee0abf9df6b9a5562eb5bdc88c5 | |
parent | 5d4858e0736a0c0881c65b9e9ac0983d3b5bb24b (diff) |
Introduce Cactus options for vectorisation
Introduce configuration-time options for vectorisation, including
options to allow architecture-specific choices that may influence
performance.
Introduce "middle" masked stores for large vector sizes and small
loops.
Clean up and simplify some of the implementation code.
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@10 105869f7-3296-0410-a4ea-f4349344b45a
-rw-r--r-- | README | 47 | ||||
-rw-r--r-- | configuration.ccl | 10 | ||||
-rw-r--r-- | configure.sh | 123 | ||||
-rw-r--r-- | src/vectors-4-Altivec.h | 134 | ||||
-rw-r--r-- | src/vectors-4-SSE.h | 295 | ||||
-rw-r--r-- | src/vectors-4-default.h | 1 | ||||
-rw-r--r-- | src/vectors-8-AVX.h | 127 | ||||
-rw-r--r-- | src/vectors-8-DoubleHummer.h | 180 | ||||
-rw-r--r-- | src/vectors-8-SSE2.h | 216 | ||||
-rw-r--r-- | src/vectors-8-VSX.h | 66 | ||||
-rw-r--r-- | src/vectors-8-default.h | 1 | ||||
-rw-r--r-- | src/vectors.h | 52 |
12 files changed, 902 insertions, 350 deletions
@@ -6,4 +6,49 @@ Licence : GPL 1. Purpose -Provide a C++ class template that helps vectorisation. +Provide C macro definitions and a C++ class template that help +vectorisation. + + + +2. Build-time choices + +Several choices can be made via configuration options, which can be +set to "yes" or "no": + +VECTORISE (default "no"): Vectorise. Otherwise, scalar code is +generated, and the other options have no effect. + + + +VECTORISE_ALIGNED_ARRAYS (default "no", experimental): Assume that all +arrays have an extent in the x direction that is a multiple of the +vector size. This allows aligned load operations e.g. for finite +differencing operators in the y and z directions. (Setting this +produces faster code, but may lead to segfaults if the assumption is +not true.) + +VECTORISE_ALWAYS_USE_UNALIGNED_LOADS (default "no", experimental): +Replace all aligned load operations with unaligned load operations. +This may simplify some code where alignment is unknown at compile +time. This should never lead to better code, since the default is to +use aligned load operations iff the alignment is known to permit this +at build time. This options is probably useless. + +VECTORISE_ALWAYS_USE_ALIGNED_LOADS (default "no", experimental): +Replace all unaligned load operations by (multiple) aligned load +operations and corresponding vector-gather operations. This may be +beneficial if unaligned load operations are slow, and if vector-gather +operations are fast. + +VECTORISE_INLINE (default "yes"): Inline functions into the loop body +as much as possible. (Disabling this may reduce code size, which can +improve performance if the instruction cache is small.) + +VECTORISE_STREAMING_STORES (default "yes"): Use streaming stores, i.e. +use store operations that bypass the cache. (Disabling this produces +slower code.) + +VECTORISE_EMULATE_AVX (default "no", experimental): Emulate AVX +instructions with SSE2 instructions. This produces slower code, but +can be used to test AVX code on systems that don't support AVX. diff --git a/configuration.ccl b/configuration.ccl index ec50e58..54500d7 100644 --- a/configuration.ccl +++ b/configuration.ccl @@ -2,4 +2,14 @@ PROVIDES Vectors { + SCRIPT configure.sh + LANG bash + OPTIONS \ + VECTORISE \ + VECTORISE_ALIGNED_ARRAYS \ + VECTORISE_ALWAYS_USE_UNALIGNED_LOADS \ + VECTORISE_ALWAYS_USE_ALIGNED_LOADS \ + VECTORISE_INLINE \ + VECTORISE_STREAMING_STORES \ + VECTORISE_EMULATE_AVX } diff --git a/configure.sh b/configure.sh new file mode 100644 index 0000000..85a6e26 --- /dev/null +++ b/configure.sh @@ -0,0 +1,123 @@ +#! /bin/bash + +# Set up shell +set -x # Output commands +set -e # Abort on errors + + + +################################################################################ +# Examine settings +################################################################################ + + + +case $(echo "x$VECTORISE" | tr '[:upper:]' '[:lower:]') in + (xyes) VECTORISE=1 ;; + (xno) VECTORISE=0 ;; + (x) VECTORISE=0 ;; # default + (*) echo "BEGIN ERROR" + echo "Illegal value of option VECTORISE" + echo "END ERROR" + exit 1;; +esac + +# Disable vectorisation when optimisation is disabled. (The +# vectorisation macros depend on optimisation for efficient code; +# without optimisation, the code is most likely much slower than +# usual.) +case $(echo "x$OPTIMISE" | tr '[:upper:]' '[:lower:]') in + (xyes) ;; # do nothing + (xno) VECTORISE=0 ;; # disable vectorisation + (*) echo "BEGIN ERROR" + echo "Illegal value of option CCTK_OPTIMISE_MODE" + echo "END ERROR" + exit 1 +esac + +case $(echo "x$VECTORISE_ALIGNED_ARRAYS" | tr '[:upper:]' '[:lower:]') in + (xyes) VECTORISE_ALIGNED_ARRAYS=1 ;; + (xno) VECTORISE_ALIGNED_ARRAYS=0 ;; + (x) VECTORISE_ALIGNED_ARRAYS=0 ;; # default + (*) echo "BEGIN ERROR" + echo "Illegal value of option VECTORISE_ALIGNED_ARRAYS" + echo "END ERROR" + exit 1 +esac + +case $(echo "x$VECTORISE_ALWAYS_USE_UNALIGNED_LOADS" | tr '[:upper:]' '[:lower:]') in + (xyes) VECTORISE_ALWAYS_USE_UNALIGNED_LOADS=1 ;; + (xno) VECTORISE_ALWAYS_USE_UNALIGNED_LOADS=0 ;; + (x) VECTORISE_ALWAYS_USE_UNALIGNED_LOADS=0 ;; # default + (*) echo "BEGIN ERROR" + echo "Illegal value of option VECTORISE_ALWAYS_USE_UNALIGNED_LOADS" + echo "END ERROR" + exit 1 +esac + +case $(echo "x$VECTORISE_ALWAYS_USE_ALIGNED_LOADS" | tr '[:upper:]' '[:lower:]') in + (xyes) VECTORISE_ALWAYS_USE_ALIGNED_LOADS=1 ;; + (xno) VECTORISE_ALWAYS_USE_ALIGNED_LOADS=0 ;; + (x) VECTORISE_ALWAYS_USE_ALIGNED_LOADS=0 ;; # default + (*) echo "BEGIN ERROR" + echo "Illegal value of option VECTORISE_ALWAYS_USE_ALIGNED_LOADS" + echo "END ERROR" + exit 1 +esac + +case $(echo "x$VECTORISE_STREAMING_STORES" | tr '[:upper:]' '[:lower:]') in + (xyes) VECTORISE_STREAMING_STORES=1 ;; + (xno) VECTORISE_STREAMING_STORES=0 ;; + (x) VECTORISE_STREAMING_STORES=1 ;; # default + (*) echo "BEGIN ERROR" + echo "Illegal value of option VECTORISE_STREAMING_STORES" + echo "END ERROR" + exit 1 +esac + +case $(echo "x$VECTORISE_INLINE" | tr '[:upper:]' '[:lower:]') in + (xyes) VECTORISE_INLINE=1 ;; + (xno) VECTORISE_INLINE=0 ;; + (x) VECTORISE_INLINE=1 ;; # default + (*) echo "BEGIN ERROR" + echo "Illegal value of option VECTORISE_INLINE" + echo "END ERROR" + exit 1 +esac + +case $(echo "x$VECTORISE_EMULATE_AVX" | tr '[:upper:]' '[:lower:]') in + (xyes) VECTORISE_EMULATE_AVX=1 ;; + (xno) VECTORISE_EMULATE_AVX=0 ;; + (x) VECTORISE_EMULATE_AVX=0 ;; # default + (*) echo "BEGIN ERROR" + echo "Illegal value of option VECTORISE_EMULATE_AVX" + echo "END ERROR" + exit 1 +esac + + + +################################################################################ +# Configure Cactus +################################################################################ + +# Pass options to Cactus +echo "BEGIN DEFINE" +echo "VECTORISE $VECTORISE" +echo "VECTORISE_ALIGNED_ARRAYS $VECTORISE_ALIGNED_ARRAYS" +echo "VECTORISE_ALWAYS_USE_UNALIGNED_LOADS $VECTORISE_ALWAYS_USE_UNALIGNED_LOADS" +echo "VECTORISE_ALWAYS_USE_ALIGNED_LOADS $VECTORISE_ALWAYS_USE_ALIGNED_LOADS" +echo "VECTORISE_INLINE $VECTORISE_INLINE" +echo "VECTORISE_STREAMING_STORES $VECTORISE_STREAMING_STORES" +echo "VECTORISE_EMULATE_AVX $VECTORISE_EMULATE_AVX" +echo "END DEFINE" + +echo "BEGIN MAKE_DEFINITION" +echo "VECTORISE = $VECTORISE" +echo "VECTORISE_ALIGNED_ARRAYS = $VECTORISE_ALIGNED_ARRAYS" +echo "VECTORISE_ALWAYS_USE_UNALIGNED_LOADS = $VECTORISE_ALWAYS_USE_UNALIGNED_LOADS" +echo "VECTORISE_ALWAYS_USE_ALIGNED_LOADS = $VECTORISE_ALWAYS_USE_ALIGNED_LOADS" +echo "VECTORISE_INLINE = $VECTORISE_INLINE" +echo "VECTORISE_STREAMING_STORES = $VECTORISE_STREAMING_STORES" +echo "VECTORISE_EMULATE_AVX = $VECTORISE_EMULATE_AVX" +echo "END MAKE_DEFINITION" diff --git a/src/vectors-4-Altivec.h b/src/vectors-4-Altivec.h index 06cea58..b44b492 100644 --- a/src/vectors-4-Altivec.h +++ b/src/vectors-4-Altivec.h @@ -21,14 +21,14 @@ #define vec4_set1(a) (vec_splats(a)) #define vec4_set(a,b,c,d) \ -({ \ - CCTK_REAL4_VEC x; \ - x[0]=(a); \ - x[1]=(b); \ - x[2]=(c); \ - x[3]=(d); \ - x; \ -}) + ({ \ + CCTK_REAL4_VEC x; \ + x[0]=(a); \ + x[1]=(b); \ + x[2]=(c); \ + x[3]=(d); \ + x; \ + }) #define vec4_elt0(x) ((x)[0]) #define vec4_elt1(x) ((x)[1]) @@ -52,10 +52,10 @@ // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar -#define vec4_store(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x)) -#define vec4_storeu(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x)) -#if 0 -# define vec4_store_nta(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x)) +#define vec4_store(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x)) +#define vec4_storeu(p,x) (*(CCTK_REAL4_VEC*)&(p)=(x)) +#if ! VECTORISE_STREAMING_STORES +# define vec4_store_nta(p,x) (vec4_store(p,x)) #else // use stvxl instruction # define vec4_store_nta(p,x) (vec_stl(x,0,(CCTK_REAL4_VEC*)&(p))) @@ -63,22 +63,49 @@ // Store a lower or higher partial vector (aligned and non-temporal); // the non-temporal hint is probably ignored -#define vec4_store_nta_partial_lo(p,x,n) \ -({ \ - switch (n) { \ - case 3: ((&(p))[2]=(x)[2]); \ - case 2: ((&(p))[1]=(x)[1]); \ - case 1: ((&(p))[0]=(x)[0]); \ - } \ -}) -#define vec4_store_nta_partial_hi(p,x,n) \ -({ \ - switch (n) { \ - case 3: ((&(p))[1]=(x)[1]); \ - case 2: ((&(p))[2]=(x)[2]); \ - case 1: ((&(p))[3]=(x)[3]); \ - } \ -}) +#define vec4_store_nta_partial_lo(p_,x_,n) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + switch (n) { \ + case 3: (&p)[2]=x[2]; \ + case 2: (&p)[1]=x[1]; \ + case 1: (&p)[0]=x[0]; \ + } \ + }) +#define vec4_store_nta_partial_hi(p_,x_,n) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + switch (n) { \ + case 3: (&p)[1]=x[1]; \ + case 2: (&p)[2]=x[2]; \ + case 1: (&p)[3]=x[3]; \ + } \ + }) +#define vec4_store_nta_partial_mid(p_,x_,nlo_,nhi_) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + int const nnlo=(nlo_); \ + int const nlo=nnlo; \ + int const nnhi=(nhi_); \ + int const nhi=nnhi; \ + if (nlo==3 and nhi==3) { \ + (&p)[1]=x[1]; \ + (&p)[2]=x[2]; \ + } else if (nlo==2 and nhi==3) { \ + (&p)[1]=x[1]; \ + } else if (nlo==3 and nhi==2) { \ + (&p)[2]=x[2]; \ + } \ + }) @@ -105,28 +132,29 @@ #define k4fmin(x,y) (vec_min(x,y)) #define k4fnabs(x) (vec_nabs(x)) -#define k4exp(x) \ -({ \ - CCTK_REAL4_VEC const xexp=(x); \ - vec4_set(exp(vec4_elt0(xexp)), exp(vec4_elt1(xexp)), \ - exp(vec4_elt2(xexp)), exp(vec4_elt3(xexp))); \ -}) -#define k4log(x) \ -({ \ - CCTK_REAL4_VEC const xlog=(x); \ - vec4_set(log(vec4_elt0(xlog)), log(vec4_elt1(xlog)), \ - log(vec4_elt2(xlog)), log(vec4_elt3(xlog))); \ -}) -#define k4pow(x,a) \ -({ \ - CCTK_REAL4_VEC const xpow=(x); \ - CCTK_REAL4 const apow=(a); \ - vec4_set(pow(vec4_elt0(xpow),apow), pow(vec4_elt1(xpow),apow), \ - pow(vec4_elt2(xpow),apow), pow(vec4_elt3(xpow),apow)); \ -}) -#define k4sqrt(x) \ -({ \ - CCTK_REAL4_VEC const xsqrt=(x); \ - vec4_set(sqrt(vec4_elt0(xsqrt)), sqrt(vec4_elt1(xsqrt)), \ - sqrt(vec4_elt2(xsqrt)), sqrt(vec4_elt3(xsqrt))); \ -}) +// Expensive functions +#define K4REPL(f,x_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + vec4_set(f(vec4_elt0(x)), \ + f(vec4_elt1(x)), \ + f(vec4_elt2(x)), \ + f(vec4_elt3(x))); \ + }) +#define K4REPL2(f,x_,a_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + CCTK_REAL4 const aa=(a_); \ + CCTK_REAL4 const a=aa; \ + vec4_set(f(vec4_elt0(x),a), \ + f(vec4_elt1(x),a), \ + f(vec4_elt2(x),a), \ + f(vec4_elt3(x),a)); \ + }) + +#define k4exp(x) K4REPL(exp,x) +#define k4log(x) K4REPL(log,x) +#define k4pow(x,a) K4REPL2(pow,x,a) +#define k4sqrt(x) K4REPL(sqrt,x) diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h index bc50e68..e6dc735 100644 --- a/src/vectors-4-SSE.h +++ b/src/vectors-4-SSE.h @@ -6,6 +6,10 @@ #include <xmmintrin.h> +#ifdef __SSE4A__ +// AMD's SSE 4a +# include <ammintrin.h> +#endif @@ -22,56 +26,66 @@ #define vec4_set1(a) (_mm_set1_ps(a)) #define vec4_set(a,b,c,d) (_mm_set_ps(d,c,b,a)) // note reversed arguments -#if defined(__PGI) && defined (__amd64__) +// original order is 0123 +#define vec4_swap1032(x_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + _mm_shuffle_ps(x,x, _MM_SHUFFLE(2,3,0,1)); \ + }) +#define vec4_swap2301(x_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + _mm_shuffle_ps(x,x, _MM_SHUFFLE(1,0,3,2)); \ + }) +#define vec4_swap3210(x_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + _mm_shuffle_ps(x,x, _MM_SHUFFLE(0,1,2,3)); \ + }) + +#if defined(__PGI) // _mm_cvtss_f32 does not exist on PGI compilers # define vec4_elt0(x) \ -({ \ - CCTK_REAL4 aelt0; \ - asm ("" : "=x" (aelt0) : "0" (x)); \ - aelt0; \ -}) + ({ \ + CCTK_REAL4 a; \ + asm ("" : "=x" (a) : "0" (x)); \ + a; \ + }) #else # define vec4_elt0(x) (_mm_cvtss_f32(x)) // this is a no-op #endif -#define vec4_elt1(x) \ -({ \ - CCTK_REAL4_VEC const xelt1=(x); \ - vec4_elt0(_mm_shuffle_ps(xelt1,xelt1,_MM_SHUFFLE(1,0,3,2))); \ -}) -#define vec4_elt2(x) \ -({ \ - CCTK_REAL4_VEC const xelt2=(x); \ - vec4_elt0(_mm_unpackhi_ps(xelt2,xelt2)); \ -}) -#define vec4_elt3(x) \ -({ \ - CCTK_REAL4_VEC const xelt3=(x); \ - vec4_elt0(_mm_shuffle_ps(xelt3,xelt3,_MM_SHUFFLE(3,2,1,0))); \ -}) -#if defined(__PGI) && defined (__amd64__) -# define vec4_elt(x,d) \ -({ \ - CCTK_REAL4_VEC const xelt=(x); \ - CCTK_REAL4 aelt; \ - if (d==0) aelt=vec4_elt0(xelt); \ - else if (d==1) aelt=vec4_elt1(xelt); \ - else if (d==2) aelt=vec4_elt2(xelt); \ - else if (d==3) aelt=vec4_elt3(xelt); \ - aelt; \ -}) +#define vec4_elt1(x) vec4_elt0(vec4_swap1032(x)) +#define vec4_elt2(x) vec4_elt0(vec4_swap2301(x)) +#define vec4_elt3(x) vec4_elt0(vec4_swap3210(x)) +#if defined(__PGI) +# define vec4_elt(x_,d) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + CCTK_REAL4 a; \ + if (d==0) a=vec4_elt0(x); \ + else if (d==1) a=vec4_elt1(x); \ + else if (d==2) a=vec4_elt2(x); \ + else if (d==3) a=vec4_elt3(x); \ + a; \ + }) #else -# define vec4_elt(x,d) \ -({ \ - CCTK_REAL4_VEC const xelt=(x); \ - CCTK_REAL4 aelt; \ - switch (d) { \ - case 0: aelt=vec4_elt0(xelt); break; \ - case 1: aelt=vec4_elt1(xelt); break; \ - case 2: aelt=vec4_elt2(xelt); break; \ - case 3: aelt=vec4_elt3(xelt); break; \ - } \ - aelt; \ -}) +# define vec4_elt(x_,d) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + CCTK_REAL4 a; \ + switch (d) { \ + case 0: a=vec4_elt0(x); break; \ + case 1: a=vec4_elt1(x); break; \ + case 2: a=vec4_elt2(x); break; \ + case 3: a=vec4_elt3(x); break; \ + } \ + a; \ + }) #endif @@ -82,37 +96,133 @@ // a reference to a scalar #define vec4_load(p) (_mm_load_ps(&(p))) #define vec4_loadu(p) (_mm_loadu_ps(&(p))) +#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS +# define vec4_load_off1(p) vec_loadu(p) +# define vec4_load_off2(p) vec_loadu(p) +# define vec4_load_off3(p) vec_loadu(p) +#else +# define vec4_load_off1(p_) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const lo=vec4_load((&p)[-1]); \ + CCTK_REAL4_VEC const hi=vec4_load((&p)[+3]); \ + assert(0); \ + CCTK_REAL4_VEC const hi2=_mm_suffle_ps(lo,hi, _MM_SHUFFLE(0,1,2,3)); \ + _mm_shuffle_ps(lo,hi2, _MM_SHUFFLE(2,1,3,0)); \ + }) +# define vec4_load_off2(p_) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const lo=vec4_load((&p)[-2]); \ + CCTK_REAL4_VEC const hi=vec4_load((&p)[+2]); \ + _mm_shuffle_ps(lo,hi, _MM_SHUFFLE(1,0,3,2)); \ + }) +# define vec4_load_off1(p_) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const lo=vec4_load((&p)[-1]); \ + CCTK_REAL4_VEC const hi=vec4_load((&p)[+3]); \ + assert(0); \ + CCTK_REAL4_VEC const lo2=_mm_suffle_ps(lo,hi, _MM_SHUFFLE(0,1,2,3)); \ + _mm_shuffle_ps(lo2,hi, _MM_SHUFFLE(3,0,2,1)); \ + }) +#endif // Load a vector from memory that may or may not be aligned, as // decided by the offset off and the vector size +#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS // Implementation: Always use unaligned load -#define vec4_loadu_maybe(off,p) (vec4_loadu(p)) -#define vec4_loadu_maybe3(off1,off2,off3,p) (vec4_loadu(p)) - -// Store a vector to memory (aligned and non-temporal); this stores to -// a reference to a scalar -#define vec4_store(p,x) (_mm_store_ps(&(p),x)) -#define vec4_storeu(p,x) (_mm_storeu_ps(&(p),x)) -#define vec4_store_nta(p,x) (_mm_stream_ps(&(p),x)) +# define vec4_loadu_maybe(off,p) vec4_loadu(p) +# define vec4_loadu_maybe3(off1,off2,off3,p) vec4_loadu(p) +#else +# define vec4_loadu_maybe(off,p_) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + (off) % CCTK_REAL4_VEC_SIZE == 0 ? \ + vec4_load(p) : \ + vec4_loadu(p); \ + }) +# if VECTORISE_ALIGNED_ARRAYS +// Assume all array x sizes are multiples of the vector size +# define vec4_loadu_maybe3(off1,off2,off3,p) \ + vec4_loadu_maybe(off1,p) +# else +# define vec4_loadu_maybe3(off1,off2,off3,p) \ + vec4_loadu_maybe((off1)|(off2)|(off3),p) +# endif +#endif // Store a lower or higher partial vector (aligned and non-temporal); // the non-temporal hint is probably ignored -#define vec4_store_nta_partial_lo(p,x,n) \ -({ \ - switch (n) { \ - case 3: (&(p))[2]=vec_elt2(p); \ - case 2: _mm_storel_pi(&(p),x); break; \ - case 1: (&(p))[0]=vec_elt0(p); \ - } \ -}) -#define vec4_store_nta_partial_hi(p,x,n) \ -({ \ - switch (n) { \ - case 3: (&(p))[1]=vec_elt1(p); \ - case 2: _mm_storeh_pi(&(p)+2,x); break; \ - case 1: (&(p))[3]=vec_elt3(p); \ - } \ -}) +#if ! VECTORISE_STREAMING_STORES || ! defined(__SSE4A__) +# define vec4_store_nta_partial_lo(p_,x_,n) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + switch (n) { \ + case 1: (&p)[0]=vec4_elt0(x); break; \ + case 2: _mm_storel_ps(&p,x); break; \ + case 3: _mm_storel_ps(&p,x); (&p)[2]=vec4_elt2(x); break; \ + } \ + }) +# define vec4_store_nta_partial_hi(p_,x_,n) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + switch (n) { \ + case 1: (&p)[3]=vec4_elt3(x); break; \ + case 2: _mm_storeh_ps(&p+2,x); break; \ + case 3: _mm_storeh_ps(&p+2,x); (&p)[1]=vec4_elt1(x); break; \ + } \ + }) +#else +# define vec4_store_nta_partial_lo(p_,x_,n) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + switch (n) { \ + case 1: \ + _mm_stream_ss(&p,x); \ + break; \ + case 2: \ + _mm_storel_ps(&p,x); \ + break; \ + case 3: \ + _mm_storel_ps(&p,x); \ + _mm_stream_ss(&p+2, vec4_swap2301(x)); \ + break; \ + } \ + }) +# define vec4_store_nta_partial_hi(p_,x_,n) \ + ({ \ + CCTK_REAL4 const& pp=(p_); \ + CCTK_REAL4 const& p=pp; \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + switch (n) { \ + case 1: \ + _mm_stream_ss(&p+3, vec4_swap3210(x)); \ + break; \ + case 2: \ + _mm_storeh_ps(&p+2,x); \ + break; \ + case 3: \ + _mm_storeh_ps(&p+2,x); \ + _mm_stream_ss(&p+1, vec4_swap1032(x)); \ + break; \ + } \ + }) +#endif @@ -132,10 +242,15 @@ static const union { // Operators #define k4pos(x) (x) #define k4neg(x) (_mm_xor_ps(x,k4sign_mask)) +// #define k4inv(x) +// TODO: provide k4inv via rcp and Newton-Raphson +// This is described in AMD's publication 47414. +// This should apply for AVX as well. #define k4add(x,y) (_mm_add_ps(x,y)) #define k4sub(x,y) (_mm_sub_ps(x,y)) #define k4mul(x,y) (_mm_mul_ps(x,y)) +// TODO: use k4inv and k4mul instead #define k4div(x,y) (_mm_div_ps(x,y)) // Fused multiply-add, defined as [+-]x*y[+-]z @@ -149,25 +264,31 @@ static const union { #define k4fmax(x,y) (_mm_max_ps(x,y)) #define k4fmin(x,y) (_mm_min_ps(x,y)) #define k4fnabs(x) (_mm_or_ps(x,k4sign_mask)) +// TODO: maybe use rsqrt and Newton-Raphson #define k4sqrt(x) (_mm_sqrt_ps(x)) // Expensive functions -#define k4exp(x) \ -({ \ - CCTK_REAL4_VEC const xexp=(x); \ - vec4_set(exp(vec4_elt0(xexp)), exp(vec4_elt1(xexp)), \ - exp(vec4_elt2(xexp)), exp(vec4_elt3(xexp))); \ -}) -#define k4log(x) \ -({ \ - CCTK_REAL4_VEC const xlog=(x); \ - vec4_set(log(vec4_elt0(xlog)), log(vec4_elt1(xlog)), \ - log(vec4_elt2(xlog)), log(vec4_elt3(xlog))); \ -}) -#define k4pow(x,a) \ -({ \ - CCTK_REAL4_VEC const xpow=(x); \ - CCTK_REAL4 const apow=(a); \ - vec4_set(pow(vec4_elt0(xpow),apow), pow(vec4_elt1(xpow),apow), \ - pow(vec4_elt2(xpow),apow), pow(vec4_elt3(xpow),apow)); \ -}) +#define K4REPL(f,x_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + vec4_set(f(vec4_elt0(x)), \ + f(vec4_elt1(x)), \ + f(vec4_elt2(x)), \ + f(vec4_elt3(x))); \ + }) +#define K4REPL2(f,x_,a_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + CCTK_REAL4 const aa=(a_); \ + CCTK_REAL4 const a=aa; \ + vec4_set(f(vec4_elt0(x),a), \ + f(vec4_elt1(x),a), \ + f(vec4_elt2(x),a), \ + f(vec4_elt3(x),a)); \ + }) + +#define k4exp(x) K4REPL(exp,x) +#define k4log(x) K4REPL(log,x) +#define k4pow(x,a) K4REPL2(pow,x,a) diff --git a/src/vectors-4-default.h b/src/vectors-4-default.h index e20109d..1277040 100644 --- a/src/vectors-4-default.h +++ b/src/vectors-4-default.h @@ -50,6 +50,7 @@ // the vector elements into memory locations as if element 0 were // stored at p. #define vec4_store_nta_partial_hi(p,x,n) (assert(0)) +#define vec4_store_nta_partial_mid(p,x,nlo,nhi) (assert(0)) diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h index 78c00d4..0f08096 100644 --- a/src/vectors-8-AVX.h +++ b/src/vectors-8-AVX.h @@ -5,7 +5,7 @@ -#if defined(EMULATE_AVX) +#if VECTORISE_EMULATE_AVX # include "avxintrin_emu.h" #else # include <immintrin.h> @@ -39,31 +39,11 @@ union k8const_t { #define vec8_set1(a) (_mm256_set1_pd(a)) #define vec8_set(a,b,c,d) (_mm256_set_pd(d,c,b,a)) // note reversed arguments -#define vec8_elt0(x) (_mm_cvtsd_f64(_mm256_extractf128_pd(x,0))) -#define vec8_elt1(x) \ -({ \ - __m128d const xelt1=_mm256_extractf128_pd(x,0); \ - _mm_cvtsd_f64(_mm_unpackhi_pd(xelt1,xelt1)); \ -}) -#define vec8_elt2(x) (_mm_cvtsd_f64(_mm256_extractf128_pd(x,1))) -#define vec8_elt3(x) \ -({ \ - __m128d const xelt3=_mm256_extractf128_pd(x,1); \ - _mm_cvtsd_f64(_mm_unpackhi_pd(xelt3,xelt3)); \ -}) - -#define vec8_elt(x,d) \ -({ \ - CCTK_REAL8_VEC const xelt=(x); \ - CCTK_REAL8 aelt; \ - switch (d) { \ - case 0: aelt=vec8_elt0(xelt); break; \ - case 1: aelt=vec8_elt1(xelt); break; \ - case 2: aelt=vec8_elt2(xelt); break; \ - case 3: aelt=vec8_elt3(xelt); break; \ - } \ - aelt; \ -}) +#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0]) +#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1]) +#define vec8_elt2(x) (((CCTK_REAL8 const*)&(x))[2]) +#define vec8_elt3(x) (((CCTK_REAL8 const*)&(x))[3]) +#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d]) @@ -73,18 +53,53 @@ union k8const_t { // a reference to a scalar #define vec8_load(p) (_mm256_load_pd(&(p))) #define vec8_loadu(p) (_mm256_loadu_pd(&(p))) +#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS +# define vec8_load_off1(p) vec_loadu(p) +#else +# error "VECTORISE_ALWAYS_USE_ALIGNED_LOADS not yet supported" +#endif // Load a vector from memory that may or may not be aligned, as // decided by the offset off and the vector size +#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS // Implementation: Always use unaligned load -#define vec8_loadu_maybe(off,p) (vec8_loadu(p)) -#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) +# define vec8_loadu_maybe(off,p) (vec8_loadu(p)) +# define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) +#else +# define vec8_loadu_maybe(off,p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + (off) % CCTK_REAL8_VEC_SIZE == 0 ? \ + vec8_load(p) : \ + vec8_load_off1(p); \ + }) +# if VECTORISE_ALIGNED_ARRAYS +// Assume all array x sizes are multiples of the vector size +# define vec8_loadu_maybe3(off1,off2,off3,p) \ + vec8_loadu_maybe(off1,p) +# else +# define vec8_loadu_maybe3(off1,off2,off3,p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \ + (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \ + vec8_loadu(p) : \ + vec8_loadu_maybe(off1,p); \ + }) +# endif +#endif // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar -#define vec8_store(p,x) (_mm256_store_pd(&(p),x)) -#define vec8_storeu(p,x) (_mm256_storeu_pd(&(p),x)) -#define vec8_store_nta(p,x) (_mm256_stream_pd(&(p),x)) +#define vec8_store(p,x) (_mm256_store_pd(&(p),x)) +#define vec8_storeu(p,x) (_mm256_storeu_pd(&(p),x)) +#if ! VECTORISE_STREAMING_STORES +# define vec8_store_nta(p,x) (vec8_store(p,x)) +#else +# define vec8_store_nta(p,x) (_mm256_stream_pd(&(p),x)) +#endif // Store a lower or higher partial vector (aligned and non-temporal); // the non-temporal hint is probably ignored @@ -96,8 +111,6 @@ static const k8const_t k8store_lo_union[5] = {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_ZERO, }}, {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }}, }; -#define vec8_store_nta_partial_lo(p,x,n) \ - (_mm256_maskstore_pd(&(p),k8store_lo_union[n].vi,x)) static const k8const_t k8store_hi_union[5] = { {{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }}, @@ -106,8 +119,28 @@ static const k8const_t k8store_hi_union[5] = {{ K8_ZERO, K8_IMIN, K8_IMIN, K8_IMIN, }}, {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }}, }; -#define vec8_store_nta_partial_hi(p,x,n) \ +#if defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4 +// gcc 4.4 uses a wrong prototype for _mm256_maskstore_pd +# define vec8_store_nta_partial_lo(p,x,n) \ + (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_lo_union[n].vi),x)) +# define vec8_store_nta_partial_hi(p,x,n) \ + (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_hi_union[n].vi),x)) +# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \ + (_mm256_maskstore_pd \ + (&(p), \ + _mm256_castsi256_pd(k8store_lo_union[nlo].vi & k8store_hi_union[nhi].vi), \ + x)) +#else +# define vec8_store_nta_partial_lo(p,x,n) \ + (_mm256_maskstore_pd(&(p),k8store_lo_union[n].vi,x)) +# define vec8_store_nta_partial_hi(p,x,n) \ (_mm256_maskstore_pd(&(p),k8store_hi_union[n].vi,x)) +# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \ + (_mm256_maskstore_pd \ + (&(p), \ + k8store_lo_union[nlo].vi & k8store_hi_union[nhi].vi, \ + x)) +#endif @@ -141,23 +174,23 @@ static const k8const_t k8abs_mask_union = #define k8sqrt(x) (_mm256_sqrt_pd(x)) // Expensive functions -#define K8REPL(x,func) \ +#define K8REPL(f,x) \ ({ \ CCTK_REAL8_VEC const xfunc=(x); \ - vec8_set((vec8_elt0(xfunc)), \ - (vec8_elt1(xfunc)), \ - (vec8_elt2(xfunc)), \ - (vec8_elt3(xfunc))); \ + vec8_set(f(vec8_elt0(xfunc)), \ + f(vec8_elt1(xfunc)), \ + f(vec8_elt2(xfunc)), \ + f(vec8_elt3(xfunc))); \ }) -#define K8REPL2(x,a,func) \ +#define K8REPL2(f,x,a) \ ({ \ CCTK_REAL8_VEC const xfunc=(x); \ CCTK_REAL8 const afunc=(a); \ - vec8_set((vec8_elt0(xfunc),afunc), \ - (vec8_elt1(xfunc),afunc), \ - (vec8_elt2(xfunc),afunc), \ - (vec8_elt3(xfunc),afunc)); \ + vec8_set(f(vec8_elt0(xfunc),afunc), \ + f(vec8_elt1(xfunc),afunc), \ + f(vec8_elt2(xfunc),afunc), \ + f(vec8_elt3(xfunc),afunc)); \ }) -#define k8exp(x) K8REPL(x,exp) -#define k8log(x) K8REPL(x,log) -#define k8pow(x,a) K8REPL2(x,a,exp) +#define k8exp(x) K8REPL(exp,x) +#define k8log(x) K8REPL(log,x) +#define k8pow(x,a) K8REPL2(pow,x,a) diff --git a/src/vectors-8-DoubleHummer.h b/src/vectors-8-DoubleHummer.h index 9311f62..16ed8a0 100644 --- a/src/vectors-8-DoubleHummer.h +++ b/src/vectors-8-DoubleHummer.h @@ -24,16 +24,17 @@ #define vec8_elt0(x) (__creal(x)) #define vec8_elt1(x) (__cimag(x)) -#define vec8_elt(x,d) \ -({ \ - CCTK_REAL8_VEC const xelt=(x); \ - CCTK_REAL8 aelt; \ - switch (d) { \ - case 0: aelt=vec8_elt0(xelt); break; \ - case 1: aelt=vec8_elt1(xelt); break; \ - } \ - aelt; \ -}) +#define vec8_elt(x_,d) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8 a; \ + switch (d) { \ + case 0: a=vec8_elt0(x); break; \ + case 1: a=vec8_elt1(x); break; \ + } \ + a; \ + }) @@ -41,13 +42,72 @@ // Load a vector from memory (aligned and unaligned); this loads from // a reference to a scalar -#define vec8_load(p) (__lfpd((double *)&(p))) -#define vec8_loadu(p) (__lfpd((double *)&(p))) // this may not work +#define vec8_load(p) (__lfpd((CCTK_REAL8 *)&(p))) +#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS +# define vec8_load_off1(p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + vec8_set((&p)[0],(&p)[1]); \ + }) +#else +#if 0 +# define vec8_load_off1(p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + CCTK_REAL8_VEC const lo = __lfxd((CCTK_REAL8 *)(&p-1)); \ + CCTK_REAL8_VEC const hi = __lfxd((CCTK_REAL8 *)(&p+1)); \ + __fpsel(vec8_set(-1.0,+1.0),lo,hi); \ + }) +#endif +# define vec8_load_off1(p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + CCTK_REAL8_VEC const lo = vec8_load((&p)[-1]); \ + CCTK_REAL8_VEC const hi = vec8_load((&p)[+1]); \ + __fxmr(__fpsel(vec8_set(+1.0,-1.0),lo,hi)); \ + }) +#endif +#define vec8_loadu(p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + int const off = (ptrdiff_t)&p & 0xf; \ + off==0 ? vec8_load(p) : vec8_load_off1(p); \ + }) // Load a vector from memory that may or may not be aligned, as // decided by the offset and the vector size -#define vec8_loadu_maybe(off,p) (vec8_loadu(p)) -#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) +#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS +// Implementation: Always use unaligned load +# define vec8_loadu_maybe(off,p) vec8_loadu(p) +# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p) +#else +# define vec8_loadu_maybe(off,p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + (off) % CCTK_REAL8_VEC_SIZE == 0 ? \ + vec8_load(p) : \ + vec8_load_off1(p); \ + }) +# if VECTORISE_ALIGNED_ARRAYS +// Assume all array x sizes are multiples of the vector size +# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu_maybe(off1,p) +# else +# define vec8_loadu_maybe3(off1,off2,off3,p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + ((off2) % CCTK_REAL8_VEC_SIZE != 0 || \ + (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \ + vec8_loadu(p) : \ + vec8_loadu_maybe(off1,p); \ + }) +# endif +#endif // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar @@ -59,6 +119,7 @@ // the non-temporal hint is probably ignored #define vec8_store_nta_partial_lo(p,x,n) ((&(p))[0]=vec8_elt0(x)) #define vec8_store_nta_partial_hi(p,x,n) ((&(p))[1]=vec8_elt1(x)) +#define vec8_store_nta_partial_mid(p,x,nlo,nhi) assert(0) @@ -71,7 +132,29 @@ #define k8add(x,y) (__fpadd(x,y)) #define k8sub(x,y) (__fpsub(x,y)) #define k8mul(x,y) (__fpmul(x,y)) -#define k8div(x,y) (__fpmul(x,__fpre(y))) +// Estimate for reciprocal +#define k8inv_init(x) (__fpre(x)) +// One Newton iteration for reciprocal +#define k8inv_iter(x_,r_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8_VEC const rr=(r_); \ + CCTK_REAL8_VEC const r=rr; \ + /* r + r * (vec8_set1(1.0) - x*r); */ \ + k8madd(r, k8nmsub(x, r, vec8_set1(1.0)), r); \ + }) +// Reciprocal: First estimate, then apply two Newton iterations +#define k8inv(x_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8_VEC const r0 = k8inv_init(x); \ + CCTK_REAL8_VEC const r1 = k8inv_iter(x,r0); \ + CCTK_REAL8_VEC const r2 = k8inv_iter(x,r1); \ + r2; \ + }) +#define k8div(x,y) (__fpmul(x,k8inv(y))) // Fused multiply-add, defined as [+-]x*y[+-]z #define k8madd(x,y,z) (__fpmadd(z,x,y)) @@ -80,29 +163,44 @@ #define k8nmsub(x,y,z) (__fpnmsub(z,x,y)) // Cheap functions -#define k8fabs(x) (__fpabs(x)) -#define k8fmax(x,y) (__fpsel(__fpsub(y,x),x,y)) -#define k8fmin(x,y) (__fpsel(__fpsub(x,y),x,y)) -#define k8fnabs(x) (__fpnabs(x)) - -#define k8exp(x) \ -({ \ - CCTK_REAL8_VEC const xexp=(x); \ - vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \ -}) -#define k8log(x) \ -({ \ - CCTK_REAL8_VEC const xlog=(x); \ - vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \ -}) -#define k8pow(x,a) \ -({ \ - CCTK_REAL8_VEC const xpow=(x); \ - CCTK_REAL8 const apow=(a); \ - vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \ -}) -#define k8sqrt(x) \ -({ \ - CCTK_REAL8_VEC const xsqrt=(x); \ - vec8_set(sqrt(vec8_elt0(xsqrt)), sqrt(vec8_elt1(xsqrt))); \ -}) +#define k8fabs(x) (__fpabs(x)) +#define k8fmax(x_,y_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8_VEC const yy=(y_); \ + CCTK_REAL8_VEC const y=yy; \ + __fpsel(k8sub(y,x),x,y); \ + }) +#define k8fmin(x_,y_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8_VEC const yy=(y_); \ + CCTK_REAL8_VEC const y=yy; \ + __fpsel(k8sub(x,y),x,y); \ + }) +#define k8fnabs(x) (__fpnabs(x)) + +// Expensive functions +#define K8REPL(f,x_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + vec8_set(f(vec8_elt0(x)), \ + f(vec8_elt1(x))); \ + }) +#define K8REPL2(f,x_,a_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8 const aa=(a_); \ + CCTK_REAL8 const a=aa; \ + vec8_set(f(vec8_elt0(x),a), \ + f(vec8_elt1(x),a)); \ + }) + +#define k8exp(x) K8REPL(exp,x) +#define k8log(x) K8REPL(log,x) +#define k8pow(x,a) K8REPL2(pow,x,a) +#define k8sqrt(x) K8REPL(sqrt,x) diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h index 34aa24f..4a3f4e2 100644 --- a/src/vectors-8-SSE2.h +++ b/src/vectors-8-SSE2.h @@ -6,6 +6,14 @@ #include <emmintrin.h> +#ifdef __SSE4_1__ +// Intel's SSE 4.1 +# include <smmintrin.h> +#endif +#ifdef __SSE4A__ +// AMD's SSE 4a +# include <ammintrin.h> +#endif @@ -22,43 +30,17 @@ #define vec8_set1(a) (_mm_set1_pd(a)) #define vec8_set(a,b) (_mm_set_pd(b,a)) // note reversed arguments -#if defined(__PGI) && defined (__amd64__) -// _mm_cvtsd_f64 does not exist on PGI 9 compilers -# define vec8_elt0(x) \ -({ \ - CCTK_REAL8 aelt0; \ - asm ("" : "=x" (aelt0) : "0" (x)); \ - aelt0; \ -}) -#else -# define vec8_elt0(x) (_mm_cvtsd_f64(x)) // this is a no-op -#endif -#define vec8_elt1(x) \ -({ \ - CCTK_REAL8_VEC const xelt1=(x); \ - vec8_elt0(_mm_unpackhi_pd(xelt1,xelt1)); \ -}) -#if defined(__PGI) && defined (__amd64__) -# define vec8_elt(x,d) \ -({ \ - CCTK_REAL8_VEC const xelt=(x); \ - CCTK_REAL8 aelt; \ - if (d==0) aelt=vec8_elt0(xelt); \ - else if (d==1) aelt=vec8_elt1(xelt); \ - aelt; \ -}) -#else -# define vec8_elt(x,d) \ -({ \ - CCTK_REAL8_VEC const xelt=(x); \ - CCTK_REAL8 aelt; \ - switch (d) { \ - case 0: aelt=vec8_elt0(xelt); break; \ - case 1: aelt=vec8_elt1(xelt); break; \ - } \ - aelt; \ -}) -#endif +// original order is 01 +#define vec8_swap10(x_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + _mm_shuffle_pd(x,x, _MM_SHUFFLE2(0,1)); \ + }) + +#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0]) +#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1]) +#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d]) @@ -68,29 +50,96 @@ // a reference to a scalar #define vec8_load(p) (_mm_load_pd(&(p))) #define vec8_loadu(p) (_mm_loadu_pd(&(p))) +#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS +# define vec8_load_off1(p) vec_loadu(p) +#else +# define vec8_load_off1(p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + _mm_shuffle_pd(vec8_load((&p)[-1]), \ + vec8_load((&p)[+1]), _MM_SHUFFLE2(0,1)); \ + }) +#endif // Load a vector from memory that may or may not be aligned, as // decided by the offset off and the vector size +#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS // Implementation: Always use unaligned load -#define vec8_loadu_maybe(off,p) (vec8_loadu(p)) -#define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p)) +# define vec8_loadu_maybe(off,p) vec8_loadu(p) +# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p) +#else +# define vec8_loadu_maybe(off,p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + (off) % CCTK_REAL8_VEC_SIZE == 0 ? \ + vec8_load(p) : \ + vec8_load_off1(p); \ + }) +# if VECTORISE_ALIGNED_ARRAYS +// Assume all array x sizes are multiples of the vector size +# define vec8_loadu_maybe3(off1,off2,off3,p) \ + vec8_loadu_maybe(off1,p) +# else +# define vec8_loadu_maybe3(off1,off2,off3,p_) \ + ({ \ + CCTK_REAL8 const& pp=(p_); \ + CCTK_REAL8 const& p=pp; \ + ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \ + (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \ + vec8_loadu(p) : \ + vec8_loadu_maybe(off1,p); \ + }) +# endif +#endif // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar -#define vec8_store(p,x) (_mm_store_pd(&(p),x)) -#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x)) -#define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x)) +#define vec8_store(p,x) (_mm_store_pd(&(p),x)) +#define vec8_storeu(p,x) (_mm_storeu_pd(&(p),x)) +#if ! VECTORISE_STREAMING_STORES +# define vec8_store_nta(p,x) vec8_store(p,x) +#else +# define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x)) +#endif -// Store a lower or higher partial vector (aligned and non-temporal); -// the non-temporal hint is probably ignored -#if 1 +// Store a lower or higher partial vector (aligned and non-temporal) +#if ! VECTORISE_STREAMING_STORES # define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x)) # define vec8_store_nta_partial_hi(p,x,n) (_mm_storeh_pd(&(p)+1,x)) #else +# if defined(__SSE4A__) +# define vec8_store_nta_partial_lo(p,x,n) (_mm_stream_sd(&(p),x)) +# define vec8_store_nta_partial_hi(p,x,n) \ + (_mm_stream_sd(&(p)+1, vec8_swap10(x))) +# else +// TODO: use clflush once a whole cache line has been written (cache +// lines are usually larger than the CPU vector size) +# define vec8_store_nta_partial_lo(p_,x,n) \ + ({ \ + CCTK_REAL8& pp=(p_); \ + CCTK_REAL8& p=pp; \ + _mm_storel_pd(&p,x); \ + /* _mm_clflush(&p); */ \ + }) +# define vec8_store_nta_partial_hi(p_,x,n) \ + ({ \ + CCTK_REAL8& pp=(p_); \ + CCTK_REAL8& p=pp; \ + _mm_storeh_pd(&p+1,x); \ + /* _mm_clflush(&p+1); */ \ + }) +# endif +#endif +#if 0 // This is slower; we would need a non-temporal read -# define vec8_store_nta_partial_lo(p,x,n) (vec8_store_nta(p,_mm_loadh_pd(x,&(p)+1))) -# define vec8_store_nta_partial_hi(p,x,n) (vec8_store_nta(p,_mm_loadl_pd(x,&(p)))) +#define vec8_store_nta_partial_lo(p,x,n) \ + vec8_store_nta(p, _mm_loadh_pd(x,&(p)+1)) +#define vec8_store_nta_partial_hi(p,x,n) \ + vec8_store_nta(p, _mm_loadl_pd(x,&(p))) #endif +#define vec8_store_nta_partial_mid(p,x,nlo,nhi) assert(0) @@ -107,6 +156,43 @@ static const union { } k8abs_mask_union = {{ 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }}; #define k8abs_mask (k8sign_mask_union.v) +// Choice [sign(x)>0 ? y : z] +#ifdef __SSE4_1__ +# define k8ifthen(x,y,z) (_mm_blendv_pd(y,z,x)) +#elif 0 +# define k8ifthen(x,y,z) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8_VEC const yy=(y_); \ + CCTK_REAL8_VEC const y=yy; \ + CCTK_REAL8_VEC const zz=(z_); \ + CCTK_REAL8_VEC const z=zz; \ + int const m = _mm_movemask_pd(x); \ + CCTK_REAL8_VEC r; \ + switch (m) { \ + case 0: r = y; break; \ + case 1: r = _mm_move_sd(y,z); break; \ + case 2: r = _mm_move_sd(z,y); break; \ + case 3: r = z; break; \ + } \ + r; \ + }) +#else +# define k8ifthen(x,y,z) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8_VEC const yy=(y_); \ + CCTK_REAL8_VEC const y=yy; \ + CCTK_REAL8_VEC const zz=(z_); \ + CCTK_REAL8_VEC const z=zz; \ + CCTK_REAL8_VEC const c = _mm_and_pd(x,k8sign_mask); \ + vec8_set(not vec8_elt0(c) ? vec8_elt0(y) : vec8_elt0(z), \ + not vec8_elt1(c) ? vec8_elt1(y) : vec8_elt1(z)); \ + }) +#endif + // Operators #define k8pos(x) (x) #define k8neg(x) (_mm_xor_pd(x,k8sign_mask)) @@ -130,19 +216,23 @@ static const union { #define k8sqrt(x) (_mm_sqrt_pd(x)) // Expensive functions -#define k8exp(x) \ -({ \ - CCTK_REAL8_VEC const xexp=(x); \ - vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \ -}) -#define k8log(x) \ -({ \ - CCTK_REAL8_VEC const xlog=(x); \ - vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \ -}) -#define k8pow(x,a) \ -({ \ - CCTK_REAL8_VEC const xpow=(x); \ - CCTK_REAL8 const apow=(a); \ - vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \ -}) +#define K8REPL(f,x_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + vec8_set(f(vec8_elt0(x)), \ + f(vec8_elt1(x))); \ + }) +#define K8REPL2(f,x_,a_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8 const aa=(a_); \ + CCTK_REAL8 const a=aa; \ + vec8_set(f(vec8_elt0(x),a), \ + f(vec8_elt1(x),a)); \ + }) + +#define k8exp(x) K8REPL(exp,x) +#define k8log(x) K8REPL(log,x) +#define k8pow(x,a) K8REPL2(pow,x,a) diff --git a/src/vectors-8-VSX.h b/src/vectors-8-VSX.h index 9d7c17c..f47d1df 100644 --- a/src/vectors-8-VSX.h +++ b/src/vectors-8-VSX.h @@ -21,12 +21,12 @@ #define vec8_set1(a) (vec_splats(a)) #define vec8_set(a,b) \ -({ \ - CCTK_REAL8_VEC x; \ - x[0]=(a); \ - x[1]=(b); \ - x; \ -}) + ({ \ + CCTK_REAL8_VEC x; \ + x[0]=(a); \ + x[1]=(b); \ + x; \ + }) #define vec8_elt0(x) ((x)[0]) #define vec8_elt1(x) ((x)[1]) @@ -48,19 +48,16 @@ // Store a vector to memory (aligned and non-temporal); this stores to // a reference to a scalar -#define vec8_store(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x)) -#define vec8_storeu(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x)) -#if 1 -# define vec8_store_nta(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x)) -#else +#define vec8_store(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x)) +#define vec8_storeu(p,x) (*(CCTK_REAL8_VEC*)&(p)=(x)) // stvxl instruction doesn't exist for double precision -# define vec8_store_nta(p,x) (vec_stl(x,0,(CCTK_REAL8_VEC*)&(p))) -#endif +#define vec8_store_nta(p,x) vec8_store(p,x) // Store a lower or higher partial vector (aligned and non-temporal); // the non-temporal hint is probably ignored #define vec8_store_nta_partial_lo(p,x,n) ((&(p))[0]=(x)[0]) #define vec8_store_nta_partial_hi(p,x,n) ((&(p))[1]=(x)[1]) +#define vec8_store_nta_partial_mid(p,x,nlo,nhi) (assert(0)) @@ -87,24 +84,25 @@ #define k8fmin(x,y) (vec_min(x,y)) #define k8fnabs(x) (vec_nabs(x)) -#define k8exp(x) \ -({ \ - CCTK_REAL8_VEC const xexp=(x); \ - vec8_set(exp(vec8_elt0(xexp)), exp(vec8_elt1(xexp))); \ -}) -#define k8log(x) \ -({ \ - CCTK_REAL8_VEC const xlog=(x); \ - vec8_set(log(vec8_elt0(xlog)), log(vec8_elt1(xlog))); \ -}) -#define k8pow(x,a) \ -({ \ - CCTK_REAL8_VEC const xpow=(x); \ - CCTK_REAL8 const apow=(a); \ - vec8_set(pow(vec8_elt0(xpow),apow), pow(vec8_elt1(xpow),apow)); \ -}) -#define k8sqrt(x) \ -({ \ - CCTK_REAL8_VEC const xsqrt=(x); \ - vec8_set(sqrt(vec8_elt0(xsqrt)), sqrt(vec8_elt1(xsqrt))); \ -}) +// Expensive functions +#define K8REPL(f,x_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + vec8_set(f(vec8_elt0(x)), \ + f(vec8_elt1(x))); \ + }) +#define K8REPL2(f,x_,a_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8 const aa=(a_); \ + CCTK_REAL8 const a=aa; \ + vec8_set(f(vec8_elt0(x),a), \ + f(vec8_elt1(x),a)); \ + }) + +#define k8exp(x) K8REPL(exp,x) +#define k8log(x) K8REPL(log,x) +#define k8pow(x,a) K8REPL2(pow,x,a) +#define k8sqrt(x) K8REPL(sqrt,x) diff --git a/src/vectors-8-default.h b/src/vectors-8-default.h index 8ea3ac8..bbe0150 100644 --- a/src/vectors-8-default.h +++ b/src/vectors-8-default.h @@ -50,6 +50,7 @@ // the vector elements into memory locations as if element 0 were // stored at p. #define vec8_store_nta_partial_hi(p,x,n) (assert(0)) +#define vec8_store_nta_partial_mid(p,x,nlo,nhi) (assert(0)) diff --git a/src/vectors.h b/src/vectors.h index a3cad46..03296e7 100644 --- a/src/vectors.h +++ b/src/vectors.h @@ -5,22 +5,19 @@ -#undef EMULATE_AVX - - - -#if defined(KRANC_VECTORS) +#if VECTORISE +/* TOOD: support AVX */ # if defined(__SSE__) // Intel SSE # include "vectors-4-SSE.h" # elif defined(__ALTIVEC__) // Power Altivec # include "vectors-4-Altivec.h" # endif -# if defined(__AVX__) // Intel AVX +# if defined(__AVX__) // Intel AVX # include "vectors-8-AVX.h" -# elif defined(__SSE2__) // Intel SSE2 -# if defined(EMULATE_AVX) +# elif defined(__SSE2__) // Intel SSE2 +# if VECTORISE_EMULATE_AVX # include "vectors-8-AVX.h" # else # include "vectors-8-SSE2.h" @@ -56,14 +53,15 @@ # define vec_elt0 vec4_elt0 # define vec_elt vec4_elt -# define vec_load vec4_load -# define vec_loadu vec4_loadu -# define vec_loadu_maybe vec4_loadu_maybe -# define vec_loadu_maybe3 vec4_loadu_maybe3 -# define vec_store vec4_store -# define vec_store_nta vec4_store_nta -# define vec_store_nta_partial_lo vec4_store_nta_partial_lo -# define vec_store_nta_partial_hi vec4_store_nta_partial_hi +# define vec_load vec4_load +# define vec_loadu vec4_loadu +# define vec_loadu_maybe vec4_loadu_maybe +# define vec_loadu_maybe3 vec4_loadu_maybe3 +# define vec_store vec4_store +# define vec_store_nta vec4_store_nta +# define vec_store_nta_partial_lo vec4_store_nta_partial_lo +# define vec_store_nta_partial_hi vec4_store_nta_partial_hi +# define vec_store_nta_partial_mid vec4_store_nta_partial_mid # define kpos k4pos # define kneg k4neg @@ -98,14 +96,15 @@ # define vec_elt0 vec8_elt0 # define vec_elt vec8_elt -# define vec_load vec8_load -# define vec_loadu vec8_loadu -# define vec_loadu_maybe vec8_loadu_maybe -# define vec_loadu_maybe3 vec8_loadu_maybe3 -# define vec_store vec8_store -# define vec_store_nta vec8_store_nta -# define vec_store_nta_partial_lo vec8_store_nta_partial_lo -# define vec_store_nta_partial_hi vec8_store_nta_partial_hi +# define vec_load vec8_load +# define vec_loadu vec8_loadu +# define vec_loadu_maybe vec8_loadu_maybe +# define vec_loadu_maybe3 vec8_loadu_maybe3 +# define vec_store vec8_store +# define vec_store_nta vec8_store_nta +# define vec_store_nta_partial_lo vec8_store_nta_partial_lo +# define vec_store_nta_partial_hi vec8_store_nta_partial_hi +# define vec_store_nta_partial_mid vec8_store_nta_partial_mid # define kpos k8pos # define kneg k8neg @@ -285,6 +284,11 @@ struct vecprops<CCTK_REAL8> { // For Kranc +#undef KRANC_DIFF_FUNCTIONS +#if ! VECTORISE_INLINE +# define KRANC_DIFF_FUNCTIONS +#endif + #undef ToReal #define ToReal(x) (vec_set1((CCTK_REAL)(x))) |