aboutsummaryrefslogtreecommitdiff
path: root/src/macros/vectors-8-AVX.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/macros/vectors-8-AVX.h')
-rw-r--r--src/macros/vectors-8-AVX.h325
1 files changed, 325 insertions, 0 deletions
diff --git a/src/macros/vectors-8-AVX.h b/src/macros/vectors-8-AVX.h
new file mode 100644
index 0000000..6882523
--- /dev/null
+++ b/src/macros/vectors-8-AVX.h
@@ -0,0 +1,325 @@
+// Vectorise using Intel's or AMD's AVX
+
+// Use the type __m256d directly, without introducing a wrapper class
+// Use macros instead of inline functions
+
+
+
+#if VECTORISE_EMULATE_AVX
+# include "avxintrin_emu.h"
+#else
+# include <immintrin.h>
+#endif
+#ifdef __FMA4__
+# include <fma4intrin.h>
+#endif
+
+
+
+#ifdef __FMA4__
+# define vec8_architecture_FMA4 "+FMA4"
+#else
+# define vec8_architecture_FMA4 ""
+#endif
+#define vec8_architecture "AVX" vec8_architecture_FMA4 " (64-bit precision)"
+
+
+
+// Vector type corresponding to CCTK_REAL
+#define CCTK_REAL8_VEC __m256d
+
+// Number of vector elements in a CCTK_REAL_VEC
+#define CCTK_REAL8_VEC_SIZE 4
+
+// Integer and boolean types corresponding to this real type
+#define CCTK_INTEGER8 CCTK_REAL8
+#define CCTK_BOOLEAN8 CCTK_REAL8
+#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC
+#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC
+
+
+
+union k8const_t {
+ unsigned long long i[4];
+ double f[4];
+ __m256i vi;
+ __m256d vf;
+};
+
+#define K8_ZERO 0x0000000000000000ULL
+#define K8_NOTZERO 0xffffffffffffffffULL
+#define K8_IMIN 0x8000000000000000ULL
+#define K8_IMAX 0x7fffffffffffffffULL
+
+
+
+// Create vectors, extract vector elements
+
+#define vec8_set1(a) (_mm256_set1_pd(a))
+#define vec8_set(a,b,c,d) (_mm256_set_pd(d,c,b,a)) // note reversed arguments
+
+#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0])
+#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1])
+#define vec8_elt2(x) (((CCTK_REAL8 const*)&(x))[2])
+#define vec8_elt3(x) (((CCTK_REAL8 const*)&(x))[3])
+#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d])
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+#define vec8_load(p) (_mm256_load_pd(&(p)))
+#define vec8_loadu(p) (_mm256_loadu_pd(&(p)))
+#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS
+# define vec8_load_off1(p) vec_loadu(p)
+#else
+# error "VECTORISE_ALWAYS_USE_ALIGNED_LOADS not yet supported"
+#endif
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
+// Implementation: Always use unaligned load
+# define vec8_loadu_maybe(off,p) (vec8_loadu(p))
+# define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p))
+#else
+# define vec8_loadu_maybe(off,p_) \
+ ({ \
+ CCTK_REAL8 const& p__=(p_); \
+ CCTK_REAL8 const& p=p__; \
+ (off) % CCTK_REAL8_VEC_SIZE == 0 ? \
+ vec8_load(p) : \
+ vec8_load_off1(p); \
+ })
+# if VECTORISE_ALIGNED_ARRAYS
+// Assume all array x sizes are multiples of the vector size
+# define vec8_loadu_maybe3(off1,off2,off3,p) \
+ vec8_loadu_maybe(off1,p)
+# else
+# define vec8_loadu_maybe3(off1,off2,off3,p_) \
+ ({ \
+ CCTK_REAL8 const& p__=(p_); \
+ CCTK_REAL8 const& p=p__; \
+ ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \
+ (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \
+ vec8_loadu(p) : \
+ vec8_loadu_maybe(off1,p); \
+ })
+# endif
+#endif
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+#define vec8_store(p,x) (_mm256_store_pd(&(p),x))
+#define vec8_storeu(p,x) (_mm256_storeu_pd(&(p),x))
+#if ! VECTORISE_STREAMING_STORES
+# define vec8_store_nta(p,x) (vec8_store(p,x))
+#else
+# define vec8_store_nta(p,x) (_mm256_stream_pd(&(p),x))
+#endif
+
+// Store a partial vector (aligned and non-temporal)
+#define vec8_store_partial_prepare(i,imin_,imax_) \
+ bool v8stp_all; \
+ __m256i v8stp_mask; \
+ ({ \
+ ptrdiff_t const imin__=(imin_); \
+ ptrdiff_t const imin=imin__; \
+ ptrdiff_t const imax__=(imax_); \
+ ptrdiff_t const imax=imax__; \
+ \
+ v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE-1<imax; \
+ \
+ if (not CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
+ /* \
+ __m256i const v8stp_mask = \
+ _mm256_andnot_pd(_mm256_add_epi64(_mm256_set1_epi64x(i-imin), \
+ vec_index), \
+ _mm256_add_epi64(_mm256_set1_epi64x(i-imax), \
+ vec_index)); \
+ */ \
+ __m128i const termlo0 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(1, 0)); \
+ __m128i const termup0 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(1, 0)); \
+ __m128i const term0 = _mm_andnot_si128(termlo0, termup0); \
+ __m128i const termlo1 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(3, 2)); \
+ __m128i const termup1 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(3, 2)); \
+ __m128i const term1 = _mm_andnot_si128(termlo1, termup1); \
+ v8stp_mask = \
+ _mm256_insertf128_si256(_mm256_castsi128_si256(term0), term1, 1); \
+ } \
+ })
+
+#define vec8_store_nta_partial(p,x) \
+ ({ \
+ if (CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
+ vec8_store_nta(p,x); \
+ } else { \
+ _mm256_maskstore_pd(&p,v8stp_mask,x); \
+ } \
+ })
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+// Masks indicating which vector element should be stored:
+static const k8const_t k8store_lo[5] =
+ {
+ {{ K8_ZERO , K8_ZERO , K8_ZERO , K8_ZERO , }},
+ {{ K8_NOTZERO, K8_ZERO , K8_ZERO , K8_ZERO , }},
+ {{ K8_NOTZERO, K8_NOTZERO, K8_ZERO , K8_ZERO , }},
+ {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_ZERO , }},
+ {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }},
+ };
+static const k8const_t k8store_hi[5] =
+ {
+ {{ K8_ZERO , K8_ZERO , K8_ZERO , K8_ZERO , }},
+ {{ K8_ZERO , K8_ZERO , K8_ZERO , K8_NOTZERO, }},
+ {{ K8_ZERO , K8_ZERO , K8_NOTZERO, K8_NOTZERO, }},
+ {{ K8_ZERO , K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }},
+ {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }},
+ };
+#if !defined(__INTEL_COMPILER) && defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4
+// gcc 4.4 uses a wrong prototype for _mm256_maskstore_pd
+# define vec8_store_nta_partial_lo(p,x,n) \
+ (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_lo[n].vi),x))
+# define vec8_store_nta_partial_hi(p,x,n) \
+ (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_hi[n].vi),x))
+# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
+ (_mm256_maskstore_pd \
+ (&(p), \
+ _mm256_castsi256_pd(k8store_lo[nlo].vi & k8store_hi[nhi].vi), \
+ x))
+#else
+# define vec8_store_nta_partial_lo(p,x,n) \
+ (_mm256_maskstore_pd(&(p),k8store_lo[n].vi,x))
+# define vec8_store_nta_partial_hi(p,x,n) \
+ (_mm256_maskstore_pd(&(p),k8store_hi[n].vi,x))
+# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
+ (_mm256_maskstore_pd \
+ (&(p), \
+ _mm256_castpd_si256(_mm256_and_pd(k8store_lo[nlo].vf, \
+ k8store_hi[nhi].vf)), \
+ x))
+#endif
+
+
+
+// Functions and operators
+
+static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }};
+
+// Operators
+#define k8neg(x) (_mm256_xor_pd(x,k8sign_mask.vf))
+
+#define k8add(x,y) (_mm256_add_pd(x,y))
+#define k8sub(x,y) (_mm256_sub_pd(x,y))
+#define k8mul(x,y) (_mm256_mul_pd(x,y))
+#define k8div(x,y) (_mm256_div_pd(x,y))
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#ifdef __FMA4__
+# define k8madd(x,y,z) (_mm256_macc_pd(x,y,z))
+# define k8msub(x,y,z) (_mm256_msub_pd(x,y,z))
+# define k8nmadd(x,y,z) (_mm256_nmsub_pd(x,y,z))
+# define k8nmsub(x,y,z) (_mm256_nmacc_pd(x,y,z))
+#else
+# define k8madd(x,y,z) (k8add(k8mul(x,y),z))
+# define k8msub(x,y,z) (k8sub(k8mul(x,y),z))
+# define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y)))
+# define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y)))
+#endif
+
+// Cheap functions
+#define k8copysign(x,y) \
+ (_mm256_or_pd(_mm256_andnot_pd(k8sign_mask.vf,x), \
+ _mm256_and_pd(k8sign_mask.vf,y)))
+#define k8fabs(x) (_mm256_andnot_pd(k8sign_mask.vf,x))
+#define k8fmax(x,y) (_mm256_max_pd(x,y))
+#define k8fmin(x,y) (_mm256_min_pd(x,y))
+#define k8fnabs(x) (_mm256_or_pd(x,k8sign_mask.vf))
+static const k8const_t k8zero = { f: { 0.0, 0.0, 0.0, 0.0, }};
+static const k8const_t k8one = { f: { 1.0, 1.0, 1.0, 1.0, }};
+#define k8sgn(x_) \
+ ({ \
+ CCTK_REAL_VEC x__=(x_); \
+ CCTK_REAL_VEC x=x__; \
+ CCTK_REAL_VEC iszero = _mm256_cmp_pd(x, k8zero.vf, _CMP_EQ_OQ); \
+ CCTK_REAL_VEC sign = _mm256_and_pd(k8sign_mask.vf, x); \
+ CCTK_REAL_VEC signedone = _mm256_or_pd(sign, k8one.vf); \
+ k8ifthen(iszero, k8zero.vf, signedone); \
+ })
+#define k8sqrt(x) (_mm256_sqrt_pd(x))
+
+// Expensive functions
+#define K8REPL(f,x_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8_VEC const x=x__; \
+ vec8_set(f(vec8_elt0(x)), \
+ f(vec8_elt1(x)), \
+ f(vec8_elt2(x)), \
+ f(vec8_elt3(x))); \
+ })
+#define K8REPL2S(f,x_,a_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8 const a__=(a_); \
+ CCTK_REAL8_VEC const x=x__; \
+ CCTK_REAL8 const a=a__; \
+ vec8_set(f(vec8_elt0(x),a), \
+ f(vec8_elt1(x),a), \
+ f(vec8_elt2(x),a), \
+ f(vec8_elt3(x),a)); \
+ })
+#define K8REPL2(f,x_,y_) \
+ ({ \
+ CCTK_REAL8_VEC const x__=(x_); \
+ CCTK_REAL8_VEC const y__=(y_); \
+ CCTK_REAL8_VEC const x=x__; \
+ CCTK_REAL8_VEC const y=y__; \
+ vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \
+ f(vec8_elt1(x),vec8_elt1(y)), \
+ f(vec8_elt2(x),vec8_elt2(y)), \
+ f(vec8_elt3(x),vec8_elt3(y))); \
+ })
+
+#define k8acos(x) K8REPL(acos,x)
+#define k8acosh(x) K8REPL(acosh,x)
+#define k8asin(x) K8REPL(asin,x)
+#define k8asinh(x) K8REPL(asinh,x)
+#define k8atan(x) K8REPL(atan,x)
+#define k8atan2(x,y) K8REPL2(atan2,x,y)
+#define k8atanh(x) K8REPL(atanh,x)
+#define k8cos(x) K8REPL(cos,x)
+#define k8cosh(x) K8REPL(cosh,x)
+#define k8exp(x) K8REPL(exp,x)
+#define k8log(x) K8REPL(log,x)
+#define k8pow(x,a) K8REPL2S(pow,x,a)
+#define k8sin(x) K8REPL(sin,x)
+#define k8sinh(x) K8REPL(sinh,x)
+#define k8tan(x) K8REPL(tan,x)
+#define k8tanh(x) K8REPL(tanh,x)
+
+static const k8const_t k8lfalse_ =
+ {{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }};
+static const k8const_t k8ltrue_ =
+ {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }};
+#define k8lfalse (k8lfalse_.vf)
+#define k8ltrue (k8ltrue_.vf)
+#define k8lnot(x) (_mm256_xor_pd(k8ltrue,x))
+#define k8land(x,y) (_mm256_and_pd(x,y))
+#define k8lor(x,y) (_mm256_or_pd(x,y))
+#define k8lxor(x,y) (_mm256_xor_pd(x,y))
+#define k8ifthen(x,y,z) (_mm256_blendv_pd(z,y,x))
+
+#define k8cmpeq(x,y) (_mm256_cmp_pd(x,y,_CMP_EQ_OQ))
+#define k8cmpne(x,y) (_mm256_cmp_pd(x,y,_CMP_NEQ_OQ))
+#define k8cmpgt(x,y) (_mm256_cmp_pd(x,y,_CMP_GT_OQ))
+#define k8cmpge(x,y) (_mm256_cmp_pd(x,y,_CMP_GE_OQ))
+#define k8cmplt(x,y) (_mm256_cmp_pd(x,y,_CMP_LT_OQ))
+#define k8cmple(x,y) (_mm256_cmp_pd(x,y,_CMP_LE_OQ))