aboutsummaryrefslogtreecommitdiff
path: root/src/vectors-8-AVX.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/vectors-8-AVX.h')
-rw-r--r--src/vectors-8-AVX.h752
1 files changed, 521 insertions, 231 deletions
diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h
index 6882523..ce43542 100644
--- a/src/vectors-8-AVX.h
+++ b/src/vectors-8-AVX.h
@@ -1,17 +1,16 @@
// Vectorise using Intel's or AMD's AVX
// Use the type __m256d directly, without introducing a wrapper class
-// Use macros instead of inline functions
-#if VECTORISE_EMULATE_AVX
-# include "avxintrin_emu.h"
-#else
-# include <immintrin.h>
-#endif
+#include <cstdlib>
+
+
+
+#include <immintrin.h>
#ifdef __FMA4__
-# include <fma4intrin.h>
+# include <x86intrin.h>
#endif
@@ -26,43 +25,80 @@
// Vector type corresponding to CCTK_REAL
-#define CCTK_REAL8_VEC __m256d
+typedef __m256d CCTK_REAL8_VEC;
+typedef __m256i CCTK_INTEGER8_VEC;
+typedef __m256d CCTK_BOOLEAN8_VEC;
// Number of vector elements in a CCTK_REAL_VEC
#define CCTK_REAL8_VEC_SIZE 4
+vec_static_assert(sizeof(CCTK_REAL8_VEC) ==
+ sizeof(CCTK_REAL8) * CCTK_REAL8_VEC_SIZE);
+
// Integer and boolean types corresponding to this real type
-#define CCTK_INTEGER8 CCTK_REAL8
-#define CCTK_BOOLEAN8 CCTK_REAL8
-#define CCTK_INTEGER8_VEC CCTK_REAL8_VEC
-#define CCTK_BOOLEAN8_VEC CCTK_REAL8_VEC
+typedef CCTK_INT8 CCTK_INTEGER8;
+typedef CCTK_REAL8 CCTK_BOOLEAN8;
union k8const_t {
- unsigned long long i[4];
- double f[4];
- __m256i vi;
- __m256d vf;
+ CCTK_INTEGER8 i[CCTK_REAL8_VEC_SIZE];
+ CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE];
+ CCTK_INTEGER8_VEC vi;
+ CCTK_REAL8_VEC vf;
};
-#define K8_ZERO 0x0000000000000000ULL
-#define K8_NOTZERO 0xffffffffffffffffULL
-#define K8_IMIN 0x8000000000000000ULL
-#define K8_IMAX 0x7fffffffffffffffULL
+#define k8sign (vec8_set1i( (CCTK_INTEGER8)(1ULL << 63ULL)))
+#define k8notsign (vec8_set1i(~ (CCTK_INTEGER8)(1ULL << 63ULL)))
// Create vectors, extract vector elements
-#define vec8_set1(a) (_mm256_set1_pd(a))
-#define vec8_set(a,b,c,d) (_mm256_set_pd(d,c,b,a)) // note reversed arguments
-
-#define vec8_elt0(x) (((CCTK_REAL8 const*)&(x))[0])
-#define vec8_elt1(x) (((CCTK_REAL8 const*)&(x))[1])
-#define vec8_elt2(x) (((CCTK_REAL8 const*)&(x))[2])
-#define vec8_elt3(x) (((CCTK_REAL8 const*)&(x))[3])
-#define vec8_elt(x,d) (((CCTK_REAL8 const*)&(x))[d])
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_set1(CCTK_REAL8 const a)
+{
+ return _mm256_set1_pd(a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_set1i(CCTK_INT8 const a)
+{
+ return _mm256_castsi256_pd(_mm256_set1_epi64x(a));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_set(CCTK_REAL8 const a,
+ CCTK_REAL8 const b,
+ CCTK_REAL8 const c,
+ CCTK_REAL8 const d)
+{
+ return _mm256_set_pd(d,c,b,a); // note reversed arguments
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt0(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[0];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt1(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[1];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt2(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[2];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt3(CCTK_REAL8_VEC const x)
+{
+ return ((CCTK_REAL8 const*)&x)[3];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8 vec8_elt(CCTK_REAL8_VEC const x, std::ptrdiff_t const d)
+{
+ return ((CCTK_REAL8 const*)&x)[d];
+}
@@ -70,11 +106,17 @@ union k8const_t {
// Load a vector from memory (aligned and unaligned); this loads from
// a reference to a scalar
-#define vec8_load(p) (_mm256_load_pd(&(p)))
-#define vec8_loadu(p) (_mm256_loadu_pd(&(p)))
-#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS
-# define vec8_load_off1(p) vec_loadu(p)
-#else
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_load(CCTK_REAL8 const& p)
+{
+ return _mm256_load_pd(&p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu(CCTK_REAL8 const& p)
+{
+ return _mm256_loadu_pd(&p);
+}
+#if VECTORISE_ALWAYS_USE_ALIGNED_LOADS
# error "VECTORISE_ALWAYS_USE_ALIGNED_LOADS not yet supported"
#endif
@@ -82,244 +124,492 @@ union k8const_t {
// decided by the offset off and the vector size
#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
// Implementation: Always use unaligned load
-# define vec8_loadu_maybe(off,p) (vec8_loadu(p))
-# define vec8_loadu_maybe3(off1,off2,off3,p) (vec8_loadu(p))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p)
+{
+ return vec8_loadu(p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL8 const& p)
+{
+ return vec8_loadu(p);
+}
#else
-# define vec8_loadu_maybe(off,p_) \
- ({ \
- CCTK_REAL8 const& p__=(p_); \
- CCTK_REAL8 const& p=p__; \
- (off) % CCTK_REAL8_VEC_SIZE == 0 ? \
- vec8_load(p) : \
- vec8_load_off1(p); \
- })
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL8 const& p)
+{
+ return off % CCTK_REAL8_VEC_SIZE == 0 ? vec8_load(p) : vec8_loadu(p);
+}
# if VECTORISE_ALIGNED_ARRAYS
// Assume all array x sizes are multiples of the vector size
-# define vec8_loadu_maybe3(off1,off2,off3,p) \
- vec8_loadu_maybe(off1,p)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL8 const& p)
+{
+ return vec8_loadu_maybe(off1, p);
+}
# else
-# define vec8_loadu_maybe3(off1,off2,off3,p_) \
- ({ \
- CCTK_REAL8 const& p__=(p_); \
- CCTK_REAL8 const& p=p__; \
- ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \
- (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \
- vec8_loadu(p) : \
- vec8_loadu_maybe(off1,p); \
- })
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC vec8_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL8 const& p)
+{
+ return
+ off2 % CCTK_REAL8_VEC_SIZE != 0 or
+ off3 % CCTK_REAL8_VEC_SIZE != 0 ?
+ vec8_loadu(p) :
+ vec8_loadu_maybe(off1, p);
+}
# endif
#endif
// Store a vector to memory (aligned and non-temporal); this stores to
// a reference to a scalar
-#define vec8_store(p,x) (_mm256_store_pd(&(p),x))
-#define vec8_storeu(p,x) (_mm256_storeu_pd(&(p),x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+ _mm256_store_pd(&p, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_storeu(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+ _mm256_storeu_pd(&p, x);
+}
#if ! VECTORISE_STREAMING_STORES
-# define vec8_store_nta(p,x) (vec8_store(p,x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+ vec8_store(p, x);
+}
#else
-# define vec8_store_nta(p,x) (_mm256_stream_pd(&(p),x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta(CCTK_REAL8& p, CCTK_REAL8_VEC const x)
+{
+ _mm256_stream_pd(&p, x);
+}
#endif
// Store a partial vector (aligned and non-temporal)
-#define vec8_store_partial_prepare(i,imin_,imax_) \
+#define vec8_store_partial_prepare(i, imin,imax) \
bool v8stp_all; \
__m256i v8stp_mask; \
- ({ \
- ptrdiff_t const imin__=(imin_); \
- ptrdiff_t const imin=imin__; \
- ptrdiff_t const imax__=(imax_); \
- ptrdiff_t const imax=imax__; \
- \
- v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE-1<imax; \
- \
- if (not CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
- /* \
- __m256i const v8stp_mask = \
- _mm256_andnot_pd(_mm256_add_epi64(_mm256_set1_epi64x(i-imin), \
- vec_index), \
- _mm256_add_epi64(_mm256_set1_epi64x(i-imax), \
- vec_index)); \
- */ \
- __m128i const termlo0 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(1, 0)); \
- __m128i const termup0 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(1, 0)); \
- __m128i const term0 = _mm_andnot_si128(termlo0, termup0); \
- __m128i const termlo1 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(3, 2)); \
- __m128i const termup1 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(3, 2)); \
- __m128i const term1 = _mm_andnot_si128(termlo1, termup1); \
- v8stp_mask = \
- _mm256_insertf128_si256(_mm256_castsi128_si256(term0), term1, 1); \
- } \
- })
-
-#define vec8_store_nta_partial(p,x) \
- ({ \
- if (CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
- vec8_store_nta(p,x); \
- } else { \
- _mm256_maskstore_pd(&p,v8stp_mask,x); \
- } \
- })
+ vec8_store_partial_prepare_(v8stp_all, v8stp_mask, i, imin, imax)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_partial_prepare_(bool& all, __m256i& mask,
+ std::ptrdiff_t const i,
+ std::ptrdiff_t const imin,
+ std::ptrdiff_t const imax)
+{
+ all = i>=imin and i+CCTK_REAL8_VEC_SIZE-1<imax;
+
+ if (not CCTK_BUILTIN_EXPECT(all, true)) {
+ /* __m256i const v8stp_mask = */
+ /* _mm256_andnot_pd(_mm256_add_epi64(_mm256_set1_epi64x(i-imin), */
+ /* vec_index), */
+ /* _mm256_add_epi64(_mm256_set1_epi64x(i-imax), */
+ /* vec_index)); */
+ __m128i const termlo01 =
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(1, 0));
+ __m128i const termup01 =
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(1, 0));
+ __m128i const term01 = _mm_andnot_si128(termlo01, termup01);
+ __m128i const termlo23 =
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(3, 2));
+ __m128i const termup23 =
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(3, 2));
+ __m128i const term23 = _mm_andnot_si128(termlo23, termup23);
+ mask = _mm256_insertf128_si256(_mm256_castsi128_si256(term01), term23, 1);
+ }
+}
+
+#define vec8_store_nta_partial(p, x) \
+ vec8_store_nta_partial_(v8stp_all, v8stp_mask, p, x)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_(bool const all, __m256i const mask,
+ CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x)
+{
+ if (CCTK_BUILTIN_EXPECT(all, true)) {
+ vec8_store_nta(p, x);
+ } else {
+ _mm256_maskstore_pd(&p, mask, x);
+ }
+}
// Store a lower or higher partial vector (aligned and non-temporal);
// the non-temporal hint is probably ignored
// Masks indicating which vector element should be stored:
-static const k8const_t k8store_lo[5] =
+/*static*/ k8const_t const k8store_lo[5] =
{
- {{ K8_ZERO , K8_ZERO , K8_ZERO , K8_ZERO , }},
- {{ K8_NOTZERO, K8_ZERO , K8_ZERO , K8_ZERO , }},
- {{ K8_NOTZERO, K8_NOTZERO, K8_ZERO , K8_ZERO , }},
- {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_ZERO , }},
- {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }},
+ { i: { 0, 0, 0, 0, }},
+ { i: { ~0, 0, 0, 0, }},
+ { i: { ~0, ~0, 0, 0, }},
+ { i: { ~0, ~0, ~0, 0, }},
+ { i: { ~0, ~0, ~0, ~0, }},
};
-static const k8const_t k8store_hi[5] =
+/*static*/ k8const_t const k8store_hi[5] =
{
- {{ K8_ZERO , K8_ZERO , K8_ZERO , K8_ZERO , }},
- {{ K8_ZERO , K8_ZERO , K8_ZERO , K8_NOTZERO, }},
- {{ K8_ZERO , K8_ZERO , K8_NOTZERO, K8_NOTZERO, }},
- {{ K8_ZERO , K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }},
- {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }},
+ { i: { 0, 0, 0, 0, }},
+ { i: { 0, 0, 0, ~0, }},
+ { i: { 0, 0, ~0, ~0, }},
+ { i: { 0, ~0, ~0, ~0, }},
+ { i: { ~0, ~0, ~0, ~0, }},
};
#if !defined(__INTEL_COMPILER) && defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4
// gcc 4.4 uses a wrong prototype for _mm256_maskstore_pd
-# define vec8_store_nta_partial_lo(p,x,n) \
- (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_lo[n].vi),x))
-# define vec8_store_nta_partial_hi(p,x,n) \
- (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_hi[n].vi),x))
-# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
- (_mm256_maskstore_pd \
- (&(p), \
- _mm256_castsi256_pd(k8store_lo[nlo].vi & k8store_hi[nhi].vi), \
- x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_lo(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_pd(&p, _mm256_castsi256_pd(k8store_lo[n].vi), x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_hi(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_pd(&p, _mm256_castsi256_pd(k8store_hi[n].vi), x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_mid(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const nlo,
+ ptrdiff_t const nhi)
+{
+ _mm256_maskstore_pd
+ (&p,
+ _mm256_castsi256_pd(k8store_lo[nlo].vi & k8store_hi[nhi].vi),
+ x);
+}
#else
-# define vec8_store_nta_partial_lo(p,x,n) \
- (_mm256_maskstore_pd(&(p),k8store_lo[n].vi,x))
-# define vec8_store_nta_partial_hi(p,x,n) \
- (_mm256_maskstore_pd(&(p),k8store_hi[n].vi,x))
-# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
- (_mm256_maskstore_pd \
- (&(p), \
- _mm256_castpd_si256(_mm256_and_pd(k8store_lo[nlo].vf, \
- k8store_hi[nhi].vf)), \
- x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_lo(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_pd(&p, k8store_lo[n].vi, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_hi(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_pd(&p, k8store_hi[n].vi, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec8_store_nta_partial_mid(CCTK_REAL8& p,
+ CCTK_REAL8_VEC const x,
+ ptrdiff_t const nlo,
+ ptrdiff_t const nhi)
+{
+ _mm256_maskstore_pd
+ (&p,
+ _mm256_castpd_si256(_mm256_and_pd(k8store_lo[nlo].vf, k8store_hi[nhi].vf)),
+ x);
+}
#endif
// Functions and operators
-static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }};
-
// Operators
-#define k8neg(x) (_mm256_xor_pd(x,k8sign_mask.vf))
-
-#define k8add(x,y) (_mm256_add_pd(x,y))
-#define k8sub(x,y) (_mm256_sub_pd(x,y))
-#define k8mul(x,y) (_mm256_mul_pd(x,y))
-#define k8div(x,y) (_mm256_div_pd(x,y))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8neg(CCTK_REAL8_VEC const x)
+{
+ return _mm256_xor_pd(k8sign, x);
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8add(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_add_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sub(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_sub_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8mul(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_mul_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8div(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_div_pd(x, y);
+}
// Fused multiply-add, defined as [+-]x*y[+-]z
#ifdef __FMA4__
-# define k8madd(x,y,z) (_mm256_macc_pd(x,y,z))
-# define k8msub(x,y,z) (_mm256_msub_pd(x,y,z))
-# define k8nmadd(x,y,z) (_mm256_nmsub_pd(x,y,z))
-# define k8nmsub(x,y,z) (_mm256_nmacc_pd(x,y,z))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8madd(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm256_macc_pd(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8msub(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm256_msub_pd(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8nmadd(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm256_nmsub_pd(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm256_nmacc_pd(x, y, z);
+}
#else
-# define k8madd(x,y,z) (k8add(k8mul(x,y),z))
-# define k8msub(x,y,z) (k8sub(k8mul(x,y),z))
-# define k8nmadd(x,y,z) (k8sub(k8neg(z),k8mul(x,y)))
-# define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y)))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8madd(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return k8add(k8mul(x, y), z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8msub(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return k8sub(k8mul(x, y), z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8nmadd(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return k8sub(k8neg(z), k8mul(x, y));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8nmsub(CCTK_REAL8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return k8sub(z, k8mul(x, y));
+}
#endif
// Cheap functions
-#define k8copysign(x,y) \
- (_mm256_or_pd(_mm256_andnot_pd(k8sign_mask.vf,x), \
- _mm256_and_pd(k8sign_mask.vf,y)))
-#define k8fabs(x) (_mm256_andnot_pd(k8sign_mask.vf,x))
-#define k8fmax(x,y) (_mm256_max_pd(x,y))
-#define k8fmin(x,y) (_mm256_min_pd(x,y))
-#define k8fnabs(x) (_mm256_or_pd(x,k8sign_mask.vf))
-static const k8const_t k8zero = { f: { 0.0, 0.0, 0.0, 0.0, }};
-static const k8const_t k8one = { f: { 1.0, 1.0, 1.0, 1.0, }};
-#define k8sgn(x_) \
- ({ \
- CCTK_REAL_VEC x__=(x_); \
- CCTK_REAL_VEC x=x__; \
- CCTK_REAL_VEC iszero = _mm256_cmp_pd(x, k8zero.vf, _CMP_EQ_OQ); \
- CCTK_REAL_VEC sign = _mm256_and_pd(k8sign_mask.vf, x); \
- CCTK_REAL_VEC signedone = _mm256_or_pd(sign, k8one.vf); \
- k8ifthen(iszero, k8zero.vf, signedone); \
- })
-#define k8sqrt(x) (_mm256_sqrt_pd(x))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8copysign(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_or_pd(_mm256_and_pd(k8notsign, x),
+ _mm256_and_pd(k8sign , y));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fabs(CCTK_REAL8_VEC const x)
+{
+ return _mm256_and_pd(k8notsign, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fmax(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_max_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fmin(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_min_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8fnabs(CCTK_REAL8_VEC const x)
+{
+ return _mm256_or_pd(k8sign, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sqrt(CCTK_REAL8_VEC const x)
+{
+ return _mm256_sqrt_pd(x);
+}
// Expensive functions
-#define K8REPL(f,x_) \
- ({ \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8_VEC const x=x__; \
- vec8_set(f(vec8_elt0(x)), \
- f(vec8_elt1(x)), \
- f(vec8_elt2(x)), \
- f(vec8_elt3(x))); \
- })
-#define K8REPL2S(f,x_,a_) \
- ({ \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8 const a__=(a_); \
- CCTK_REAL8_VEC const x=x__; \
- CCTK_REAL8 const a=a__; \
- vec8_set(f(vec8_elt0(x),a), \
- f(vec8_elt1(x),a), \
- f(vec8_elt2(x),a), \
- f(vec8_elt3(x),a)); \
- })
-#define K8REPL2(f,x_,y_) \
- ({ \
- CCTK_REAL8_VEC const x__=(x_); \
- CCTK_REAL8_VEC const y__=(y_); \
- CCTK_REAL8_VEC const x=x__; \
- CCTK_REAL8_VEC const y=y__; \
- vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \
- f(vec8_elt1(x),vec8_elt1(y)), \
- f(vec8_elt2(x),vec8_elt2(y)), \
- f(vec8_elt3(x),vec8_elt3(y))); \
- })
-
-#define k8acos(x) K8REPL(acos,x)
-#define k8acosh(x) K8REPL(acosh,x)
-#define k8asin(x) K8REPL(asin,x)
-#define k8asinh(x) K8REPL(asinh,x)
-#define k8atan(x) K8REPL(atan,x)
-#define k8atan2(x,y) K8REPL2(atan2,x,y)
-#define k8atanh(x) K8REPL(atanh,x)
-#define k8cos(x) K8REPL(cos,x)
-#define k8cosh(x) K8REPL(cosh,x)
-#define k8exp(x) K8REPL(exp,x)
-#define k8log(x) K8REPL(log,x)
-#define k8pow(x,a) K8REPL2S(pow,x,a)
-#define k8sin(x) K8REPL(sin,x)
-#define k8sinh(x) K8REPL(sinh,x)
-#define k8tan(x) K8REPL(tan,x)
-#define k8tanh(x) K8REPL(tanh,x)
-
-static const k8const_t k8lfalse_ =
- {{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }};
-static const k8const_t k8ltrue_ =
- {{ K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, K8_NOTZERO, }};
-#define k8lfalse (k8lfalse_.vf)
-#define k8ltrue (k8ltrue_.vf)
-#define k8lnot(x) (_mm256_xor_pd(k8ltrue,x))
-#define k8land(x,y) (_mm256_and_pd(x,y))
-#define k8lor(x,y) (_mm256_or_pd(x,y))
-#define k8lxor(x,y) (_mm256_xor_pd(x,y))
-#define k8ifthen(x,y,z) (_mm256_blendv_pd(z,y,x))
-
-#define k8cmpeq(x,y) (_mm256_cmp_pd(x,y,_CMP_EQ_OQ))
-#define k8cmpne(x,y) (_mm256_cmp_pd(x,y,_CMP_NEQ_OQ))
-#define k8cmpgt(x,y) (_mm256_cmp_pd(x,y,_CMP_GT_OQ))
-#define k8cmpge(x,y) (_mm256_cmp_pd(x,y,_CMP_GE_OQ))
-#define k8cmplt(x,y) (_mm256_cmp_pd(x,y,_CMP_LT_OQ))
-#define k8cmple(x,y) (_mm256_cmp_pd(x,y,_CMP_LE_OQ))
+#define K8REPL(f,x) \
+ vec8_set(f(vec8_elt0(x)), \
+ f(vec8_elt1(x)), \
+ f(vec8_elt2(x)), \
+ f(vec8_elt3(x)));
+#define K8REPL2S(f,x,a) \
+ vec8_set(f(vec8_elt0(x),a), \
+ f(vec8_elt1(x),a), \
+ f(vec8_elt2(x),a), \
+ f(vec8_elt3(x),a));
+#define K8REPL2(f,x,y) \
+ vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \
+ f(vec8_elt1(x),vec8_elt1(y)), \
+ f(vec8_elt2(x),vec8_elt2(y)), \
+ f(vec8_elt3(x),vec8_elt3(y)));
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8acos(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(acos,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8acosh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(acosh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8asin(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(asin,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8asinh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(asinh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atan(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(atan,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atan2(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return K8REPL2(atan2,x,y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8atanh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(atanh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8cos(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(cos,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8cosh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(cosh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8exp(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(exp,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8log(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(log,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8pow(CCTK_REAL8_VEC const x, CCTK_REAL8 const a)
+{
+ return K8REPL2S(pow,x,a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sin(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(sin,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sinh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(sinh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8tan(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(tan,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8tanh(CCTK_REAL8_VEC const x)
+{
+ return K8REPL(tanh,x);
+}
+
+
+
+#define k8lfalse (vec8_set1i( 0))
+#define k8ltrue (vec8_set1i(~0))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8lnot(CCTK_BOOLEAN8_VEC const x)
+{
+ return _mm256_xor_pd(k8ltrue, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8land(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y)
+{
+ return _mm256_and_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8lor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y)
+{
+ return _mm256_or_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8lxor(CCTK_BOOLEAN8_VEC const x, CCTK_BOOLEAN8_VEC const y)
+{
+ return _mm256_xor_pd(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8ifthen(CCTK_BOOLEAN8_VEC const x,
+ CCTK_REAL8_VEC const y,
+ CCTK_REAL8_VEC const z)
+{
+ return _mm256_blendv_pd(z, y, x);
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpeq(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_cmp_pd(x, y, _CMP_EQ_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpne(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_cmp_pd(x, y, _CMP_NEQ_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpgt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_cmp_pd(x, y, _CMP_GT_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmpge(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_cmp_pd(x, y, _CMP_GE_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmplt(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_cmp_pd(x, y, _CMP_LT_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN8_VEC k8cmple(CCTK_REAL8_VEC const x, CCTK_REAL8_VEC const y)
+{
+ return _mm256_cmp_pd(x, y, _CMP_LE_OQ);
+}
+
+
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL8_VEC k8sgn(CCTK_REAL8_VEC const x)
+{
+ CCTK_BOOLEAN8_VEC const iszero = k8cmpeq(x, vec8_set1(0.0));
+ CCTK_REAL8_VEC const sign = _mm256_and_pd(k8sign, x);
+ CCTK_REAL8_VEC const signedone = _mm256_or_pd(sign, vec8_set1(1.0));
+ return k8ifthen(iszero, vec8_set1(0.0), signedone);
+}