aboutsummaryrefslogtreecommitdiff
path: root/src/vectors-4-AVX.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/vectors-4-AVX.h')
-rw-r--r--src/vectors-4-AVX.h659
1 files changed, 659 insertions, 0 deletions
diff --git a/src/vectors-4-AVX.h b/src/vectors-4-AVX.h
new file mode 100644
index 0000000..641a74b
--- /dev/null
+++ b/src/vectors-4-AVX.h
@@ -0,0 +1,659 @@
+// Vectorise using Intel's or AMD's AVX
+
+// Use the type __m256 directly, without introducing a wrapper class
+
+
+
+#include <cstdlib>
+
+
+
+#include <immintrin.h>
+#ifdef __FMA4__
+# include <x86intrin.h>
+#endif
+
+
+
+#ifdef __FMA4__
+# define vec4_architecture_FMA4 "+FMA4"
+#else
+# define vec4_architecture_FMA4 ""
+#endif
+#define vec4_architecture "AVX" vec4_architecture_FMA4 " (32-bit precision)"
+
+
+
+// Vector type corresponding to CCTK_REAL
+typedef __m256 CCTK_REAL4_VEC;
+typedef __m256i CCTK_INTEGER4_VEC;
+typedef __m256 CCTK_BOOLEAN4_VEC;
+
+// Number of vector elements in a CCTK_REAL_VEC
+#define CCTK_REAL4_VEC_SIZE 8
+
+vec_static_assert(sizeof(CCTK_REAL4_VEC) ==
+ sizeof(CCTK_REAL4) * CCTK_REAL4_VEC_SIZE);
+
+// Integer and boolean types corresponding to this real type
+typedef CCTK_INT4 CCTK_INTEGER4;
+typedef CCTK_REAL4 CCTK_BOOLEAN4;
+
+
+
+union k4const_t {
+ CCTK_INTEGER4 i[CCTK_REAL4_VEC_SIZE];
+ CCTK_REAL4 f[CCTK_REAL4_VEC_SIZE];
+ CCTK_INTEGER4_VEC vi;
+ CCTK_REAL4_VEC vf;
+};
+
+#define k4sign (vec4_set1i( (CCTK_INTEGER4)(1UL << 31UL)))
+#define k4notsign (vec4_set1i(~ (CCTK_INTEGER4)(1UL << 31UL)))
+
+
+
+// Create vectors, extract vector elements
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_set1(CCTK_REAL4 const a)
+{
+ return _mm256_set1_ps(a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_set1i(CCTK_INT4 const a)
+{
+ return _mm256_castsi256_ps(_mm256_set1_epi32(a));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_set(CCTK_REAL4 const a,
+ CCTK_REAL4 const b,
+ CCTK_REAL4 const c,
+ CCTK_REAL4 const d,
+ CCTK_REAL4 const e,
+ CCTK_REAL4 const f,
+ CCTK_REAL4 const g,
+ CCTK_REAL4 const h)
+{
+ return _mm256_set_ps(h,g,f,e,d,c,b,a); // note reversed arguments
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt0(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[0];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt1(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[1];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt2(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[2];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt3(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[3];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt4(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[4];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt5(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[5];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt6(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[6];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt7(CCTK_REAL4_VEC const x)
+{
+ return ((CCTK_REAL4 const*)&x)[7];
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4 vec4_elt(CCTK_REAL4_VEC const x, std::ptrdiff_t const d)
+{
+ return ((CCTK_REAL4 const*)&x)[d];
+}
+
+
+
+// Load and store vectors
+
+// Load a vector from memory (aligned and unaligned); this loads from
+// a reference to a scalar
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_load(CCTK_REAL4 const& p)
+{
+ return _mm256_load_ps(&p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu(CCTK_REAL4 const& p)
+{
+ return _mm256_loadu_ps(&p);
+}
+#if VECTORISE_ALWAYS_USE_ALIGNED_LOADS
+# error "VECTORISE_ALWAYS_USE_ALIGNED_LOADS not yet supported"
+#endif
+
+// Load a vector from memory that may or may not be aligned, as
+// decided by the offset off and the vector size
+#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
+// Implementation: Always use unaligned load
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL4 const& p)
+{
+ return vec4_loadu(p);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL4 const& p)
+{
+ return vec4_loadu(p);
+}
+#else
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe(std::ptrdiff_t const off, CCTK_REAL4 const& p)
+{
+ return off % CCTK_REAL4_VEC_SIZE == 0 ? vec4_load(p) : vec4_loadu(p);
+}
+# if VECTORISE_ALIGNED_ARRAYS
+// Assume all array x sizes are multiples of the vector size
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL4 const& p)
+{
+ return vec4_loadu_maybe(off1, p);
+}
+# else
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC vec4_loadu_maybe3(std::ptrdiff_t const off1,
+ std::ptrdiff_t const off2,
+ std::ptrdiff_t const off3,
+ CCTK_REAL4 const& p)
+{
+ return
+ off2 % CCTK_REAL4_VEC_SIZE != 0 or
+ off3 % CCTK_REAL4_VEC_SIZE != 0 ?
+ vec4_loadu(p) :
+ vec4_loadu_maybe(off1, p);
+}
+# endif
+#endif
+
+// Store a vector to memory (aligned and non-temporal); this stores to
+// a reference to a scalar
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store(CCTK_REAL4& p, CCTK_REAL4_VEC const x)
+{
+ return _mm256_store_ps(&p, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_storeu(CCTK_REAL4& p, CCTK_REAL4_VEC const x)
+{
+ return _mm256_storeu_ps(&p, x);
+}
+#if ! VECTORISE_STREAMING_STORES
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta(CCTK_REAL4& p, CCTK_REAL4_VEC const x)
+{
+ return vec4_store(p, x);
+}
+#else
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta(CCTK_REAL4& p, CCTK_REAL4_VEC const x)
+{
+ return _mm256_stream_ps(&p, x);
+}
+#endif
+
+// Store a partial vector (aligned and non-temporal)
+#define vec4_store_partial_prepare(i,imin,imax) \
+ bool v4stp_all; \
+ __m256i v4stp_mask; \
+ vec4_store_partial_prepare_(v4stp_all, v4stp_mask, i, imin, imax);
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_partial_prepare_(bool& all, __m256i& mask,
+ std::ptrdiff_t const i,
+ std::ptrdiff_t const imin,
+ std::ptrdiff_t const imax)
+{
+ all = i>=imin and i+CCTK_REAL4_VEC_SIZE-1<imax;
+
+ if (not CCTK_BUILTIN_EXPECT(all, true)) {
+ /* __m256i const v4stp_mask = */
+ /* _mm256_andnot_ps(_mm256_add_epi64(_mm256_set1_epi64x(i-imin), */
+ /* vec_index), */
+ /* _mm256_add_epi64(_mm256_set1_epi64x(i-imax), */
+ /* vec_index)); */
+ __m128i const termlo0123 =
+ _mm_add_epi32(_mm_set1_epi32(i-imin), _mm_set_epi32(3, 2, 1, 0));
+ __m128i const termup0123 =
+ _mm_add_epi32(_mm_set1_epi32(i-imax), _mm_set_epi32(3, 2, 1, 0));
+ __m128i const term0123 = _mm_andnot_si128(termlo0123, termup0123);
+ __m128i const termlo4567 =
+ _mm_add_epi32(_mm_set1_epi32(i-imin), _mm_set_epi32(7, 6, 5, 4));
+ __m128i const termup4567 =
+ _mm_add_epi32(_mm_set1_epi32(i-imax), _mm_set_epi32(7, 6, 5, 4));
+ __m128i const term4567 = _mm_andnot_si128(termlo4567, termup4567);
+ mask =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(term0123), term4567, 1);
+ }
+}
+
+#define vec4_store_nta_partial(p, x) \
+ vec4_store_nta_partial_(v4stp_all, v4stp_mask, p, x)
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_(bool const all, __m256i const mask,
+ CCTK_REAL4& p, CCTK_REAL4_VEC const x)
+{
+ if (CCTK_BUILTIN_EXPECT(all, true)) {
+ vec4_store_nta(p, x);
+ } else {
+ _mm256_maskstore_ps(&p, mask, x);
+ }
+}
+
+// Store a lower or higher partial vector (aligned and non-temporal);
+// the non-temporal hint is probably ignored
+// Masks indicating which vector element should be stored:
+static k4const_t const k4store_lo[9] =
+ {
+ { i: { 0, 0, 0, 0, 0, 0, 0, 0, }},
+ { i: { ~0, 0, 0, 0, 0, 0, 0, 0, }},
+ { i: { ~0, ~0, 0, 0, 0, 0, 0, 0, }},
+ { i: { ~0, ~0, ~0, 0, 0, 0, 0, 0, }},
+ { i: { ~0, ~0, ~0, ~0, 0, 0, 0, 0, }},
+ { i: { ~0, ~0, ~0, ~0, ~0, 0, 0, 0, }},
+ { i: { ~0, ~0, ~0, ~0, ~0, ~0, 0, 0, }},
+ { i: { ~0, ~0, ~0, ~0, ~0, ~0, ~0, 0, }},
+ { i: { ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, }},
+ };
+static k4const_t const k4store_hi[9] =
+ {
+ { i: { 0, 0, 0, 0, 0, 0, 0, 0, }},
+ { i: { 0, 0, 0, 0, 0, 0, 0, ~0, }},
+ { i: { 0, 0, 0, 0, 0, 0, ~0, ~0, }},
+ { i: { 0, 0, 0, 0, 0, ~0, ~0, ~0, }},
+ { i: { 0, 0, 0, 0, ~0, ~0, ~0, ~0, }},
+ { i: { 0, 0, 0, ~0, ~0, ~0, ~0, ~0, }},
+ { i: { 0, 0, ~0, ~0, ~0, ~0, ~0, ~0, }},
+ { i: { 0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, }},
+ { i: { ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, }},
+ };
+#if !defined(__INTEL_COMPILER) && defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4
+// gcc 4.4 uses a wrong prototype for _mm256_maskstore_ps
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_lo(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_ps(&p, _mm256_castsi256_ps(k4store_lo[n].vi), x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_hi(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_ps(&p, _mm256_castsi256_ps(k4store_hi[n].vi), x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_mid(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ ptrdiff_t const nlo,
+ ptrdiff_t const nhi)
+{
+ _mm256_maskstore_ps
+ (&p,
+ _mm256_castsi256_ps(k4store_lo[nlo].vi & k4store_hi[nhi].vi),
+ x);
+}
+#else
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_lo(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_ps(&p, k4store_lo[n].vi, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_hi(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ ptrdiff_t const n)
+{
+ _mm256_maskstore_ps(&p, k4store_hi[n].vi, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+void vec4_store_nta_partial_mid(CCTK_REAL4& p,
+ CCTK_REAL4_VEC const x,
+ ptrdiff_t const nlo,
+ ptrdiff_t const nhi)
+{
+ _mm256_maskstore_ps
+ (&p,
+ _mm256_castps_si256(_mm256_and_ps(k4store_lo[nlo].vf, k4store_hi[nhi].vf)),
+ x);
+}
+#endif
+
+
+
+// Functions and operators
+
+// Operators
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4neg(CCTK_REAL4_VEC const x)
+{
+ return _mm256_xor_ps(x, k4sign);
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4add(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_add_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sub(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_sub_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4mul(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_mul_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4div(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_div_ps(x, y);
+}
+
+// Fused multiply-add, defined as [+-]x*y[+-]z
+#ifdef __FMA4__
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4madd(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return _mm256_macc_ps(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4msub(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return _mm256_msub_ps(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4nmadd(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return _mm256_nmsub_ps(x, y, z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4nmsub(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return _mm256_nmacc_ps(x, y, z);
+}
+#else
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4madd(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return k4add(k4mul(x, y), z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4msub(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return k4sub(k4mul(x, y), z);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4nmadd(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return k4sub(k4neg(z), k4mul(x, y));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4nmsub(CCTK_REAL4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return k4sub(z, k4mul(x, y));
+}
+#endif
+
+// Cheap functions
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4copysign(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_or_ps(_mm256_and_ps(k4notsign, x),
+ _mm256_and_ps(k4sign , y));
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4fabs(CCTK_REAL4_VEC const x)
+{
+ return _mm256_and_ps(k4notsign, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4fmax(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_max_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4fmin(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_min_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4fnabs(CCTK_REAL4_VEC const x)
+{
+ return _mm256_or_ps(x, k4sign);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sqrt(CCTK_REAL4_VEC const x)
+{
+ return _mm256_sqrt_ps(x);
+}
+
+// Expensive functions
+#define K4REPL(f,x) \
+ vec4_set(f(vec4_elt0(x)), \
+ f(vec4_elt1(x)), \
+ f(vec4_elt2(x)), \
+ f(vec4_elt3(x)), \
+ f(vec4_elt4(x)), \
+ f(vec4_elt5(x)), \
+ f(vec4_elt6(x)), \
+ f(vec4_elt7(x)));
+#define K4REPL2S(f,x,a) \
+ vec4_set(f(vec4_elt0(x),a), \
+ f(vec4_elt1(x),a), \
+ f(vec4_elt2(x),a), \
+ f(vec4_elt3(x),a), \
+ f(vec4_elt4(x),a), \
+ f(vec4_elt5(x),a), \
+ f(vec4_elt6(x),a), \
+ f(vec4_elt7(x),a));
+#define K4REPL2(f,x,y) \
+ vec4_set(f(vec4_elt0(x),vec4_elt0(y)), \
+ f(vec4_elt1(x),vec4_elt1(y)), \
+ f(vec4_elt2(x),vec4_elt2(y)), \
+ f(vec4_elt3(x),vec4_elt3(y)), \
+ f(vec4_elt4(x),vec4_elt4(y)), \
+ f(vec4_elt5(x),vec4_elt5(y)), \
+ f(vec4_elt6(x),vec4_elt6(y)), \
+ f(vec4_elt7(x),vec4_elt7(y)));
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4acos(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(acos,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4acosh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(acosh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4asin(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(asin,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4asinh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(asinh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4atan(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(atan,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4atan2(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return K4REPL2(atan2,x,y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4atanh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(atanh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4cos(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(cos,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4cosh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(cosh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4exp(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(exp,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4log(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(log,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4pow(CCTK_REAL4_VEC const x, CCTK_REAL4 const a)
+{
+ return K4REPL2S(pow,x,a);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sin(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(sin,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sinh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(sinh,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4tan(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(tan,x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4tanh(CCTK_REAL4_VEC const x)
+{
+ return K4REPL(tanh,x);
+}
+
+
+
+#define k4lfalse (vec4_set1i( 0))
+#define k4ltrue (vec4_set1i(~0))
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4lnot(CCTK_BOOLEAN4_VEC const x)
+{
+ return _mm256_xor_ps(k4ltrue, x);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4land(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y)
+{
+ return _mm256_and_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4lor(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y)
+{
+ return _mm256_or_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4lxor(CCTK_BOOLEAN4_VEC const x, CCTK_BOOLEAN4_VEC const y)
+{
+ return _mm256_xor_ps(x, y);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4ifthen(CCTK_BOOLEAN4_VEC const x,
+ CCTK_REAL4_VEC const y,
+ CCTK_REAL4_VEC const z)
+{
+ return _mm256_blendv_ps(z, y, x);
+}
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmpeq(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_cmp_ps(x, y, _CMP_EQ_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmpne(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_cmp_ps(x, y, _CMP_NEQ_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmpgt(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_cmp_ps(x, y, _CMP_GT_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmpge(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_cmp_ps(x, y, _CMP_GE_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmplt(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_cmp_ps(x, y, _CMP_LT_OQ);
+}
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_BOOLEAN4_VEC k4cmple(CCTK_REAL4_VEC const x, CCTK_REAL4_VEC const y)
+{
+ return _mm256_cmp_ps(x, y, _CMP_LE_OQ);
+}
+
+
+
+static inline CCTK_ATTRIBUTE_ALWAYS_INLINE
+CCTK_REAL4_VEC k4sgn(CCTK_REAL4_VEC const x)
+{
+ CCTK_BOOLEAN4_VEC const iszero = k4cmpeq(x, vec4_set1(0.0));
+ CCTK_REAL4_VEC const sign = _mm256_and_ps(k4sign, x);
+ CCTK_REAL4_VEC const signedone = _mm256_or_ps(sign, vec4_set1(1.0));
+ return k4ifthen(iszero, vec4_set1(0.0), signedone);
+}