aboutsummaryrefslogtreecommitdiff
path: root/src/vectors-8-AVX.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/vectors-8-AVX.h')
-rw-r--r--src/vectors-8-AVX.h80
1 files changed, 48 insertions, 32 deletions
diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h
index 9e1d98b..825f2d3 100644
--- a/src/vectors-8-AVX.h
+++ b/src/vectors-8-AVX.h
@@ -35,9 +35,9 @@
union k8const_t {
unsigned long long i[4];
- double d[4];
+ double f[4];
__m256i vi;
- __m256d vd;
+ __m256d vf;
};
#define K8_ZERO 0x0000000000000000ULL
@@ -118,12 +118,12 @@ union k8const_t {
bool v8stp_all; \
__m256i v8stp_mask; \
({ \
- ptrdiff_t const imin1=(imin_); \
- ptrdiff_t const imin=imin1; \
- ptrdiff_t const imax1=(imax_); \
- ptrdiff_t const imax=imax1; \
+ ptrdiff_t const imin__=(imin_); \
+ ptrdiff_t const imin=imin__; \
+ ptrdiff_t const imax__=(imax_); \
+ ptrdiff_t const imax=imax__; \
\
- v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE<imax; \
+ v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE-1<imax; \
\
if (not CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
/* \
@@ -134,14 +134,14 @@ union k8const_t {
vec_index)); \
*/ \
__m128i const termlo0 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(0,1)); \
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(1, 0)); \
__m128i const termup0 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(0,1)); \
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(1, 0)); \
__m128i const term0 = _mm_andnot_si128(termlo0, termup0); \
__m128i const termlo1 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(2,3)); \
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(3, 2)); \
__m128i const termup1 = \
- _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(2,3)); \
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(3, 2)); \
__m128i const term1 = _mm_andnot_si128(termlo1, termup1); \
v8stp_mask = \
_mm256_insertf128_si256(_mm256_castsi128_si256(term0), term1, 1); \
@@ -160,7 +160,7 @@ union k8const_t {
// Store a lower or higher partial vector (aligned and non-temporal);
// the non-temporal hint is probably ignored
// Masks indicating which vector element should be stored:
-static const k8const_t k8store_lo_union[5] =
+static const k8const_t k8store_lo[5] =
{
{{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }},
{{ K8_IMIN, K8_ZERO, K8_ZERO, K8_ZERO, }},
@@ -168,7 +168,7 @@ static const k8const_t k8store_lo_union[5] =
{{ K8_IMIN, K8_IMIN, K8_IMIN, K8_ZERO, }},
{{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }},
};
-static const k8const_t k8store_hi_union[5] =
+static const k8const_t k8store_hi[5] =
{
{{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }},
{{ K8_ZERO, K8_ZERO, K8_ZERO, K8_IMIN, }},
@@ -179,24 +179,24 @@ static const k8const_t k8store_hi_union[5] =
#if !defined(__INTEL_COMPILER) && defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4
// gcc 4.4 uses a wrong prototype for _mm256_maskstore_pd
# define vec8_store_nta_partial_lo(p,x,n) \
- (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_lo_union[n].vi),x))
+ (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_lo[n].vi),x))
# define vec8_store_nta_partial_hi(p,x,n) \
- (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_hi_union[n].vi),x))
+ (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_hi[n].vi),x))
# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
(_mm256_maskstore_pd \
(&(p), \
- _mm256_castsi256_pd(k8store_lo_union[nlo].vi & k8store_hi_union[nhi].vi), \
+ _mm256_castsi256_pd(k8store_lo[nlo].vi & k8store_hi[nhi].vi), \
x))
#else
# define vec8_store_nta_partial_lo(p,x,n) \
- (_mm256_maskstore_pd(&(p),k8store_lo_union[n].vi,x))
+ (_mm256_maskstore_pd(&(p),k8store_lo[n].vi,x))
# define vec8_store_nta_partial_hi(p,x,n) \
- (_mm256_maskstore_pd(&(p),k8store_hi_union[n].vi,x))
-# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
- (_mm256_maskstore_pd \
- (&(p), \
- _mm256_castpd_si256(_mm256_and_pd(k8store_lo_union[nlo].vd, \
- k8store_hi_union[nhi].vd)), \
+ (_mm256_maskstore_pd(&(p),k8store_hi[n].vi,x))
+# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
+ (_mm256_maskstore_pd \
+ (&(p), \
+ _mm256_castpd_si256(_mm256_and_pd(k8store_lo[nlo].vf, \
+ k8store_hi[nhi].vf)), \
x))
#endif
@@ -204,13 +204,10 @@ static const k8const_t k8store_hi_union[5] =
// Functions and operators
-static const k8const_t k8sign_mask_union =
- {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }};
-static const k8const_t k8abs_mask_union =
- {{ K8_IMAX, K8_IMAX, K8_IMAX, K8_IMAX, }};
+static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }};
// Operators
-#define k8neg(x) (_mm256_xor_pd(x,k8sign_mask_union.vd))
+#define k8neg(x) (_mm256_xor_pd(x,k8sign_mask.vf))
#define k8add(x,y) (_mm256_add_pd(x,y))
#define k8sub(x,y) (_mm256_sub_pd(x,y))
@@ -231,10 +228,24 @@ static const k8const_t k8abs_mask_union =
#endif
// Cheap functions
-#define k8fabs(x) (_mm256_and_pd(x,k8abs_mask_union.vd))
+#define k8copysign(x,y) \
+ (_mm256_or_pd(_mm256_andnot_pd(k8sign_mask.vf,x), \
+ _mm256_and_pd(k8sign_mask.vf,y)))
+#define k8fabs(x) (_mm256_andnot_pd(k8sign_mask.vf,x))
#define k8fmax(x,y) (_mm256_max_pd(x,y))
#define k8fmin(x,y) (_mm256_min_pd(x,y))
-#define k8fnabs(x) (_mm256_or_pd(x,k8sign_mask_union.vd))
+#define k8fnabs(x) (_mm256_or_pd(x,k8sign_mask.vf))
+static const k8const_t k8zero = { f: { 0.0, 0.0, 0.0, 0.0, }};
+static const k8const_t k8one = { f: { 1.0, 1.0, 1.0, 1.0, }};
+#define k8sgn(x_) \
+ ({ \
+ CCTK_REAL_VEC x__=(x_); \
+ CCTK_REAL_VEC x=x__; \
+ CCTK_REAL_VEC iszero = _mm256_cmp_pd(x, k8zero.vf, _CMP_EQ_OQ); \
+ CCTK_REAL_VEC sign = _mm256_and_pd(k8sign_mask.vf, x); \
+ CCTK_REAL_VEC signedone = _mm256_or_pd(sign, k8one.vf); \
+ k8ifthen(iszero, k8zero.vf, signedone); \
+ })
#define k8sqrt(x) (_mm256_sqrt_pd(x))
// Expensive functions
@@ -287,5 +298,10 @@ static const k8const_t k8abs_mask_union =
#define k8tan(x) K8REPL(tan,x)
#define k8tanh(x) K8REPL(tanh,x)
-// Choice [sign(x)>0 ? y : z]
-#define k8ifmsb(x,y,z) (_mm256_blendv_pd(z,y,x))
+static const k8const_t k8lfalse = {{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }};
+static const k8const_t k8ltrue = {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }};
+#define k8lnot(x) (_mm256_xor_pd(k8sign_mask,x))
+#define k8land(x,y) (_mm256_and_pd(x,y))
+#define k8lor(x,y) (_mm256_or_pd(x,y))
+#define k8lxor(x,y) (_mm256_xor_pd(x,y))
+#define k8ifthen(x,y,z) (_mm256_blendv_pd(z,y,x))