aboutsummaryrefslogtreecommitdiff
path: root/src/vectors-8-AVX.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/vectors-8-AVX.h')
-rw-r--r--src/vectors-8-AVX.h62
1 files changed, 56 insertions, 6 deletions
diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h
index 144d3b5..274b376 100644
--- a/src/vectors-8-AVX.h
+++ b/src/vectors-8-AVX.h
@@ -113,8 +113,53 @@ union k8const_t {
# define vec8_store_nta(p,x) (_mm256_stream_pd(&(p),x))
#endif
+// Store a partial vector (aligned and non-temporal)
+#define vec8_store_partial_prepare(i,imin_,imax_) \
+ bool v8stp_all; \
+ __m256i v8stp_mask; \
+ ({ \
+ ptrdiff_t const imin1=(imin_); \
+ ptrdiff_t const imin=imin1; \
+ ptrdiff_t const imax1=(imax_); \
+ ptrdiff_t const imax=imax1; \
+ \
+ v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE<imax; \
+ \
+ if (not CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
+ /* \
+ __m256i const v8stp_mask = \
+ _mm256_andnot_pd(_mm256_add_epi64(_mm256_set1_epi64x(i-imin), \
+ vec_index), \
+ _mm256_add_epi64(_mm256_set1_epi64x(i-imax), \
+ vec_index)); \
+ */ \
+ __m128i const termlo0 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(0,1)); \
+ __m128i const termup0 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(0,1)); \
+ __m128i const term0 = _mm_andnot_si128(termlo0, termup0); \
+ __m128i const termlo1 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(2,3)); \
+ __m128i const termup1 = \
+ _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(2,3)); \
+ __m128i const term1 = _mm_andnot_si128(termlo1, termup1); \
+ v8stp_mask = \
+ _mm256_insertf128_si256(_mm256_castsi128_si256(term0), term1, 1); \
+ } \
+ })
+
+#define vec8_store_nta_partial(p,x) \
+ ({ \
+ if (CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \
+ vec8_store_nta(p,x); \
+ } else { \
+ _mm256_maskstore_pd(&p,v8stp_mask,x); \
+ } \
+ })
+
// Store a lower or higher partial vector (aligned and non-temporal);
// the non-temporal hint is probably ignored
+// Masks indicating which vector element should be stored:
static const k8const_t k8store_lo_union[5] =
{
{{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }},
@@ -131,7 +176,7 @@ static const k8const_t k8store_hi_union[5] =
{{ K8_ZERO, K8_IMIN, K8_IMIN, K8_IMIN, }},
{{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }},
};
-#if defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4
+#if !defined(__INTEL_COMPILER) && defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4
// gcc 4.4 uses a wrong prototype for _mm256_maskstore_pd
# define vec8_store_nta_partial_lo(p,x,n) \
(_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_lo_union[n].vi),x))
@@ -147,10 +192,11 @@ static const k8const_t k8store_hi_union[5] =
(_mm256_maskstore_pd(&(p),k8store_lo_union[n].vi,x))
# define vec8_store_nta_partial_hi(p,x,n) \
(_mm256_maskstore_pd(&(p),k8store_hi_union[n].vi,x))
-# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
- (_mm256_maskstore_pd \
- (&(p), \
- k8store_lo_union[nlo].vi & k8store_hi_union[nhi].vi, \
+# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \
+ (_mm256_maskstore_pd \
+ (&(p), \
+ _mm256_castpd_si256(_mm256_and_pd(k8store_lo_union[nlo].vd, \
+ k8store_hi_union[nhi].vd)), \
x))
#endif
@@ -209,8 +255,12 @@ static const k8const_t k8abs_mask_union =
f(vec8_elt2(xfunc),afunc), \
f(vec8_elt3(xfunc),afunc)); \
})
+#define k8cos(x) K8REPL(cos,x)
#define k8exp(x) K8REPL(exp,x)
#define k8log(x) K8REPL(log,x)
#define k8pow(x,a) K8REPL2(pow,x,a)
+#define k8sin(x) K8REPL(sin,x)
+#define k8tan(x) K8REPL(tan,x)
-#define k8ifpos(x,y,z) (_mm256_blendv_pd(y,z,x))
+// Choice [sign(x)>0 ? y : z]
+#define k8ifmsb(x,y,z) (_mm256_blendv_pd(z,y,x))