diff options
Diffstat (limited to 'src/vectors-8-AVX.h')
-rw-r--r-- | src/vectors-8-AVX.h | 62 |
1 files changed, 56 insertions, 6 deletions
diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h index 144d3b5..274b376 100644 --- a/src/vectors-8-AVX.h +++ b/src/vectors-8-AVX.h @@ -113,8 +113,53 @@ union k8const_t { # define vec8_store_nta(p,x) (_mm256_stream_pd(&(p),x)) #endif +// Store a partial vector (aligned and non-temporal) +#define vec8_store_partial_prepare(i,imin_,imax_) \ + bool v8stp_all; \ + __m256i v8stp_mask; \ + ({ \ + ptrdiff_t const imin1=(imin_); \ + ptrdiff_t const imin=imin1; \ + ptrdiff_t const imax1=(imax_); \ + ptrdiff_t const imax=imax1; \ + \ + v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE<imax; \ + \ + if (not CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \ + /* \ + __m256i const v8stp_mask = \ + _mm256_andnot_pd(_mm256_add_epi64(_mm256_set1_epi64x(i-imin), \ + vec_index), \ + _mm256_add_epi64(_mm256_set1_epi64x(i-imax), \ + vec_index)); \ + */ \ + __m128i const termlo0 = \ + _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(0,1)); \ + __m128i const termup0 = \ + _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(0,1)); \ + __m128i const term0 = _mm_andnot_si128(termlo0, termup0); \ + __m128i const termlo1 = \ + _mm_add_epi64(_mm_set1_epi64x(i-imin), _mm_set_epi64x(2,3)); \ + __m128i const termup1 = \ + _mm_add_epi64(_mm_set1_epi64x(i-imax), _mm_set_epi64x(2,3)); \ + __m128i const term1 = _mm_andnot_si128(termlo1, termup1); \ + v8stp_mask = \ + _mm256_insertf128_si256(_mm256_castsi128_si256(term0), term1, 1); \ + } \ + }) + +#define vec8_store_nta_partial(p,x) \ + ({ \ + if (CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \ + vec8_store_nta(p,x); \ + } else { \ + _mm256_maskstore_pd(&p,v8stp_mask,x); \ + } \ + }) + // Store a lower or higher partial vector (aligned and non-temporal); // the non-temporal hint is probably ignored +// Masks indicating which vector element should be stored: static const k8const_t k8store_lo_union[5] = { {{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }}, @@ -131,7 +176,7 @@ static const k8const_t k8store_hi_union[5] = {{ K8_ZERO, K8_IMIN, K8_IMIN, K8_IMIN, }}, {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }}, }; -#if defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4 +#if !defined(__INTEL_COMPILER) && defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<=4 // gcc 4.4 uses a wrong prototype for _mm256_maskstore_pd # define vec8_store_nta_partial_lo(p,x,n) \ (_mm256_maskstore_pd(&(p),_mm256_castsi256_pd(k8store_lo_union[n].vi),x)) @@ -147,10 +192,11 @@ static const k8const_t k8store_hi_union[5] = (_mm256_maskstore_pd(&(p),k8store_lo_union[n].vi,x)) # define vec8_store_nta_partial_hi(p,x,n) \ (_mm256_maskstore_pd(&(p),k8store_hi_union[n].vi,x)) -# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \ - (_mm256_maskstore_pd \ - (&(p), \ - k8store_lo_union[nlo].vi & k8store_hi_union[nhi].vi, \ +# define vec8_store_nta_partial_mid(p,x,nlo,nhi) \ + (_mm256_maskstore_pd \ + (&(p), \ + _mm256_castpd_si256(_mm256_and_pd(k8store_lo_union[nlo].vd, \ + k8store_hi_union[nhi].vd)), \ x)) #endif @@ -209,8 +255,12 @@ static const k8const_t k8abs_mask_union = f(vec8_elt2(xfunc),afunc), \ f(vec8_elt3(xfunc),afunc)); \ }) +#define k8cos(x) K8REPL(cos,x) #define k8exp(x) K8REPL(exp,x) #define k8log(x) K8REPL(log,x) #define k8pow(x,a) K8REPL2(pow,x,a) +#define k8sin(x) K8REPL(sin,x) +#define k8tan(x) K8REPL(tan,x) -#define k8ifpos(x,y,z) (_mm256_blendv_pd(y,z,x)) +// Choice [sign(x)>0 ? y : z] +#define k8ifmsb(x,y,z) (_mm256_blendv_pd(z,y,x)) |