diff options
Diffstat (limited to 'src/vectors-8-SSE2.h')
-rw-r--r-- | src/vectors-8-SSE2.h | 49 |
1 files changed, 39 insertions, 10 deletions
diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h index 7dc7d31..fe231b7 100644 --- a/src/vectors-8-SSE2.h +++ b/src/vectors-8-SSE2.h @@ -139,6 +139,34 @@ # define vec8_store_nta(p,x) (_mm_stream_pd(&(p),x)) #endif +// Store a partial vector (aligned and non-temporal) +#define vec8_store_partial_prepare(i,imin,imax) \ + bool const v8stp_lo = (i)>=(imin); \ + bool const v8stp_hi = (i)+CCTK_REAL_VEC_SIZE<(imax) +#if VECTORISE_STREAMING_STORES && defined(__SSE4A__) +# define vec8_store_nta_partial(p,x) \ + ({ \ + if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \ + vec8_store_nta(p,x); \ + } else if (v8stp_lo) { \ + _mm_stream_sd(&p,x); \ + } else if (v8stp_hi) { \ + _mm_stream_sd(&p+1, vec8_swap10(x)); \ + } \ + }) +#else +# define vec8_store_nta_partial(p,x) \ + ({ \ + if (CCTK_BUILTIN_EXPECT(v8stp_lo and v8stp_hi, true)) { \ + vec8_store_nta(p,x); \ + } else if (v8stp_lo) { \ + _mm_storel_pd(&p,x); \ + } else if (v8stp_hi) { \ + _mm_storeh_pd(&p+1,x); \ + } \ + }) +#endif + // Store a lower or higher partial vector (aligned and non-temporal) #if ! VECTORISE_STREAMING_STORES # define vec8_store_nta_partial_lo(p,x,n) (_mm_storel_pd(&(p),x)) @@ -264,9 +292,10 @@ static const union { // Choice [sign(x)>0 ? y : z] #ifdef __SSE4_1__ -# define k8ifpos(x,y,z) (_mm_blendv_pd(y,z,x)) +# define k8ifmsb(x,y,z) (_mm_blendv_pd(z,y,x)) #elif 0 -# define k8ifpos(x_,y_,z_) \ +// This is slow +# define k8ifmsb(x_,y_,z_) \ ({ \ CCTK_REAL8_VEC const x__=(x_); \ CCTK_REAL8_VEC const y__=(y_); \ @@ -278,8 +307,8 @@ static const union { CCTK_REAL8_VEC r; \ switch (m) { \ case 0: r = y; break; \ - case 1: r = _mm_move_sd(y,z); break; \ - case 2: r = _mm_move_sd(z,y); break; \ + case 1: r = _mm_move_sd(y,z); break; \ + case 2: r = _mm_move_sd(z,y); break; \ case 3: r = z; break; \ } \ r; \ @@ -290,7 +319,7 @@ static const union { # else # define k4sgn(x) (signbit(x)) # endif -# define k8ifpos(x_,y_,z_) \ +# define k8ifmsb(x_,y_,z_) \ ({ \ CCTK_REAL8_VEC const x__=(x_); \ CCTK_REAL8_VEC const y__=(y_); \ @@ -298,8 +327,8 @@ static const union { CCTK_REAL8_VEC const x=x__; \ CCTK_REAL8_VEC const y=y__; \ CCTK_REAL8_VEC const z=z__; \ - vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(z) : vec8_elt0(y), \ - k8sgn(vec8_elt1(x)) ? vec8_elt1(z) : vec8_elt1(y)); \ + vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), \ + k8sgn(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); \ }) #else static const union { @@ -307,7 +336,7 @@ static const union { double d; } k8one_union = { 0x1ULL }; # define k8one (k8one_union.d) -# define k8ifpos(x_,y_,z_) \ +# define k8ifmsb(x_,y_,z_) \ ({ \ CCTK_REAL8_VEC const x__=(x_); \ CCTK_REAL8_VEC const y__=(y_); \ @@ -319,7 +348,7 @@ static const union { CCTK_REAL8_VEC const imask = \ (__m128d)_mm_sub_epi64(_mm_srli_epi64((__m128i)x, 63), \ (__m128i)_mm_set1_pd(k8one)); \ - /* (y & ~mask) | (z & mask); imask = ~mask */ \ - _mm_or_pd(_mm_and_pd(imask, y), _mm_andnot_pd(imask, z)); \ + /* (z & ~mask) | (y & mask); imask = ~mask */ \ + _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); \ }) #endif |