diff options
-rw-r--r-- | src/vectors-4-SSE.h | 26 | ||||
-rw-r--r-- | src/vectors-8-SSE2.h | 38 |
2 files changed, 44 insertions, 20 deletions
diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h index dbf0cce..2bde97e 100644 --- a/src/vectors-4-SSE.h +++ b/src/vectors-4-SSE.h @@ -248,15 +248,10 @@ static const union { __m128 v; } k4sign_mask_union = {{ 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U }}; #define k4sign_mask (k4sign_mask_union.v) -static const union { - unsigned i[4]; - __m128 v; -} k4abs_mask_union = {{ 0x7fffffffU, 0x7fffffffU, 0x7fffffffU, 0x7fffffffU }}; -#define k4abs_mask (k4abs_mask_union.v) // Operators #define k4pos(x) (x) -#define k4neg(x) (_mm_xor_ps(x,k4sign_mask)) +#define k4neg(x) (_mm_xor_ps(k4sign_mask,x)) // #define k4inv(x) // TODO: provide k4inv via rcp and Newton-Raphson // This is described in AMD's publication 47414. @@ -275,10 +270,10 @@ static const union { #define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y))) // Cheap functions -#define k4fabs(x) (_mm_and_ps(x,k4abs_mask)) +#define k4fabs(x) (_mm_andnot_ps(k4sign_mask,x)) #define k4fmax(x,y) (_mm_max_ps(x,y)) #define k4fmin(x,y) (_mm_min_ps(x,y)) -#define k4fnabs(x) (_mm_or_ps(x,k4sign_mask)) +#define k4fnabs(x) (_mm_or_ps(k4sign_mask,x)) // TODO: maybe use rsqrt and Newton-Raphson #define k4sqrt(x) (_mm_sqrt_ps(x)) @@ -311,7 +306,7 @@ static const union { // Choice [sign(x)>0 ? y : z] #ifdef __SSE4_1__ # define k4ifpos(x,y,z) (_mm_blendv_ps(y,z,x)) -#else +#elif 0 # ifdef __cplusplus # define k4sgn(x) ({ using namespace std; signbit(x); }) # else @@ -330,4 +325,17 @@ static const union { k4sgn(vec4_elt2(x)) ? vec4_elt2(z) : vec4_elt2(y), \ k4sgn(vec4_elt3(x)) ? vec4_elt3(z) : vec4_elt3(y)); \ }) +#else +# define k4ifpos(x_,y_,z_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + CCTK_REAL4_VEC const yy=(y_); \ + CCTK_REAL4_VEC const y=yy; \ + CCTK_REAL4_VEC const zz=(z_); \ + CCTK_REAL4_VEC const z=zz; \ + CCTK_REAL4_VEC const mask = _mm_srai_epi32(x, 31); \ + /* (y & ~mask) | (z & mask) */ \ + _mm_or_ps(_mm_andnot_ps(mask, y), _mm_and_ps(mask, z)); \ + }) #endif diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h index 3b11990..2da4b11 100644 --- a/src/vectors-8-SSE2.h +++ b/src/vectors-8-SSE2.h @@ -171,15 +171,10 @@ static const union { __m128d v; } k8sign_mask_union = {{ 0x8000000000000000ULL, 0x8000000000000000ULL }}; #define k8sign_mask (k8sign_mask_union.v) -static const union { - unsigned long long i[2]; - __m128d v; -} k8abs_mask_union = {{ 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }}; -#define k8abs_mask (k8abs_mask_union.v) // Operators #define k8pos(x) (x) -#define k8neg(x) (_mm_xor_pd(x,k8sign_mask)) +#define k8neg(x) (_mm_xor_pd(k8sign_mask,x)) #define k8add(x,y) (_mm_add_pd(x,y)) #define k8sub(x,y) (_mm_sub_pd(x,y)) @@ -193,10 +188,10 @@ static const union { #define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y))) // Cheap functions -#define k8fabs(x) (_mm_and_pd(x,k8abs_mask)) +#define k8fabs(x) (_mm_andnot_pd(k8sign_mask,x)) #define k8fmax(x,y) (_mm_max_pd(x,y)) #define k8fmin(x,y) (_mm_min_pd(x,y)) -#define k8fnabs(x) (_mm_or_pd(x,k8sign_mask)) +#define k8fnabs(x) (_mm_or_pd(k8sign_mask,x)) #define k8sqrt(x) (_mm_sqrt_pd(x)) // Expensive functions @@ -243,7 +238,7 @@ static const union { } \ r; \ }) -#else +#elif 0 # ifdef __cplusplus # define k8sgn(x) ({ using namespace std; signbit(x); }) # else @@ -257,7 +252,28 @@ static const union { CCTK_REAL8_VEC const y=yy; \ CCTK_REAL8_VEC const zz=(z_); \ CCTK_REAL8_VEC const z=zz; \ - vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(z) : vec8_elt0(y), \ - k8sgn(vec8_elt1(x)) ? vec8_elt1(z) : vec8_elt1(y)); \ + vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(z) : vec8_elt0(y), \ + k8sgn(vec8_elt1(x)) ? vec8_elt1(z) : vec8_elt1(y)); \ + }) +#else +static const union { + unsigned long long i; + double d; +} k8one_union = { 0x1ULL }; +# define k8one (k8one_union.d) +# define k8ifpos(x_,y_,z_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8_VEC const yy=(y_); \ + CCTK_REAL8_VEC const y=yy; \ + CCTK_REAL8_VEC const zz=(z_); \ + CCTK_REAL8_VEC const z=zz; \ + /* there is no _mm_srai_epi64(x, 63) */ \ + CCTK_REAL8_VEC const imask = \ + (__m128d)_mm_sub_epi64(_mm_srli_epi64((__m128i)x, 63), \ + (__m128i)_mm_set1_pd(k8one)); \ + /* (y & ~mask) | (z & mask); imask = ~mask */ \ + _mm_or_pd(_mm_and_pd(imask, y), _mm_andnot_pd(imask, z)); \ }) #endif |