diff options
author | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2011-09-26 01:47:26 +0000 |
---|---|---|
committer | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2011-09-26 01:47:26 +0000 |
commit | 3c64331b60ffee738e803df02eec5700fea4686a (patch) | |
tree | 804ca8f3cc51d4d1598a6e9946c203b08c6b5372 | |
parent | 802b82837b5b37b7c76ee807939bbffe76f17fdd (diff) |
Use "andnot" instruction when vectorising
Use the "andnot" instruction to reduce the number of different bit
masks that are required. Using fewer different bit masks may require
fewer registers to hold them, or fewer load instructions to access
them, thus potentially improving performance.
Do not scalarize ifpos when SSE 4.1 is not available; instead, use
logical operations to create a bit mask.
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@31 105869f7-3296-0410-a4ea-f4349344b45a
-rw-r--r-- | src/vectors-4-SSE.h | 26 | ||||
-rw-r--r-- | src/vectors-8-SSE2.h | 38 |
2 files changed, 44 insertions, 20 deletions
diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h index dbf0cce..2bde97e 100644 --- a/src/vectors-4-SSE.h +++ b/src/vectors-4-SSE.h @@ -248,15 +248,10 @@ static const union { __m128 v; } k4sign_mask_union = {{ 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U }}; #define k4sign_mask (k4sign_mask_union.v) -static const union { - unsigned i[4]; - __m128 v; -} k4abs_mask_union = {{ 0x7fffffffU, 0x7fffffffU, 0x7fffffffU, 0x7fffffffU }}; -#define k4abs_mask (k4abs_mask_union.v) // Operators #define k4pos(x) (x) -#define k4neg(x) (_mm_xor_ps(x,k4sign_mask)) +#define k4neg(x) (_mm_xor_ps(k4sign_mask,x)) // #define k4inv(x) // TODO: provide k4inv via rcp and Newton-Raphson // This is described in AMD's publication 47414. @@ -275,10 +270,10 @@ static const union { #define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y))) // Cheap functions -#define k4fabs(x) (_mm_and_ps(x,k4abs_mask)) +#define k4fabs(x) (_mm_andnot_ps(k4sign_mask,x)) #define k4fmax(x,y) (_mm_max_ps(x,y)) #define k4fmin(x,y) (_mm_min_ps(x,y)) -#define k4fnabs(x) (_mm_or_ps(x,k4sign_mask)) +#define k4fnabs(x) (_mm_or_ps(k4sign_mask,x)) // TODO: maybe use rsqrt and Newton-Raphson #define k4sqrt(x) (_mm_sqrt_ps(x)) @@ -311,7 +306,7 @@ static const union { // Choice [sign(x)>0 ? y : z] #ifdef __SSE4_1__ # define k4ifpos(x,y,z) (_mm_blendv_ps(y,z,x)) -#else +#elif 0 # ifdef __cplusplus # define k4sgn(x) ({ using namespace std; signbit(x); }) # else @@ -330,4 +325,17 @@ static const union { k4sgn(vec4_elt2(x)) ? vec4_elt2(z) : vec4_elt2(y), \ k4sgn(vec4_elt3(x)) ? vec4_elt3(z) : vec4_elt3(y)); \ }) +#else +# define k4ifpos(x_,y_,z_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + CCTK_REAL4_VEC const yy=(y_); \ + CCTK_REAL4_VEC const y=yy; \ + CCTK_REAL4_VEC const zz=(z_); \ + CCTK_REAL4_VEC const z=zz; \ + CCTK_REAL4_VEC const mask = _mm_srai_epi32(x, 31); \ + /* (y & ~mask) | (z & mask) */ \ + _mm_or_ps(_mm_andnot_ps(mask, y), _mm_and_ps(mask, z)); \ + }) #endif diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h index 3b11990..2da4b11 100644 --- a/src/vectors-8-SSE2.h +++ b/src/vectors-8-SSE2.h @@ -171,15 +171,10 @@ static const union { __m128d v; } k8sign_mask_union = {{ 0x8000000000000000ULL, 0x8000000000000000ULL }}; #define k8sign_mask (k8sign_mask_union.v) -static const union { - unsigned long long i[2]; - __m128d v; -} k8abs_mask_union = {{ 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }}; -#define k8abs_mask (k8abs_mask_union.v) // Operators #define k8pos(x) (x) -#define k8neg(x) (_mm_xor_pd(x,k8sign_mask)) +#define k8neg(x) (_mm_xor_pd(k8sign_mask,x)) #define k8add(x,y) (_mm_add_pd(x,y)) #define k8sub(x,y) (_mm_sub_pd(x,y)) @@ -193,10 +188,10 @@ static const union { #define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y))) // Cheap functions -#define k8fabs(x) (_mm_and_pd(x,k8abs_mask)) +#define k8fabs(x) (_mm_andnot_pd(k8sign_mask,x)) #define k8fmax(x,y) (_mm_max_pd(x,y)) #define k8fmin(x,y) (_mm_min_pd(x,y)) -#define k8fnabs(x) (_mm_or_pd(x,k8sign_mask)) +#define k8fnabs(x) (_mm_or_pd(k8sign_mask,x)) #define k8sqrt(x) (_mm_sqrt_pd(x)) // Expensive functions @@ -243,7 +238,7 @@ static const union { } \ r; \ }) -#else +#elif 0 # ifdef __cplusplus # define k8sgn(x) ({ using namespace std; signbit(x); }) # else @@ -257,7 +252,28 @@ static const union { CCTK_REAL8_VEC const y=yy; \ CCTK_REAL8_VEC const zz=(z_); \ CCTK_REAL8_VEC const z=zz; \ - vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(z) : vec8_elt0(y), \ - k8sgn(vec8_elt1(x)) ? vec8_elt1(z) : vec8_elt1(y)); \ + vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(z) : vec8_elt0(y), \ + k8sgn(vec8_elt1(x)) ? vec8_elt1(z) : vec8_elt1(y)); \ + }) +#else +static const union { + unsigned long long i; + double d; +} k8one_union = { 0x1ULL }; +# define k8one (k8one_union.d) +# define k8ifpos(x_,y_,z_) \ + ({ \ + CCTK_REAL8_VEC const xx=(x_); \ + CCTK_REAL8_VEC const x=xx; \ + CCTK_REAL8_VEC const yy=(y_); \ + CCTK_REAL8_VEC const y=yy; \ + CCTK_REAL8_VEC const zz=(z_); \ + CCTK_REAL8_VEC const z=zz; \ + /* there is no _mm_srai_epi64(x, 63) */ \ + CCTK_REAL8_VEC const imask = \ + (__m128d)_mm_sub_epi64(_mm_srli_epi64((__m128i)x, 63), \ + (__m128i)_mm_set1_pd(k8one)); \ + /* (y & ~mask) | (z & mask); imask = ~mask */ \ + _mm_or_pd(_mm_and_pd(imask, y), _mm_andnot_pd(imask, z)); \ }) #endif |