diff options
author | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2011-09-26 01:47:26 +0000 |
---|---|---|
committer | eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a> | 2011-09-26 01:47:26 +0000 |
commit | 3c64331b60ffee738e803df02eec5700fea4686a (patch) | |
tree | 804ca8f3cc51d4d1598a6e9946c203b08c6b5372 /src/vectors-4-SSE.h | |
parent | 802b82837b5b37b7c76ee807939bbffe76f17fdd (diff) |
Use "andnot" instruction when vectorising
Use the "andnot" instruction to reduce the number of different bit
masks that are required. Using fewer different bit masks may require
fewer registers to hold them, or fewer load instructions to access
them, thus potentially improving performance.
Do not scalarize ifpos when SSE 4.1 is not available; instead, use
logical operations to create a bit mask.
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@31 105869f7-3296-0410-a4ea-f4349344b45a
Diffstat (limited to 'src/vectors-4-SSE.h')
-rw-r--r-- | src/vectors-4-SSE.h | 26 |
1 files changed, 17 insertions, 9 deletions
diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h index dbf0cce..2bde97e 100644 --- a/src/vectors-4-SSE.h +++ b/src/vectors-4-SSE.h @@ -248,15 +248,10 @@ static const union { __m128 v; } k4sign_mask_union = {{ 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U }}; #define k4sign_mask (k4sign_mask_union.v) -static const union { - unsigned i[4]; - __m128 v; -} k4abs_mask_union = {{ 0x7fffffffU, 0x7fffffffU, 0x7fffffffU, 0x7fffffffU }}; -#define k4abs_mask (k4abs_mask_union.v) // Operators #define k4pos(x) (x) -#define k4neg(x) (_mm_xor_ps(x,k4sign_mask)) +#define k4neg(x) (_mm_xor_ps(k4sign_mask,x)) // #define k4inv(x) // TODO: provide k4inv via rcp and Newton-Raphson // This is described in AMD's publication 47414. @@ -275,10 +270,10 @@ static const union { #define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y))) // Cheap functions -#define k4fabs(x) (_mm_and_ps(x,k4abs_mask)) +#define k4fabs(x) (_mm_andnot_ps(k4sign_mask,x)) #define k4fmax(x,y) (_mm_max_ps(x,y)) #define k4fmin(x,y) (_mm_min_ps(x,y)) -#define k4fnabs(x) (_mm_or_ps(x,k4sign_mask)) +#define k4fnabs(x) (_mm_or_ps(k4sign_mask,x)) // TODO: maybe use rsqrt and Newton-Raphson #define k4sqrt(x) (_mm_sqrt_ps(x)) @@ -311,7 +306,7 @@ static const union { // Choice [sign(x)>0 ? y : z] #ifdef __SSE4_1__ # define k4ifpos(x,y,z) (_mm_blendv_ps(y,z,x)) -#else +#elif 0 # ifdef __cplusplus # define k4sgn(x) ({ using namespace std; signbit(x); }) # else @@ -330,4 +325,17 @@ static const union { k4sgn(vec4_elt2(x)) ? vec4_elt2(z) : vec4_elt2(y), \ k4sgn(vec4_elt3(x)) ? vec4_elt3(z) : vec4_elt3(y)); \ }) +#else +# define k4ifpos(x_,y_,z_) \ + ({ \ + CCTK_REAL4_VEC const xx=(x_); \ + CCTK_REAL4_VEC const x=xx; \ + CCTK_REAL4_VEC const yy=(y_); \ + CCTK_REAL4_VEC const y=yy; \ + CCTK_REAL4_VEC const zz=(z_); \ + CCTK_REAL4_VEC const z=zz; \ + CCTK_REAL4_VEC const mask = _mm_srai_epi32(x, 31); \ + /* (y & ~mask) | (z & mask) */ \ + _mm_or_ps(_mm_andnot_ps(mask, y), _mm_and_ps(mask, z)); \ + }) #endif |