aboutsummaryrefslogtreecommitdiff
path: root/src/vectors-8-SSE2.h
diff options
context:
space:
mode:
authoreschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2011-09-26 01:47:26 +0000
committereschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2011-09-26 01:47:26 +0000
commit3c64331b60ffee738e803df02eec5700fea4686a (patch)
tree804ca8f3cc51d4d1598a6e9946c203b08c6b5372 /src/vectors-8-SSE2.h
parent802b82837b5b37b7c76ee807939bbffe76f17fdd (diff)
Use "andnot" instruction when vectorising
Use the "andnot" instruction to reduce the number of different bit masks that are required. Using fewer different bit masks may require fewer registers to hold them, or fewer load instructions to access them, thus potentially improving performance. Do not scalarize ifpos when SSE 4.1 is not available; instead, use logical operations to create a bit mask. git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@31 105869f7-3296-0410-a4ea-f4349344b45a
Diffstat (limited to 'src/vectors-8-SSE2.h')
-rw-r--r--src/vectors-8-SSE2.h38
1 files changed, 27 insertions, 11 deletions
diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h
index 3b11990..2da4b11 100644
--- a/src/vectors-8-SSE2.h
+++ b/src/vectors-8-SSE2.h
@@ -171,15 +171,10 @@ static const union {
__m128d v;
} k8sign_mask_union = {{ 0x8000000000000000ULL, 0x8000000000000000ULL }};
#define k8sign_mask (k8sign_mask_union.v)
-static const union {
- unsigned long long i[2];
- __m128d v;
-} k8abs_mask_union = {{ 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }};
-#define k8abs_mask (k8abs_mask_union.v)
// Operators
#define k8pos(x) (x)
-#define k8neg(x) (_mm_xor_pd(x,k8sign_mask))
+#define k8neg(x) (_mm_xor_pd(k8sign_mask,x))
#define k8add(x,y) (_mm_add_pd(x,y))
#define k8sub(x,y) (_mm_sub_pd(x,y))
@@ -193,10 +188,10 @@ static const union {
#define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y)))
// Cheap functions
-#define k8fabs(x) (_mm_and_pd(x,k8abs_mask))
+#define k8fabs(x) (_mm_andnot_pd(k8sign_mask,x))
#define k8fmax(x,y) (_mm_max_pd(x,y))
#define k8fmin(x,y) (_mm_min_pd(x,y))
-#define k8fnabs(x) (_mm_or_pd(x,k8sign_mask))
+#define k8fnabs(x) (_mm_or_pd(k8sign_mask,x))
#define k8sqrt(x) (_mm_sqrt_pd(x))
// Expensive functions
@@ -243,7 +238,7 @@ static const union {
} \
r; \
})
-#else
+#elif 0
# ifdef __cplusplus
# define k8sgn(x) ({ using namespace std; signbit(x); })
# else
@@ -257,7 +252,28 @@ static const union {
CCTK_REAL8_VEC const y=yy; \
CCTK_REAL8_VEC const zz=(z_); \
CCTK_REAL8_VEC const z=zz; \
- vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(z) : vec8_elt0(y), \
- k8sgn(vec8_elt1(x)) ? vec8_elt1(z) : vec8_elt1(y)); \
+ vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(z) : vec8_elt0(y), \
+ k8sgn(vec8_elt1(x)) ? vec8_elt1(z) : vec8_elt1(y)); \
+ })
+#else
+static const union {
+ unsigned long long i;
+ double d;
+} k8one_union = { 0x1ULL };
+# define k8one (k8one_union.d)
+# define k8ifpos(x_,y_,z_) \
+ ({ \
+ CCTK_REAL8_VEC const xx=(x_); \
+ CCTK_REAL8_VEC const x=xx; \
+ CCTK_REAL8_VEC const yy=(y_); \
+ CCTK_REAL8_VEC const y=yy; \
+ CCTK_REAL8_VEC const zz=(z_); \
+ CCTK_REAL8_VEC const z=zz; \
+ /* there is no _mm_srai_epi64(x, 63) */ \
+ CCTK_REAL8_VEC const imask = \
+ (__m128d)_mm_sub_epi64(_mm_srli_epi64((__m128i)x, 63), \
+ (__m128i)_mm_set1_pd(k8one)); \
+ /* (y & ~mask) | (z & mask); imask = ~mask */ \
+ _mm_or_pd(_mm_and_pd(imask, y), _mm_andnot_pd(imask, z)); \
})
#endif