aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoreschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2010-12-07 16:03:09 +0000
committereschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>2010-12-07 16:03:09 +0000
commitd728013d0b8c0eec323cee76522f77ff70ec8bab (patch)
tree6d79e2701fc56c2df3916780f21a0db78fd0ff19
parent421103bf1df43f250a452df460553f72c824f8db (diff)
Correct vectorised fabs() function for Intel
git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@5 105869f7-3296-0410-a4ea-f4349344b45a
-rw-r--r--src/vectors-intel-4.h7
-rw-r--r--src/vectors-intel-8.h7
2 files changed, 12 insertions, 2 deletions
diff --git a/src/vectors-intel-4.h b/src/vectors-intel-4.h
index 4549a70..73c90be 100644
--- a/src/vectors-intel-4.h
+++ b/src/vectors-intel-4.h
@@ -110,6 +110,11 @@ static const union {
__m128 v;
} k4sign_mask_union = {{ 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U }};
#define k4sign_mask (k4sign_mask_union.v)
+static const union {
+ unsigned i[4];
+ __m128 v;
+} k4abs_mask_union = {{ 0x7fffffffU, 0x7fffffffU, 0x7fffffffU, 0x7fffffffU }};
+#define k4abs_mask (k4abs_mask_union.v)
// Operators
#define k4pos(x) (x)
@@ -127,7 +132,7 @@ static const union {
#define k4nmsub(x,y,z) (k4sub(z,k4mul(x,y)))
// Cheap functions
-#define k4fabs(x) (_mm_andnot_ps(x,k4sign_mask))
+#define k4fabs(x) (_mm_and_ps(x,k4abs_mask))
#define k4fmax(x,y) (_mm_max_ps(x,y))
#define k4fmin(x,y) (_mm_min_ps(x,y))
#define k4fnabs(x) (_mm_or_ps(x,k4sign_mask))
diff --git a/src/vectors-intel-8.h b/src/vectors-intel-8.h
index a9e4764..35dffa6 100644
--- a/src/vectors-intel-8.h
+++ b/src/vectors-intel-8.h
@@ -74,6 +74,11 @@ static const union {
__m128d v;
} k8sign_mask_union = {{ 0x8000000000000000ULL, 0x8000000000000000ULL }};
#define k8sign_mask (k8sign_mask_union.v)
+static const union {
+ unsigned long long i[2];
+ __m128d v;
+} k8abs_mask_union = {{ 0x7fffffffffffffffULL, 0x7fffffffffffffffULL }};
+#define k8abs_mask (k8sign_mask_union.v)
// Operators
#define k8pos(x) (x)
@@ -91,7 +96,7 @@ static const union {
#define k8nmsub(x,y,z) (k8sub(z,k8mul(x,y)))
// Cheap functions
-#define k8fabs(x) (_mm_andnot_pd(x,k8sign_mask))
+#define k8fabs(x) (_mm_and_pd(x,k8abs_mask))
#define k8fmax(x,y) (_mm_max_pd(x,y))
#define k8fmin(x,y) (_mm_min_pd(x,y))
#define k8fnabs(x) (_mm_or_pd(x,k8sign_mask))