From 4b4418d0efb5b06784e89f617701c7b7cd5f6059 Mon Sep 17 00:00:00 2001 From: eschnett Date: Sat, 11 Aug 2012 20:55:54 +0000 Subject: Add ksgn function (vectorised version of Kranc's Sign) All architectures: Add copysign and sgn functions. Remove pos function (which does nothing). Add support for Blue Gene/Q (QPX instructions). Correct errors in AVX instructions. git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@62 105869f7-3296-0410-a4ea-f4349344b45a --- src/test.cc | 62 ++++++--- src/vectors-4-Altivec.h | 1 - src/vectors-4-SSE.h | 82 +++++++++--- src/vectors-4-default.h | 57 ++++---- src/vectors-8-AVX.h | 80 ++++++----- src/vectors-8-DoubleHummer.h | 32 ++++- src/vectors-8-QPX.h | 311 +++++++++++++++++++++++++++++++++++++++++++ src/vectors-8-SSE2.h | 101 +++++++++----- src/vectors-8-VSX.h | 12 +- src/vectors-8-default.h | 56 ++++---- src/vectors.h | 117 ++++++++-------- 11 files changed, 689 insertions(+), 222 deletions(-) create mode 100644 src/vectors-8-QPX.h diff --git a/src/test.cc b/src/test.cc index 924e4ec..73dd9e2 100644 --- a/src/test.cc +++ b/src/test.cc @@ -11,6 +11,14 @@ using namespace std; + + +inline CCTK_REAL my_copysign (CCTK_REAL const x, CCTK_REAL const y) +{ + using namespace std; + return copysign(x, y); +} + inline int my_isnan (CCTK_REAL const x) { return std::isnan(x); @@ -21,6 +29,14 @@ inline int my_signbit (CCTK_REAL const x) return std::signbit(x); } +inline CCTK_REAL my_sgn (CCTK_REAL const x) +{ + using namespace std; + return x == (CCTK_REAL)0.0 ? (CCTK_REAL)0.0 : my_copysign((CCTK_REAL)1.0, x); +} + + + #define SCALARTEST(testname, vecexpr, scalarexpr) \ do { \ if (verbose) { \ @@ -67,6 +83,8 @@ inline int my_signbit (CCTK_REAL const x) } \ } while(0) + + extern "C" void Vectors_Test(CCTK_ARGUMENTS) { @@ -215,27 +233,29 @@ void Vectors_Test(CCTK_ARGUMENTS) VECTEST("knmadd", knmadd(av, bv, cv), -a[i] * b[i] - c[i] ); VECTEST("knmsub", knmsub(av, bv, cv), -a[i] * b[i] + c[i] ); - VECTEST("kacos", kacos(xv), acos(x[i]) ); - VECTEST("kacosh", kacosh(zv), acosh(z[i]) ); - VECTEST("kasin", kasin(xv), asin(x[i]) ); - VECTEST("kasinh", kasinh(xv), asinh(x[i]) ); - VECTEST("katan", katan(xv), atan(x[i]) ); - VECTEST("katan2", katan2(xv, yv), atan2(x[i], y[i]) ); - VECTEST("katanh", katanh(xv), atanh(x[i]) ); - VECTEST("kcos", kcos(xv), cos(x[i]) ); - VECTEST("kcosh", kcosh(xv), cosh(x[i]) ); - VECTEST("kexp", kexp(xv), exp(x[i]) ); - VECTEST("kfabs", kfabs(xv), fabs(x[i]) ); - VECTEST("kfmax", kfmax(xv, yv), fmax(x[i], y[i]) ); - VECTEST("kfmin", kfmin(xv, yv), fmin(x[i], y[i]) ); - VECTEST("kfnabs", kfnabs(xv), -fabs(x[i]) ); - VECTEST("klog", klog(yv), log(y[i]) ); - VECTEST("kpow", kpow(yv, x[0]), pow(y[i], x[0]) ); - VECTEST("ksin", ksin(xv), sin(x[i]) ); - VECTEST("ksinh", ksinh(xv), sinh(x[i]) ); - VECTEST("ksqrt", ksqrt(yv), sqrt(y[i]) ); - VECTEST("ktan", ktan(xv), tan(x[i]) ); - VECTEST("ktanh", ktanh(xv), tanh(x[i]) ); + VECTEST("kacos", kacos(xv), acos(x[i]) ); + VECTEST("kacosh", kacosh(zv), acosh(z[i]) ); + VECTEST("kasin", kasin(xv), asin(x[i]) ); + VECTEST("kasinh", kasinh(xv), asinh(x[i]) ); + VECTEST("katan", katan(xv), atan(x[i]) ); + VECTEST("katan2", katan2(xv, yv), atan2(x[i], y[i]) ); + VECTEST("katanh", katanh(xv), atanh(x[i]) ); + VECTEST("kcopysign", kcopysign(xv, yv), copysign(x[i], y[i])); + VECTEST("kcos", kcos(xv), cos(x[i]) ); + VECTEST("kcosh", kcosh(xv), cosh(x[i]) ); + VECTEST("kexp", kexp(xv), exp(x[i]) ); + VECTEST("kfabs", kfabs(xv), fabs(x[i]) ); + VECTEST("kfmax", kfmax(xv, yv), fmax(x[i], y[i]) ); + VECTEST("kfmin", kfmin(xv, yv), fmin(x[i], y[i]) ); + VECTEST("kfnabs", kfnabs(xv), -fabs(x[i]) ); + VECTEST("klog", klog(yv), log(y[i]) ); + VECTEST("kpow", kpow(yv, x[0]), pow(y[i], x[0]) ); + VECTEST("ksin", ksin(xv), sin(x[i]) ); + VECTEST("ksinh", ksinh(xv), sinh(x[i]) ); + VECTEST("ksgn", ksgn(xv), my_sgn(x[i]) ); + VECTEST("ksqrt", ksqrt(yv), sqrt(y[i]) ); + VECTEST("ktan", ktan(xv), tan(x[i]) ); + VECTEST("ktanh", ktanh(xv), tanh(x[i]) ); VECTEST("kifpos positive", kifpos(av, bv, cv), my_signbit(a[i]) ? c[i] : b[i]); diff --git a/src/vectors-4-Altivec.h b/src/vectors-4-Altivec.h index 78c07ca..d6281c8 100644 --- a/src/vectors-4-Altivec.h +++ b/src/vectors-4-Altivec.h @@ -114,7 +114,6 @@ // Functions and operators // Operators -#define k4pos(x) (+(x)) #define k4neg(x) (-(x)) #define k4add(x,y) ((x)+(y)) diff --git a/src/vectors-4-SSE.h b/src/vectors-4-SSE.h index 68388b6..9af7189 100644 --- a/src/vectors-4-SSE.h +++ b/src/vectors-4-SSE.h @@ -50,6 +50,19 @@ +union k4const_t { + unsigned i[4]; + float f[4]; + __m128i vi; + __m128 vf; +}; + +#define K4_ZERO 0x00000000UL +#define K4_IMIN 0x80000000UL +#define K4_IMAX 0x7fffffffUL + + + // Create vectors, extract vector elements #define vec4_set1(a) (_mm_set1_ps(a)) @@ -272,15 +285,10 @@ // Functions and operators -static const union { - unsigned i[4]; - __m128 v; -} k4sign_mask_union = {{ 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U }}; -#define k4sign_mask (k4sign_mask_union.v) +static const k4const_t k4sign_mask = {{ K4_IMIN, K4_IMIN, K4_IMIN, K4_IMIN, }}; // Operators -#define k4pos(x) (x) -#define k4neg(x) (_mm_xor_ps(k4sign_mask,x)) +#define k4neg(x) (_mm_xor_ps(k4sign_mask.vf,x)) // #define k4inv(x) // TODO: provide k4inv via rcp and Newton-Raphson // This is described in AMD's publication 47414. @@ -306,10 +314,24 @@ static const union { #endif // Cheap functions -#define k4fabs(x) (_mm_andnot_ps(k4sign_mask,x)) +#define k4copysign(x,y) \ + (_mm_or_ps(_mm_andnot_ps(k4sign_mask.vf,x), \ + _mm_and_ps(k4sign_mask.vf,y))) +#define k4fabs(x) (_mm_andnot_ps(k4sign_mask.vf,x)) #define k4fmax(x,y) (_mm_max_ps(x,y)) #define k4fmin(x,y) (_mm_min_ps(x,y)) -#define k4fnabs(x) (_mm_or_ps(k4sign_mask,x)) +#define k4fnabs(x) (_mm_or_ps(k4sign_mask.vf,x)) +static const k4const_t k4zero = { f: { 0.0f, 0.0f, 0.0f, 0.0f, }}; +static const k4const_t k4one = { f: { 1.0f, 1.0f, 1.0f, 1.0f, }}; +#define k4sgn(x_) \ + ({ \ + CCTK_REAL_VEC const x__=(x_); \ + CCTK_REAL_VEC const x=x__; \ + CCTK_REAL_VEC const iszero = _mm_cmpeq_ps(k4zero.vf, x); \ + CCTK_REAL_VEC const sign = _mm_and_ps(k4sign_mask.vf, x); \ + CCTK_REAL_VEC const signedone = _mm_or_ps(k4one.vf, sign); \ + k4ifthen(iszero, k4zero.vf, signedone); \ + }) // TODO: maybe use rsqrt and Newton-Raphson #define k4sqrt(x) (_mm_sqrt_ps(x)) @@ -363,16 +385,22 @@ static const union { #define k4tan(x) K4REPL(tanf,x) #define k4tanh(x) K4REPL(tanhf,x) -// Choice [sign(x)>0 ? y : z] +static const k4const_t k4lfalse = {{ +0U, +0U, +0U, +0U, }}; +static const k4const_t k4ltrue = {{ -1U, -1U, -1U, -1U, }}; +#define k4lnot(x) (_mm_xor_ps(k4ltrue,x)) +#define k4land(x,y) (_mm_and_ps(x,y)) +#define k4lor(x,y) (_mm_or_ps(x,y)) +#define k4lxor(x,y) (_mm_xor_ps(x,y)) + #ifdef __SSE4_1__ -# define k4ifmsb(x,y,z) (_mm_blendv_ps(z,y,x)) +# define k4ifthen(x,y,z) (_mm_blendv_ps(z,y,x)) #elif 0 # ifdef __cplusplus -# define k4sgn(x) ({ using namespace std; signbit(x); }) +# define k4signbit(x) ({ using namespace std; signbit(x); }) # else -# define k4sgn(x) (signbitf(x)) +# define k4signbit(x) (signbitf(x)) # endif -# define k4ifmsb(x,y,z) \ +# define k4ifthen(x,y,z) \ ({ \ CCTK_REAL4_VEC const x__=(x_); \ CCTK_REAL4_VEC const y__=(y_); \ @@ -380,13 +408,15 @@ static const union { CCTK_REAL4_VEC const x=x__; \ CCTK_REAL4_VEC const y=y__; \ CCTK_REAL4_VEC const z=z__; \ - vec4_set(k4sgn(vec4_elt0(x)) ? vec4_elt0(y) : vec4_elt0(z), \ - k4sgn(vec4_elt1(x)) ? vec4_elt1(y) : vec4_elt1(z), \ - k4sgn(vec4_elt2(x)) ? vec4_elt2(y) : vec4_elt2(z), \ - k4sgn(vec4_elt3(x)) ? vec4_elt3(y) : vec4_elt3(z)); \ + vec4_set(k4signbit(vec4_elt0(x)) ? vec4_elt0(y) : vec4_elt0(z), \ + k4signbit(vec4_elt1(x)) ? vec4_elt1(y) : vec4_elt1(z), \ + k4signbit(vec4_elt2(x)) ? vec4_elt2(y) : vec4_elt2(z), \ + k4signbit(vec4_elt3(x)) ? vec4_elt3(y) : vec4_elt3(z)); \ }) -#else -# define k4ifmsb(x_,y_,z_) \ +#elif 0 +// We don't need to shift -- the condition (mask) will be either all +// zeros or all ones +# define k4ifthen(x_,y_,z_) \ ({ \ CCTK_REAL4_VEC const x__=(x_); \ CCTK_REAL4_VEC const y__=(y_); \ @@ -399,4 +429,16 @@ static const union { /* (z & ~mask) | (y & mask) */ \ _mm_or_ps(_mm_andnot_ps(mask, z), _mm_and_ps(mask, y)); \ }) +#else +# define k4ifthen(x_,y_,z_) \ + ({ \ + CCTK_REAL4_VEC const x__=(x_); \ + CCTK_REAL4_VEC const y__=(y_); \ + CCTK_REAL4_VEC const z__=(z_); \ + CCTK_REAL4_VEC const x=x__; \ + CCTK_REAL4_VEC const y=y__; \ + CCTK_REAL4_VEC const z=z__; \ + /* (z & ~mask) | (y & mask) where imask = ~mask */ \ + _mm_or_ps(_mm_and_ps(x, y), _mm_andnot_ps(x, z)); \ + }) #endif diff --git a/src/vectors-4-default.h b/src/vectors-4-default.h index ff675f6..d1b2135 100644 --- a/src/vectors-4-default.h +++ b/src/vectors-4-default.h @@ -64,7 +64,6 @@ // Operators -#define k4pos(x) (+(x)) #define k4neg(x) (-(x)) #define k4add(x,y) ((x)+(y)) @@ -79,32 +78,40 @@ #define k4nmsub(x,y,z) (-(x)*(y)+(z)) // Functions -#define k4acos(x) (acosf(x)) -#define k4acosh(x) (acoshf(x)) -#define k4asin(x) (asinf(x)) -#define k4asinh(x) (asinhf(x)) -#define k4atan(x) (atanf(x)) -#define k4atan2(x,y) (atan2f(x,y)) -#define k4atanh(x) (atanhf(x)) -#define k4cos(x) (cosf(x)) -#define k4cosh(x) (coshf(x)) -#define k4exp(x) (expf(x)) -#define k4fabs(x) (fabsf(x)) -#define k4fmax(x,y) (fmaxf(x,y)) -#define k4fmin(x,y) (fminf(x,y)) -#define k4fnabs(x) (-fabsf(x)) -#define k4log(x) (logf(x)) -#define k4pow(x,a) (powf(x,a)) -#define k4sin(x) (sinf(x)) -#define k4sinh(x) (sinhf(x)) -#define k4sqrt(x) (sqrtf(x)) -#define k4tan(x) (tanf(x)) -#define k4tanh(x) (tanhf(x)) +#define k4acos(x) (acosf(x)) +#define k4acosh(x) (acoshf(x)) +#define k4asin(x) (asinf(x)) +#define k4asinh(x) (asinhf(x)) +#define k4atan(x) (atanf(x)) +#define k4atan2(x,y) (atan2f(x,y)) +#define k4atanh(x) (atanhf(x)) +#define k4copysign(x,y) (copysign(x,y)) +#define k4cos(x) (cosf(x)) +#define k4cosh(x) (coshf(x)) +#define k4exp(x) (expf(x)) +#define k4fabs(x) (fabsf(x)) +#define k4fmax(x,y) (fmaxf(x,y)) +#define k4fmin(x,y) (fminf(x,y)) +#define k4fnabs(x) (-fabsf(x)) +#define k4log(x) (logf(x)) +#define k4pow(x,a) (powf(x,a)) +#define k4sin(x) (sinf(x)) +#define k4sinh(x) (sinhf(x)) +#define k4sqrt(x) (sqrtf(x)) +#define k4tan(x) (tanf(x)) +#define k4tanh(x) (tanhf(x)) + +#define k4sgn(x_) \ + ({ \ + CCTK_REAL x__=(x_); \ + CCTK_REAL x=x__; \ + x==(CCTK_REAL)0.0 ? (CCTK_REAL)0.0 : copysign((CCTK_REAL)1.0, x); \ + }) #ifdef __cplusplus -# define k4sgn(x) ({ using namespace std; signbit(x); }) +# define k4signbit(x) ({ using namespace std; signbit(x); }) #else -# define k4sgn(x) (signbit(x)) +# define k4signbit(x) (signbit(x)) #endif -#define k4ifmsb(x,y,z) (k4sgn(x)?(y):(z)) +#define k4ifthen(x,y,z) (k4signbit(x)?(y):(z)) diff --git a/src/vectors-8-AVX.h b/src/vectors-8-AVX.h index 9e1d98b..825f2d3 100644 --- a/src/vectors-8-AVX.h +++ b/src/vectors-8-AVX.h @@ -35,9 +35,9 @@ union k8const_t { unsigned long long i[4]; - double d[4]; + double f[4]; __m256i vi; - __m256d vd; + __m256d vf; }; #define K8_ZERO 0x0000000000000000ULL @@ -118,12 +118,12 @@ union k8const_t { bool v8stp_all; \ __m256i v8stp_mask; \ ({ \ - ptrdiff_t const imin1=(imin_); \ - ptrdiff_t const imin=imin1; \ - ptrdiff_t const imax1=(imax_); \ - ptrdiff_t const imax=imax1; \ + ptrdiff_t const imin__=(imin_); \ + ptrdiff_t const imin=imin__; \ + ptrdiff_t const imax__=(imax_); \ + ptrdiff_t const imax=imax__; \ \ - v8stp_all = i>=imin and i+CCTK_REAL_VEC_SIZE=imin and i+CCTK_REAL_VEC_SIZE-10 ? y : z] -#define k8ifmsb(x,y,z) (_mm256_blendv_pd(z,y,x)) +static const k8const_t k8lfalse = {{ K8_ZERO, K8_ZERO, K8_ZERO, K8_ZERO, }}; +static const k8const_t k8ltrue = {{ K8_IMIN, K8_IMIN, K8_IMIN, K8_IMIN, }}; +#define k8lnot(x) (_mm256_xor_pd(k8sign_mask,x)) +#define k8land(x,y) (_mm256_and_pd(x,y)) +#define k8lor(x,y) (_mm256_or_pd(x,y)) +#define k8lxor(x,y) (_mm256_xor_pd(x,y)) +#define k8ifthen(x,y,z) (_mm256_blendv_pd(z,y,x)) diff --git a/src/vectors-8-DoubleHummer.h b/src/vectors-8-DoubleHummer.h index b0a1adf..ffb04b4 100644 --- a/src/vectors-8-DoubleHummer.h +++ b/src/vectors-8-DoubleHummer.h @@ -23,6 +23,14 @@ +union k8const_t { + double f[2]; + unsigned long long i[2]; + CCTK_REAL8_VEC vf; +}; + + + // Create vectors, extract vector elements #define vec8_set1(a) (__cmplx(a,a)) @@ -107,7 +115,7 @@ ({ \ CCTK_REAL8 const& p__=(p_); \ CCTK_REAL8 const& p=p__; \ - ((off2) % CCTK_REAL8_VEC_SIZE != 0 || \ + ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \ (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \ vec8_loadu(p) : \ vec8_loadu_maybe(off1,p); \ @@ -205,6 +213,20 @@ __fpsel(k8sub(x,y),x,y); \ }) #define k8fnabs(x) (__fpnabs(x)) +static const k8const_t k8zero = {{ 0.0, 0.0, }}; +static const k8const_t k8one = {{ +1.0, +1.0, }}; +static const k8const_t k8mone = {{ -1.0, -1.0, }}; +#define k8sgn(x_) \ + ({ \ + CCTK_REAL_VEC x__=(x_); \ + CCTK_REAL_VEC x=x__; \ + /* TODO: this assumes that __fpsel says -0>=+0; \ + if this is not so, we need k8abs(x) instead of x for iszero. */ \ + CCTK_REAL_VEC iszero = k8land(__fpsel(x, k8lfalse.vf, k8ltrue.vf), \ + __fpsel(k8neg(x), k8lfalse.vf, k8ltrue.vf)); \ + CCTK_REAL_VEC signedone = __fpsel(x, k8mone.vf, k8one.vf); \ + k8ifthen(iszero, k8zero.vf, signedone); \ + }) // Estimate for reciprocal square root #define k8rsqrt_init(x) (__fprsqrte(x)) // One Newton iteration for reciprocal square root @@ -279,4 +301,10 @@ #define k8tan(x) K8REPL(tan,x) #define k8tanh(x) K8REPL(tanh,x) -#define k8ifmsb(x,y,z) (__fpsel(x,y,z)) +static const k8const_t k8lfalse = {{ -1.0, -1.0, }}; +static const k8const_t k8ltrue = {{ +1.0, +1.0, }}; +#define k8lnot(x) (__fpneg(x)) +#define k8land(x,y) (k8ifthen(x,y,k8lfalse.vf)) +#define k8lor(x,y) (k8ifthen(x,k8ltrue.vf,y)) +#define k8lxor(x,y) (k8ifthen(x,k8lnot(y),y)) +#define k8ifthen(x,y,z) (__fpsel(x,z,y)) diff --git a/src/vectors-8-QPX.h b/src/vectors-8-QPX.h new file mode 100644 index 0000000..80762fe --- /dev/null +++ b/src/vectors-8-QPX.h @@ -0,0 +1,311 @@ +// Vectorise using IBM's Blue Gene/Q QPX (Power) + +// Use the type vector4double directly, without introducing a wrapper class +// Use macros instead of inline functions + +// Note: bgxlC_r does not like const declarations, so we need to cast +// them away and/or omit them everywhere + + + +#include + +#ifdef __cplusplus +# include +#endif + + + +#define vec8_architecture "QPX" + +// Vector type corresponding to CCTK_REAL +#define CCTK_REAL8_VEC vector4double + +// Number of vector elements in a CCTK_REAL_VEC +#define CCTK_REAL8_VEC_SIZE 4 + + + +union k8const_t { + CCTK_REAL8 f[CCTK_REAL8_VEC_SIZE]; + CCTK_REAL8_VEC vf; +}; + + + +// Create vectors, extract vector elements + +#define vec8_set1(a) (vec_splats(a)) +#define vec8_set(a,b,c,d) ((vector4double){a,b,c,d}) + +#define vec8_elt0(x) (vec_extract(x,0)) +#define vec8_elt1(x) (vec_extract(x,1)) +#define vec8_elt2(x) (vec_extract(x,2)) +#define vec8_elt3(x) (vec_extract(x,3)) +#define vec8_elt(x,d) (vec_extract(x,d)) + + + +// Load and store vectors + +// Load a vector from memory (aligned and unaligned); this loads from +// a reference to a scalar +#define vec8_load(p) (vec_lda(0,(CCTK_REAL8*)&(p))) +#define vec8_loadu(p_) \ + ({ \ + CCTK_REAL8 const& p__=(p_); \ + CCTK_REAL8& p = *(CCTK_REAL8*)&p__; \ + CCTK_REAL8_VEC v1, v2, vp; \ + v1 = vec_ld(0,&p); /* load the left part of the vector */ \ + v2 = vec_ld(32,&p); /* load the right part of the vector */ \ + vp = vec_lvsl(0,&p); /* generate control value */ \ + vec_perm(v1,v2,vp); /* generate the aligned vector */ \ + }) + +// Load a vector from memory that may or may not be aligned, as +// decided by the offset and the vector size +#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS +// Implementation: Always use unaligned load +# define vec8_loadu_maybe(off,p) vec8_loadu(p) +# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p) +#else +# define vec8_loadu_maybe(off,p_) \ + ({ \ + CCTK_REAL8 const& p__=(p_); \ + CCTK_REAL8 const& p=p__; \ + (off) % CCTK_REAL8_VEC_SIZE == 0 ? \ + vec8_load(p) : \ + vec8_loadu(p); \ + }) +# if VECTORISE_ALIGNED_ARRAYS +// Assume all array x sizes are multiples of the vector size +# define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu_maybe(off1,p) +# else +# define vec8_loadu_maybe3(off1,off2,off3,p_) \ + ({ \ + CCTK_REAL8 const& p__=(p_); \ + CCTK_REAL8 const& p=p__; \ + ((off2) % CCTK_REAL8_VEC_SIZE != 0 or \ + (off3) % CCTK_REAL8_VEC_SIZE != 0) ? \ + vec8_loadu(p) : \ + vec8_loadu_maybe(off1,p); \ + }) +# endif +#endif + +// Store a vector to memory (aligned and non-temporal); this stores to +// a reference to a scalar +#define vec8_store(p,x) (vec_sta(x,0,&(p))) +#define vec8_storeu(p_,x_) \ + ({ \ + CCTK_REAL8 & p__=(p_); \ + CCTK_REAL8_VEC const x__=(x_); \ + CCTK_REAL8 & p=p__; \ + CCTK_REAL8_VEC const x=x__; \ + CCTK_REAL8_VEC v1, v2, v3, vp, m1, m2, m3; \ + /* generate insert masks */ \ + vp = vec_lvsr(0,&p); \ + m1 = k8lfalse; \ + m2 = k8ltrue; \ + m3 = vec_perm(m1,m2,vp); \ + /* get existing data */ \ + v1 = vec_ld(0,&p); \ + v2 = vec_ld(32,&p); \ + /* permute and insert */ \ + v3 = vec_perm(x,x,vp); \ + v1 = vec_sel(v1,v3,m3); \ + v2 = vec_sel(v3,v2,m3); \ + /* store data back */ \ + vec_st(0,&p,v1); \ + vec_st(32,&p,v2); \ + }) +#define vec8_store_nta(p,x) (vec_sta(x,0,&(p))) // this doesn't avoid the cache + +// Store a partial vector (aligned and non-temporal) +#define vec8_store_partial_prepare(i,imin_,imax_) \ + bool v8stp_all; \ + CCTK_REAL8_VEC v8stp_mask; \ + ({ \ + ptrdiff_t const imin__=(imin_); \ + ptrdiff_t const imax__=(imax_); \ + ptrdiff_t const imin=imin__; \ + ptrdiff_t const imax=imax__; \ + \ + v8stp_all = i-imin>=0 and i-imax<=-CCTK_REAL8_VEC_SIZE; \ + \ + if (not CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \ + CCTK_REAL8_VEC vp_lo, vp_hi, mask_lo, mask_hi; \ + vp_lo = vec_lvsl(i-imin, (CCTK_REAL*)CCTK_REAL8_VEC_SIZE); \ + mask_lo = (i-imin>=0 ? \ + k8ltrue : \ + vec_perm(k8lfalse, k8ltrue, vp_lo)); \ + vp_hi = vec_lvsl(i-imax, (CCTK_REAL*)CCTK_REAL8_VEC_SIZE); \ + mask_hi = (i-imax<=-CCTK_REAL8_VEC_SIZE ? \ + k8ltrue : \ + vec_perm(k8ltrue, k8lfalse, vp_hi)); \ + v8stp_mask = vec_and(mask_lo, mask_hi); \ + } \ + }) +#define vec8_store_nta_partial(p_,x_) \ + ({ \ + CCTK_REAL8& p__=(p_); \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8& p=p__; \ + CCTK_REAL8_VEC x=x__; \ + if (CCTK_BUILTIN_EXPECT(v8stp_all, true)) { \ + vec8_store(p, x); \ + } else { \ + vec8_store(p, vec_sel(vec8_load(p), x, v8stp_mask)); \ + } \ + }) + +// Store a lower or higher partial vector (aligned and non-temporal); +// the non-temporal hint is probably ignored +#define vec8_store_nta_partial_lo(p_,x_,n) \ + ({ \ + CCTK_REAL8& p__=(p_); \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8& p=p__; \ + CCTK_REAL8_VEC x=x__; \ + CCTK_REAL8_VEC vp_hi, mask_hi; \ + vp_hi = vec_lvsl(CCTK_REAL8_VEC_SIZE-n, (CCTK_REAL*)0); \ + mask_hi = vec_perm(k8ltrue, k8lfalse, vp_hi); \ + vec8_store(p, vec_sel(vec8_load(p), x, mask_hi)); \ + }) +#define vec8_store_nta_partial_hi(p_,x_,n) \ + ({ \ + CCTK_REAL8& p__=(p_); \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8& p=p__; \ + CCTK_REAL8_VEC x=x__; \ + CCTK_REAL8_VEC vp_lo, mask_lo; \ + vp_lo = vec_lvsl(n, (CCTK_REAL*)0); \ + mask_lo = vec_perm(k8lfalse, k8ltrue, vp_lo); \ + vec8_store(p, vec_sel(vec8_load(p), x, mask_lo)); \ + }) +#define vec8_store_nta_partial_mid(p_,x_,nlo,nhi) \ + ({ \ + CCTK_REAL8& p__=(p_); \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8 p=p__; \ + CCTK_REAL8_VEC x=x__; \ + CCTK_REAL8_VEC vp_lo, mask_lo; \ + vp_lo = vec_lvsl(nhi, (CCTK_REAL*)0); \ + mask_lo = vec_perm(k8lfalse, k8ltrue, vp_lo); \ + CCTK_REAL8_VEC vp_hi, mask_hi; \ + vp_hi = vec_lvsl(CCTK_REAL8_VEC_SIZE-nlo, (CCTK_REAL*)0); \ + mask_hi = vec_perm(k8ltrue, k8lfalse, vp_hi); \ + CCTK_REAL8_VEC mask; \ + mask = vec_and(mask_lo, mask_hi); \ + vec8_store(p, vec_sel(vec8_load(p), x, mask)); \ + }) + + + +// Functions and operators + +// Operators +#define k8neg(x) (vec_neg(x)) + +#define k8add(x,y) (vec_add(x,y)) +#define k8sub(x,y) (vec_sub(x,y)) +#define k8mul(x,y) (vec_mul(x,y)) +#define k8div(x,y) (vec_swdiv_nochk(x,y)) + +// Fused multiply-add, defined as [+-]x*y[+-]z +#define k8madd(x,y,z) (vec_madd(z,x,y)) +#define k8msub(x,y,z) (vec_msub(z,x,y)) +#define k8nmadd(x,y,z) (vec_nmadd(z,x,y)) +#define k8nmsub(x,y,z) (vec_nmsub(z,x,y)) + +// Cheap functions +#define k8copysign(x,y) (vec_cpsgn(y,x)) +#define k8fabs(x) (vec_abs(x)) +#define k8fmax(x_,y_) \ + ({ \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8_VEC y__=(y_); \ + CCTK_REAL8_VEC x=x__; \ + CCTK_REAL8_VEC y=y__; \ + vec_sel(vec_cmpgt(y,x),y,x); \ + }) +#define k8fmin(x_,y_) \ + ({ \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8_VEC y__=(y_); \ + CCTK_REAL8_VEC x=x__; \ + CCTK_REAL8_VEC y=y__; \ + vec_sel(vec_cmplt(y,x),y,x); \ + }) +#define k8fnabs(x) (vec_nabs(x)) +#define k8sgn(x_) \ + ({ \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8_VEC x=x__; \ + CCTK_REAL8_VEC one, zero, iszero; \ + one = k8ltrue; \ + zero = vec_sub(one, one); \ + iszero = vec_cmpeq(x, zero); \ + k8ifthen(iszero, zero, vec_cpsgn(one, x)); \ + }) +#define k8sqrt(x) (vec_swsqrt_nochk(x)) + +// Expensive functions +#define K8REPL(f,x_) \ + ({ \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8_VEC x=x__; \ + vec8_set(f(vec8_elt0(x)), \ + f(vec8_elt1(x)), \ + f(vec8_elt2(x)), \ + f(vec8_elt3(x))); \ + }) +#define K8REPL2S(f,x_,a_) \ + ({ \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8 a__=(a_); \ + CCTK_REAL8_VEC x=x__; \ + CCTK_REAL8 a=a__; \ + vec8_set(f(vec8_elt0(x),a), \ + f(vec8_elt1(x),a), \ + f(vec8_elt2(x),a), \ + f(vec8_elt3(x),a)); \ + }) +#define K8REPL2(f,x_,y_) \ + ({ \ + CCTK_REAL8_VEC x__=(x_); \ + CCTK_REAL8_VEC y__=(y_); \ + CCTK_REAL8_VEC x=x__; \ + CCTK_REAL8_VEC y=y__; \ + vec8_set(f(vec8_elt0(x),vec8_elt0(y)), \ + f(vec8_elt1(x),vec8_elt1(y)), \ + f(vec8_elt2(x),vec8_elt2(y)), \ + f(vec8_elt3(x),vec8_elt3(y))); \ + }) + +#define k8acos(x) K8REPL(acos,x) +#define k8acosh(x) K8REPL(acosh,x) +#define k8asin(x) K8REPL(asin,x) +#define k8asinh(x) K8REPL(asinh,x) +#define k8atan(x) K8REPL(atan,x) +#define k8atan2(x,y) K8REPL2(atan2,x,y) +#define k8atanh(x) K8REPL(atanh,x) +#define k8cos(x) K8REPL(cos,x) +#define k8cosh(x) K8REPL(cosh,x) +#define k8exp(x) K8REPL(exp,x) +#define k8log(x) K8REPL(log,x) +#define k8pow(x,a) K8REPL2S(pow,x,a) +#define k8sin(x) K8REPL(sin,x) +#define k8sinh(x) K8REPL(sinh,x) +#define k8tan(x) K8REPL(tan,x) +#define k8tanh(x) K8REPL(tanh,x) + +#define k8lfalse \ + ({ CCTK_REAL8_VEC dummy; vec_logical(dummy,dummy,0x0); }) +#define k8ltrue \ + ({ CCTK_REAL8_VEC dummy; vec_logical(dummy,dummy,0xf); }) +#define k8lnot(x) (vec_not(x)) +#define k8land(x,y) (vec_and(x,y)) +#define k8lor(x,y) (vec_or(x,y)) +#define k8lxor(x,y) (vec_xor(x,y)) +#define k8ifthen(x,y,z) (vec_sel(z,x,y)) diff --git a/src/vectors-8-SSE2.h b/src/vectors-8-SSE2.h index 4138a18..0b301e8 100644 --- a/src/vectors-8-SSE2.h +++ b/src/vectors-8-SSE2.h @@ -60,6 +60,17 @@ +union k8const_t { + long long i[2]; + double f[2]; + __m128i vi; + __m128d vf; +}; + +#define K8_IMIN ((long long)0x8000000000000000ULL) + + + // Create vectors, extract vector elements #define vec8_set1(a) (_mm_set1_pd(a)) @@ -216,16 +227,7 @@ // Functions and operators -// static const union { -// unsigned long long i[2]; -// __m128d v; -// } k8all_mask_union = {{ 0xfffffffffffffffULL, 0xfffffffffffffffULL }}; -// #define k8all_mask (k8all_mask_union.v) -static const union { - unsigned long long i[2]; - __m128d v; -} k8sign_mask_union = {{ 0x8000000000000000ULL, 0x8000000000000000ULL }}; -#define k8sign_mask (k8sign_mask_union.v) +static const k8const_t k8sign_mask = {{ K8_IMIN, K8_IMIN, }}; // Operators @@ -246,7 +248,7 @@ static const union { // #define k8or(x,y) (_mm_or_pd(x,y)) // #define k8xor(x,y) (_mm_xor_pd(x,y)) -#define k8neg(x) (_mm_xor_pd(k8sign_mask,x)) +#define k8neg(x) (_mm_xor_pd(k8sign_mask.vf,x)) #define k8add(x,y) (_mm_add_pd(x,y)) #define k8sub(x,y) (_mm_sub_pd(x,y)) @@ -267,10 +269,24 @@ static const union { #endif // Cheap functions -#define k8fabs(x) (_mm_andnot_pd(k8sign_mask,x)) +#define k8copysign(x,y) \ + (_mm_or_pd(_mm_andnot_pd(k8sign_mask.vf,x), \ + _mm_and_pd(k8sign_mask.vf,y))) +#define k8fabs(x) (_mm_andnot_pd(k8sign_mask.vf,x)) #define k8fmax(x,y) (_mm_max_pd(x,y)) #define k8fmin(x,y) (_mm_min_pd(x,y)) -#define k8fnabs(x) (_mm_or_pd(k8sign_mask,x)) +#define k8fnabs(x) (_mm_or_pd(k8sign_mask.vf,x)) +static const k8const_t k8zero = { f: { 0.0, 0.0, }}; +static const k8const_t k8one = { f: { 1.0, 1.0, }}; +#define k8sgn(x_) \ + ({ \ + CCTK_REAL_VEC const x__=(x_); \ + CCTK_REAL_VEC const x=x__; \ + CCTK_REAL_VEC const iszero = _mm_cmpeq_pd(k8zero.vf, x); \ + CCTK_REAL_VEC const sign = _mm_and_pd(k8sign_mask.vf, x); \ + CCTK_REAL_VEC const signedone = _mm_or_pd(k8one.vf, sign); \ + k8ifthen(iszero, k8zero.vf, signedone); \ + }) #define k8sqrt(x) (_mm_sqrt_pd(x)) // Expensive functions @@ -317,12 +333,18 @@ static const union { #define k8tan(x) K8REPL(tan,x) #define k8tanh(x) K8REPL(tanh,x) -// Choice [sign(x)>0 ? y : z] +static const k8const_t k8lfalse = {{ +0LL, +0LL, }}; +static const k8const_t k8ltrue = {{ -1LL, -1LL, }}; +#define k8lnot(x) (_mm_xor_pd(k8ltrue,x)) +#define k8land(x,y) (_mm_and_pd(x,y)) +#define k8lor(x,y) (_mm_or_pd(x,y)) +#define k8lxor(x,y) (_mm_xor_pd(x,y)) + #ifdef __SSE4_1__ -# define k8ifmsb(x,y,z) (_mm_blendv_pd(z,y,x)) +# define k8ifthen(x,y,z) (_mm_blendv_pd(z,y,x)) #elif 0 -// This is slow -# define k8ifmsb(x_,y_,z_) \ +// This is slow (but this is what Intel/PGI produce by themselves) +# define k8ifthen(x_,y_,z_) \ ({ \ CCTK_REAL8_VEC const x__=(x_); \ CCTK_REAL8_VEC const y__=(y_); \ @@ -342,11 +364,26 @@ static const union { }) #elif 0 # ifdef __cplusplus -# define k8sgn(x) ({ using namespace std; signbit(x); }) +# define k8signbit(x) ({ using namespace std; signbit(x); }) # else -# define k4sgn(x) (signbit(x)) +# define k8signbit(x) (signbit(x)) # endif -# define k8ifmsb(x_,y_,z_) \ +# define k8ifthen(x_,y_,z_) \ + ({ \ + CCTK_REAL8_VEC const x__=(x_); \ + CCTK_REAL8_VEC const y__=(y_); \ + CCTK_REAL8_VEC const z__=(z_); \ + CCTK_REAL8_VEC const x=x__; \ + CCTK_REAL8_VEC const y=y__; \ + CCTK_REAL8_VEC const z=z__; \ + vec8_set(k8signbit(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), \ + k8signbit(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); \ + }) +#elif 0 +// We don't need to shift -- the condition (mask) will be either all +// zeros or all ones +static const k8const_t k8ione = {{ 0x1ULL, 0x1ULL, }}; +# define k8ifthen(x_,y_,z_) \ ({ \ CCTK_REAL8_VEC const x__=(x_); \ CCTK_REAL8_VEC const y__=(y_); \ @@ -354,16 +391,16 @@ static const union { CCTK_REAL8_VEC const x=x__; \ CCTK_REAL8_VEC const y=y__; \ CCTK_REAL8_VEC const z=z__; \ - vec8_set(k8sgn(vec8_elt0(x)) ? vec8_elt0(y) : vec8_elt0(z), \ - k8sgn(vec8_elt1(x)) ? vec8_elt1(y) : vec8_elt1(z)); \ + /* there is no _mm_srai_epi64(x, 63); we therefore calculate srli(x)-1 */ \ + __m128i const x_int = *(__m128i const*)&x; \ + __m128i const imask_int = \ + _mm_sub_epi64(_mm_srli_epi64(x_int, 63), k8ione.vi); \ + CCTK_REAL8_VEC const imask = *(CCTK_REAL8_VEC const*)&imask_int; \ + /* (z & ~mask) | (y & mask) where imask = ~mask */ \ + _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); \ }) #else -static const union { - unsigned long long i; - double d; -} k8one_union = { 0x1ULL }; -# define k8one (k8one_union.d) -# define k8ifmsb(x_,y_,z_) \ +# define k8ifthen(x_,y_,z_) \ ({ \ CCTK_REAL8_VEC const x__=(x_); \ CCTK_REAL8_VEC const y__=(y_); \ @@ -371,11 +408,7 @@ static const union { CCTK_REAL8_VEC const x=x__; \ CCTK_REAL8_VEC const y=y__; \ CCTK_REAL8_VEC const z=z__; \ - /* there is no _mm_srai_epi64(x, 63) */ \ - CCTK_REAL8_VEC const imask = \ - (__m128d)_mm_sub_epi64(_mm_srli_epi64((__m128i)x, 63), \ - (__m128i)_mm_set1_pd(k8one)); \ - /* (z & ~mask) | (y & mask); imask = ~mask */ \ - _mm_or_pd(_mm_and_pd(imask, z), _mm_andnot_pd(imask, y)); \ + /* (z & ~mask) | (y & mask) where imask = ~mask */ \ + _mm_or_pd(_mm_and_pd(x, y), _mm_andnot_pd(x, z)); \ }) #endif diff --git a/src/vectors-8-VSX.h b/src/vectors-8-VSX.h index 8007bb2..07f19d6 100644 --- a/src/vectors-8-VSX.h +++ b/src/vectors-8-VSX.h @@ -129,5 +129,13 @@ #define k8tan(x) K8REPL(tan,x) #define k8tanh(x) K8REPL(tanh,x) -#define k8ifmsb(x,y,z) \ - (vec_sel((z), (y), vec_sra(vec_convert((x), &(vector long long*)0), 63))) +/* #define k8ifmsb(x,y,z) \ */ +/* (vec_sel((z), (y), vec_sra(vec_convert((x), &(vector long long*)0), 63))) */ + +#define k8lfalse (vec_splats(+0LL)) +#define k8ltrue (vec_splats(-1LL)) +#define k8lnot(x) (vec_xor(x,k8ltrue)) +#define k8land(x,y,z) (vec_and(x,y)) +#define k8lor(x,y,z) (vec_or(x,y)) +#define k8lxor(x,y,z) (vec_xor(x,y)) +#define k8ifthen(x,y,z) (vec_sel(z,y,x)) diff --git a/src/vectors-8-default.h b/src/vectors-8-default.h index 94d9ed4..e9e2734 100644 --- a/src/vectors-8-default.h +++ b/src/vectors-8-default.h @@ -76,32 +76,40 @@ #define k8nmsub(x,y,z) (-(x)*(y)+(z)) // Functions -#define k8acos(x) (acos(x)) -#define k8acosh(x) (acosh(x)) -#define k8asin(x) (asin(x)) -#define k8asinh(x) (asinh(x)) -#define k8atan(x) (atan(x)) -#define k8atan2(x,y) (atan2(x,y)) -#define k8atanh(x) (atanh(x)) -#define k8cos(x) (cos(x)) -#define k8cosh(x) (cosh(x)) -#define k8exp(x) (exp(x)) -#define k8fabs(x) (fabs(x)) -#define k8fmax(x,y) (fmax(x,y)) -#define k8fmin(x,y) (fmin(x,y)) -#define k8fnabs(x) (-fabs(x)) -#define k8log(x) (log(x)) -#define k8pow(x,a) (pow(x,a)) -#define k8sin(x) (sin(x)) -#define k8sinh(x) (sinh(x)) -#define k8sqrt(x) (sqrt(x)) -#define k8tan(x) (tan(x)) -#define k8tanh(x) (tanh(x)) +#define k8acos(x) (acos(x)) +#define k8acosh(x) (acosh(x)) +#define k8asin(x) (asin(x)) +#define k8asinh(x) (asinh(x)) +#define k8atan(x) (atan(x)) +#define k8atan2(x,y) (atan2(x,y)) +#define k8atanh(x) (atanh(x)) +#define k8copysign(x,y) (copysign(x,y)) +#define k8cos(x) (cos(x)) +#define k8cosh(x) (cosh(x)) +#define k8exp(x) (exp(x)) +#define k8fabs(x) (fabs(x)) +#define k8fmax(x,y) (fmax(x,y)) +#define k8fmin(x,y) (fmin(x,y)) +#define k8fnabs(x) (-fabs(x)) +#define k8log(x) (log(x)) +#define k8pow(x,a) (pow(x,a)) +#define k8sin(x) (sin(x)) +#define k8sinh(x) (sinh(x)) +#define k8sqrt(x) (sqrt(x)) +#define k8tan(x) (tan(x)) +#define k8tanh(x) (tanh(x)) + +#define k8sgn(x_) \ + ({ \ + CCTK_REAL x__=(x_); \ + CCTK_REAL x=x__; \ + x==(CCTK_REAL)0.0 ? (CCTK_REAL)0.0 : copysign((CCTK_REAL)1.0, x); \ + }) #ifdef __cplusplus -# define k8sgn(x) ({ using namespace std; signbit(x); }) +# define k8signbit(x) ({ using namespace std; signbit(x); }) #else -# define k8sgn(x) (signbit(x)) +# define k8signbit(x) (signbit(x)) #endif -#define k8ifmsb(x,y,z) (k8sgn(x)?(y):(z)) +#define k8ifthen(x,y,z) (k8signbit(x)?(y):(z)) diff --git a/src/vectors.h b/src/vectors.h index 4443212..b1bce6f 100644 --- a/src/vectors.h +++ b/src/vectors.h @@ -22,10 +22,12 @@ # else # include "vectors-8-SSE2.h" # endif -# elif defined(_ARCH_450D) // Blue Gene/P Double Hummer -# include "vectors-8-DoubleHummer.h" +# elif defined(_ARCH_QP) // Blue Gene/Q QPX +# include "vectors-8-QPX.h" # elif defined(__ALTIVEC__) && defined(_ARCH_PWR7) // Power VSX # include "vectors-8-VSX.h" +# elif defined(_ARCH_450D) // Blue Gene/P Double Hummer +# include "vectors-8-DoubleHummer.h" # endif #endif @@ -67,7 +69,6 @@ # define vec_store_nta_partial_hi vec4_store_nta_partial_hi # define vec_store_nta_partial_mid vec4_store_nta_partial_mid -# define kpos k4pos # define kneg k4neg # define kadd k4add @@ -80,29 +81,31 @@ # define knmadd k4nmadd # define knmsub k4nmsub -# define kacos k4acos -# define kacosh k4acosh -# define kasin k4asin -# define kasinh k4asinh -# define katan k4atan -# define katan2 k4atan2 -# define katanh k4atanh -# define kcos k4cos -# define kcosh k4cosh -# define kexp k4exp -# define kfabs k4fabs -# define kfmax k4fmax -# define kfmin k4fmin -# define kfnabs k4fnabs -# define klog k4log -# define kpow k4pow -# define ksin k4sin -# define ksinh k4sinh -# define ksqrt k4sqrt -# define ktan k4tan -# define ktanh k4tanh - -# define kifmsb k4ifmsb +# define kacos k4acos +# define kacosh k4acosh +# define kasin k4asin +# define kasinh k4asinh +# define katan k4atan +# define katan2 k4atan2 +# define katanh k4atanh +# define kcopysign k4copysign +# define kcos k4cos +# define kcosh k4cosh +# define kexp k4exp +# define kfabs k4fabs +# define kfmax k4fmax +# define kfmin k4fmin +# define kfnabs k4fnabs +# define klog k4log +# define kpow k4pow +# define ksin k4sin +# define ksinh k4sinh +# define ksgn k4sgn +# define ksqrt k4sqrt +# define ktan k4tan +# define ktanh k4tanh + +# define kifthen k4ifthen #elif defined(CCTK_REAL_PRECISION_8) @@ -141,29 +144,31 @@ # define knmadd k8nmadd # define knmsub k8nmsub -# define kacos k8acos -# define kacosh k8acosh -# define kasin k8asin -# define kasinh k8asinh -# define katan k8atan -# define katan2 k8atan2 -# define katanh k8atanh -# define kcos k8cos -# define kcosh k8cosh -# define kexp k8exp -# define kfabs k8fabs -# define kfmax k8fmax -# define kfmin k8fmin -# define kfnabs k8fnabs -# define klog k8log -# define kpow k8pow -# define ksin k8sin -# define ksinh k8sinh -# define ksqrt k8sqrt -# define ktan k8tan -# define ktanh k8tanh - -# define kifmsb k8ifmsb +# define kacos k8acos +# define kacosh k8acosh +# define kasin k8asin +# define kasinh k8asinh +# define katan k8atan +# define katan2 k8atan2 +# define katanh k8atanh +# define kcopysign k8copysign +# define kcos k8cos +# define kcosh k8cosh +# define kexp k8exp +# define kfabs k8fabs +# define kfmax k8fmax +# define kfmin k8fmin +# define kfnabs k8fnabs +# define klog k8log +# define kpow k8pow +# define ksin k8sin +# define ksinh k8sinh +# define ksgn k8sgn +# define ksqrt k8sqrt +# define ktan k8tan +# define ktanh k8tanh + +# define kifthen k8ifthen #else @@ -173,6 +178,7 @@ +#define kifmsb(a,b,c) kifthen(a,b,c) #define kifneg(a,b,c) kifmsb(a,b,c) #define kifpos(a,b,c) kifmsb(a,c,b) @@ -218,10 +224,6 @@ struct vecprops { { return x; } - static inline vector_t pos (vector_t const& x) - { - return +x; - } static inline vector_t neg (vector_t const& x) { return -x; @@ -264,10 +266,6 @@ struct vecprops { { return vec4_elt(x,d); } - static inline vector_t pos (vector_t const& x) - { - return k4pos(x); - } static inline vector_t neg (vector_t const& x) { return k4neg(x); @@ -345,11 +343,8 @@ struct vecprops { # define KRANC_DIFF_FUNCTIONS # endif -# undef Sign -# define Sign(x) -999999999 // poison - # undef ToReal -# define ToReal(x) (vec_set1((CCTK_REAL)(x))) +# define ToReal(x) (vec_set1(CCTK_REAL(x))) # undef KRANC_GFOFFSET3D # define KRANC_GFOFFSET3D(var,i,j,k) \ -- cgit v1.2.3