// Vectorise using IBM's Blue Gene/P Double Hummer (Power)

// Use the type double _Complex directly, without introducing a wrapper class
// Use macros instead of inline functions


#include <assert.h>

#ifdef __cplusplus
#  include <builtins.h>
#endif


#define vec8_architecture "Double Hummer"

// Vector type corresponding to CCTK_REAL
#define CCTK_REAL8_VEC double _Complex

// Number of vector elements in a CCTK_REAL_VEC
#define CCTK_REAL8_VEC_SIZE 2


// Create vectors, extract vector elements

#define vec8_set1(a)  (__cmplx(a,a))
#define vec8_set(a,b) (__cmplx(a,b))

#define vec8_elt0(x) (__creal(x))
#define vec8_elt1(x) (__cimag(x))
#define vec8_elt(x_,d)                          \
  ({                                            \
    CCTK_REAL8_VEC const xx=(x_);               \
    CCTK_REAL8_VEC const x=xx;                  \
    CCTK_REAL8 a;                               \
    switch (d) {                                \
    case 0: a=vec8_elt0(x); break;              \
    case 1: a=vec8_elt1(x); break;              \
    }                                           \
    a;                                          \
  })


// Load and store vectors

// Load a vector from memory (aligned and unaligned); this loads from
// a reference to a scalar
#define vec8_load(p)  (__lfpd((CCTK_REAL8 *)&(p)))
#if ! VECTORISE_ALWAYS_USE_ALIGNED_LOADS
#  define vec8_load_off1(p_)                    \
  ({                                            \
    CCTK_REAL8 const& pp=(p_);                  \
    CCTK_REAL8 const& p=pp;                     \
    vec8_set((&p)[0],(&p)[1]);                  \
  })
#else
#if 0
#  define vec8_load_off1(p_)                                    \
  ({                                                            \
    CCTK_REAL8 const& pp=(p_);                                  \
    CCTK_REAL8 const& p=pp;                                     \
    CCTK_REAL8_VEC const lo = __lfxd((CCTK_REAL8 *)(&p-1));     \
    CCTK_REAL8_VEC const hi = __lfxd((CCTK_REAL8 *)(&p+1));     \
    __fpsel(vec8_set(-1.0,+1.0),lo,hi);                         \
  })
#endif
#  define vec8_load_off1(p_)                            \
  ({                                                    \
    CCTK_REAL8 const& pp=(p_);                          \
    CCTK_REAL8 const& p=pp;                             \
    CCTK_REAL8_VEC const lo = vec8_load((&p)[-1]);      \
    CCTK_REAL8_VEC const hi = vec8_load((&p)[+1]);      \
    __fxmr(__fpsel(vec8_set(+1.0,-1.0),lo,hi));         \
  })
#endif
#define vec8_loadu(p_)                          \
  ({                                            \
    CCTK_REAL8 const& pp=(p_);                  \
    CCTK_REAL8 const& p=pp;                     \
    int const off = (ptrdiff_t)&p & 0xf;        \
    off==0 ? vec8_load(p) : vec8_load_off1(p);  \
  })

// Load a vector from memory that may or may not be aligned, as
// decided by the offset and the vector size
#if VECTORISE_ALWAYS_USE_UNALIGNED_LOADS
// Implementation: Always use unaligned load
#  define vec8_loadu_maybe(off,p)             vec8_loadu(p)
#  define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu(p)
#else
#  define vec8_loadu_maybe(off,p_)              \
  ({                                            \
    CCTK_REAL8 const& pp=(p_);                  \
    CCTK_REAL8 const& p=pp;                     \
    (off) % CCTK_REAL8_VEC_SIZE == 0 ?          \
      vec8_load(p) :                            \
      vec8_load_off1(p);                        \
  })
#  if VECTORISE_ALIGNED_ARRAYS
// Assume all array x sizes are multiples of the vector size
#    define vec8_loadu_maybe3(off1,off2,off3,p) vec8_loadu_maybe(off1,p)
#  else
#    define vec8_loadu_maybe3(off1,off2,off3,p_)        \
  ({                                                    \
    CCTK_REAL8 const& pp=(p_);                          \
    CCTK_REAL8 const& p=pp;                             \
    ((off2) % CCTK_REAL8_VEC_SIZE != 0 ||               \
     (off3) % CCTK_REAL8_VEC_SIZE != 0) ?               \
      vec8_loadu(p) :                                   \
      vec8_loadu_maybe(off1,p);                         \
  })
#  endif
#endif

// Store a vector to memory (aligned and non-temporal); this stores to
// a reference to a scalar
#define vec8_store(p,x)     (__stfpd(&(p),x))
#define vec8_storeu(p,x)    (__stfpd(&(p),x)) // this may not work
#define vec8_store_nta(p,x) (__stfpd(&(p),x)) // this doesn't avoid the cache

// Store a lower or higher partial vector (aligned and non-temporal);
// the non-temporal hint is probably ignored
#define vec8_store_nta_partial_lo(p,x,n) ((&(p))[0]=vec8_elt0(x))
#define vec8_store_nta_partial_hi(p,x,n) ((&(p))[1]=vec8_elt1(x))
#define vec8_store_nta_partial_mid(p,x,nlo,nhi) assert(0)


// Functions and operators

// Operators
#define k8neg(x) (__fpneg(x))

#define k8add(x,y) (__fpadd(x,y))
#define k8sub(x,y) (__fpsub(x,y))
#define k8mul(x,y) (__fpmul(x,y))
// Estimate for reciprocal
#define k8inv_init(x) (__fpre(x))
// One Newton iteration for reciprocal
#define k8inv_iter(x_,r_)                               \
  ({                                                    \
    CCTK_REAL8_VEC const xx=(x_);                       \
    CCTK_REAL8_VEC const x=xx;                          \
    CCTK_REAL8_VEC const rr=(r_);                       \
    CCTK_REAL8_VEC const r=rr;                          \
    /* r + r * (vec8_set1(1.0) - x*r); */               \
    k8madd(r, k8nmsub(x, r, vec8_set1(1.0)), r);        \
  })
// Reciprocal: First estimate, then apply two Newton iterations
#define k8inv(x_)                               \
  ({                                            \
    CCTK_REAL8_VEC const xx=(x_);               \
    CCTK_REAL8_VEC const x=xx;                  \
    CCTK_REAL8_VEC const r0 = k8inv_init(x);    \
    CCTK_REAL8_VEC const r1 = k8inv_iter(x,r0); \
    CCTK_REAL8_VEC const r2 = k8inv_iter(x,r1); \
    r2;                                         \
  })
#define k8div(x,y) (__fpmul(x,k8inv(y)))

// Fused multiply-add, defined as [+-]x*y[+-]z
#define k8madd(x,y,z)  (__fpmadd(z,x,y))
#define k8msub(x,y,z)  (__fpmsub(z,x,y))
#define k8nmadd(x,y,z) (__fpnmadd(z,x,y))
#define k8nmsub(x,y,z) (__fpnmsub(z,x,y))

// Cheap functions
#define k8fabs(x) (__fpabs(x))
#define k8fmax(x_,y_)                           \
  ({                                            \
    CCTK_REAL8_VEC const xx=(x_);               \
    CCTK_REAL8_VEC const x=xx;                  \
    CCTK_REAL8_VEC const yy=(y_);               \
    CCTK_REAL8_VEC const y=yy;                  \
    __fpsel(k8sub(y,x),x,y);                    \
  })
#define k8fmin(x_,y_)                           \
  ({                                            \
    CCTK_REAL8_VEC const xx=(x_);               \
    CCTK_REAL8_VEC const x=xx;                  \
    CCTK_REAL8_VEC const yy=(y_);               \
    CCTK_REAL8_VEC const y=yy;                  \
    __fpsel(k8sub(x,y),x,y);                    \
  })
#define k8fnabs(x) (__fpnabs(x))

// Expensive functions
#define K8REPL(f,x_)                            \
  ({                                            \
    CCTK_REAL8_VEC const xx=(x_);               \
    CCTK_REAL8_VEC const x=xx;                  \
    vec8_set(f(vec8_elt0(x)),                   \
             f(vec8_elt1(x)));                  \
  })
#define K8REPL2(f,x_,a_)                        \
  ({                                            \
    CCTK_REAL8_VEC const xx=(x_);               \
    CCTK_REAL8_VEC const x=xx;                  \
    CCTK_REAL8     const aa=(a_);               \
    CCTK_REAL8     const a=aa;                  \
    vec8_set(f(vec8_elt0(x),a),                 \
             f(vec8_elt1(x),a));                \
  })

#define k8exp(x)   K8REPL(exp,x)
#define k8log(x)   K8REPL(log,x)
#define k8pow(x,a) K8REPL2(pow,x,a)
#define k8sqrt(x)  K8REPL(sqrt,x)

#define k8ifpos(x,y,z) (__fpsel(x,z,y))