From 96b656fc2d33f2d5238db7e8de17b11c5893084b Mon Sep 17 00:00:00 2001
From: eschnett <eschnett@105869f7-3296-0410-a4ea-f4349344b45a>
Date: Mon, 29 Nov 2010 16:40:49 +0000
Subject: Add initial implementation

git-svn-id: https://svn.cct.lsu.edu/repos/numrel/LSUThorns/Vectors/trunk@3 105869f7-3296-0410-a4ea-f4349344b45a
---
 doc/documentation.tex           | 144 +++++++++++++++
 src/indirect/vectors-default.hh | 226 +++++++++++++++++++++++
 src/indirect/vectors-intel.hh   | 390 ++++++++++++++++++++++++++++++++++++++++
 src/indirect/vectors-power.hh   | 360 +++++++++++++++++++++++++++++++++++++
 src/indirect/vectors-pseudo.hh  | 183 +++++++++++++++++++
 src/indirect/vectors.hh         |  19 ++
 6 files changed, 1322 insertions(+)
 create mode 100644 doc/documentation.tex
 create mode 100644 src/indirect/vectors-default.hh
 create mode 100644 src/indirect/vectors-intel.hh
 create mode 100644 src/indirect/vectors-power.hh
 create mode 100644 src/indirect/vectors-pseudo.hh
 create mode 100644 src/indirect/vectors.hh

diff --git a/doc/documentation.tex b/doc/documentation.tex
new file mode 100644
index 0000000..55a3762
--- /dev/null
+++ b/doc/documentation.tex
@@ -0,0 +1,144 @@
+% *======================================================================*
+%  Cactus Thorn template for ThornGuide documentation
+%  Author: Ian Kelley
+%  Date: Sun Jun 02, 2002
+%  $Header$
+%
+%  Thorn documentation in the latex file doc/documentation.tex
+%  will be included in ThornGuides built with the Cactus make system.
+%  The scripts employed by the make system automatically include
+%  pages about variables, parameters and scheduling parsed from the
+%  relevant thorn CCL files.
+%
+%  This template contains guidelines which help to assure that your
+%  documentation will be correctly added to ThornGuides. More
+%  information is available in the Cactus UsersGuide.
+%
+%  Guidelines:
+%   - Do not change anything before the line
+%       % START CACTUS THORNGUIDE",
+%     except for filling in the title, author, date, etc. fields.
+%        - Each of these fields should only be on ONE line.
+%        - Author names should be separated with a \\ or a comma.
+%   - You can define your own macros, but they must appear after
+%     the START CACTUS THORNGUIDE line, and must not redefine standard
+%     latex commands.
+%   - To avoid name clashes with other thorns, 'labels', 'citations',
+%     'references', and 'image' names should conform to the following
+%     convention:
+%       ARRANGEMENT_THORN_LABEL
+%     For example, an image wave.eps in the arrangement CactusWave and
+%     thorn WaveToyC should be renamed to CactusWave_WaveToyC_wave.eps
+%   - Graphics should only be included using the graphicx package.
+%     More specifically, with the "\includegraphics" command.  Do
+%     not specify any graphic file extensions in your .tex file. This
+%     will allow us to create a PDF version of the ThornGuide
+%     via pdflatex.
+%   - References should be included with the latex "\bibitem" command.
+%   - Use \begin{abstract}...\end{abstract} instead of \abstract{...}
+%   - Do not use \appendix, instead include any appendices you need as
+%     standard sections.
+%   - For the benefit of our Perl scripts, and for future extensions,
+%     please use simple latex.
+%
+% *======================================================================*
+%
+% Example of including a graphic image:
+%    \begin{figure}[ht]
+% 	\begin{center}
+%    	   \includegraphics[width=6cm]{MyArrangement_MyThorn_MyFigure}
+% 	\end{center}
+% 	\caption{Illustration of this and that}
+% 	\label{MyArrangement_MyThorn_MyLabel}
+%    \end{figure}
+%
+% Example of using a label:
+%   \label{MyArrangement_MyThorn_MyLabel}
+%
+% Example of a citation:
+%    \cite{MyArrangement_MyThorn_Author99}
+%
+% Example of including a reference
+%   \bibitem{MyArrangement_MyThorn_Author99}
+%   {J. Author, {\em The Title of the Book, Journal, or periodical}, 1 (1999),
+%   1--16. {\tt http://www.nowhere.com/}}
+%
+% *======================================================================*
+
+% If you are using CVS use this line to give version information
+% $Header$
+
+\documentclass{article}
+
+% Use the Cactus ThornGuide style file
+% (Automatically used from Cactus distribution, if you have a
+%  thorn without the Cactus Flesh download this from the Cactus
+%  homepage at www.cactuscode.org)
+\usepackage{../../../../doc/latex/cactus}
+
+\begin{document}
+
+% The author of the documentation
+\author{Erik Schnetter \textless schnetter@cct.lsu.edu\textgreater}
+
+% The title of the document (not necessarily the name of the Thorn)
+\title{Vectors}
+
+% the date your document was last changed, if your document is in CVS,
+% please use:
+%    \date{$ $Date: 2004-01-07 14:12:39 -0600 (Wed, 07 Jan 2004) $ $}
+\date{November 24 2010}
+
+\maketitle
+
+% Do not delete next line
+% START CACTUS THORNGUIDE
+
+% Add all definitions used in this documentation here
+%   \def\mydef etc
+
+% Add an abstract for this thorn's documentation
+\begin{abstract}
+
+\end{abstract}
+
+% The following sections are suggestive only.
+% Remove them or add your own.
+
+\section{Introduction}
+
+\section{Physical System}
+
+\section{Numerical Implementation}
+
+\section{Using This Thorn}
+
+\subsection{Obtaining This Thorn}
+
+\subsection{Basic Usage}
+
+\subsection{Special Behaviour}
+
+\subsection{Interaction With Other Thorns}
+
+\subsection{Examples}
+
+\subsection{Support and Feedback}
+
+\section{History}
+
+\subsection{Thorn Source Code}
+
+\subsection{Thorn Documentation}
+
+\subsection{Acknowledgements}
+
+
+\begin{thebibliography}{9}
+
+\end{thebibliography}
+
+% Do not delete next line
+% END CACTUS THORNGUIDE
+
+\end{document}
diff --git a/src/indirect/vectors-default.hh b/src/indirect/vectors-default.hh
new file mode 100644
index 0000000..9a1e1d3
--- /dev/null
+++ b/src/indirect/vectors-default.hh
@@ -0,0 +1,226 @@
+using namespace std;
+
+
+
+// A class template that provides a vectorised type for the underlying
+// scalar type T. This implementation does "nothing", i.e. just
+// provides a "vector" class with a vector size of 1, forwarding all
+// operations to the scalar type.
+//
+// This implementation uses small integers in several places, e.g. in
+// the [] operator. This is efficient only if these integers are
+// compile time constants, so that the compiler can remove the
+// corresponding if and switch statements.
+template<typename T>
+struct vec_t {
+  
+  // Names for the underlying scalar type, and for the vector type
+  // used to implement this class. For example, with SSE2, it would be
+  // scalar_t=double, and impl_t=__m128d.
+  typedef T scalar_t;
+  typedef T impl_t;
+  
+  // The payload -- the actual vector content
+  impl_t v;
+  
+  // Vector size (number of elements)
+  static inline size_t size()
+  {
+    return sizeof(impl_t)/sizeof(scalar_t);
+  }
+  
+  // Constructors
+  inline vec_t ()
+  {
+  }
+  inline vec_t (scalar_t const& a)
+    : v(a)
+  {
+  }
+  
+  // Convert to the implementation vector type
+  inline operator impl_t ()
+  {
+    return v;
+  }
+  
+  // Access individual vector elements
+  inline scalar_t operator[] (size_t const d) const
+  {
+    return v;
+  }
+  
+  // Load vectors from memory. For convenience when using this class,
+  // these accept references to the scalar type instead of pointers to
+  // the vector type. These routines are static members of the class,
+  // so that they can be used as VEC::load(p); if they were
+  // stand-alone functions, one would have to write load<SCALAR>(p)
+  // instead.
+  //
+  // Aligned load
+  static inline vec_t load (scalar_t const& p)
+  {
+    return p;
+  }
+  // Unaligned load
+  static inline vec_t loadu (scalar_t const& p)
+  {
+    return p;
+  }
+  // Load a vector from memory that may or may not be aligned, as
+  // decided by the offset and the vector size. These functions are
+  // useful e.g. for loading neightbouring grid points while
+  // evaluating finite differencing stencils.
+  static inline vec_t loadu_maybe (int const off, scalar_t const& p)
+  {
+    return p;
+  }
+  static inline vec_t loadu_maybe3 (int const off0, int const off1,
+                                    int const off2,
+                                    scalar_t const& p)
+  {
+    return p;
+  }
+};
+
+// Store vectors to memory. These routines are stand-alone functions,
+// so that they can be used as vec_store(p,x); if they were class
+// members, one would have to write x.store(p) instead, or possibly
+// VEC::store(p,x).
+//
+// Aligned store
+template<typename T>
+inline void vec_store (typename vec_t<T>::scalar_t& p, vec_t<T> const& x)
+{
+  p=x.v;
+}
+// Unaligned store
+template<typename T>
+inline void vec_storeu (typename vec_t<T>::scalar_t& p, vec_t<T> const& x)
+{
+  p=x.v;
+}
+// Non-temporal store, i.e. a store that bypasses the cache
+template<typename T>
+inline void vec_store_nta (typename vec_t<T>::scalar_t& p, vec_t<T> const& x)
+{
+  p=x.v;
+}
+// Store the cnt lower elements of a vector, bypassing the cache if
+// possible
+template<typename T>
+inline void vec_store_nta_partial_lo (typename vec_t<T>::scalar_t& p,
+                                      vec_t<T> const& x,
+                                      size_t const cnt)
+{
+  assert(0);
+}
+// Store the cnt higher elements of a vector, bypassing the cache if
+// possible. This stores the vector elements into memory locations as
+// if element 0 were stored at p.
+template<typename T>
+inline void vec_store_nta_partial_hi (typename vec_t<T>::scalar_t& p,
+                                      vec_t<T> const& x,
+                                      size_t const cnt)
+{
+  assert(0);
+}
+
+template<typename T>
+inline vec_t<T> operator+ (vec_t<T> const& x)
+{
+  return +x.v;
+}
+template<typename T>
+inline vec_t<T> operator- (vec_t<T> const& x)
+{
+  return -x.v;
+}
+
+template<typename T>
+inline vec_t<T> operator+ (vec_t<T> const& x, vec_t<T> const& y)
+{
+  return x.v + y.v;
+}
+template<typename T>
+inline vec_t<T> operator- (vec_t<T> const& x, vec_t<T> const& y)
+{
+  return x.v - y.v;
+}
+template<typename T>
+inline vec_t<T> operator* (vec_t<T> const& x, vec_t<T> const& y)
+{
+  return x.v * y.v;
+}
+template<typename T>
+inline vec_t<T> operator/ (vec_t<T> const& x, vec_t<T> const& y)
+{
+  return x.v / y.v;
+}
+
+template<typename T>
+inline vec_t<T>& operator+= (vec_t<T>& x, vec_t<T> const& y)
+{
+  x.v += y.v;
+  return x;
+}
+template<typename T>
+inline vec_t<T>& operator-= (vec_t<T>& x, vec_t<T> const& y)
+{
+  x.v -= y.v;
+  return x;
+}
+template<typename T>
+inline vec_t<T>& operator*= (vec_t<T>& x, vec_t<T> const& y)
+{
+  x.v *= y.v;
+  return x;
+}
+template<typename T>
+inline vec_t<T>& operator/= (vec_t<T>& x, vec_t<T> const& y)
+{
+  x.v /= y.v;
+  return x;
+}
+
+template<typename T>
+inline vec_t<T> exp (vec_t<T> const& x)
+{
+  return exp(x.v);
+}
+template<typename T>
+inline vec_t<T> fabs (vec_t<T> const& x)
+{
+  return fabs(x.v);
+}
+template<typename T>
+inline vec_t<T> log (vec_t<T> const& x)
+{
+  return log(x.v);
+}
+template<typename T>
+inline vec_t<T> sqrt (vec_t<T> const& x)
+{
+  return sqrt(x.v);
+}
+
+template<typename T>
+inline vec_t<T> fmax (vec_t<T> const& x, vec_t<T> const& y)
+{
+  return fmax(x.v, y.v);
+}
+template<typename T>
+inline vec_t<T> fmin (vec_t<T> const& x, vec_t<T> const& y)
+{
+  return fmin(x.v, y.v);
+}
+template<typename T>
+inline vec_t<T> pow (vec_t<T> const& x, typename vec_t<T>::scalar_t const& a)
+{
+  return pow(x.v, a);
+}
+template<typename T>
+inline vec_t<T> pow (vec_t<T> const& x, int const& i)
+{
+  return pow(x.v, i);
+}
diff --git a/src/indirect/vectors-intel.hh b/src/indirect/vectors-intel.hh
new file mode 100644
index 0000000..29da84f
--- /dev/null
+++ b/src/indirect/vectors-intel.hh
@@ -0,0 +1,390 @@
+#include <cmath>
+#include <cstdlib>
+using namespace std;
+
+
+
+#if defined(__SSE__)            // SSE (Intel)
+
+#include <xmmintrin.h>
+
+template<>
+struct vec_T<float> {
+  typedef float scalar_t;
+  typedef __m128 impl_t;
+  impl_t v;
+  
+  static inline size_t size()
+  {
+    return sizeof(impl_t)/sizeof(scalar_t);
+  }
+  
+  inline vec_t ()
+  {
+  }
+  inline vec_t (scalar_t const& a)
+    : v(_mm_set1_ps(a))
+  {
+  }
+  inline vec_t (scalar_t const& a0, scalar_t const& a1,
+                scalar_t const& a2, scalar_t const& a3)
+    : v(_mm_set_ps(a3,a2,a1,a0)) // reverse order!
+  {
+  }
+  
+  inline vec_t (impl_t const& w)
+    : v(w)
+  {
+  }
+  inline operator impl_t ()
+  {
+    return v;
+  }
+  
+private:
+  static inline scalar_t elt0 (impl_t const& v)
+  {
+    return _mm_cvtss_f32(v);    // this is a no-op
+  }
+public:
+  inline scalar_t operator[] (size_t const d)
+  {
+    switch (d) {
+    case 0: return elt0(v);
+    case 1: return elt0(_mm_shuffle_ps(v,v,_MM_SHUFFLE(1,0,3,2)));
+    case 2: return elt0(_mm_unpackhi_ps(v,v));
+    case 3: return elt0(_mm_shuffle_ps(v,v,_MM_SHUFFLE(3,2,1,0)));
+    }
+  }
+  
+  static inline vec_t load (scalar_t const& a)
+  {
+    return _mm_load_ps(&a);
+  }
+  static inline vec_t loadu (scalar_t const& a)
+  {
+    return _mm_loadu_ps(&a);
+  }
+  // Load a vector from memory that may or may not be aligned, as
+  // decided by the offset and the vector size
+  static inline vec_t loadu_maybe (int const off, scalar_t const& p)
+  {
+    if (off % size() == 0) {
+      return load(p);
+    } else {
+      return loadu(p);
+    }
+  }
+  static inline vec_t loadu_maybe3 (int const off0, int const off1,
+                                    int const off2,
+                                    scalar_t const& p)
+  {
+    if (off0 % size() == 0 and off1 % size() == 0 and off2 % size() == 0) {
+      return load(p);
+    } else {
+      return loadu(p);
+    }
+  }
+  inline void store (scalar_t& p) const
+  {
+    _mm_store_ps(&p,v);
+  }
+  inline void storeu (scalar_t& p) const
+  {
+    _mm_storeu_ps(&p,v);
+  }
+  inline void store_nta (scalar_t& p) const
+  {
+    _mm_stream_ps(&p,v);
+  }
+  inline void store_nta_partial_lo (scalar_t& p, size_t const cnt) const
+  {
+    switch (cnt) {
+    case 4: store_nta(p); break;
+    case 3: (&p)[2]=v[2];
+    case 2: (&p)[1]=v[1];
+    case 1: (&p)[0]=v[0];
+    }
+  }
+  inline void store_nta_partial_hi (scalar_t& p, size_t const cnt) const
+  {
+    switch (cnt) {
+    case 4: store_nta(p); break;
+    case 3: (&p)[1]=v[1];
+    case 2: (&p)[2]=v[2];
+    case 1: (&p)[3]=v[3];
+    }
+  }
+  
+  inline vec_t operator+ () const
+  {
+    return +v;
+  }
+  inline vec_t operator- () const
+  {
+    return -v;
+  }
+  inline vec_t operator+ (vec_t const& x) const
+  {
+    return v+x.v;
+  }
+  inline vec_t operator- (vec_t const& x) const
+  {
+    return v-x.v;
+  }
+  inline vec_t operator* (vec_t const& x) const
+  {
+    return v*x.v;
+  }
+  inline vec_t operator/ (vec_t const& x) const
+  {
+    return v/x.v;
+  }
+  inline vec_t& operator+= (vec_t const& x)
+  {
+    return *this=*this+x;
+  }
+  inline vec_t& operator-= (vec_t const& x)
+  {
+    return *this=*this+x;
+  }
+  inline vec_t& operator*= (vec_t const& x)
+  {
+    return *this=*this-x;
+  }
+  inline vec_t& operator/= (vec_t const& x)
+  {
+    return *this=/this+x;
+  }
+};
+
+template<typename T>
+vec_t<T> exp (vec_t<T> const& x)
+{
+  return vec_t(exp(x.v[0]), exp(x.v[1]), exp(x.v[2]), exp(x.v[3]));
+}
+template<typename T>
+vec_t<T> fabs (vec_t<T> const& x)
+{
+  return _mm_and_ps(v,_mm_set1_pi32(0x7fffffffU));
+}
+template<typename T>
+vec_t<T> fmax (vec_t<T> const& x, vec_t<T> const& y)
+{
+  return _mm_max_ps(x.v, y.v);
+}
+template<typename T>
+vec_t<T> fmin (vec_t<T> const& x, vec_t<T> const& y)
+{
+  return _mm_min_ps(x.v, y.v);
+}
+template<typename T>
+vec_t<T> ifthen (bool const b, vec_t<T> const& x, vec_t<T> const& y)
+{
+  return b ? x : y;
+}
+vec_t<T> log (vec_t<T> const& x)
+{
+  return vec_t(log(x.v[0]), log(x.v[1]), log(x.v[2]), log(x.v[3]));
+}
+template<typename T>
+vec_t<T> pow (vec_t<T> const& x, typename vec_t<T>::scalar_t const& a)
+{
+  return vec_t(pow(x.v[0],a), pow(x.v[1],a), pow(x.v[2],a), pow(x.v[3],a));
+}
+vec_t<T> sqrt (vec_t<T> const& x)
+{
+  return _mm_sqrt_ps(x.v);
+}
+
+#endif
+
+
+
+#if defined(__SSE2__)            // SSE2 (Intel)
+
+#include <emmintrin.h>
+
+template<>
+struct vec_T<double> {
+  typedef double scalar_t;
+  typedef __m128d impl_t;
+  impl_t v;
+  
+  static inline size_t size()
+  {
+    return sizeof(impl_t)/sizeof(scalar_t);
+  }
+  
+  inline vec_t ()
+  {
+  }
+  inline vec_t (scalar_t const& a)
+    : v(_mm_set1_pd(a))
+  {
+  }
+  inline vec_t (scalar_t const& a0, scalar_t const& a1)
+    : v(_mm_set_pd(a1,a0))      // reverse order!
+  {
+  }
+  
+  inline vec_t (impl_t const& w)
+    : v(w)
+  {
+  }
+  inline operator impl_t ()
+  {
+    return v;
+  }
+  
+private:
+  static inline scalar_t elt0 (impl_t const& v)
+  {
+    return _mm_cvtss_f64(v);    // this is a no-op
+  }
+public:
+  inline scalar_t operator[] (size_t const d)
+  {
+    switch (d) {
+    case 0: return elt0(v);
+    case 1: return elt0(_mm_unpackhi_pd(v,v));
+    }
+  }
+  
+  static inline vec_t load (scalar_t const& a)
+  {
+    return _mm_load_pd(&a);
+  }
+  static inline vec_t loadu (scalar_t const& a)
+  {
+    return _mm_loadu_pd(&a);
+  }
+  // Load a vector from memory that may or may not be aligned, as
+  // decided by the offset and the vector size
+  static inline vec_t loadu_maybe (int const off, scalar_t const& p)
+  {
+    if (off % size() == 0) {
+      return load(p);
+    } else {
+      return loadu(p);
+    }
+  }
+  static inline vec_t loadu_maybe3 (int const off0, int const off1,
+                                    int const off2,
+                                    scalar_t const& p)
+  {
+    if (off0 % size() == 0 and off1 % size() == 0 and off2 % size() == 0) {
+      return load(p);
+    } else {
+      return loadu(p);
+    }
+  }
+  inline void store (scalar_t& p) const
+  {
+    _mm_store_pd(&p,v);
+  }
+  inline void storeu (scalar_t& p) const
+  {
+    _mm_storeu_pd(&p,v);
+  }
+  inline void store_nta (scalar_t& p) const
+  {
+    _mm_stream_pd(&p,v);
+  }
+  inline void store_nta_partial_lo (scalar_t& p, size_t const cnt) const
+  {
+    switch (cnt) {
+    case 2: store_nta(p); break;
+    case 1: (&p)[0]=v[0];
+    }
+  }
+  inline void store_nta_partial_hi (scalar_t& p, size_t const cnt) const
+  {
+    switch (cnt) {
+    case 2: store_nta(p); break;
+    case 1: (&p)[1]=v[1];
+    }
+  }
+  
+  inline vec_t operator+ () const
+  {
+    return +v;
+  }
+  inline vec_t operator- () const
+  {
+    return -v;
+  }
+  inline vec_t operator+ (vec_t const& x) const
+  {
+    return v+x.v;
+  }
+  inline vec_t operator- (vec_t const& x) const
+  {
+    return v-x.v;
+  }
+  inline vec_t operator* (vec_t const& x) const
+  {
+    return v*x.v;
+  }
+  inline vec_t operator/ (vec_t const& x) const
+  {
+    return v/x.v;
+  }
+  inline vec_t& operator+= (vec_t const& x)
+  {
+    return *this=*this+x;
+  }
+  inline vec_t& operator-= (vec_t const& x)
+  {
+    return *this=*this+x;
+  }
+  inline vec_t& operator*= (vec_t const& x)
+  {
+    return *this=*this-x;
+  }
+  inline vec_t& operator/= (vec_t const& x)
+  {
+    return *this=/this+x;
+  }
+};
+
+template<typename T>
+vec_t<T> exp (vec_t<T> const& x)
+{
+  return vec_t(exp(x.v[0]), exp(x.v[1]));
+}
+template<typename T>
+vec_t<T> fabs (vec_t<T> const& x)
+{
+  return _mm_and_pd(v,_mm_set1_epi64(0x7fffffffffffffffULL));
+}
+template<typename T>
+vec_t<T> fmax (vec_t<T> const& x, vec_t<T> const& y)
+{
+  return _mm_max_pd(x.v, y.v);
+}
+template<typename T>
+vec_t<T> fmin (vec_t<T> const& x, vec_t<T> const& y)
+{
+  return _mm_min_pd(x.v, y.v);
+}
+template<typename T>
+vec_t<T> ifthen (bool const b, vec_t<T> const& x, vec_t<T> const& y)
+{
+  return b ? x : y;
+}
+vec_t<T> log (vec_t<T> const& x)
+{
+  return vec_t(log(x.v[0]), log(x.v[1]));
+}
+template<typename T>
+vec_t<T> pow (vec_t<T> const& x, typename vec_t<T>::scalar_t const& a)
+{
+  return vec_t(pow(x.v[0],a), pow(x.v[1],a));
+}
+vec_t<T> sqrt (vec_t<T> const& x)
+{
+  return _mm_sqrt_pd(x.v);
+}
+
+#endif
diff --git a/src/indirect/vectors-power.hh b/src/indirect/vectors-power.hh
new file mode 100644
index 0000000..37a8786
--- /dev/null
+++ b/src/indirect/vectors-power.hh
@@ -0,0 +1,360 @@
+#include <cmath>
+#include <cstdlib>
+using namespace std;
+
+
+
+#if defined(__ALTIVEC__)        // Altivec (Power)
+
+#include <altivec.h>
+
+template<>
+struct vec_t<float> {
+  typedef float scalar_t;
+  typedef vector float impl_t;
+  
+  static inline size_t size()
+  {
+    return sizeof(impl_t)/sizeof(scalar_t);
+  }
+  
+  inline vec_t ()
+  {
+  }
+  inline vec_t (scalar_t const& a)
+    : v(vec_splats(a))
+  {
+  }
+  inline vec_t (scalar_t const& a0, scalar_t const& a1,
+                scalar_t const& a2, scalar_t const& a3)
+  {
+    v[0]=a0; v[1]=a1; v[2]=a2; v[3]=a3;
+  }
+  
+  inline vec_t (impl_t const& w)
+    : v(w)
+  {
+  }
+  inline operator impl_t ()
+  {
+    return v;
+  }
+  
+  static inline vec_t load (scalar_t const& a)
+  {
+    return *(impl_t const*)&a;
+  }
+  static inline vec_t loadu (scalar_t const& a)
+  {
+    return vec_load(&a);
+  }
+  // Load a vector from memory that may or may not be aligned, as
+  // decided by the offset and the vector size
+  static inline vec_t loadu_maybe (int const off, scalar_t const& p)
+  {
+    if (off % size() == 0) {
+      return load(p);
+    } else {
+      return loadu(p);
+    }
+  }
+  static inline vec_t loadu_maybe3 (int const off0, int const off1,
+                                    int const off2,
+                                    scalar_t const& p)
+  {
+    if (off0 % size() == 0 and off1 % size() == 0 and off2 % size() == 0) {
+      return load(p);
+    } else {
+      return loadu(p);
+    }
+  }
+  inline void store (scalar_t& p) const
+  {
+    *(impl_t*)p = v;
+  }
+  inline void storeu (scalar_t& p) const
+  {
+    store(p);
+  }
+  inline void store_nta (scalar_t& p) const
+  {
+    // TODO: Use stvxl instruction?
+    store(p);
+  }
+  inline void store_nta_partial_lo (scalar_t& p, size_t const cnt) const
+  {
+    switch (cnt) {
+    case 4: store_nta(p); break;
+    case 3: (&p)[2]=v[2];
+    case 2: (&p)[1]=v[1];
+    case 1: (&p)[0]=v[0];
+    }
+  }
+  inline void store_nta_partial_hi (scalar_t& p, size_t const cnt) const
+  {
+    switch (cnt) {
+    case 4: store_nta(p); break;
+    case 3: (&p)[1]=v[1];
+    case 2: (&p)[2]=v[2];
+    case 1: (&p)[3]=v[3];
+    }
+  }
+  
+  inline vec_t operator+ () const
+  {
+    return +v;
+  }
+  inline vec_t operator- () const
+  {
+    return -v;
+  }
+  inline vec_t operator+ (vec_t const& x) const
+  {
+    return v+x.v;
+  }
+  inline vec_t operator- (vec_t const& x) const
+  {
+    return v-x.v;
+  }
+  inline vec_t operator* (vec_t const& x) const
+  {
+    return v*x.v;
+  }
+  inline vec_t operator/ (vec_t const& x) const
+  {
+    return v/x.v;
+  }
+  inline vec_t& operator+= (vec_t const& x)
+  {
+    return *this=*this+x;
+  }
+  inline vec_t& operator-= (vec_t const& x)
+  {
+    return *this=*this+x;
+  }
+  inline vec_t& operator*= (vec_t const& x)
+  {
+    return *this=*this-x;
+  }
+  inline vec_t& operator/= (vec_t const& x)
+  {
+    return *this=/this+x;
+  }
+};
+
+template<typename T>
+vec_t<T> exp (vec_t<T> const& x)
+{
+  return vec_t(exp(x.v[0]), exp(x.v[1]), exp(x.v[2]), exp(x.v[3]));
+}
+template<typename T>
+vec_t<T> fabs (vec_t<T> const& x)
+{
+  return vec_abs(x.v);
+}
+template<typename T>
+vec_t<T> fmax (vec_t<T> const& x, vec_t<T> const& y)
+{
+  return vec_max(x.v, y.v);
+}
+template<typename T>
+vec_t<T> fmin (vec_t<T> const& x, vec_t<T> const& y)
+{
+  return vec_min(x.v, y.v);
+}
+template<typename T>
+vec_t<T> ifthen (bool const b, vec_t<T> const& x, vec_t<T> const& y)
+{
+  return b ? x : y;
+}
+vec_t<T> log (vec_t<T> const& x)
+{
+  return vec_t(log(x.v[0]), log(x.v[1]), log(x.v[2]), log(x.v[3]));
+}
+template<typename T>
+vec_t<T> pow (vec_t<T> const& x, typename vec_t<T>::scalar_t const& a)
+{
+  return vec_t(pow(x.v[0],a), pow(x.v[1],a), pow(x.v[2],a), pow(x.v[3],a));
+}
+vec_t<T> sqrt (vec_t<T> const& x)
+{
+  return vec_t(sqrt(x.v[0]), sqrt(x.v[1]), sqrt(x.v[2]), sqrt(x.v[3]));
+}
+
+#endif
+
+
+
+#if defined(__ALTIVEC__) && defined(_ARCH_PWR7) // Altivec VSX (Power)
+
+#include <altivec.h>
+
+template<>
+struct vec_t<double> {
+  typedef double scalar_t;
+  typedef vector double impl_t;
+  
+  static inline size_t size()
+  {
+    return sizeof(impl_t)/sizeof(scalar_t);
+  }
+  
+  inline vec_t ()
+  {
+  }
+  inline vec_t (scalar_t const& a)
+    : v(vec_splats(a))
+  {
+  }
+  inline vec_t (scalar_t const& a0, scalar_t const& a1)
+  {
+    v[0]=a0; v[1]=a1;
+  }
+  
+  inline vec_t (impl_t const& w)
+    : v(w)
+  {
+  }
+  inline operator impl_t ()
+  {
+    return v;
+  }
+  
+  static inline vec_t load (scalar_t const& a)
+  {
+    return *(impl_t const*)&a;
+  }
+  static inline vec_t loadu (scalar_t const& a)
+  {
+    return vec_load(&a);
+  }
+  // Load a vector from memory that may or may not be aligned, as
+  // decided by the offset and the vector size
+  static inline vec_t loadu_maybe (int const off, scalar_t const& p)
+  {
+    if (off % size() == 0) {
+      return load(p);
+    } else {
+      return loadu(p);
+    }
+  }
+  static inline vec_t loadu_maybe3 (int const off0, int const off1,
+                                    int const off2,
+                                    scalar_t const& p)
+  {
+    if (off0 % size() == 0 and off1 % size() == 0 and off2 % size() == 0) {
+      return load(p);
+    } else {
+      return loadu(p);
+    }
+  }
+  inline void store (scalar_t& p) const
+  {
+    *(impl_t*)p = v;
+  }
+  inline void storeu (scalar_t& p) const
+  {
+    store(p);
+  }
+  inline void store_nta (scalar_t& p) const
+  {
+    // TODO: Use stvxl instruction?
+    store(p);
+  }
+  inline void store_nta_partial_lo (scalar_t& p, size_t const cnt) const
+  {
+    switch (cnt) {
+    case 2: store_nta(p); break;
+    case 1: (&p)[0]=v[0];
+    }
+  }
+  inline void store_nta_partial_hi (scalar_t& p, size_t const cnt) const
+  {
+    switch (cnt) {
+    case 2: store_nta(p); break;
+    case 1: (&p)[1]=v[1];
+    }
+  }
+  
+  inline vec_t operator+ () const
+  {
+    return +v;
+  }
+  inline vec_t operator- () const
+  {
+    return -v;
+  }
+  inline vec_t operator+ (vec_t const& x) const
+  {
+    return v+x.v;
+  }
+  inline vec_t operator- (vec_t const& x) const
+  {
+    return v-x.v;
+  }
+  inline vec_t operator* (vec_t const& x) const
+  {
+    return v*x.v;
+  }
+  inline vec_t operator/ (vec_t const& x) const
+  {
+    return v/x.v;
+  }
+  inline vec_t& operator+= (vec_t const& x)
+  {
+    return *this=*this+x;
+  }
+  inline vec_t& operator-= (vec_t const& x)
+  {
+    return *this=*this+x;
+  }
+  inline vec_t& operator*= (vec_t const& x)
+  {
+    return *this=*this-x;
+  }
+  inline vec_t& operator/= (vec_t const& x)
+  {
+    return *this=/this+x;
+  }
+};
+
+template<typename T>
+vec_t<T> exp (vec_t<T> const& x)
+{
+  return vec_t(exp(x.v[0]), exp(x.v[1]));
+}
+template<typename T>
+vec_t<T> fabs (vec_t<T> const& x)
+{
+  return vec_abs(x.v);
+}
+template<typename T>
+vec_t<T> fmax (vec_t<T> const& x, vec_t<T> const& y)
+{
+  return vec_max(x.v, y.v);
+}
+template<typename T>
+vec_t<T> fmin (vec_t<T> const& x, vec_t<T> const& y)
+{
+  return vec_min(x.v, y.v);
+}
+template<typename T>
+vec_t<T> ifthen (bool const b, vec_t<T> const& x, vec_t<T> const& y)
+{
+  return b ? x : y;
+}
+vec_t<T> log (vec_t<T> const& x)
+{
+  return vec_t(log(x.v[0]), log(x.v[1]));
+}
+template<typename T>
+vec_t<T> pow (vec_t<T> const& x, typename vec_t<T>::scalar_t const& a)
+{
+  return vec_t(pow(x.v[0],a), pow(x.v[1],a));
+}
+vec_t<T> sqrt (vec_t<T> const& x)
+{
+  return vec_t(sqrt(x.v[0]), sqrt(x.v[1]));
+}
+
+#endif
diff --git a/src/indirect/vectors-pseudo.hh b/src/indirect/vectors-pseudo.hh
new file mode 100644
index 0000000..a5cca83
--- /dev/null
+++ b/src/indirect/vectors-pseudo.hh
@@ -0,0 +1,183 @@
+#include <cmath>
+#include <cstdlib>
+using namespace std;
+
+template<typename T>
+struct vec_t {
+  size_t const D = 2;
+  typedef T scalar_t;
+  typedef T impl_t;
+  impl_t v[D];
+  
+  static inline size_t size()
+  {
+    return D;
+  }
+  
+  inline vec_t ()
+  {
+  }
+  inline vec_t (scalar_t const& a)
+  {
+    for (size_t d=0; d<D; ++d) v[d]=a;
+  }
+  
+  inline operator impl_t ()
+  {
+    return v;
+  }
+  
+  inline scalar_t operator[] (size_t const d) const
+  {
+    return v[d];
+  }
+  
+  static inline vec_t load (scalar_t const& a)
+  {
+    vec_t r;
+    for (size_t d=0; d<D; ++d) r.v[d]=(&a)[d];
+    return r;
+  }
+  static inline vec_t loadu (scalar_t const& a)
+  {
+    return load(a);
+  }
+  // Load a vector from memory that may or may not be aligned, as
+  // decided by the offset and the vector size
+  static inline vec_t loadu_maybe (int const off, scalar_t const& p)
+  {
+    return load(a);
+  }
+  static inline vec_t loadu_maybe3 (int const off0, int const off1,
+                                    int const off2,
+                                    scalar_t const& p)
+  {
+    return load(a);
+  }
+  inline void store (scalar_t& p) const
+  {
+    for (size_t d=0; d<D; ++d) (&p)[d]=v[d];
+  }
+  inline void storeu (scalar_t& p) const
+  {
+    store(p);
+  }
+  inline void store_nta (scalar_t& p) const
+  {
+    store(p);
+  }
+  inline void store_nta_partial_lo (scalar_t& p, size_t const cnt) const
+  {
+    for (size_t d=0; d<cnt; ++d) (&p)[d]=v[d];
+  }
+  inline void store_nta_partial_hi (scalar_t& p, size_t const cnt) const
+  {
+    for (size_t d=D-cnt; d<D; ++d) (&p)[d]=v[d];
+  }
+  
+  inline vec_t operator+ () const
+  {
+    vec_t r=*this;
+    for (size_t d=0; d<D; ++d) r.v[d]=+v[d];
+    return r;
+  }
+  inline vec_t operator- () const
+  {
+    vec_t r=*this;
+    for (size_t d=0; d<D; ++d) r.v[d]=-v[d];
+    return r;
+  }
+  inline vec_t operator+ (vec_t const& x) const
+  {
+    vec_t r=*this;
+    return r+=x;
+  }
+  inline vec_t operator- (vec_t const& x) const
+  {
+    vec_t r=*this;
+    return r-=x;
+  }
+  inline vec_t operator* (vec_t const& x) const
+  {
+    vec_t r=*this;
+    return r*=x;
+  }
+  inline vec_t operator/ (vec_t const& x) const
+  {
+    vec_t r=*this;
+    return r/=x;
+  }
+  inline vec_t& operator+= (vec_t const& x)
+  {
+    for (size_t d=0; d<D; ++d) v[d]+=x.v[d];
+    return *this;
+  }
+  inline vec_t& operator-= (vec_t const& x)
+  {
+    for (size_t d=0; d<D; ++d) v[d]-=x.v[d];
+    return *this;
+  }
+  inline vec_t& operator*= (vec_t const& x)
+  {
+    for (size_t d=0; d<D; ++d) v[d]*=x.v[d];
+    return *this;
+  }
+  inline vec_t& operator/= (vec_t const& x)
+  {
+    for (size_t d=0; d<D; ++d) v[d]/=x.v[d];
+    return *this;
+  }
+};
+
+template<typename T>
+vec_t<T> exp (vec_t<T> const& x)
+{
+  vec_t r;
+  for (size_t d=0; d<D; ++d) r.v[d]=exp(x.v[d]);
+  return r;
+}
+template<typename T>
+vec_t<T> fabs (vec_t<T> const& x)
+{
+  vec_t r;
+  for (size_t d=0; d<D; ++d) r.v[d]=fabs(x.v[d]);
+  return r;
+}
+template<typename T>
+vec_t<T> fmax (vec_t<T> const& x, vec_t<T> const& y)
+{
+  vec_t r;
+  for (size_t d=0; d<D; ++d) r.v[d]=fmax(x.v[d],y.v[d]);
+  return r;
+}
+template<typename T>
+vec_t<T> fmin (vec_t<T> const& x, vec_t<T> const& y)
+{
+  vec_t r;
+  for (size_t d=0; d<D; ++d) r.v[d]=fmin(x.v[d],y.v[d]);
+  return r;
+}
+template<typename T>
+vec_t<T> ifthen (bool const b, vec_t<T> const& x, vec_t<T> const& y)
+{
+  return b ? x : y;
+}
+vec_t<T> log (vec_t<T> const& x)
+{
+  vec_t r;
+  for (size_t d=0; d<D; ++d) r.v[d]=log(x.v[d]);
+  return r;
+}
+template<typename T>
+vec_t<T> pow (vec_t<T> const& x, typename vec_t<T>::scalar_t const& a)
+{
+  vec_t r;
+  for (size_t d=0; d<D; ++d) r.v[d]=pow(x.v[d],a);
+  return r;
+}
+vec_t<T> sqrt (vec_t<T> const& x)
+{
+  vec_t r;
+  for (size_t d=0; d<D; ++d) r.v[d]=sqrt(x.v[d]);
+  return r;
+}
diff --git a/src/indirect/vectors.hh b/src/indirect/vectors.hh
new file mode 100644
index 0000000..a9f51db
--- /dev/null
+++ b/src/indirect/vectors.hh
@@ -0,0 +1,19 @@
+#ifndef VECTORS_HH
+#define VECTORS_HH
+
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+
+  // Default vector implementation, does not vectorise
+#include "vectors-default.hh"
+  
+#if 0
+  // Intel SSE vector instructions
+#include "vectors-intel.hh"
+  
+  // Power (Altivec) vector instructions
+#include "vectors-power.hh"
+#endif
+
+#endif  // #ifndef VECTORS_HH
-- 
cgit v1.2.3