aboutsummaryrefslogtreecommitdiff
path: root/Carpet/CarpetLib/src/prolongate_3d_rf2.cc
diff options
context:
space:
mode:
authorErik Schnetter <schnetter@cct.lsu.edu>2011-11-14 15:18:16 -0500
committerBarry Wardell <barry.wardell@gmail.com>2011-12-14 19:54:56 +0000
commit26abf757839e38f24ad5c4a4bf8975b00387fca1 (patch)
tree26ec4d001d386628ff11cc6b7e239b16e4398ddc /Carpet/CarpetLib/src/prolongate_3d_rf2.cc
parent425375ffab5abc80f904871992421749a3a187b4 (diff)
CarpetLib: Rewrite vectorisation of prolongate_3d_rf2
Diffstat (limited to 'Carpet/CarpetLib/src/prolongate_3d_rf2.cc')
-rw-r--r--Carpet/CarpetLib/src/prolongate_3d_rf2.cc161
1 files changed, 86 insertions, 75 deletions
diff --git a/Carpet/CarpetLib/src/prolongate_3d_rf2.cc b/Carpet/CarpetLib/src/prolongate_3d_rf2.cc
index 3f92b46e7..f36e79ef8 100644
--- a/Carpet/CarpetLib/src/prolongate_3d_rf2.cc
+++ b/Carpet/CarpetLib/src/prolongate_3d_rf2.cc
@@ -232,83 +232,92 @@ namespace CarpetLib {
#endif
typedef vecprops<T> VP;
typedef typename VP::vector_t VT;
- assert (coeffs::ncoeffs % VP::size() == 0);
ptrdiff_t i = coeffs::imin;
- VT vres =
- VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
- VP::loadu(interp0<T,ORDER> (p + i)));
-#if defined(__INTEL_COMPILER)
- // Unroll the loop manually to help the Intel compiler
- // (This manual unrolling hurts with other compilers, e.g. PGI)
- assert (coeffs::ncoeffs / VP::size() <= 12);
- switch (coeffs::ncoeffs / VP::size()) {
- // Note that all case statements fall through
- case 12:
- i += VP::size();
- vres = VP::add(vres,
- VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
- VP::loadu(interp0<T,ORDER> (p + i))));
- case 11:
- i += VP::size();
- vres = VP::add(vres,
- VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
- VP::loadu(interp0<T,ORDER> (p + i))));
- case 10:
- i += VP::size();
- vres = VP::add(vres,
- VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
- VP::loadu(interp0<T,ORDER> (p + i))));
- case 9:
- i += VP::size();
- vres = VP::add(vres,
- VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
- VP::loadu(interp0<T,ORDER> (p + i))));
- case 8:
- i += VP::size();
- vres = VP::add(vres,
- VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
- VP::loadu(interp0<T,ORDER> (p + i))));
- case 7:
- i += VP::size();
- vres = VP::add(vres,
- VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
- VP::loadu(interp0<T,ORDER> (p + i))));
- case 6:
- i += VP::size();
- vres = VP::add(vres,
- VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
- VP::loadu(interp0<T,ORDER> (p + i))));
- case 5:
- i += VP::size();
- vres = VP::add(vres,
- VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
- VP::loadu(interp0<T,ORDER> (p + i))));
- case 4:
- i += VP::size();
- vres = VP::add(vres,
- VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
- VP::loadu(interp0<T,ORDER> (p + i))));
- case 3:
- i += VP::size();
- vres = VP::add(vres,
- VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
- VP::loadu(interp0<T,ORDER> (p + i))));
- case 2:
+ T res = typ::fromreal (0);
+ if (coeffs::ncoeffs >= VP::size()) {
+ VT vres =
+ VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
+ VP::loadu(interp0<T,ORDER> (p + i)));
i += VP::size();
- vres = VP::add(vres,
- VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
- VP::loadu(interp0<T,ORDER> (p + i))));
- }
+#if defined(__INTEL_COMPILER)
+ // Unroll the loop manually to help the Intel compiler
+ // (This manual unrolling hurts with other compilers, e.g. PGI)
+ assert (coeffs::ncoeffs / VP::size() <= 12);
+ switch (coeffs::ncoeffs / VP::size()) {
+ // Note that all case statements fall through
+ case 12:
+ vres = VP::add(vres,
+ VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
+ VP::loadu(interp0<T,ORDER> (p + i))));
+ i += VP::size();
+ case 11:
+ vres = VP::add(vres,
+ VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
+ VP::loadu(interp0<T,ORDER> (p + i))));
+ i += VP::size();
+ case 10:
+ vres = VP::add(vres,
+ VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
+ VP::loadu(interp0<T,ORDER> (p + i))));
+ i += VP::size();
+ case 9:
+ vres = VP::add(vres,
+ VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
+ VP::loadu(interp0<T,ORDER> (p + i))));
+ i += VP::size();
+ case 8:
+ vres = VP::add(vres,
+ VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
+ VP::loadu(interp0<T,ORDER> (p + i))));
+ i += VP::size();
+ case 7:
+ vres = VP::add(vres,
+ VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
+ VP::loadu(interp0<T,ORDER> (p + i))));
+ i += VP::size();
+ case 6:
+ vres = VP::add(vres,
+ VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
+ VP::loadu(interp0<T,ORDER> (p + i))));
+ i += VP::size();
+ case 5:
+ vres = VP::add(vres,
+ VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
+ VP::loadu(interp0<T,ORDER> (p + i))));
+ i += VP::size();
+ case 4:
+ vres = VP::add(vres,
+ VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
+ VP::loadu(interp0<T,ORDER> (p + i))));
+ i += VP::size();
+ case 3:
+ vres = VP::add(vres,
+ VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
+ VP::loadu(interp0<T,ORDER> (p + i))));
+ i += VP::size();
+ case 2:
+ vres = VP::add(vres,
+ VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
+ VP::loadu(interp0<T,ORDER> (p + i))));
+ i += VP::size();
+ }
#else
- for (i += VP::size(); i < coeffs::imax; i += VP::size()) {
- vres = VP::add(vres,
- VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
- VP::loadu(interp0<T,ORDER> (p + i))));
- }
+ for (; i + VP::size() <= coeffs::imax; i += VP::size()) {
+ vres = VP::add(vres,
+ VP::mul(VP::load(typ::fromreal(coeffs::get(i))),
+ VP::loadu(interp0<T,ORDER> (p + i))));
+ }
#endif
- T res = typ::fromreal (0);
- for (int d=0; d<VP::size(); ++d) {
- res += VP::elt(vres,d);
+ for (int d=0; d<VP::size(); ++d) {
+ res += VP::elt(vres,d);
+ }
+ }
+ assert (i == coeffs::imax - coeffs::ncoeffs % VP::size());
+ for (i = coeffs::imax - coeffs::ncoeffs % VP::size();
+ i < coeffs::imax;
+ ++ i)
+ {
+ res += coeffs::get(i) * interp0<T,ORDER> (p + i*d1);
}
return res;
} else {
@@ -330,7 +339,8 @@ namespace CarpetLib {
size_t const d1,
size_t const d2)
{
- typedef typename typeprops<T>::real RT;
+ typedef typeprops<T> typ;
+ typedef typename typ::real RT;
typedef coeffs1d<RT,ORDER> coeffs;
if (dj == 0) {
return interp1<T,ORDER,di> (p, d1);
@@ -352,7 +362,8 @@ namespace CarpetLib {
size_t const d2,
size_t const d3)
{
- typedef typename typeprops<T>::real RT;
+ typedef typeprops<T> typ;
+ typedef typename typ::real RT;
typedef coeffs1d<RT,ORDER> coeffs;
if (dk == 0) {
return interp2<T,ORDER,di,dj> (p, d1, d2);