From 26abf757839e38f24ad5c4a4bf8975b00387fca1 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Mon, 14 Nov 2011 15:18:16 -0500 Subject: CarpetLib: Rewrite vectorisation of prolongate_3d_rf2 --- Carpet/CarpetLib/src/prolongate_3d_rf2.cc | 161 ++++++++++++++++-------------- 1 file changed, 86 insertions(+), 75 deletions(-) (limited to 'Carpet/CarpetLib/src/prolongate_3d_rf2.cc') diff --git a/Carpet/CarpetLib/src/prolongate_3d_rf2.cc b/Carpet/CarpetLib/src/prolongate_3d_rf2.cc index 3f92b46e7..f36e79ef8 100644 --- a/Carpet/CarpetLib/src/prolongate_3d_rf2.cc +++ b/Carpet/CarpetLib/src/prolongate_3d_rf2.cc @@ -232,83 +232,92 @@ namespace CarpetLib { #endif typedef vecprops VP; typedef typename VP::vector_t VT; - assert (coeffs::ncoeffs % VP::size() == 0); ptrdiff_t i = coeffs::imin; - VT vres = - VP::mul(VP::load(typ::fromreal(coeffs::get(i))), - VP::loadu(interp0 (p + i))); -#if defined(__INTEL_COMPILER) - // Unroll the loop manually to help the Intel compiler - // (This manual unrolling hurts with other compilers, e.g. PGI) - assert (coeffs::ncoeffs / VP::size() <= 12); - switch (coeffs::ncoeffs / VP::size()) { - // Note that all case statements fall through - case 12: - i += VP::size(); - vres = VP::add(vres, - VP::mul(VP::load(typ::fromreal(coeffs::get(i))), - VP::loadu(interp0 (p + i)))); - case 11: - i += VP::size(); - vres = VP::add(vres, - VP::mul(VP::load(typ::fromreal(coeffs::get(i))), - VP::loadu(interp0 (p + i)))); - case 10: - i += VP::size(); - vres = VP::add(vres, - VP::mul(VP::load(typ::fromreal(coeffs::get(i))), - VP::loadu(interp0 (p + i)))); - case 9: - i += VP::size(); - vres = VP::add(vres, - VP::mul(VP::load(typ::fromreal(coeffs::get(i))), - VP::loadu(interp0 (p + i)))); - case 8: - i += VP::size(); - vres = VP::add(vres, - VP::mul(VP::load(typ::fromreal(coeffs::get(i))), - VP::loadu(interp0 (p + i)))); - case 7: - i += VP::size(); - vres = VP::add(vres, - VP::mul(VP::load(typ::fromreal(coeffs::get(i))), - VP::loadu(interp0 (p + i)))); - case 6: - i += VP::size(); - vres = VP::add(vres, - VP::mul(VP::load(typ::fromreal(coeffs::get(i))), - VP::loadu(interp0 (p + i)))); - case 5: - i += VP::size(); - vres = VP::add(vres, - VP::mul(VP::load(typ::fromreal(coeffs::get(i))), - VP::loadu(interp0 (p + i)))); - case 4: - i += VP::size(); - vres = VP::add(vres, - VP::mul(VP::load(typ::fromreal(coeffs::get(i))), - VP::loadu(interp0 (p + i)))); - case 3: - i += VP::size(); - vres = VP::add(vres, - VP::mul(VP::load(typ::fromreal(coeffs::get(i))), - VP::loadu(interp0 (p + i)))); - case 2: + T res = typ::fromreal (0); + if (coeffs::ncoeffs >= VP::size()) { + VT vres = + VP::mul(VP::load(typ::fromreal(coeffs::get(i))), + VP::loadu(interp0 (p + i))); i += VP::size(); - vres = VP::add(vres, - VP::mul(VP::load(typ::fromreal(coeffs::get(i))), - VP::loadu(interp0 (p + i)))); - } +#if defined(__INTEL_COMPILER) + // Unroll the loop manually to help the Intel compiler + // (This manual unrolling hurts with other compilers, e.g. PGI) + assert (coeffs::ncoeffs / VP::size() <= 12); + switch (coeffs::ncoeffs / VP::size()) { + // Note that all case statements fall through + case 12: + vres = VP::add(vres, + VP::mul(VP::load(typ::fromreal(coeffs::get(i))), + VP::loadu(interp0 (p + i)))); + i += VP::size(); + case 11: + vres = VP::add(vres, + VP::mul(VP::load(typ::fromreal(coeffs::get(i))), + VP::loadu(interp0 (p + i)))); + i += VP::size(); + case 10: + vres = VP::add(vres, + VP::mul(VP::load(typ::fromreal(coeffs::get(i))), + VP::loadu(interp0 (p + i)))); + i += VP::size(); + case 9: + vres = VP::add(vres, + VP::mul(VP::load(typ::fromreal(coeffs::get(i))), + VP::loadu(interp0 (p + i)))); + i += VP::size(); + case 8: + vres = VP::add(vres, + VP::mul(VP::load(typ::fromreal(coeffs::get(i))), + VP::loadu(interp0 (p + i)))); + i += VP::size(); + case 7: + vres = VP::add(vres, + VP::mul(VP::load(typ::fromreal(coeffs::get(i))), + VP::loadu(interp0 (p + i)))); + i += VP::size(); + case 6: + vres = VP::add(vres, + VP::mul(VP::load(typ::fromreal(coeffs::get(i))), + VP::loadu(interp0 (p + i)))); + i += VP::size(); + case 5: + vres = VP::add(vres, + VP::mul(VP::load(typ::fromreal(coeffs::get(i))), + VP::loadu(interp0 (p + i)))); + i += VP::size(); + case 4: + vres = VP::add(vres, + VP::mul(VP::load(typ::fromreal(coeffs::get(i))), + VP::loadu(interp0 (p + i)))); + i += VP::size(); + case 3: + vres = VP::add(vres, + VP::mul(VP::load(typ::fromreal(coeffs::get(i))), + VP::loadu(interp0 (p + i)))); + i += VP::size(); + case 2: + vres = VP::add(vres, + VP::mul(VP::load(typ::fromreal(coeffs::get(i))), + VP::loadu(interp0 (p + i)))); + i += VP::size(); + } #else - for (i += VP::size(); i < coeffs::imax; i += VP::size()) { - vres = VP::add(vres, - VP::mul(VP::load(typ::fromreal(coeffs::get(i))), - VP::loadu(interp0 (p + i)))); - } + for (; i + VP::size() <= coeffs::imax; i += VP::size()) { + vres = VP::add(vres, + VP::mul(VP::load(typ::fromreal(coeffs::get(i))), + VP::loadu(interp0 (p + i)))); + } #endif - T res = typ::fromreal (0); - for (int d=0; d (p + i*d1); } return res; } else { @@ -330,7 +339,8 @@ namespace CarpetLib { size_t const d1, size_t const d2) { - typedef typename typeprops::real RT; + typedef typeprops typ; + typedef typename typ::real RT; typedef coeffs1d coeffs; if (dj == 0) { return interp1 (p, d1); @@ -352,7 +362,8 @@ namespace CarpetLib { size_t const d2, size_t const d3) { - typedef typename typeprops::real RT; + typedef typeprops typ; + typedef typename typ::real RT; typedef coeffs1d coeffs; if (dk == 0) { return interp2 (p, d1, d2); -- cgit v1.2.3