diff options
author | rhaas <rhaas@2e825fa2-fb71-486d-8b7f-a5ff3f0f6cb8> | 2009-11-18 01:56:00 +0000 |
---|---|---|
committer | rhaas <rhaas@2e825fa2-fb71-486d-8b7f-a5ff3f0f6cb8> | 2009-11-18 01:56:00 +0000 |
commit | 2f1d026b9438fc40217e596934dbf74c041ce4df (patch) | |
tree | a983edcd35af235178e7bf47580cfa378111c1bf | |
parent | 0fac8b2d14bde5de51f534ce57d03a53416cf6a4 (diff) |
add optimized support for CCTK_INT, introduce templated functions
git-svn-id: http://svn.cactuscode.org/arrangements/CactusNumerical/Slab/trunk@65 2e825fa2-fb71-486d-8b7f-a5ff3f0f6cb8
-rw-r--r-- | src/slab.cc | 589 |
1 files changed, 241 insertions, 348 deletions
diff --git a/src/slab.cc b/src/slab.cc index daa8908..a60eebc 100644 --- a/src/slab.cc +++ b/src/slab.cc @@ -74,6 +74,7 @@ using namespace std; #endif +namespace Slab { static int timer_init = -1; static int timer_copy_in = -1; @@ -696,7 +697,149 @@ print_xferinfo (FILE * const out, fprintf (out, " flip: %d\n", xferinfo->flip); } +// workhorse routine responsible for the actual copying/transposing of data +template<typename T> inline void +copy_data (const vector<xfer> &info, + const vector<bbox> &srcdetail, + const vector<int> &srcoffset, + const vector<int> &srcelems, + const vector<char> &srcdata, + void const * restrict const * restrict const srcptrs, + const int n, + const vector<int> &varis, + const int nvaris, + const int xpose_x=0, + const int xpose_y=1, + const int xpose_z=2) +{ + assert (srcptrs); + + int const srcoffi = info[0].src.local.off; + int const srcoffj = info[1].src.local.off; + int const srcoffk = info[2].src.local.off; + + int const srcleni = info[0].src.local.len; + int const srclenj = info[1].src.local.len; + int const srclenk = info[2].src.local.len; + + int const srcdetailoffi = srcdetail[n*SLAB_MAXDIM+0].off; + int const srcdetailoffj = srcdetail[n*SLAB_MAXDIM+1].off; + int const srcdetailoffk = srcdetail[n*SLAB_MAXDIM+2].off; + + int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+0].len; + int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+1].len; + int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+2].len; + + int const dstdetailleni = srcdetail[n*SLAB_MAXDIM+xpose_x].len; + int const dstdetaillenj = srcdetail[n*SLAB_MAXDIM+xpose_y].len; + //int const dstdetaillenk = srcdetail[n*SLAB_MAXDIM+xpose_z].len; unused + + if (n==0) assert (srcoffset[n]==0); + // TODO: This does not take nvaris into account + // if (n<size-1) assert (srcoffset[n+1]==srcoffset[n]+srcdetailleni*srcdetaillenj*srcdetaillenk); + + ifcheck { + const int xpose[SLAB_MAXDIM] = {xpose_x, xpose_y, xpose_z}; + for (int i = 0; i < SLAB_MAXDIM; ++i) { + for (int j = i+1; j < SLAB_MAXDIM; ++j) { + assert(xpose[i] != xpose[j]); + } + } + } + + for (int vari=0; vari<nvaris; ++vari) { + T * restrict const srcdataptr = + (T *)&srcdata.front() + srcoffset[n] + vari * srcelems[n]; + T const * restrict const srcptr = + (T const *)srcptrs[varis[vari]]; + assert(srcptr); + +# pragma omp parallel for + { + int ipos[SLAB_MAXDIM]; + for (ipos[2] = 0; ipos[2] < srcdetaillenk; ++ipos[2]) { + for (ipos[1] = 0; ipos[1] < srcdetaillenj; ++ipos[1]) { + for (ipos[0] = 0; ipos[0] < srcdetailleni; ++ipos[0]) { + int const srcindi = srcdetailoffi + ipos[0] - srcoffi; + int const srcindj = srcdetailoffj + ipos[1] - srcoffj; + int const srcindk = srcdetailoffk + ipos[2] - srcoffk; + ifcheck assert (srcindi>=0 and srcindi<srcleni); + ifcheck assert (srcindj>=0 and srcindj<srclenj); + ifcheck assert (srcindk>=0 and srcindk<srclenk); + size_t const srcind = + srcindi + srcleni * (srcindj + srclenj * srcindk); + size_t const bufind = + ipos[xpose_x] + dstdetailleni * (ipos[xpose_y] + dstdetaillenj * ipos[xpose_z]); + srcdataptr[bufind] = srcptr[srcind]; + } + } + } + } + + } // for vari +} +// workhorse routine responsible for the actual copying/flipping of data +template<typename T> inline void +copy_data_back (const vector<xfer> &info, + const vector<bbox> &dstdetail, + const vector<int> &dstoffset, + const vector<int> &dstelems, + const vector<char> &dstdata, + void const * restrict const * restrict const dstptrs, + const int n, + const vector<int> &varis, + const int nvaris, + const bool flip_x=false, + const bool flip_y=false, + const bool flip_z=false) +{ + assert (dstptrs); + + int const dstoffi = info[0].dst.local.off; + int const dstoffj = info[1].dst.local.off; + int const dstoffk = info[2].dst.local.off; + + int const dstleni = info[0].dst.local.len; + int const dstlenj = info[1].dst.local.len; + int const dstlenk = info[2].dst.local.len; + + int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off; + int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off; + int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off; + + int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len; + int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len; + int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len; + + for (int vari=0; vari<nvaris; ++vari) { + T * restrict const dstptr = (T *)dstptrs[varis[vari]]; + assert (dstptr); + T const * restrict const dstdataptr = + (T const *)&dstdata.front() + + dstoffset[n] + vari * dstelems[n]; + +# pragma omp parallel for + for (int k = 0; k < dstdetaillenk; ++k) { + for (int j = 0; j < dstdetaillenj; ++j) { + for (int i = 0; i < dstdetailleni; ++i) { + int const dstindi = dstdetailoffi + (flip_x ? dstdetailleni - 1 - i : i) - dstoffi; + int const dstindj = dstdetailoffj + (flip_y ? dstdetaillenj - 1 - j : j) - dstoffj; + int const dstindk = dstdetailoffk + (flip_z ? dstdetaillenk - 1 - k : k) - dstoffk; + ifcheck assert (dstindi>=0 and dstindi<dstleni); + ifcheck assert (dstindj>=0 and dstindj<dstlenj); + ifcheck assert (dstindk>=0 and dstindk<dstlenk); + size_t const dstind = + dstindi + dstleni * (dstindj + dstlenj * dstindk); + size_t const bufind = + i + dstdetailleni * (j + dstdetaillenj * k); + dstptr[dstind] = dstdataptr[bufind]; + } + } + } + + } // for vari +} extern "C" int @@ -1182,52 +1325,8 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH, { // Optimised version for a special case: no transposing - int const srcoffi = info[0].src.local.off; - int const srcoffj = info[1].src.local.off; - int const srcoffk = info[2].src.local.off; - - int const srcleni = info[0].src.local.len; - int const srclenj = info[1].src.local.len; - int const srclenk = info[2].src.local.len; - - int const srcdetailoffi = srcdetail[n*SLAB_MAXDIM+0].off; - int const srcdetailoffj = srcdetail[n*SLAB_MAXDIM+1].off; - int const srcdetailoffk = srcdetail[n*SLAB_MAXDIM+2].off; - - int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+0].len; - int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+1].len; - int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+2].len; - - if (n==0) assert (srcoffset[n]==0); - // TODO: This does not take nvaris into account - // if (n<size-1) assert (srcoffset[n+1]==srcoffset[n]+srcdetailleni*srcdetaillenj*srcdetaillenk); - - for (int vari=0; vari<nvaris; ++vari) { - CCTK_REAL * restrict const srcdataptr = - (CCTK_REAL *)&srcdata.front() + srcoffset[n] + vari * srcelems[n]; - CCTK_REAL const * restrict const srcptr = - (CCTK_REAL const *)srcptrs[varis[vari]]; - -# pragma omp parallel for - for (int k = 0; k < srcdetaillenk; ++k) { - for (int j = 0; j < srcdetaillenj; ++j) { - for (int i = 0; i < srcdetailleni; ++i) { - int const srcindi = srcdetailoffi + i - srcoffi; - int const srcindj = srcdetailoffj + j - srcoffj; - int const srcindk = srcdetailoffk + k - srcoffk; - ifcheck assert (srcindi>=0 and srcindi<srcleni); - ifcheck assert (srcindj>=0 and srcindj<srclenj); - ifcheck assert (srcindk>=0 and srcindk<srclenk); - size_t const srcind = - srcindi + srcleni * (srcindj + srclenj * srcindk); - size_t const bufind = - i + srcdetailleni * (j + srcdetaillenj * k); - srcdataptr[bufind] = srcptr[srcind]; - } - } - } - - } // for vari + copy_data<CCTK_REAL> (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs, + n, varis, nvaris); } else if (info[0].xpose==1 and info[1].xpose==0 and info[2].xpose==2 and srcdetail[n*SLAB_MAXDIM ].str==1 and @@ -1237,52 +1336,8 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH, { // Optimised version for a special case: transpose x and y - int const srcoffi = info[0].src.local.off; - int const srcoffj = info[1].src.local.off; - int const srcoffk = info[2].src.local.off; - - int const srcleni = info[0].src.local.len; - int const srclenj = info[1].src.local.len; - int const srclenk = info[2].src.local.len; - - int const srcdetailoffi = srcdetail[n*SLAB_MAXDIM+0].off; - int const srcdetailoffj = srcdetail[n*SLAB_MAXDIM+1].off; - int const srcdetailoffk = srcdetail[n*SLAB_MAXDIM+2].off; - - int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+0].len; - int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+1].len; - int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+2].len; - - if (n==0) assert (srcoffset[n]==0); - // TODO: This does not take nvaris into account - // if (n<size-1) assert (srcoffset[n+1]==srcoffset[n]+srcdetailleni*srcdetaillenj*srcdetaillenk); - - for (int vari=0; vari<nvaris; ++vari) { - CCTK_REAL * restrict const srcdataptr = - (CCTK_REAL *)&srcdata.front() + srcoffset[n] + vari * srcelems[n]; - CCTK_REAL const * restrict const srcptr = - (CCTK_REAL const *)srcptrs[varis[vari]]; - -# pragma omp parallel for - for (int k = 0; k < srcdetaillenk; ++k) { - for (int j = 0; j < srcdetaillenj; ++j) { - for (int i = 0; i < srcdetailleni; ++i) { - int const srcindi = srcdetailoffi + i - srcoffi; - int const srcindj = srcdetailoffj + j - srcoffj; - int const srcindk = srcdetailoffk + k - srcoffk; - ifcheck assert (srcindi>=0 and srcindi<srcleni); - ifcheck assert (srcindj>=0 and srcindj<srclenj); - ifcheck assert (srcindk>=0 and srcindk<srclenk); - size_t const srcind = - srcindi + srcleni * (srcindj + srclenj * srcindk); - size_t const bufind = - j + srcdetaillenj * (i + srcdetailleni * k); - srcdataptr[bufind] = srcptr[srcind]; - } - } - } - - } // for vari + copy_data<CCTK_REAL> (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs, + n, varis, nvaris, 1, 0, 2); } else if (srcdetail[n*SLAB_MAXDIM ].str==1 and srcdetail[n*SLAB_MAXDIM+1].str==1 and @@ -1291,53 +1346,40 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH, { // Optimised version for CCTK_REAL and stride 1 - int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+info[0].xpose].len; - int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+info[1].xpose].len; - int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+info[2].xpose].len; + copy_data<CCTK_REAL> (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs, + n, varis, nvaris, info[0].xpose, info[1].xpose, info[2].xpose); - for (int vari=0; vari<nvaris; ++vari) { - CCTK_REAL * restrict const srcdataptr = - (CCTK_REAL *)&srcdata.front() + srcoffset[n] + vari * srcelems[n]; - CCTK_REAL const * restrict const srcptr = - (CCTK_REAL const *)srcptrs[varis[vari]]; - -# pragma omp parallel for - for (int k = 0; k < srcdetaillenk; ++k) { - for (int j = 0; j < srcdetaillenj; ++j) { - for (int i = 0; i < srcdetailleni; ++i) { - int ipos[SLAB_MAXDIM]; - ipos[0] = i; - ipos[1] = j; - ipos[2] = k; - int srcipos[SLAB_MAXDIM]; - int bufipos[SLAB_MAXDIM]; - for (int d=0; d<SLAB_MAXDIM; ++d) { - int const c = info[d].xpose; - srcipos[c] = srcdetail[n*SLAB_MAXDIM+c].off + ipos[d]; - assert (srcipos[c] >= info[c].src.local.off and - srcipos[c] < info[c].src.local.off + info[c].src.local.len); - assert (srcipos[c] >= allinfo[n*SLAB_MAXDIM+c].src.slab.off and - srcipos[c] <= allinfo[n*SLAB_MAXDIM+c].src.slab.off + (allinfo[n*SLAB_MAXDIM+c].src.slab.len - 1)); - bufipos[d] = ipos[d]; - assert (bufipos[d] >= 0 and - bufipos[d] < srcdetail[n*SLAB_MAXDIM+c].len); - } - size_t srcind = 0; - size_t bufind = 0; - for (int d=SLAB_MAXDIM-1; d>=0; --d) { - int const c = info[d].xpose; - srcind = srcind * info[d].src.local.len + srcipos[d] - info[d].src.local.off; - bufind = bufind * srcdetail[n*SLAB_MAXDIM+c].len + bufipos[d]; - } - assert (srcind < srclentot); - assert (bufind < (size_t)srccount[n]); - srcdataptr[bufind] = srcptr[srcind]; - } - } - } - - } // for vari + } else if (info[0].xpose==0 and info[1].xpose==1 and info[2].xpose==2 and + srcdetail[n*SLAB_MAXDIM ].str==1 and + srcdetail[n*SLAB_MAXDIM+1].str==1 and + srcdetail[n*SLAB_MAXDIM+2].str==1 and + vartype == CCTK_VARIABLE_INT) + { + // Optimised version for a special case: no transposing + copy_data<CCTK_INT> (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs, + n, varis, nvaris); + + } else if (info[0].xpose==1 and info[1].xpose==0 and info[2].xpose==2 and + srcdetail[n*SLAB_MAXDIM ].str==1 and + srcdetail[n*SLAB_MAXDIM+1].str==1 and + srcdetail[n*SLAB_MAXDIM+2].str==1 and + vartype == CCTK_VARIABLE_INT) + { + // Optimised version for a special case: transpose x and y + + copy_data<CCTK_INT> (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs, + n, varis, nvaris, 1, 0, 2); + + } else if (srcdetail[n*SLAB_MAXDIM ].str==1 and + srcdetail[n*SLAB_MAXDIM+1].str==1 and + srcdetail[n*SLAB_MAXDIM+2].str==1) + { + // Optimised version for CCTK_INT and stride 1 + + copy_data<CCTK_INT> (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs, + n, varis, nvaris, info[0].xpose, info[1].xpose, info[2].xpose); + } else { // Generic, unoptimised version @@ -1478,48 +1520,8 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH, { // Optimised version for a special case: no flipping - int const dstoffi = info[0].dst.local.off; - int const dstoffj = info[1].dst.local.off; - int const dstoffk = info[2].dst.local.off; - - int const dstleni = info[0].dst.local.len; - int const dstlenj = info[1].dst.local.len; - int const dstlenk = info[2].dst.local.len; - - int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off; - int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off; - int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off; - - int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len; - int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len; - int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len; - - for (int vari=0; vari<nvaris; ++vari) { - CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]]; - CCTK_REAL const * restrict const dstdataptr = - (CCTK_REAL const *)&dstdata.front() + - dstoffset[n] + vari * dstelems[n]; - -# pragma omp parallel for - for (int k = 0; k < dstdetaillenk; ++k) { - for (int j = 0; j < dstdetaillenj; ++j) { - for (int i = 0; i < dstdetailleni; ++i) { - int const dstindi = dstdetailoffi + i - dstoffi; - int const dstindj = dstdetailoffj + j - dstoffj; - int const dstindk = dstdetailoffk + k - dstoffk; - ifcheck assert (dstindi>=0 and dstindi<dstleni); - ifcheck assert (dstindj>=0 and dstindj<dstlenj); - ifcheck assert (dstindk>=0 and dstindk<dstlenk); - size_t const dstind = - dstindi + dstleni * (dstindj + dstlenj * dstindk); - size_t const bufind = - i + dstdetailleni * (j + dstdetaillenj * k); - dstptr[dstind] = dstdataptr[bufind]; - } - } - } - - } // for vari + copy_data_back<CCTK_REAL> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris); } else if (info[0].flip==1 and info[1].flip==0 and info[2].flip==0 and dstdetail[n*SLAB_MAXDIM ].str==1 and @@ -1529,48 +1531,8 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH, { // Optimised version for a special case: flip in x direction - int const dstoffi = info[0].dst.local.off; - int const dstoffj = info[1].dst.local.off; - int const dstoffk = info[2].dst.local.off; - - int const dstleni = info[0].dst.local.len; - int const dstlenj = info[1].dst.local.len; - int const dstlenk = info[2].dst.local.len; - - int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off; - int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off; - int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off; - - int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len; - int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len; - int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len; - - for (int vari=0; vari<nvaris; ++vari) { - CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]]; - CCTK_REAL const * restrict const dstdataptr = - (CCTK_REAL const *)&dstdata.front() + - dstoffset[n] + vari * dstelems[n]; - -# pragma omp parallel for - for (int k = 0; k < dstdetaillenk; ++k) { - for (int j = 0; j < dstdetaillenj; ++j) { - for (int i = 0; i < dstdetailleni; ++i) { - int const dstindi = dstdetailoffi + i - dstoffi; - int const dstindj = dstdetailoffj + j - dstoffj; - int const dstindk = dstdetailoffk + k - dstoffk; - ifcheck assert (dstindi>=0 and dstindi<dstleni); - ifcheck assert (dstindj>=0 and dstindj<dstlenj); - ifcheck assert (dstindk>=0 and dstindk<dstlenk); - size_t const dstind = - dstindi + dstleni * (dstindj + dstlenj * dstindk); - size_t const bufind = - (dstdetailleni - 1 - i) + dstdetailleni * (j + dstdetaillenj * k); - dstptr[dstind] = dstdataptr[bufind]; - } - } - } - - } // for vari + copy_data_back<CCTK_REAL> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, true); } else if (info[0].flip==0 and info[1].flip==1 and info[2].flip==0 and dstdetail[n*SLAB_MAXDIM ].str==1 and @@ -1580,154 +1542,83 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH, { // Optimised version for a special case: flip in y direction - int const dstoffi = info[0].dst.local.off; - int const dstoffj = info[1].dst.local.off; - int const dstoffk = info[2].dst.local.off; + copy_data_back<CCTK_REAL> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, false, true); - int const dstleni = info[0].dst.local.len; - int const dstlenj = info[1].dst.local.len; - int const dstlenk = info[2].dst.local.len; + } else if (info[0].flip==1 and info[1].flip==1 and info[2].flip==0 and + dstdetail[n*SLAB_MAXDIM ].str==1 and + dstdetail[n*SLAB_MAXDIM+1].str==1 and + dstdetail[n*SLAB_MAXDIM+2].str==1 and + vartype == CCTK_VARIABLE_REAL) + { + // Optimised version for a special case: flip in x and y direction - int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off; - int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off; - int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off; + copy_data_back<CCTK_REAL> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, true, true); - int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len; - int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len; - int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len; + } else if (dstdetail[n*SLAB_MAXDIM ].str==1 and + dstdetail[n*SLAB_MAXDIM+1].str==1 and + dstdetail[n*SLAB_MAXDIM+2].str==1 and + vartype == CCTK_VARIABLE_REAL) + { + // Optimised version for CCTK_REAL and stride 1 - for (int vari=0; vari<nvaris; ++vari) { - CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]]; - CCTK_REAL const * restrict const dstdataptr = - (CCTK_REAL const *)&dstdata.front() + - dstoffset[n] + vari * dstelems[n]; + copy_data_back<CCTK_REAL> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, info[0].flip==1, info[1].flip==1, info[2].flip==1); -# pragma omp parallel for - for (int k = 0; k < dstdetaillenk; ++k) { - for (int j = 0; j < dstdetaillenj; ++j) { - for (int i = 0; i < dstdetailleni; ++i) { - int const dstindi = dstdetailoffi + i - dstoffi; - int const dstindj = dstdetailoffj + j - dstoffj; - int const dstindk = dstdetailoffk + k - dstoffk; - ifcheck assert (dstindi>=0 and dstindi<dstleni); - ifcheck assert (dstindj>=0 and dstindj<dstlenj); - ifcheck assert (dstindk>=0 and dstindk<dstlenk); - size_t const dstind = - dstindi + dstleni * (dstindj + dstlenj * dstindk); - size_t const bufind = - i + dstdetailleni * ((dstdetaillenj - 1 - j) + dstdetaillenj * k); - dstptr[dstind] = dstdataptr[bufind]; - } - } - } - - } // for vari + } else if (info[0].flip==0 and info[1].flip==0 and info[2].flip==0 and + dstdetail[n*SLAB_MAXDIM ].str==1 and + dstdetail[n*SLAB_MAXDIM+1].str==1 and + dstdetail[n*SLAB_MAXDIM+2].str==1 and + vartype == CCTK_VARIABLE_INT) + { + // Optimised version for a special case: no flipping - } else if (info[0].flip==1 and info[1].flip==1 and info[2].flip==0 and + copy_data_back<CCTK_INT> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris); + + } else if (info[0].flip==1 and info[1].flip==0 and info[2].flip==0 and dstdetail[n*SLAB_MAXDIM ].str==1 and dstdetail[n*SLAB_MAXDIM+1].str==1 and dstdetail[n*SLAB_MAXDIM+2].str==1 and - vartype == CCTK_VARIABLE_REAL) + vartype == CCTK_VARIABLE_INT) { - // Optimised version for a special case: flip in x and y direction + // Optimised version for a special case: flip in x direction - int const dstoffi = info[0].dst.local.off; - int const dstoffj = info[1].dst.local.off; - int const dstoffk = info[2].dst.local.off; + copy_data_back<CCTK_INT> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, true); - int const dstleni = info[0].dst.local.len; - int const dstlenj = info[1].dst.local.len; - int const dstlenk = info[2].dst.local.len; + } else if (info[0].flip==0 and info[1].flip==1 and info[2].flip==0 and + dstdetail[n*SLAB_MAXDIM ].str==1 and + dstdetail[n*SLAB_MAXDIM+1].str==1 and + dstdetail[n*SLAB_MAXDIM+2].str==1 and + vartype == CCTK_VARIABLE_INT) + { + // Optimised version for a special case: flip in y direction - int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off; - int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off; - int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off; + copy_data_back<CCTK_INT> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, false, true); - int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len; - int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len; - int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len; + } else if (info[0].flip==1 and info[1].flip==1 and info[2].flip==0 and + dstdetail[n*SLAB_MAXDIM ].str==1 and + dstdetail[n*SLAB_MAXDIM+1].str==1 and + dstdetail[n*SLAB_MAXDIM+2].str==1 and + vartype == CCTK_VARIABLE_INT) + { + // Optimised version for a special case: flip in x and y direction - for (int vari=0; vari<nvaris; ++vari) { - CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]]; - CCTK_REAL const * restrict const dstdataptr = - (CCTK_REAL const *)&dstdata.front() + - dstoffset[n] + vari * dstelems[n]; - -# pragma omp parallel for - for (int k = 0; k < dstdetaillenk; ++k) { - for (int j = 0; j < dstdetaillenj; ++j) { - for (int i = 0; i < dstdetailleni; ++i) { - int const dstindi = dstdetailoffi + i - dstoffi; - int const dstindj = dstdetailoffj + j - dstoffj; - int const dstindk = dstdetailoffk + k - dstoffk; - ifcheck assert (dstindi>=0 and dstindi<dstleni); - ifcheck assert (dstindj>=0 and dstindj<dstlenj); - ifcheck assert (dstindk>=0 and dstindk<dstlenk); - size_t const dstind = - dstindi + dstleni * (dstindj + dstlenj * dstindk); - size_t const bufind = - (dstdetailleni - 1 - i) + dstdetailleni * ((dstdetaillenj - 1 - j) + dstdetaillenj * k); - dstptr[dstind] = dstdataptr[bufind]; - } - } - } - - } // for vari + copy_data_back<CCTK_INT> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, true, true); } else if (dstdetail[n*SLAB_MAXDIM ].str==1 and dstdetail[n*SLAB_MAXDIM+1].str==1 and dstdetail[n*SLAB_MAXDIM+2].str==1 and - vartype == CCTK_VARIABLE_REAL) + vartype == CCTK_VARIABLE_INT) { - // Optimised version for CCTK_REAL and stride 1 - - int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len; - int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len; - int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len; + // Optimised version for CCTK_INT and stride 1 - for (int vari=0; vari<nvaris; ++vari) { - CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]]; - CCTK_REAL const * restrict const dstdataptr = - (CCTK_REAL const *)&dstdata.front() + - dstoffset[n] + vari * dstelems[n]; - -# pragma omp parallel for - for (int k = 0; k < dstdetaillenk; ++k) { - for (int j = 0; j < dstdetaillenj; ++j) { - for (int i = 0; i < dstdetailleni; ++i) { - int ipos[SLAB_MAXDIM]; - ipos[0] = i; - ipos[1] = j; - ipos[2] = k; - int bufipos[SLAB_MAXDIM]; - int dstipos[SLAB_MAXDIM]; - for (int d=0; d<SLAB_MAXDIM; ++d) { - if (not info[d].flip) { - bufipos[d] = ipos[d]; - } else { - bufipos[d] = dstdetail[n*SLAB_MAXDIM+d].len - 1 - ipos[d]; - } - ifcheck assert (bufipos[d] >= 0 and bufipos[d] < dstdetail[n*SLAB_MAXDIM+d].len); - dstipos[d] = dstdetail[n*SLAB_MAXDIM+d].off + ipos[d]; - ifcheck assert (dstipos[d] >= info[d].dst.local.off and - dstipos[d] < info[d].dst.local.off + info[d].dst.local.len); - ifcheck assert (dstipos[d] >= info[d].dst.slab.off and - dstipos[d] <= info[d].dst.slab.off + info[d].dst.slab.len - 1); - } - size_t bufind = 0; - size_t dstind = 0; - for (int d=SLAB_MAXDIM-1; d>=0; --d) { - bufind = bufind * dstdetail[n*SLAB_MAXDIM+d].len + bufipos[d]; - dstind = dstind * info[d].dst.local.len + dstipos[d] - info[d].dst.local.off; - } - ifcheck assert (bufind < (size_t)dstcount[n]); - ifcheck assert (dstind < dstlentot); - dstptr[dstind] = dstdataptr[bufind]; - } - } - } - - } // for vari + copy_data_back<CCTK_INT> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, info[0].flip==1, info[1].flip==1, info[2].flip==1); } else { // Generic, unoptimised version @@ -1884,3 +1775,5 @@ Slab_Transfer (cGH const * restrict const cctkGH, return Slab_MultiTransfer (cctkGH, dim, xferinfo, options, nvars, srctypes, srcptrs, dsttypes, dstptrs); } + +} // namespace Slab |