From 2f1d026b9438fc40217e596934dbf74c041ce4df Mon Sep 17 00:00:00 2001 From: rhaas Date: Wed, 18 Nov 2009 01:56:00 +0000 Subject: add optimized support for CCTK_INT, introduce templated functions git-svn-id: http://svn.cactuscode.org/arrangements/CactusNumerical/Slab/trunk@65 2e825fa2-fb71-486d-8b7f-a5ff3f0f6cb8 --- src/slab.cc | 589 +++++++++++++++++++++++++----------------------------------- 1 file changed, 241 insertions(+), 348 deletions(-) diff --git a/src/slab.cc b/src/slab.cc index daa8908..a60eebc 100644 --- a/src/slab.cc +++ b/src/slab.cc @@ -74,6 +74,7 @@ using namespace std; #endif +namespace Slab { static int timer_init = -1; static int timer_copy_in = -1; @@ -696,7 +697,149 @@ print_xferinfo (FILE * const out, fprintf (out, " flip: %d\n", xferinfo->flip); } +// workhorse routine responsible for the actual copying/transposing of data +template inline void +copy_data (const vector &info, + const vector &srcdetail, + const vector &srcoffset, + const vector &srcelems, + const vector &srcdata, + void const * restrict const * restrict const srcptrs, + const int n, + const vector &varis, + const int nvaris, + const int xpose_x=0, + const int xpose_y=1, + const int xpose_z=2) +{ + assert (srcptrs); + + int const srcoffi = info[0].src.local.off; + int const srcoffj = info[1].src.local.off; + int const srcoffk = info[2].src.local.off; + + int const srcleni = info[0].src.local.len; + int const srclenj = info[1].src.local.len; + int const srclenk = info[2].src.local.len; + + int const srcdetailoffi = srcdetail[n*SLAB_MAXDIM+0].off; + int const srcdetailoffj = srcdetail[n*SLAB_MAXDIM+1].off; + int const srcdetailoffk = srcdetail[n*SLAB_MAXDIM+2].off; + + int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+0].len; + int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+1].len; + int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+2].len; + + int const dstdetailleni = srcdetail[n*SLAB_MAXDIM+xpose_x].len; + int const dstdetaillenj = srcdetail[n*SLAB_MAXDIM+xpose_y].len; + //int const dstdetaillenk = srcdetail[n*SLAB_MAXDIM+xpose_z].len; unused + + if (n==0) assert (srcoffset[n]==0); + // TODO: This does not take nvaris into account + // if (n=0 and srcindi=0 and srcindj=0 and srcindk inline void +copy_data_back (const vector &info, + const vector &dstdetail, + const vector &dstoffset, + const vector &dstelems, + const vector &dstdata, + void const * restrict const * restrict const dstptrs, + const int n, + const vector &varis, + const int nvaris, + const bool flip_x=false, + const bool flip_y=false, + const bool flip_z=false) +{ + assert (dstptrs); + + int const dstoffi = info[0].dst.local.off; + int const dstoffj = info[1].dst.local.off; + int const dstoffk = info[2].dst.local.off; + + int const dstleni = info[0].dst.local.len; + int const dstlenj = info[1].dst.local.len; + int const dstlenk = info[2].dst.local.len; + + int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off; + int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off; + int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off; + + int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len; + int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len; + int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len; + + for (int vari=0; vari=0 and dstindi=0 and dstindj=0 and dstindk=0 and srcindi=0 and srcindj=0 and srcindk (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs, + n, varis, nvaris); } else if (info[0].xpose==1 and info[1].xpose==0 and info[2].xpose==2 and srcdetail[n*SLAB_MAXDIM ].str==1 and @@ -1237,52 +1336,8 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH, { // Optimised version for a special case: transpose x and y - int const srcoffi = info[0].src.local.off; - int const srcoffj = info[1].src.local.off; - int const srcoffk = info[2].src.local.off; - - int const srcleni = info[0].src.local.len; - int const srclenj = info[1].src.local.len; - int const srclenk = info[2].src.local.len; - - int const srcdetailoffi = srcdetail[n*SLAB_MAXDIM+0].off; - int const srcdetailoffj = srcdetail[n*SLAB_MAXDIM+1].off; - int const srcdetailoffk = srcdetail[n*SLAB_MAXDIM+2].off; - - int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+0].len; - int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+1].len; - int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+2].len; - - if (n==0) assert (srcoffset[n]==0); - // TODO: This does not take nvaris into account - // if (n=0 and srcindi=0 and srcindj=0 and srcindk (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs, + n, varis, nvaris, 1, 0, 2); } else if (srcdetail[n*SLAB_MAXDIM ].str==1 and srcdetail[n*SLAB_MAXDIM+1].str==1 and @@ -1291,53 +1346,40 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH, { // Optimised version for CCTK_REAL and stride 1 - int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+info[0].xpose].len; - int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+info[1].xpose].len; - int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+info[2].xpose].len; + copy_data (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs, + n, varis, nvaris, info[0].xpose, info[1].xpose, info[2].xpose); - for (int vari=0; vari= info[c].src.local.off and - srcipos[c] < info[c].src.local.off + info[c].src.local.len); - assert (srcipos[c] >= allinfo[n*SLAB_MAXDIM+c].src.slab.off and - srcipos[c] <= allinfo[n*SLAB_MAXDIM+c].src.slab.off + (allinfo[n*SLAB_MAXDIM+c].src.slab.len - 1)); - bufipos[d] = ipos[d]; - assert (bufipos[d] >= 0 and - bufipos[d] < srcdetail[n*SLAB_MAXDIM+c].len); - } - size_t srcind = 0; - size_t bufind = 0; - for (int d=SLAB_MAXDIM-1; d>=0; --d) { - int const c = info[d].xpose; - srcind = srcind * info[d].src.local.len + srcipos[d] - info[d].src.local.off; - bufind = bufind * srcdetail[n*SLAB_MAXDIM+c].len + bufipos[d]; - } - assert (srcind < srclentot); - assert (bufind < (size_t)srccount[n]); - srcdataptr[bufind] = srcptr[srcind]; - } - } - } - - } // for vari + } else if (info[0].xpose==0 and info[1].xpose==1 and info[2].xpose==2 and + srcdetail[n*SLAB_MAXDIM ].str==1 and + srcdetail[n*SLAB_MAXDIM+1].str==1 and + srcdetail[n*SLAB_MAXDIM+2].str==1 and + vartype == CCTK_VARIABLE_INT) + { + // Optimised version for a special case: no transposing + copy_data (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs, + n, varis, nvaris); + + } else if (info[0].xpose==1 and info[1].xpose==0 and info[2].xpose==2 and + srcdetail[n*SLAB_MAXDIM ].str==1 and + srcdetail[n*SLAB_MAXDIM+1].str==1 and + srcdetail[n*SLAB_MAXDIM+2].str==1 and + vartype == CCTK_VARIABLE_INT) + { + // Optimised version for a special case: transpose x and y + + copy_data (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs, + n, varis, nvaris, 1, 0, 2); + + } else if (srcdetail[n*SLAB_MAXDIM ].str==1 and + srcdetail[n*SLAB_MAXDIM+1].str==1 and + srcdetail[n*SLAB_MAXDIM+2].str==1) + { + // Optimised version for CCTK_INT and stride 1 + + copy_data (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs, + n, varis, nvaris, info[0].xpose, info[1].xpose, info[2].xpose); + } else { // Generic, unoptimised version @@ -1478,48 +1520,8 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH, { // Optimised version for a special case: no flipping - int const dstoffi = info[0].dst.local.off; - int const dstoffj = info[1].dst.local.off; - int const dstoffk = info[2].dst.local.off; - - int const dstleni = info[0].dst.local.len; - int const dstlenj = info[1].dst.local.len; - int const dstlenk = info[2].dst.local.len; - - int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off; - int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off; - int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off; - - int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len; - int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len; - int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len; - - for (int vari=0; vari=0 and dstindi=0 and dstindj=0 and dstindk (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris); } else if (info[0].flip==1 and info[1].flip==0 and info[2].flip==0 and dstdetail[n*SLAB_MAXDIM ].str==1 and @@ -1529,48 +1531,8 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH, { // Optimised version for a special case: flip in x direction - int const dstoffi = info[0].dst.local.off; - int const dstoffj = info[1].dst.local.off; - int const dstoffk = info[2].dst.local.off; - - int const dstleni = info[0].dst.local.len; - int const dstlenj = info[1].dst.local.len; - int const dstlenk = info[2].dst.local.len; - - int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off; - int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off; - int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off; - - int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len; - int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len; - int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len; - - for (int vari=0; vari=0 and dstindi=0 and dstindj=0 and dstindk (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, true); } else if (info[0].flip==0 and info[1].flip==1 and info[2].flip==0 and dstdetail[n*SLAB_MAXDIM ].str==1 and @@ -1580,154 +1542,83 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH, { // Optimised version for a special case: flip in y direction - int const dstoffi = info[0].dst.local.off; - int const dstoffj = info[1].dst.local.off; - int const dstoffk = info[2].dst.local.off; + copy_data_back (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, false, true); - int const dstleni = info[0].dst.local.len; - int const dstlenj = info[1].dst.local.len; - int const dstlenk = info[2].dst.local.len; + } else if (info[0].flip==1 and info[1].flip==1 and info[2].flip==0 and + dstdetail[n*SLAB_MAXDIM ].str==1 and + dstdetail[n*SLAB_MAXDIM+1].str==1 and + dstdetail[n*SLAB_MAXDIM+2].str==1 and + vartype == CCTK_VARIABLE_REAL) + { + // Optimised version for a special case: flip in x and y direction - int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off; - int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off; - int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off; + copy_data_back (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, true, true); - int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len; - int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len; - int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len; + } else if (dstdetail[n*SLAB_MAXDIM ].str==1 and + dstdetail[n*SLAB_MAXDIM+1].str==1 and + dstdetail[n*SLAB_MAXDIM+2].str==1 and + vartype == CCTK_VARIABLE_REAL) + { + // Optimised version for CCTK_REAL and stride 1 - for (int vari=0; vari (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, info[0].flip==1, info[1].flip==1, info[2].flip==1); -# pragma omp parallel for - for (int k = 0; k < dstdetaillenk; ++k) { - for (int j = 0; j < dstdetaillenj; ++j) { - for (int i = 0; i < dstdetailleni; ++i) { - int const dstindi = dstdetailoffi + i - dstoffi; - int const dstindj = dstdetailoffj + j - dstoffj; - int const dstindk = dstdetailoffk + k - dstoffk; - ifcheck assert (dstindi>=0 and dstindi=0 and dstindj=0 and dstindk (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris); + + } else if (info[0].flip==1 and info[1].flip==0 and info[2].flip==0 and dstdetail[n*SLAB_MAXDIM ].str==1 and dstdetail[n*SLAB_MAXDIM+1].str==1 and dstdetail[n*SLAB_MAXDIM+2].str==1 and - vartype == CCTK_VARIABLE_REAL) + vartype == CCTK_VARIABLE_INT) { - // Optimised version for a special case: flip in x and y direction + // Optimised version for a special case: flip in x direction - int const dstoffi = info[0].dst.local.off; - int const dstoffj = info[1].dst.local.off; - int const dstoffk = info[2].dst.local.off; + copy_data_back (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, true); - int const dstleni = info[0].dst.local.len; - int const dstlenj = info[1].dst.local.len; - int const dstlenk = info[2].dst.local.len; + } else if (info[0].flip==0 and info[1].flip==1 and info[2].flip==0 and + dstdetail[n*SLAB_MAXDIM ].str==1 and + dstdetail[n*SLAB_MAXDIM+1].str==1 and + dstdetail[n*SLAB_MAXDIM+2].str==1 and + vartype == CCTK_VARIABLE_INT) + { + // Optimised version for a special case: flip in y direction - int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off; - int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off; - int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off; + copy_data_back (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, false, true); - int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len; - int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len; - int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len; + } else if (info[0].flip==1 and info[1].flip==1 and info[2].flip==0 and + dstdetail[n*SLAB_MAXDIM ].str==1 and + dstdetail[n*SLAB_MAXDIM+1].str==1 and + dstdetail[n*SLAB_MAXDIM+2].str==1 and + vartype == CCTK_VARIABLE_INT) + { + // Optimised version for a special case: flip in x and y direction - for (int vari=0; vari=0 and dstindi=0 and dstindj=0 and dstindk (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, true, true); } else if (dstdetail[n*SLAB_MAXDIM ].str==1 and dstdetail[n*SLAB_MAXDIM+1].str==1 and dstdetail[n*SLAB_MAXDIM+2].str==1 and - vartype == CCTK_VARIABLE_REAL) + vartype == CCTK_VARIABLE_INT) { - // Optimised version for CCTK_REAL and stride 1 - - int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len; - int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len; - int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len; + // Optimised version for CCTK_INT and stride 1 - for (int vari=0; vari= 0 and bufipos[d] < dstdetail[n*SLAB_MAXDIM+d].len); - dstipos[d] = dstdetail[n*SLAB_MAXDIM+d].off + ipos[d]; - ifcheck assert (dstipos[d] >= info[d].dst.local.off and - dstipos[d] < info[d].dst.local.off + info[d].dst.local.len); - ifcheck assert (dstipos[d] >= info[d].dst.slab.off and - dstipos[d] <= info[d].dst.slab.off + info[d].dst.slab.len - 1); - } - size_t bufind = 0; - size_t dstind = 0; - for (int d=SLAB_MAXDIM-1; d>=0; --d) { - bufind = bufind * dstdetail[n*SLAB_MAXDIM+d].len + bufipos[d]; - dstind = dstind * info[d].dst.local.len + dstipos[d] - info[d].dst.local.off; - } - ifcheck assert (bufind < (size_t)dstcount[n]); - ifcheck assert (dstind < dstlentot); - dstptr[dstind] = dstdataptr[bufind]; - } - } - } - - } // for vari + copy_data_back (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs, + n, varis, nvaris, info[0].flip==1, info[1].flip==1, info[2].flip==1); } else { // Generic, unoptimised version @@ -1884,3 +1775,5 @@ Slab_Transfer (cGH const * restrict const cctkGH, return Slab_MultiTransfer (cctkGH, dim, xferinfo, options, nvars, srctypes, srcptrs, dsttypes, dstptrs); } + +} // namespace Slab -- cgit v1.2.3