aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorrhaas <rhaas@2e825fa2-fb71-486d-8b7f-a5ff3f0f6cb8>2009-11-18 01:56:00 +0000
committerrhaas <rhaas@2e825fa2-fb71-486d-8b7f-a5ff3f0f6cb8>2009-11-18 01:56:00 +0000
commit2f1d026b9438fc40217e596934dbf74c041ce4df (patch)
treea983edcd35af235178e7bf47580cfa378111c1bf
parent0fac8b2d14bde5de51f534ce57d03a53416cf6a4 (diff)
add optimized support for CCTK_INT, introduce templated functions
git-svn-id: http://svn.cactuscode.org/arrangements/CactusNumerical/Slab/trunk@65 2e825fa2-fb71-486d-8b7f-a5ff3f0f6cb8
-rw-r--r--src/slab.cc589
1 files changed, 241 insertions, 348 deletions
diff --git a/src/slab.cc b/src/slab.cc
index daa8908..a60eebc 100644
--- a/src/slab.cc
+++ b/src/slab.cc
@@ -74,6 +74,7 @@ using namespace std;
#endif
+namespace Slab {
static int timer_init = -1;
static int timer_copy_in = -1;
@@ -696,7 +697,149 @@ print_xferinfo (FILE * const out,
fprintf (out, " flip: %d\n", xferinfo->flip);
}
+// workhorse routine responsible for the actual copying/transposing of data
+template<typename T> inline void
+copy_data (const vector<xfer> &info,
+ const vector<bbox> &srcdetail,
+ const vector<int> &srcoffset,
+ const vector<int> &srcelems,
+ const vector<char> &srcdata,
+ void const * restrict const * restrict const srcptrs,
+ const int n,
+ const vector<int> &varis,
+ const int nvaris,
+ const int xpose_x=0,
+ const int xpose_y=1,
+ const int xpose_z=2)
+{
+ assert (srcptrs);
+
+ int const srcoffi = info[0].src.local.off;
+ int const srcoffj = info[1].src.local.off;
+ int const srcoffk = info[2].src.local.off;
+
+ int const srcleni = info[0].src.local.len;
+ int const srclenj = info[1].src.local.len;
+ int const srclenk = info[2].src.local.len;
+
+ int const srcdetailoffi = srcdetail[n*SLAB_MAXDIM+0].off;
+ int const srcdetailoffj = srcdetail[n*SLAB_MAXDIM+1].off;
+ int const srcdetailoffk = srcdetail[n*SLAB_MAXDIM+2].off;
+
+ int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+0].len;
+ int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+1].len;
+ int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+2].len;
+
+ int const dstdetailleni = srcdetail[n*SLAB_MAXDIM+xpose_x].len;
+ int const dstdetaillenj = srcdetail[n*SLAB_MAXDIM+xpose_y].len;
+ //int const dstdetaillenk = srcdetail[n*SLAB_MAXDIM+xpose_z].len; unused
+
+ if (n==0) assert (srcoffset[n]==0);
+ // TODO: This does not take nvaris into account
+ // if (n<size-1) assert (srcoffset[n+1]==srcoffset[n]+srcdetailleni*srcdetaillenj*srcdetaillenk);
+
+ ifcheck {
+ const int xpose[SLAB_MAXDIM] = {xpose_x, xpose_y, xpose_z};
+ for (int i = 0; i < SLAB_MAXDIM; ++i) {
+ for (int j = i+1; j < SLAB_MAXDIM; ++j) {
+ assert(xpose[i] != xpose[j]);
+ }
+ }
+ }
+
+ for (int vari=0; vari<nvaris; ++vari) {
+ T * restrict const srcdataptr =
+ (T *)&srcdata.front() + srcoffset[n] + vari * srcelems[n];
+ T const * restrict const srcptr =
+ (T const *)srcptrs[varis[vari]];
+ assert(srcptr);
+
+# pragma omp parallel for
+ {
+ int ipos[SLAB_MAXDIM];
+ for (ipos[2] = 0; ipos[2] < srcdetaillenk; ++ipos[2]) {
+ for (ipos[1] = 0; ipos[1] < srcdetaillenj; ++ipos[1]) {
+ for (ipos[0] = 0; ipos[0] < srcdetailleni; ++ipos[0]) {
+ int const srcindi = srcdetailoffi + ipos[0] - srcoffi;
+ int const srcindj = srcdetailoffj + ipos[1] - srcoffj;
+ int const srcindk = srcdetailoffk + ipos[2] - srcoffk;
+ ifcheck assert (srcindi>=0 and srcindi<srcleni);
+ ifcheck assert (srcindj>=0 and srcindj<srclenj);
+ ifcheck assert (srcindk>=0 and srcindk<srclenk);
+ size_t const srcind =
+ srcindi + srcleni * (srcindj + srclenj * srcindk);
+ size_t const bufind =
+ ipos[xpose_x] + dstdetailleni * (ipos[xpose_y] + dstdetaillenj * ipos[xpose_z]);
+ srcdataptr[bufind] = srcptr[srcind];
+ }
+ }
+ }
+ }
+
+ } // for vari
+}
+// workhorse routine responsible for the actual copying/flipping of data
+template<typename T> inline void
+copy_data_back (const vector<xfer> &info,
+ const vector<bbox> &dstdetail,
+ const vector<int> &dstoffset,
+ const vector<int> &dstelems,
+ const vector<char> &dstdata,
+ void const * restrict const * restrict const dstptrs,
+ const int n,
+ const vector<int> &varis,
+ const int nvaris,
+ const bool flip_x=false,
+ const bool flip_y=false,
+ const bool flip_z=false)
+{
+ assert (dstptrs);
+
+ int const dstoffi = info[0].dst.local.off;
+ int const dstoffj = info[1].dst.local.off;
+ int const dstoffk = info[2].dst.local.off;
+
+ int const dstleni = info[0].dst.local.len;
+ int const dstlenj = info[1].dst.local.len;
+ int const dstlenk = info[2].dst.local.len;
+
+ int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off;
+ int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off;
+ int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off;
+
+ int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len;
+ int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len;
+ int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len;
+
+ for (int vari=0; vari<nvaris; ++vari) {
+ T * restrict const dstptr = (T *)dstptrs[varis[vari]];
+ assert (dstptr);
+ T const * restrict const dstdataptr =
+ (T const *)&dstdata.front() +
+ dstoffset[n] + vari * dstelems[n];
+
+# pragma omp parallel for
+ for (int k = 0; k < dstdetaillenk; ++k) {
+ for (int j = 0; j < dstdetaillenj; ++j) {
+ for (int i = 0; i < dstdetailleni; ++i) {
+ int const dstindi = dstdetailoffi + (flip_x ? dstdetailleni - 1 - i : i) - dstoffi;
+ int const dstindj = dstdetailoffj + (flip_y ? dstdetaillenj - 1 - j : j) - dstoffj;
+ int const dstindk = dstdetailoffk + (flip_z ? dstdetaillenk - 1 - k : k) - dstoffk;
+ ifcheck assert (dstindi>=0 and dstindi<dstleni);
+ ifcheck assert (dstindj>=0 and dstindj<dstlenj);
+ ifcheck assert (dstindk>=0 and dstindk<dstlenk);
+ size_t const dstind =
+ dstindi + dstleni * (dstindj + dstlenj * dstindk);
+ size_t const bufind =
+ i + dstdetailleni * (j + dstdetaillenj * k);
+ dstptr[dstind] = dstdataptr[bufind];
+ }
+ }
+ }
+
+ } // for vari
+}
extern "C"
int
@@ -1182,52 +1325,8 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH,
{
// Optimised version for a special case: no transposing
- int const srcoffi = info[0].src.local.off;
- int const srcoffj = info[1].src.local.off;
- int const srcoffk = info[2].src.local.off;
-
- int const srcleni = info[0].src.local.len;
- int const srclenj = info[1].src.local.len;
- int const srclenk = info[2].src.local.len;
-
- int const srcdetailoffi = srcdetail[n*SLAB_MAXDIM+0].off;
- int const srcdetailoffj = srcdetail[n*SLAB_MAXDIM+1].off;
- int const srcdetailoffk = srcdetail[n*SLAB_MAXDIM+2].off;
-
- int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+0].len;
- int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+1].len;
- int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+2].len;
-
- if (n==0) assert (srcoffset[n]==0);
- // TODO: This does not take nvaris into account
- // if (n<size-1) assert (srcoffset[n+1]==srcoffset[n]+srcdetailleni*srcdetaillenj*srcdetaillenk);
-
- for (int vari=0; vari<nvaris; ++vari) {
- CCTK_REAL * restrict const srcdataptr =
- (CCTK_REAL *)&srcdata.front() + srcoffset[n] + vari * srcelems[n];
- CCTK_REAL const * restrict const srcptr =
- (CCTK_REAL const *)srcptrs[varis[vari]];
-
-# pragma omp parallel for
- for (int k = 0; k < srcdetaillenk; ++k) {
- for (int j = 0; j < srcdetaillenj; ++j) {
- for (int i = 0; i < srcdetailleni; ++i) {
- int const srcindi = srcdetailoffi + i - srcoffi;
- int const srcindj = srcdetailoffj + j - srcoffj;
- int const srcindk = srcdetailoffk + k - srcoffk;
- ifcheck assert (srcindi>=0 and srcindi<srcleni);
- ifcheck assert (srcindj>=0 and srcindj<srclenj);
- ifcheck assert (srcindk>=0 and srcindk<srclenk);
- size_t const srcind =
- srcindi + srcleni * (srcindj + srclenj * srcindk);
- size_t const bufind =
- i + srcdetailleni * (j + srcdetaillenj * k);
- srcdataptr[bufind] = srcptr[srcind];
- }
- }
- }
-
- } // for vari
+ copy_data<CCTK_REAL> (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs,
+ n, varis, nvaris);
} else if (info[0].xpose==1 and info[1].xpose==0 and info[2].xpose==2 and
srcdetail[n*SLAB_MAXDIM ].str==1 and
@@ -1237,52 +1336,8 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH,
{
// Optimised version for a special case: transpose x and y
- int const srcoffi = info[0].src.local.off;
- int const srcoffj = info[1].src.local.off;
- int const srcoffk = info[2].src.local.off;
-
- int const srcleni = info[0].src.local.len;
- int const srclenj = info[1].src.local.len;
- int const srclenk = info[2].src.local.len;
-
- int const srcdetailoffi = srcdetail[n*SLAB_MAXDIM+0].off;
- int const srcdetailoffj = srcdetail[n*SLAB_MAXDIM+1].off;
- int const srcdetailoffk = srcdetail[n*SLAB_MAXDIM+2].off;
-
- int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+0].len;
- int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+1].len;
- int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+2].len;
-
- if (n==0) assert (srcoffset[n]==0);
- // TODO: This does not take nvaris into account
- // if (n<size-1) assert (srcoffset[n+1]==srcoffset[n]+srcdetailleni*srcdetaillenj*srcdetaillenk);
-
- for (int vari=0; vari<nvaris; ++vari) {
- CCTK_REAL * restrict const srcdataptr =
- (CCTK_REAL *)&srcdata.front() + srcoffset[n] + vari * srcelems[n];
- CCTK_REAL const * restrict const srcptr =
- (CCTK_REAL const *)srcptrs[varis[vari]];
-
-# pragma omp parallel for
- for (int k = 0; k < srcdetaillenk; ++k) {
- for (int j = 0; j < srcdetaillenj; ++j) {
- for (int i = 0; i < srcdetailleni; ++i) {
- int const srcindi = srcdetailoffi + i - srcoffi;
- int const srcindj = srcdetailoffj + j - srcoffj;
- int const srcindk = srcdetailoffk + k - srcoffk;
- ifcheck assert (srcindi>=0 and srcindi<srcleni);
- ifcheck assert (srcindj>=0 and srcindj<srclenj);
- ifcheck assert (srcindk>=0 and srcindk<srclenk);
- size_t const srcind =
- srcindi + srcleni * (srcindj + srclenj * srcindk);
- size_t const bufind =
- j + srcdetaillenj * (i + srcdetailleni * k);
- srcdataptr[bufind] = srcptr[srcind];
- }
- }
- }
-
- } // for vari
+ copy_data<CCTK_REAL> (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs,
+ n, varis, nvaris, 1, 0, 2);
} else if (srcdetail[n*SLAB_MAXDIM ].str==1 and
srcdetail[n*SLAB_MAXDIM+1].str==1 and
@@ -1291,53 +1346,40 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH,
{
// Optimised version for CCTK_REAL and stride 1
- int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+info[0].xpose].len;
- int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+info[1].xpose].len;
- int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+info[2].xpose].len;
+ copy_data<CCTK_REAL> (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs,
+ n, varis, nvaris, info[0].xpose, info[1].xpose, info[2].xpose);
- for (int vari=0; vari<nvaris; ++vari) {
- CCTK_REAL * restrict const srcdataptr =
- (CCTK_REAL *)&srcdata.front() + srcoffset[n] + vari * srcelems[n];
- CCTK_REAL const * restrict const srcptr =
- (CCTK_REAL const *)srcptrs[varis[vari]];
-
-# pragma omp parallel for
- for (int k = 0; k < srcdetaillenk; ++k) {
- for (int j = 0; j < srcdetaillenj; ++j) {
- for (int i = 0; i < srcdetailleni; ++i) {
- int ipos[SLAB_MAXDIM];
- ipos[0] = i;
- ipos[1] = j;
- ipos[2] = k;
- int srcipos[SLAB_MAXDIM];
- int bufipos[SLAB_MAXDIM];
- for (int d=0; d<SLAB_MAXDIM; ++d) {
- int const c = info[d].xpose;
- srcipos[c] = srcdetail[n*SLAB_MAXDIM+c].off + ipos[d];
- assert (srcipos[c] >= info[c].src.local.off and
- srcipos[c] < info[c].src.local.off + info[c].src.local.len);
- assert (srcipos[c] >= allinfo[n*SLAB_MAXDIM+c].src.slab.off and
- srcipos[c] <= allinfo[n*SLAB_MAXDIM+c].src.slab.off + (allinfo[n*SLAB_MAXDIM+c].src.slab.len - 1));
- bufipos[d] = ipos[d];
- assert (bufipos[d] >= 0 and
- bufipos[d] < srcdetail[n*SLAB_MAXDIM+c].len);
- }
- size_t srcind = 0;
- size_t bufind = 0;
- for (int d=SLAB_MAXDIM-1; d>=0; --d) {
- int const c = info[d].xpose;
- srcind = srcind * info[d].src.local.len + srcipos[d] - info[d].src.local.off;
- bufind = bufind * srcdetail[n*SLAB_MAXDIM+c].len + bufipos[d];
- }
- assert (srcind < srclentot);
- assert (bufind < (size_t)srccount[n]);
- srcdataptr[bufind] = srcptr[srcind];
- }
- }
- }
-
- } // for vari
+ } else if (info[0].xpose==0 and info[1].xpose==1 and info[2].xpose==2 and
+ srcdetail[n*SLAB_MAXDIM ].str==1 and
+ srcdetail[n*SLAB_MAXDIM+1].str==1 and
+ srcdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_INT)
+ {
+ // Optimised version for a special case: no transposing
+ copy_data<CCTK_INT> (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs,
+ n, varis, nvaris);
+
+ } else if (info[0].xpose==1 and info[1].xpose==0 and info[2].xpose==2 and
+ srcdetail[n*SLAB_MAXDIM ].str==1 and
+ srcdetail[n*SLAB_MAXDIM+1].str==1 and
+ srcdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_INT)
+ {
+ // Optimised version for a special case: transpose x and y
+
+ copy_data<CCTK_INT> (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs,
+ n, varis, nvaris, 1, 0, 2);
+
+ } else if (srcdetail[n*SLAB_MAXDIM ].str==1 and
+ srcdetail[n*SLAB_MAXDIM+1].str==1 and
+ srcdetail[n*SLAB_MAXDIM+2].str==1)
+ {
+ // Optimised version for CCTK_INT and stride 1
+
+ copy_data<CCTK_INT> (info, srcdetail, srcoffset, srcelems, srcdata, srcptrs,
+ n, varis, nvaris, info[0].xpose, info[1].xpose, info[2].xpose);
+
} else {
// Generic, unoptimised version
@@ -1478,48 +1520,8 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH,
{
// Optimised version for a special case: no flipping
- int const dstoffi = info[0].dst.local.off;
- int const dstoffj = info[1].dst.local.off;
- int const dstoffk = info[2].dst.local.off;
-
- int const dstleni = info[0].dst.local.len;
- int const dstlenj = info[1].dst.local.len;
- int const dstlenk = info[2].dst.local.len;
-
- int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off;
- int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off;
- int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off;
-
- int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len;
- int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len;
- int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len;
-
- for (int vari=0; vari<nvaris; ++vari) {
- CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]];
- CCTK_REAL const * restrict const dstdataptr =
- (CCTK_REAL const *)&dstdata.front() +
- dstoffset[n] + vari * dstelems[n];
-
-# pragma omp parallel for
- for (int k = 0; k < dstdetaillenk; ++k) {
- for (int j = 0; j < dstdetaillenj; ++j) {
- for (int i = 0; i < dstdetailleni; ++i) {
- int const dstindi = dstdetailoffi + i - dstoffi;
- int const dstindj = dstdetailoffj + j - dstoffj;
- int const dstindk = dstdetailoffk + k - dstoffk;
- ifcheck assert (dstindi>=0 and dstindi<dstleni);
- ifcheck assert (dstindj>=0 and dstindj<dstlenj);
- ifcheck assert (dstindk>=0 and dstindk<dstlenk);
- size_t const dstind =
- dstindi + dstleni * (dstindj + dstlenj * dstindk);
- size_t const bufind =
- i + dstdetailleni * (j + dstdetaillenj * k);
- dstptr[dstind] = dstdataptr[bufind];
- }
- }
- }
-
- } // for vari
+ copy_data_back<CCTK_REAL> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs,
+ n, varis, nvaris);
} else if (info[0].flip==1 and info[1].flip==0 and info[2].flip==0 and
dstdetail[n*SLAB_MAXDIM ].str==1 and
@@ -1529,48 +1531,8 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH,
{
// Optimised version for a special case: flip in x direction
- int const dstoffi = info[0].dst.local.off;
- int const dstoffj = info[1].dst.local.off;
- int const dstoffk = info[2].dst.local.off;
-
- int const dstleni = info[0].dst.local.len;
- int const dstlenj = info[1].dst.local.len;
- int const dstlenk = info[2].dst.local.len;
-
- int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off;
- int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off;
- int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off;
-
- int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len;
- int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len;
- int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len;
-
- for (int vari=0; vari<nvaris; ++vari) {
- CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]];
- CCTK_REAL const * restrict const dstdataptr =
- (CCTK_REAL const *)&dstdata.front() +
- dstoffset[n] + vari * dstelems[n];
-
-# pragma omp parallel for
- for (int k = 0; k < dstdetaillenk; ++k) {
- for (int j = 0; j < dstdetaillenj; ++j) {
- for (int i = 0; i < dstdetailleni; ++i) {
- int const dstindi = dstdetailoffi + i - dstoffi;
- int const dstindj = dstdetailoffj + j - dstoffj;
- int const dstindk = dstdetailoffk + k - dstoffk;
- ifcheck assert (dstindi>=0 and dstindi<dstleni);
- ifcheck assert (dstindj>=0 and dstindj<dstlenj);
- ifcheck assert (dstindk>=0 and dstindk<dstlenk);
- size_t const dstind =
- dstindi + dstleni * (dstindj + dstlenj * dstindk);
- size_t const bufind =
- (dstdetailleni - 1 - i) + dstdetailleni * (j + dstdetaillenj * k);
- dstptr[dstind] = dstdataptr[bufind];
- }
- }
- }
-
- } // for vari
+ copy_data_back<CCTK_REAL> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs,
+ n, varis, nvaris, true);
} else if (info[0].flip==0 and info[1].flip==1 and info[2].flip==0 and
dstdetail[n*SLAB_MAXDIM ].str==1 and
@@ -1580,154 +1542,83 @@ Slab_MultiTransfer (cGH const * restrict const cctkGH,
{
// Optimised version for a special case: flip in y direction
- int const dstoffi = info[0].dst.local.off;
- int const dstoffj = info[1].dst.local.off;
- int const dstoffk = info[2].dst.local.off;
+ copy_data_back<CCTK_REAL> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs,
+ n, varis, nvaris, false, true);
- int const dstleni = info[0].dst.local.len;
- int const dstlenj = info[1].dst.local.len;
- int const dstlenk = info[2].dst.local.len;
+ } else if (info[0].flip==1 and info[1].flip==1 and info[2].flip==0 and
+ dstdetail[n*SLAB_MAXDIM ].str==1 and
+ dstdetail[n*SLAB_MAXDIM+1].str==1 and
+ dstdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_REAL)
+ {
+ // Optimised version for a special case: flip in x and y direction
- int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off;
- int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off;
- int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off;
+ copy_data_back<CCTK_REAL> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs,
+ n, varis, nvaris, true, true);
- int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len;
- int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len;
- int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len;
+ } else if (dstdetail[n*SLAB_MAXDIM ].str==1 and
+ dstdetail[n*SLAB_MAXDIM+1].str==1 and
+ dstdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_REAL)
+ {
+ // Optimised version for CCTK_REAL and stride 1
- for (int vari=0; vari<nvaris; ++vari) {
- CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]];
- CCTK_REAL const * restrict const dstdataptr =
- (CCTK_REAL const *)&dstdata.front() +
- dstoffset[n] + vari * dstelems[n];
+ copy_data_back<CCTK_REAL> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs,
+ n, varis, nvaris, info[0].flip==1, info[1].flip==1, info[2].flip==1);
-# pragma omp parallel for
- for (int k = 0; k < dstdetaillenk; ++k) {
- for (int j = 0; j < dstdetaillenj; ++j) {
- for (int i = 0; i < dstdetailleni; ++i) {
- int const dstindi = dstdetailoffi + i - dstoffi;
- int const dstindj = dstdetailoffj + j - dstoffj;
- int const dstindk = dstdetailoffk + k - dstoffk;
- ifcheck assert (dstindi>=0 and dstindi<dstleni);
- ifcheck assert (dstindj>=0 and dstindj<dstlenj);
- ifcheck assert (dstindk>=0 and dstindk<dstlenk);
- size_t const dstind =
- dstindi + dstleni * (dstindj + dstlenj * dstindk);
- size_t const bufind =
- i + dstdetailleni * ((dstdetaillenj - 1 - j) + dstdetaillenj * k);
- dstptr[dstind] = dstdataptr[bufind];
- }
- }
- }
-
- } // for vari
+ } else if (info[0].flip==0 and info[1].flip==0 and info[2].flip==0 and
+ dstdetail[n*SLAB_MAXDIM ].str==1 and
+ dstdetail[n*SLAB_MAXDIM+1].str==1 and
+ dstdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_INT)
+ {
+ // Optimised version for a special case: no flipping
- } else if (info[0].flip==1 and info[1].flip==1 and info[2].flip==0 and
+ copy_data_back<CCTK_INT> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs,
+ n, varis, nvaris);
+
+ } else if (info[0].flip==1 and info[1].flip==0 and info[2].flip==0 and
dstdetail[n*SLAB_MAXDIM ].str==1 and
dstdetail[n*SLAB_MAXDIM+1].str==1 and
dstdetail[n*SLAB_MAXDIM+2].str==1 and
- vartype == CCTK_VARIABLE_REAL)
+ vartype == CCTK_VARIABLE_INT)
{
- // Optimised version for a special case: flip in x and y direction
+ // Optimised version for a special case: flip in x direction
- int const dstoffi = info[0].dst.local.off;
- int const dstoffj = info[1].dst.local.off;
- int const dstoffk = info[2].dst.local.off;
+ copy_data_back<CCTK_INT> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs,
+ n, varis, nvaris, true);
- int const dstleni = info[0].dst.local.len;
- int const dstlenj = info[1].dst.local.len;
- int const dstlenk = info[2].dst.local.len;
+ } else if (info[0].flip==0 and info[1].flip==1 and info[2].flip==0 and
+ dstdetail[n*SLAB_MAXDIM ].str==1 and
+ dstdetail[n*SLAB_MAXDIM+1].str==1 and
+ dstdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_INT)
+ {
+ // Optimised version for a special case: flip in y direction
- int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off;
- int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off;
- int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off;
+ copy_data_back<CCTK_INT> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs,
+ n, varis, nvaris, false, true);
- int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len;
- int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len;
- int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len;
+ } else if (info[0].flip==1 and info[1].flip==1 and info[2].flip==0 and
+ dstdetail[n*SLAB_MAXDIM ].str==1 and
+ dstdetail[n*SLAB_MAXDIM+1].str==1 and
+ dstdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_INT)
+ {
+ // Optimised version for a special case: flip in x and y direction
- for (int vari=0; vari<nvaris; ++vari) {
- CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]];
- CCTK_REAL const * restrict const dstdataptr =
- (CCTK_REAL const *)&dstdata.front() +
- dstoffset[n] + vari * dstelems[n];
-
-# pragma omp parallel for
- for (int k = 0; k < dstdetaillenk; ++k) {
- for (int j = 0; j < dstdetaillenj; ++j) {
- for (int i = 0; i < dstdetailleni; ++i) {
- int const dstindi = dstdetailoffi + i - dstoffi;
- int const dstindj = dstdetailoffj + j - dstoffj;
- int const dstindk = dstdetailoffk + k - dstoffk;
- ifcheck assert (dstindi>=0 and dstindi<dstleni);
- ifcheck assert (dstindj>=0 and dstindj<dstlenj);
- ifcheck assert (dstindk>=0 and dstindk<dstlenk);
- size_t const dstind =
- dstindi + dstleni * (dstindj + dstlenj * dstindk);
- size_t const bufind =
- (dstdetailleni - 1 - i) + dstdetailleni * ((dstdetaillenj - 1 - j) + dstdetaillenj * k);
- dstptr[dstind] = dstdataptr[bufind];
- }
- }
- }
-
- } // for vari
+ copy_data_back<CCTK_INT> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs,
+ n, varis, nvaris, true, true);
} else if (dstdetail[n*SLAB_MAXDIM ].str==1 and
dstdetail[n*SLAB_MAXDIM+1].str==1 and
dstdetail[n*SLAB_MAXDIM+2].str==1 and
- vartype == CCTK_VARIABLE_REAL)
+ vartype == CCTK_VARIABLE_INT)
{
- // Optimised version for CCTK_REAL and stride 1
-
- int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len;
- int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len;
- int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len;
+ // Optimised version for CCTK_INT and stride 1
- for (int vari=0; vari<nvaris; ++vari) {
- CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]];
- CCTK_REAL const * restrict const dstdataptr =
- (CCTK_REAL const *)&dstdata.front() +
- dstoffset[n] + vari * dstelems[n];
-
-# pragma omp parallel for
- for (int k = 0; k < dstdetaillenk; ++k) {
- for (int j = 0; j < dstdetaillenj; ++j) {
- for (int i = 0; i < dstdetailleni; ++i) {
- int ipos[SLAB_MAXDIM];
- ipos[0] = i;
- ipos[1] = j;
- ipos[2] = k;
- int bufipos[SLAB_MAXDIM];
- int dstipos[SLAB_MAXDIM];
- for (int d=0; d<SLAB_MAXDIM; ++d) {
- if (not info[d].flip) {
- bufipos[d] = ipos[d];
- } else {
- bufipos[d] = dstdetail[n*SLAB_MAXDIM+d].len - 1 - ipos[d];
- }
- ifcheck assert (bufipos[d] >= 0 and bufipos[d] < dstdetail[n*SLAB_MAXDIM+d].len);
- dstipos[d] = dstdetail[n*SLAB_MAXDIM+d].off + ipos[d];
- ifcheck assert (dstipos[d] >= info[d].dst.local.off and
- dstipos[d] < info[d].dst.local.off + info[d].dst.local.len);
- ifcheck assert (dstipos[d] >= info[d].dst.slab.off and
- dstipos[d] <= info[d].dst.slab.off + info[d].dst.slab.len - 1);
- }
- size_t bufind = 0;
- size_t dstind = 0;
- for (int d=SLAB_MAXDIM-1; d>=0; --d) {
- bufind = bufind * dstdetail[n*SLAB_MAXDIM+d].len + bufipos[d];
- dstind = dstind * info[d].dst.local.len + dstipos[d] - info[d].dst.local.off;
- }
- ifcheck assert (bufind < (size_t)dstcount[n]);
- ifcheck assert (dstind < dstlentot);
- dstptr[dstind] = dstdataptr[bufind];
- }
- }
- }
-
- } // for vari
+ copy_data_back<CCTK_INT> (info, dstdetail, dstoffset, dstelems, dstdata, dstptrs,
+ n, varis, nvaris, info[0].flip==1, info[1].flip==1, info[2].flip==1);
} else {
// Generic, unoptimised version
@@ -1884,3 +1775,5 @@ Slab_Transfer (cGH const * restrict const cctkGH,
return Slab_MultiTransfer (cctkGH, dim, xferinfo, options,
nvars, srctypes, srcptrs, dsttypes, dstptrs);
}
+
+} // namespace Slab