aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorschnetter <schnetter@2e825fa2-fb71-486d-8b7f-a5ff3f0f6cb8>2009-10-25 23:34:31 +0000
committerschnetter <schnetter@2e825fa2-fb71-486d-8b7f-a5ff3f0f6cb8>2009-10-25 23:34:31 +0000
commitc09c77a5a8ad5c3bea9ff1b0c8aff9987d1a565b (patch)
treeb713ce2deb481e3cf9edc45629e7c05c03902ee0
parent743135c3e6349769c8f815f190192b520dbd98da (diff)
Update thorn TAT/Slab:
- handle different variable types in Slab_MultiTransfer Also: - rewrite in C++ - clean up code git-svn-id: http://svn.cactuscode.org/arrangements/CactusNumerical/Slab/trunk@61 2e825fa2-fb71-486d-8b7f-a5ff3f0f6cb8
-rw-r--r--src/slab.cc1886
1 files changed, 1886 insertions, 0 deletions
diff --git a/src/slab.cc b/src/slab.cc
new file mode 100644
index 0000000..0ac4dc5
--- /dev/null
+++ b/src/slab.cc
@@ -0,0 +1,1886 @@
+// TODO:
+// Provide facilities for dim > 3
+// Set up the slab exchange information in advance
+// Slab in several stages
+// Test slabbing without MPI
+// Allow not setting the ghost zones
+// Allow not using / not setting the boundaries
+// Allow different numbers of ghost zones at the lower and upper boundary
+
+
+
+// Print debug information?
+#undef DEBUG
+
+// Perform expensive self-checks?
+#define CHECK
+
+// Omit all self-checks? (Overrides CHECK)
+#undef NDEBUG
+
+// Byte value for poison checks: use 255 for nan, or e.g. 113 for a
+// large value
+#define POISON_VALUE 254
+
+
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+#include "cctk.h"
+#include "cctk_Arguments.h"
+#include "cctk_Parameters.h"
+#include "cctk_DefineThorn.h"
+#include "util_ErrorCodes.h"
+#include "util_Table.h"
+
+#ifdef CCTK_MPI
+# include <mpi.h>
+# define HAVE_MPI 1
+#else
+# define HAVE_MPI 0
+#endif
+
+#include "slab.h"
+
+#ifdef CCTK_CXX_RESTRICT
+# undef restrict
+# define restrict CCTK_CXX_RESTRICT
+#endif
+
+using namespace std;
+
+
+
+#ifdef DEBUG
+# define ifdebug
+#else
+# define ifdebug while (0)
+#endif
+
+#ifdef CHECK
+# define ifcheck
+#else
+# define ifcheck while (0)
+#endif
+
+#ifdef NDEBUG
+# define check(x) ((x) ? 0 : CCTK_WARN (0, "internal error"))
+#else
+# define check(x) assert (x)
+#endif
+
+
+
+static int timer_init = -1;
+static int timer_copy_in = -1;
+static int timer_xfer = -1;
+static int timer_copy_back = -1;
+
+extern "C"
+void
+Slab_InitTimers (CCTK_ARGUMENTS)
+{
+ DECLARE_CCTK_ARGUMENTS;
+ timer_init = CCTK_TimerCreate ("Slab/init");
+ timer_copy_in = CCTK_TimerCreate ("Slab/copy in");
+ timer_xfer = CCTK_TimerCreate ("Slab/xfer");
+ timer_copy_back = CCTK_TimerCreate ("Slab/copy back");
+}
+
+extern "C"
+int
+Slab_PrintTimers ()
+{
+ CCTK_TimerPrintDataI (timer_init , -1);
+ CCTK_TimerPrintDataI (timer_copy_in , -1);
+ CCTK_TimerPrintDataI (timer_xfer , -1);
+ CCTK_TimerPrintDataI (timer_copy_back, -1);
+ return 0;
+}
+
+
+
+// Find out which driver to use
+#ifdef CCTK_MPI
+# if defined CARPET_CARPET
+# include "Carpet/Carpet/src/carpet_public.h"
+# endif
+# if defined CACTUSPUGH_PUGH
+# include "CactusPUGH/PUGH/src/include/pugh.h"
+# endif
+#endif
+
+
+
+#ifdef CCTK_MPI
+// Determine MPI type sizes
+
+# define CACTUS_MPI_BYTE MPI_CHAR
+
+# define CACTUS_MPI_INT1 MPI_CHAR
+
+# if SIZEOF_SHORT_INT == 2
+# define CACTUS_MPI_INT2 MPI_SHORT
+# elif SIZEOF_INT == 2
+# define CACTUS_MPI_INT2 MPI_INT
+# elif SIZEOF_LONG_INT == 2
+# define CACTUS_MPI_INT2 MPI_LONG
+# elif SIZEOF_LONG_LONG == 2
+# define CACTUS_MPI_INT2 MPI_LONG_LONG_INT
+# endif
+
+# if SIZEOF_SHORT_INT == 4
+# define CACTUS_MPI_INT4 MPI_SHORT
+# elif SIZEOF_INT == 4
+# define CACTUS_MPI_INT4 MPI_INT
+# elif SIZEOF_LONG_INT == 4
+# define CACTUS_MPI_INT4 MPI_LONG
+# elif SIZEOF_LONG_LONG == 4
+# define CACTUS_MPI_INT4 MPI_LONG_LONG_INT
+# endif
+
+# if SIZEOF_SHORT_INT == 8
+# define CACTUS_MPI_INT8 MPI_SHORT
+# elif SIZEOF_INT == 8
+# define CACTUS_MPI_INT8 MPI_INT
+# elif SIZEOF_LONG_INT == 8
+# define CACTUS_MPI_INT8 MPI_LONG
+# elif SIZEOF_LONG_LONG == 8
+# define CACTUS_MPI_INT8 MPI_LONG_LONG_INT
+# endif
+
+# if SIZEOF_FLOAT == 4
+# define CACTUS_MPI_REAL4 MPI_FLOAT
+# elif SIZEOF_DOUBLE == 4
+# define CACTUS_MPI_REAL4 MPI_DOUBLE
+# elif SIZEOF_LONG_DOUBLE == 4
+# define CACTUS_MPI_REAL4 MPI_LONG_DOUBLE
+# endif
+
+# if SIZEOF_FLOAT == 8
+# define CACTUS_MPI_REAL8 MPI_FLOAT
+# elif SIZEOF_DOUBLE == 8
+# define CACTUS_MPI_REAL8 MPI_DOUBLE
+# elif SIZEOF_LONG_DOUBLE == 8
+# define CACTUS_MPI_REAL8 MPI_LONG_DOUBLE
+# endif
+
+# if SIZEOF_FLOAT == 16
+# define CACTUS_MPI_REAL16 MPI_FLOAT
+# elif SIZEOF_DOUBLE == 16
+# define CACTUS_MPI_REAL16 MPI_DOUBLE
+# elif SIZEOF_LONG_DOUBLE == 16
+# define CACTUS_MPI_REAL16 MPI_LONG_DOUBLE
+# endif
+
+static MPI_Datatype CACTUS_MPI_COMPLEX8;
+static MPI_Datatype CACTUS_MPI_COMPLEX16;
+static MPI_Datatype CACTUS_MPI_COMPLEX32;
+
+#endif
+
+
+
+// Replace MPI functions if MPI is disabled
+#ifndef CCTK_MPI
+
+typedef int MPI_Comm;
+
+typedef enum {
+ CACTUS_MPI_BYTE = CCTK_VARIABLE_BYTE,
+ CACTUS_MPI_INT = CCTK_VARIABLE_INT,
+ CACTUS_MPI_INT1 = CCTK_VARIABLE_INT1,
+ CACTUS_MPI_INT2 = CCTK_VARIABLE_INT2,
+ CACTUS_MPI_INT4 = CCTK_VARIABLE_INT4,
+ CACTUS_MPI_INT8 = CCTK_VARIABLE_INT8,
+ CACTUS_MPI_REAL = CCTK_VARIABLE_REAL,
+ CACTUS_MPI_REAL4 = CCTK_VARIABLE_REAL4,
+ CACTUS_MPI_REAL8 = CCTK_VARIABLE_REAL8,
+ CACTUS_MPI_REAL16 = CCTK_VARIABLE_REAL16,
+ CACTUS_MPI_COMPLEX = CCTK_VARIABLE_COMPLEX,
+ CACTUS_MPI_COMPLEX8 = CCTK_VARIABLE_COMPLEX8,
+ CACTUS_MPI_COMPLEX16 = CCTK_VARIABLE_COMPLEX16,
+ CACTUS_MPI_COMPLEX32 = CCTK_VARIABLE_COMPLEX32
+} MPI_Datatype;
+
+static MPI_Datatype MPI_INT;
+
+typedef enum { MPI_MIN, MPI_MAX } MPI_Op;
+
+static int
+MPI_Barrier (MPI_Comm comm)
+{
+ return 0;
+}
+
+static int
+MPI_Comm_size (MPI_Comm comm, int * size)
+{
+ *size = 1;
+ return 0;
+}
+
+static int
+MPI_Comm_rank (MPI_Comm comm, int * rank)
+{
+ *rank = 0;
+ return 0;
+}
+
+static int
+MPI_Allgather (void * sendbuf, int sendcnt, int sendtype,
+ void * recvbuf, int recvcnt, int recvtype,
+ MPI_Comm comm)
+{
+ int recvsize;
+ assert (sendbuf);
+ assert (recvbuf);
+ assert (sendcnt == recvcnt);
+ assert (recvcnt >= 0);
+ assert (sendtype == recvtype);
+ recvsize = CCTK_VarTypeSize (recvtype);
+ assert (recvsize > 0);
+ memcpy (recvbuf, sendbuf, recvcnt * recvsize);
+ return 0;
+}
+
+static int
+MPI_Alltoall (void * sendbuf, int sendcnt, int sendtype,
+ void * recvbuf, int recvcnt, int recvtype,
+ MPI_Comm comm)
+{
+ int recvsize;
+ assert (sendbuf);
+ assert (recvbuf);
+ assert (sendcnt == recvcnt);
+ assert (recvcnt >= 0);
+ assert (sendtype == recvtype);
+ recvsize = CCTK_VarTypeSize (recvtype);
+ assert (recvsize > 0);
+ memcpy (recvbuf, sendbuf, recvcnt * recvsize);
+ return 0;
+}
+
+static int
+MPI_Alltoallv (void * sendbuf, int * sendcnt, int * sendoff, int sendtype,
+ void * recvbuf, int * recvcnt, int * recvoff, int recvtype,
+ MPI_Comm comm)
+{
+ int recvsize;
+ assert (sendbuf);
+ assert (recvbuf);
+ assert (sendcnt);
+ assert (recvcnt);
+ assert (*sendcnt == *recvcnt);
+ assert (*recvcnt >= 0);
+ assert (sendoff);
+ assert (recvoff);
+ assert (*sendoff == 0);
+ assert (*recvoff == 0);
+ assert (sendtype == recvtype);
+ recvsize = CCTK_VarTypeSize (recvtype);
+ assert (recvsize > 0);
+ memcpy (recvbuf, sendbuf, *recvcnt * recvsize);
+ return 0;
+}
+
+static int
+MPI_Allreduce (void * sendbuf, void * recvbuf, int count,
+ MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+{
+ int recvsize;
+ assert (sendbuf);
+ assert (recvbuf);
+ assert (count >= 0);
+ recvsize = CCTK_VarTypeSize (datatype);
+ assert (recvsize > 0);
+ memcpy (recvbuf, sendbuf, count * recvsize);
+ return 0;
+}
+
+typedef int MPI_Request; // dummy
+typedef int MPI_Status; // dummy
+#define MPI_REQUEST_NULL 0 // dummy
+#define MPI_STATUSES_IGNORE 0 // dummy
+
+static int
+MPI_Irecv (void *buf, int count, MPI_Datatype datatype,
+ int source, int tag, MPI_Comm comm, MPI_Request *request)
+{
+ abort();
+}
+
+static int
+MPI_Isend (void *buf, int count, MPI_Datatype datatype, int dest,
+ int tag, MPI_Comm comm, MPI_Request *request)
+{
+ abort();
+}
+
+static int
+MPI_Waitall (int count, MPI_Request *array_of_requests,
+ MPI_Status *array_of_statuses)
+{
+ abort();
+}
+
+#endif
+
+
+
+// Get the MPI COMM_WOLRD communicator from the driver
+static MPI_Comm
+get_mpi_comm (cGH const * restrict const cctkGH)
+{
+#ifdef CCTK_MPI
+ if (CCTK_IsFunctionAliased ("GetMPICommWorld")) {
+ return * (MPI_Comm const *) GetMPICommWorld (cctkGH);
+ }
+# if defined CACTUSPUGH_PUGH
+ {
+ static int PUGH_active = -1;
+ if (PUGH_active == -1) PUGH_active = CCTK_IsThornActive ("PUGH");
+ assert (PUGH_active >= 0);
+ if (PUGH_active) return PUGH_pGH(cctkGH)->PUGH_COMM_WORLD;
+ }
+# endif
+ return MPI_COMM_WORLD;
+#else
+ return 0;
+#endif
+}
+
+
+
+// Initialise the MPI datatypes for complex variables
+extern "C"
+int
+Slab_InitMPIDatatypes ()
+{
+#ifdef CCTK_MPI
+# ifdef HAVE_CCTK_REAL4
+ MPI_Type_contiguous (2, CACTUS_MPI_REAL4, &CACTUS_MPI_COMPLEX8);
+ MPI_Type_commit (&CACTUS_MPI_COMPLEX8);
+# endif
+# ifdef HAVE_CCTK_REAL8
+ MPI_Type_contiguous (2, CACTUS_MPI_REAL8, &CACTUS_MPI_COMPLEX16);
+ MPI_Type_commit (&CACTUS_MPI_COMPLEX16);
+# endif
+# ifdef HAVE_CCTK_REAL16
+ MPI_Type_contiguous (2, CACTUS_MPI_REAL16, &CACTUS_MPI_COMPLEX32);
+ MPI_Type_commit (&CACTUS_MPI_COMPLEX32);
+# endif
+#endif
+
+#ifndef CCTK_MPI
+ switch (sizeof(int)) {
+#ifdef HAVE_CCTK_INT1
+ case sizeof(CCTK_INT1): MPI_INT = CCTK_VARIABLE_INT1; break;
+#endif
+#ifdef HAVE_CCTK_INT2
+ case sizeof(CCTK_INT2): MPI_INT = CCTK_VARIABLE_INT2; break;
+#endif
+#ifdef HAVE_CCTK_INT4
+ case sizeof(CCTK_INT4): MPI_INT = CCTK_VARIABLE_INT4; break;
+#endif
+#ifdef HAVE_CCTK_INT8
+ case sizeof(CCTK_INT8): MPI_INT = CCTK_VARIABLE_INT8; break;
+#endif
+ default: assert(0);
+ }
+#endif
+
+ return 0;
+}
+
+
+
+// Normalise a Cactus datatype
+static int
+normal_type (int cactustype)
+{
+ switch (cactustype) {
+ case CCTK_VARIABLE_INT:
+#ifdef CCTK_INTEGER_PRECISION_1
+ return CCTK_VARIABLE_INT1;
+#endif
+#ifdef CCTK_INTEGER_PRECISION_2
+ return CCTK_VARIABLE_INT2;
+#endif
+#ifdef CCTK_INTEGER_PRECISION_4
+ return CCTK_VARIABLE_INT4;
+#endif
+#ifdef CCTK_INTEGER_PRECISION_8
+ return CCTK_VARIABLE_INT8;
+#endif
+ assert (0);
+ case CCTK_VARIABLE_REAL:
+#ifdef CCTK_REAL_PRECISION_4
+ return CCTK_VARIABLE_REAL4;
+#endif
+#ifdef CCTK_REAL_PRECISION_8
+ return CCTK_VARIABLE_REAL8;
+#endif
+#ifdef CCTK_REAL_PRECISION_16
+ return CCTK_VARIABLE_REAL16;
+#endif
+ assert (0);
+ case CCTK_VARIABLE_COMPLEX:
+#ifdef CCTK_REAL_PRECISION_4
+ return CCTK_VARIABLE_COMPLEX8;
+#endif
+#ifdef CCTK_REAL_PRECISION_8
+ return CCTK_VARIABLE_COMPLEX16;
+#endif
+#ifdef CCTK_REAL_PRECISION_16
+ return CCTK_VARIABLE_COMPLEX32;
+#endif
+ assert (0);
+ }
+ return cactustype;
+}
+
+
+
+// Find the MPI datatype corresponding to a Cactus datatype
+static MPI_Datatype
+mpi_type (int const cactustype)
+{
+ int const normaltype = normal_type (cactustype);
+ switch (normaltype) {
+ case CCTK_VARIABLE_BYTE: return CACTUS_MPI_BYTE;
+#ifdef HAVE_CCTK_INT1
+ case CCTK_VARIABLE_INT1: return CACTUS_MPI_INT1;
+#endif
+#ifdef HAVE_CCTK_INT2
+ case CCTK_VARIABLE_INT2: return CACTUS_MPI_INT2;
+#endif
+#ifdef HAVE_CCTK_INT4
+ case CCTK_VARIABLE_INT4: return CACTUS_MPI_INT4;
+#endif
+#ifdef HAVE_CCTK_INT8
+ case CCTK_VARIABLE_INT8: return CACTUS_MPI_INT8;
+#endif
+#ifdef HAVE_CCTK_REAL4
+ case CCTK_VARIABLE_REAL4: return CACTUS_MPI_REAL4;
+ case CCTK_VARIABLE_COMPLEX8: return CACTUS_MPI_COMPLEX8;
+#endif
+#ifdef HAVE_CCTK_REAL8
+ case CCTK_VARIABLE_REAL8: return CACTUS_MPI_REAL8;
+ case CCTK_VARIABLE_COMPLEX16: return CACTUS_MPI_COMPLEX16;
+#endif
+#ifdef HAVE_CCTK_REAL16
+ case CCTK_VARIABLE_REAL16: return CACTUS_MPI_REAL16;
+ case CCTK_VARIABLE_COMPLEX32: return CACTUS_MPI_COMPLEX32;
+#endif
+ }
+ assert (0);
+ CCTK_WARN (0, "internal error");
+ return CACTUS_MPI_BYTE; // not reached
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+
+struct bbox {
+ int off; // offset
+ int len; // length
+ int str; // stride
+};
+
+struct arrays {
+ bbox global; // global region (all processors)
+ bbox local; // this processor's region
+ bbox active; // ???
+ bbox slab; // the slab that should be transferred
+};
+
+struct xfer {
+ arrays src; // source region
+ arrays dst; // destination region
+ int xpose; // exchange axes
+ int flip; // change directions of axes
+};
+
+
+
+static inline int
+roundup (int const x, int const y)
+{
+ assert (x >= 0);
+ assert (y > 0);
+ return (x + y - 1) / y * y;
+}
+
+
+
+static void
+bbox_print (bbox const * restrict const bbox)
+{
+ assert (bbox);
+ printf
+ ("[%d:%d:%d]",
+ bbox->off, bbox->off + (bbox->len - 1) * bbox->str, bbox->str);
+}
+
+static void
+bbox_check (bbox const * restrict const bbox)
+{
+ assert (bbox);
+ assert (bbox->len >= 0);
+ assert (bbox->str > 0);
+}
+
+static void
+global2bbox (slabinfo const * restrict const slab,
+ bbox * restrict const bbox)
+{
+ assert (slab);
+ assert (bbox);
+ assert (slab->gsh >= 0);
+ bbox->off = 0;
+ bbox->len = slab->gsh;
+ bbox->str = 1;
+ bbox_check (bbox);
+}
+
+static void
+local2bbox (slabinfo const * restrict const slab,
+ bbox * restrict const bbox)
+{
+ assert (slab);
+ assert (bbox);
+ assert (slab->lbnd >= 0);
+ assert (slab->lsh >= 0);
+ assert (slab->lbnd + slab->lsh <= slab->gsh);
+ bbox->off = slab->lbnd;
+ bbox->len = slab->lsh;
+ bbox->str = 1;
+ bbox_check (bbox);
+}
+
+static void
+active2bbox (slabinfo const * restrict const slab,
+ bbox * restrict const bbox,
+ int const useghosts)
+{
+ assert (slab);
+ assert (bbox);
+ assert (useghosts == 0 or useghosts == 1);
+ assert (slab->lbnd >= 0);
+ assert (slab->lsh >= 0);
+ assert (slab->lbnd + slab->lsh <= slab->gsh);
+ assert (slab->lbbox == 0 or slab->lbbox == 1);
+ assert (slab->ubbox == 0 or slab->ubbox == 1);
+ assert (slab->nghostzones >= 0);
+ int const nlghostzones = slab->lbbox or useghosts ? 0 : slab->nghostzones;
+ int const nughostzones = slab->ubbox or useghosts ? 0 : slab->nghostzones;
+ bbox->off = slab->lbnd + nlghostzones;
+ bbox->len = slab->lsh - nlghostzones - nughostzones;
+ bbox->str = 1;
+ bbox_check (bbox);
+}
+
+static void
+slab2bbox (slabinfo const * restrict const slab,
+ bbox * restrict const bbox)
+{
+ assert (slab);
+ assert (bbox);
+ bbox->off = slab->off;
+ bbox->len = slab->len;
+ bbox->str = slab->str;
+ bbox_check (bbox);
+}
+
+static int
+bbox_iscontained (bbox const * restrict const inner,
+ bbox const * restrict const outer)
+{
+ bbox_check (inner);
+ bbox_check (outer);
+ int const inner_last = inner->off + (inner->len - 1) * inner->str;
+ int const outer_last = outer->off + (outer->len - 1) * outer->str;
+ return inner->off >= outer->off and inner_last <= outer_last;
+}
+
+static void
+bbox_clip (bbox * restrict const inner,
+ bbox const * restrict const outer)
+{
+ bbox_check (inner);
+ bbox_check (outer);
+ int inner_last = inner->off + (inner->len - 1) * inner->str;
+ int const outer_last = outer->off + (outer->len - 1) * outer->str;
+ if (inner->off < outer->off) {
+ inner->off += roundup (outer->off - inner->off, inner->str);
+ }
+ if (inner_last > outer_last) {
+ inner_last -= roundup (inner_last - outer_last, inner->str);
+ }
+ assert ((inner_last - inner->off) % inner->str == 0);
+ if (inner_last >= inner->off) {
+ inner->len = (inner_last - inner->off + inner->str) / inner->str;
+ } else {
+ inner->len = 0;
+ }
+ bbox_check (inner);
+}
+
+// ydst = xdst + Flip (ysrc - xsrc)]
+// This function is its own inverse.
+static void
+bbox_xform (bbox * restrict const ydst,
+ bbox const * restrict const ysrc,
+ bbox const * restrict const xdst,
+ bbox const * restrict const xsrc,
+ int const flip)
+{
+ assert (ydst);
+ bbox_check (ysrc);
+ bbox_check (xdst);
+ bbox_check (xsrc);
+ assert (ysrc->str == xsrc->str);
+ /* int const xsrc_last = xsrc->off + (xsrc->len - 1) * xsrc->str; */
+ int const xdst_last = xdst->off + (xdst->len - 1) * xdst->str;
+ int const ysrc_last = ysrc->off + (ysrc->len - 1) * ysrc->str;
+ ydst->str = xdst->str;
+ assert ((ysrc->off - xsrc->off) % ysrc->str == 0);
+ ydst->off = xdst->off + (ysrc->off - xsrc->off) / ysrc->str * ydst->str;
+ int ydst_last =
+ xdst->off + (ysrc_last - xsrc->off) / ysrc->str * ydst->str;
+ if (flip) {
+ int const off = ydst->off;
+ int const last = ydst_last;
+ ydst->off = xdst->off + xdst_last - last;
+ ydst_last = xdst_last - (off - xdst->off);
+ }
+ assert ((ysrc_last - xsrc->off) % ysrc->str == 0);
+ assert (ydst_last - ydst->off + ydst->str >= 0);
+ ydst->len = (ydst_last - ydst->off + ydst->str) / ydst->str;
+ bbox_check (ydst);
+}
+
+
+
+extern "C"
+void
+print_slabinfo (FILE * const out,
+ slabinfo const * restrict const slabinfo)
+{
+ fprintf (out, " gsh: %d\n", slabinfo->gsh);
+ fprintf (out, " lbnd: %d, lsh: %d\n", slabinfo->lbnd, slabinfo->lsh);
+ fprintf (out, " lbbox: %d, ubbox: %d, nghostzones: %d\n",
+ slabinfo->lbbox, slabinfo->ubbox, slabinfo->nghostzones);
+ fprintf (out, " off: %d, str: %d, len: %d\n",
+ slabinfo->off, slabinfo->str, slabinfo->len);
+}
+
+extern "C"
+void
+print_xferinfo (FILE * const out,
+ xferinfo const * restrict const xferinfo)
+{
+ fprintf (out, " src:\n");
+ print_slabinfo (out, & xferinfo->src);
+ fprintf (out, " dst:\n");
+ print_slabinfo (out, & xferinfo->dst);
+ fprintf (out, " xpose: %d\n", xferinfo->xpose);
+ fprintf (out, " flip: %d\n", xferinfo->flip);
+}
+
+
+
+extern "C"
+int
+Slab_MultiTransfer (cGH const * restrict const cctkGH,
+ int const dim,
+ xferinfo const * restrict const xferinfo,
+ int const options,
+ int const nvars,
+ int const * restrict const srctypes,
+ void const * restrict const * restrict const srcptrs,
+ int const * restrict const dsttypes,
+ void * restrict const * restrict const dstptrs)
+{
+ DECLARE_CCTK_PARAMETERS;
+
+ // Check arguments
+ check (cctkGH);
+ check (dim >= 0);
+ check (xferinfo);
+ check (nvars >= 0);
+ check (nvars==0 or srctypes);
+ for (int var=0; var<nvars; ++var) check (srctypes[var] >= 0);
+ check (nvars==0 or srcptrs);
+ // for (int var=0; var<nvars; ++var) check (srcptrs[var]);
+ check (nvars==0 or dsttypes);
+ for (int var=0; var<nvars; ++var) check (dsttypes[var] >= 0);
+ check (nvars==0 or dstptrs);
+ // for (int var=0; var<nvars; ++var) check (dstptrs[var]);
+
+ if (nvars==0) return 0;
+
+ CCTK_TimerStartI (timer_init);
+
+ bool useghosts;
+ {
+ CCTK_INT tmp;
+ int const iret = Util_TableGetInt (options, &tmp, "useghosts");
+ if (iret == 1) {
+ // There was an entry, use it
+ useghosts = tmp;
+ } else if (iret == UTIL_ERROR_BAD_HANDLE or
+ iret == UTIL_ERROR_TABLE_NO_SUCH_KEY)
+ {
+ // There was no entry, use a default
+ useghosts = false;
+ } else {
+ // Something went wrong, abort
+ check (0);
+ }
+ }
+
+ check (dim <= SLAB_MAXDIM);
+ vector<xfer> info (SLAB_MAXDIM);
+ for (int d=0; d<dim; ++d) {
+ global2bbox (&xferinfo[d].src, &info[d].src.global);
+ local2bbox (&xferinfo[d].src, &info[d].src.local);
+ active2bbox (&xferinfo[d].src, &info[d].src.active, useghosts);
+ slab2bbox (&xferinfo[d].src, &info[d].src.slab);
+ check (bbox_iscontained (&info[d].src.active, &info[d].src.local));
+ check (bbox_iscontained (&info[d].src.local, &info[d].src.global));
+
+ global2bbox (&xferinfo[d].dst, &info[d].dst.global);
+ local2bbox (&xferinfo[d].dst, &info[d].dst.local);
+ active2bbox (&xferinfo[d].dst, &info[d].dst.active, 1); // fill ghosts
+ slab2bbox (&xferinfo[d].dst, &info[d].dst.slab);
+ check (bbox_iscontained (&info[d].dst.active, &info[d].dst.local));
+ check (bbox_iscontained (&info[d].dst.local, &info[d].dst.global));
+
+ info[d].xpose = xferinfo[d].xpose;
+ check (info[d].xpose >= 0 and info[d].xpose < dim);
+ info[d].flip = xferinfo[d].flip;
+ check (info[d].flip == 0 or info[d].flip == 1);
+ }
+ for (int d=dim; d<SLAB_MAXDIM; ++d) {
+ static bbox const fake_bbox = { 0, 1, 1 };
+ static arrays const fake_arrays =
+ { { 0, 1, 1 }, { 0, 1, 1 }, { 0, 1, 1 }, { 0, 1, 1 } };
+
+ bbox_check (&fake_bbox);
+
+ info[d].src = fake_arrays;
+ check (bbox_iscontained (&info[d].src.active, &info[d].src.local));
+ check (bbox_iscontained (&info[d].src.local, &info[d].src.global));
+
+ info[d].dst = fake_arrays;
+ check (bbox_iscontained (&info[d].dst.active, &info[d].dst.local));
+ check (bbox_iscontained (&info[d].dst.local, &info[d].dst.global));
+
+ info[d].xpose = d;
+ check (info[d].xpose >= 0 and info[d].xpose < SLAB_MAXDIM);
+ info[d].flip = 0;
+ check (info[d].flip == 0 or info[d].flip == 1);
+ }
+
+ ifcheck {
+ ifdebug printf ("srcinfo:\n");
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ printf (" src.global d=%d ", d);
+ bbox_print (&info[d].src.global);
+ printf ("\n");
+ printf (" src.local d=%d ", d);
+ bbox_print (&info[d].src.local);
+ printf ("\n");
+ printf (" src.active d=%d ", d);
+ bbox_print (&info[d].src.active);
+ printf ("\n");
+ printf (" src.slab d=%d ", d);
+ bbox_print (&info[d].src.slab);
+ printf ("\n");
+ }
+ ifdebug printf ("dstinfo:\n");
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ printf (" dst.global d=%d ", d);
+ bbox_print (&info[d].dst.global);
+ printf ("\n");
+ printf (" dst.local d=%d ", d);
+ bbox_print (&info[d].dst.local);
+ printf ("\n");
+ printf (" dst.active d=%d ", d);
+ bbox_print (&info[d].dst.active);
+ printf ("\n");
+ printf (" dst.slab d=%d ", d);
+ bbox_print (&info[d].dst.slab);
+ printf ("\n");
+ }
+ ifdebug printf ("info:\n");
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ printf (" xpose d=%d %d\n", d, info[d].xpose);
+ printf (" flip d=%d %d\n", d, info[d].flip);
+ }
+ }
+
+ {
+ bool iflag[SLAB_MAXDIM];
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ iflag[d] = false;
+ }
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ assert (not iflag[info[d].xpose]);
+ iflag[info[d].xpose] = true;
+ }
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ assert (iflag[d]);
+ }
+ // Allow non-contributing processors to be non-knowledgeable
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ if (info[info[d].xpose].src.slab.len and info[d].dst.slab.len > 0) {
+ assert (info[info[d].xpose].src.slab.len == info[d].dst.slab.len);
+ }
+ }
+ }
+
+ size_t srclentot = 1;
+ size_t dstlentot = 1;
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ srclentot *= info[d].src.local.len;
+ dstlentot *= info[d].dst.local.len;
+ }
+
+ // Check arguments (continued)
+ for (int var=0; var<nvars; ++var) if (srclentot > 0) assert (srcptrs[var]);
+ for (int var=0; var<nvars; ++var) if (dstlentot > 0) assert (dstptrs[var]);
+
+
+
+ MPI_Comm comm;
+ {
+ CCTK_POINTER_TO_CONST tmp1;
+ int const iret1 = Util_TableGetPointerToConst (options, &tmp1, "comm");
+ if (iret1 == 1) {
+ // There was an entry, use it
+ comm = * (MPI_Comm const *) tmp1;
+ } else if (iret1 == UTIL_ERROR_TABLE_WRONG_DATA_TYPE) {
+ // Entry has wront type, fall back
+ CCTK_POINTER tmp2;
+ int const iret2 = Util_TableGetPointer (options, &tmp2, "comm");
+ if (iret2 == 1) {
+ // There was an entry, use it
+ comm = * (MPI_Comm const *) tmp2;
+ } else {
+ // Something went wrong, abort
+ check (0);
+ }
+ } else if (iret1 == UTIL_ERROR_BAD_HANDLE or
+ iret1 == UTIL_ERROR_TABLE_NO_SUCH_KEY)
+ {
+ // There was no entry, use a default
+ comm = get_mpi_comm (cctkGH);
+ } else {
+ // Something went wrong, abort
+ check (0);
+ }
+ }
+
+ ifcheck {
+ ifdebug fflush (stdout);
+ MPI_Barrier (comm);
+ }
+
+ int size, rank;
+ MPI_Comm_size (comm, &size);
+ MPI_Comm_rank (comm, &rank);
+
+ ifcheck {
+ static int count = 424242;
+ int mycount = count;
+ ifdebug fflush (stdout);
+ MPI_Bcast (&mycount, 1, MPI_INT, 0, comm);
+ assert (mycount == count);
+ ++ count;
+ }
+
+
+
+ vector<xfer> allinfo (size * SLAB_MAXDIM);
+ {
+ int const info_nints = sizeof(xfer) / sizeof(int);
+ ifdebug fflush (stdout);
+ MPI_Allgather
+ (&info.front(), SLAB_MAXDIM * info_nints, MPI_INT,
+ &allinfo.front(), SLAB_MAXDIM * info_nints, MPI_INT, comm);
+ }
+
+ for (int n = 0; n < size; ++n) {
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ // Allow non-contributing processors to be non-knowledgeable
+ if (allinfo[n*SLAB_MAXDIM+d].src.slab.len > 0 and
+ info[d].src.slab.len > 0)
+ {
+ assert
+ (allinfo[n*SLAB_MAXDIM+d].src.global.off == info[d].src.global.off);
+ assert
+ (allinfo[n*SLAB_MAXDIM+d].src.global.len == info[d].src.global.len);
+ assert
+ (allinfo[n*SLAB_MAXDIM+d].src.global.str == info[d].src.global.str);
+ assert
+ (allinfo[n*SLAB_MAXDIM+d].src.local.str == info[d].src.local.str);
+ assert
+ (allinfo[n*SLAB_MAXDIM+d].src.active.str == info[d].src.active.str);
+ // 2003-03-01 eschnett: I don't know why the following should
+ // be necessary
+ assert
+ (allinfo[n*SLAB_MAXDIM+d].src.slab.str == info[d].src.slab.str);
+ }
+ if (allinfo[n*SLAB_MAXDIM+d].dst.slab.len > 0 and
+ info[d].dst.slab.len > 0)
+ {
+ assert
+ (allinfo[n*SLAB_MAXDIM+d].dst.global.off == info[d].dst.global.off);
+ assert
+ (allinfo[n*SLAB_MAXDIM+d].dst.global.len == info[d].dst.global.len);
+ assert
+ (allinfo[n*SLAB_MAXDIM+d].dst.global.str == info[d].dst.global.str);
+ assert
+ (allinfo[n*SLAB_MAXDIM+d].dst.local.str == info[d].dst.local.str);
+ assert
+ (allinfo[n*SLAB_MAXDIM+d].dst.active.str == info[d].dst.active.str);
+ assert
+ (allinfo[n*SLAB_MAXDIM+d].dst.slab.str == info[d].dst.slab.str);
+ }
+ assert (allinfo[n*SLAB_MAXDIM+d].xpose == info[d].xpose);
+ assert (allinfo[n*SLAB_MAXDIM+d].flip == info[d].flip);
+ }
+ }
+
+
+
+ vector<bbox> srcdetail (size * SLAB_MAXDIM);
+ for (int n = 0; n < size; ++n) {
+ ifdebug printf ("srcdetail n=%d:\n", n);
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ srcdetail[n*SLAB_MAXDIM+d] = allinfo[n*SLAB_MAXDIM+d].src.slab;
+ ifdebug printf (" src.slab d=%d ", d);
+ ifdebug bbox_print (&srcdetail[n*SLAB_MAXDIM+d]);
+ ifdebug printf ("\n");
+ bbox_clip (&srcdetail[n*SLAB_MAXDIM+d], &info[d].src.active);
+ ifdebug printf (" clipped with src.active d=%d ", d);
+ ifdebug bbox_print (&srcdetail[n*SLAB_MAXDIM+d]);
+ ifdebug printf ("\n");
+ }
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ bbox whereto;
+ bbox wherefrom;
+ whereto = allinfo[n*SLAB_MAXDIM+d].dst.slab;
+ ifdebug printf (" dst.slab d=%d ", info[d].xpose);
+ ifdebug bbox_print (&whereto);
+ ifdebug printf ("\n");
+ bbox_clip (&whereto, &allinfo[n*SLAB_MAXDIM+d].dst.active);
+ ifdebug printf (" whereto d=%d ", info[d].xpose);
+ ifdebug bbox_print (&whereto);
+ ifdebug printf ("\n");
+ bbox_xform
+ (&wherefrom, &whereto,
+ &allinfo[n*SLAB_MAXDIM+info[d].xpose].src.slab,
+ &allinfo[n*SLAB_MAXDIM+d].dst.slab,
+ info[d].flip);
+ ifdebug printf (" wherefrom d=%d ", info[d].xpose);
+ ifdebug bbox_print (&wherefrom);
+ ifdebug printf ("\n");
+ bbox_clip (&srcdetail[n*SLAB_MAXDIM+info[d].xpose], &wherefrom);
+ ifdebug printf (" clipped with wherefrom d=%d ", info[d].xpose);
+ ifdebug bbox_print (&srcdetail[n*SLAB_MAXDIM+info[d].xpose]);
+ ifdebug printf ("\n");
+ }
+ }
+
+ vector<int> srcelems (size);
+ vector<int> srccount (size);
+ vector<int> srcoffset (size + 1);
+
+
+
+ vector<bbox> dstdetail (size * SLAB_MAXDIM);
+ for (int n = 0; n < size; ++n) {
+ ifdebug printf ("dstdetail n=%d:\n", n);
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ // dstdetail[n*SLAB_MAXDIM+d] = allinfo[n*SLAB_MAXDIM+d].dst.slab;
+ dstdetail[n*SLAB_MAXDIM+d] = info[d].dst.slab;
+ ifdebug printf (" dst.slab d=%d ", d);
+ ifdebug bbox_print (&dstdetail[n*SLAB_MAXDIM+d]);
+ ifdebug printf ("\n");
+ bbox_clip (&dstdetail[n*SLAB_MAXDIM+d], &info[d].dst.active);
+ ifdebug printf (" clipped with dst.active d=%d ", d);
+ ifdebug bbox_print (&dstdetail[n*SLAB_MAXDIM+d]);
+ ifdebug printf ("\n");
+ }
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ bbox wherefrom;
+ bbox whereto;
+ // wherefrom = allinfo[n*SLAB_MAXDIM+info[d].xpose].src.slab;
+ wherefrom = info[info[d].xpose].src.slab;
+ ifdebug printf (" src.slab d=%d ", d);
+ // ifdebug bbox_print (&dstdetail[n*SLAB_MAXDIM+d]);
+ ifdebug bbox_print (&wherefrom);
+ ifdebug printf ("\n");
+ bbox_clip (&wherefrom, &allinfo[n*SLAB_MAXDIM+info[d].xpose].src.active);
+ ifdebug printf (" wherefrom d=%d ", d);
+ // ifdebug bbox_print (&dstdetail[n*SLAB_MAXDIM+d]);
+ ifdebug bbox_print (&wherefrom);
+ ifdebug printf ("\n");
+ bbox_xform
+ (&whereto, &wherefrom,
+ &allinfo[n*SLAB_MAXDIM+d].dst.slab,
+ // &allinfo[n*SLAB_MAXDIM+info[d].xpose].src.slab,
+ &info[info[d].xpose].src.slab,
+ info[d].flip);
+ ifdebug printf (" whereto d=%d ", d);
+ // ifdebug bbox_print (&dstdetail[n*SLAB_MAXDIM+d]);
+ ifdebug bbox_print (&whereto);
+ ifdebug printf ("\n");
+ bbox_clip (&dstdetail[n*SLAB_MAXDIM+d], &whereto);
+ ifdebug printf (" clipped with whereto d=%d ", d);
+ ifdebug bbox_print (&dstdetail[n*SLAB_MAXDIM+d]);
+ ifdebug printf ("\n");
+ }
+ }
+
+ vector<int> dstelems (size);
+ vector<int> dstcount (size);
+ vector<int> dstoffset (size + 1);
+
+ CCTK_TimerStopI (timer_init);
+
+
+
+ int nvartypes = 0;
+ vector<int> vartypes (nvars);
+ vector<int> vartypecount (nvars);
+ for (int var=0; var<nvars; ++var) {
+ int const srctype = srctypes[var];
+ int const dsttype = dsttypes[var];
+ check (srctype == dsttype);
+ int vartypei;
+ for (vartypei=0; vartypei<nvartypes; ++vartypei) {
+ if (srctype == vartypes[vartypei]) break;
+ }
+ if (vartypei>=nvartypes) {
+ vartypes[nvartypes] = srctype;
+ vartypecount[nvartypes] = 0;
+ ++nvartypes;
+ }
+ assert (vartypei<nvartypes);
+ ++ vartypecount[vartypei];
+ }
+
+ for (int vartypei=0; vartypei<nvartypes; ++vartypei) {
+ int const vartype = vartypes[vartypei];
+ int nvaris = 0;
+ vector<int> varis (nvars);
+ for (int var=0; var<nvars; ++var) {
+ if (srctypes[var] == vartype) {
+ varis[nvaris] = var;
+ ++nvaris;
+ }
+ }
+ assert (nvaris == vartypecount[vartypei]);
+ assert (nvaris > 0);
+
+ int const vartypesize = CCTK_VarTypeSize (vartype);
+ check (vartypesize > 0);
+ MPI_Datatype const vardatatype = mpi_type (vartype);
+
+
+
+ srcoffset[0] = 0;
+ for (int n = 0; n < size; ++n) {
+ srcelems[n] = 1;
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ srcelems[n] *= srcdetail[n*SLAB_MAXDIM+d].len;
+ }
+ srccount[n] = nvaris * srcelems[n];
+ ifdebug printf
+ ("srccnt n=%d offset=%d count=%d\n", n, srcoffset[n], srccount[n]);
+ srcoffset[n+1] = srcoffset[n] + srccount[n];
+ }
+ vector<char> srcdata (srcoffset[size] * vartypesize);
+ ifcheck {
+ if (vartype == CCTK_VARIABLE_REAL) {
+ CCTK_REAL * restrict const srcdataptr = (CCTK_REAL *)&srcdata.front();
+ CCTK_REAL marker;
+ memset (&marker, POISON_VALUE, sizeof marker);
+ for (int i = 0; i < srcoffset[size]; ++i) {
+ memcpy (&srcdataptr[i], &marker, sizeof marker);
+ }
+ }
+ }
+
+
+
+ dstoffset[0] = 0;
+ for (int n = 0; n < size; ++n) {
+ dstelems[n] = 1;
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ dstelems[n] *= dstdetail[n*SLAB_MAXDIM+d].len;
+ }
+ dstcount[n] = nvaris * dstelems[n];
+ ifdebug printf
+ ("dstcnt n=%d offset=%d count=%d\n", n, dstoffset[n], dstcount[n]);
+ dstoffset[n+1] = dstoffset[n] + dstcount[n];
+ }
+ vector<char> dstdata (dstoffset[size] * vartypesize);
+ ifcheck {
+ if (vartype == CCTK_VARIABLE_REAL) {
+ CCTK_REAL * restrict const dstdataptr = (CCTK_REAL *)&dstdata.front();
+ CCTK_REAL marker;
+ memset (&marker, POISON_VALUE, sizeof marker);
+ for (int i = 0; i < dstoffset[size]; ++i) {
+ memcpy (&dstdataptr[i], &marker, sizeof marker);
+ }
+ }
+ }
+
+ check (srccount[rank] == dstcount[rank]);
+
+
+
+ ifcheck {
+ vector<int> src2count (size);
+ vector<int> dst2count (size);
+ ifdebug fflush (stdout);
+ MPI_Alltoall
+ (&srccount.front(), 1, MPI_INT, &src2count.front(), 1, MPI_INT, comm);
+ MPI_Alltoall
+ (&dstcount.front(), 1, MPI_INT, &dst2count.front(), 1, MPI_INT, comm);
+ for (int n = 0; n < size; ++n) {
+ check (src2count[n] == dstcount[n]);
+ check (dst2count[n] == srccount[n]);
+ }
+ }
+
+
+
+ CCTK_TimerStartI (timer_copy_in);
+
+ for (int n = 0; n < size; ++n) {
+ check (SLAB_MAXDIM == 3);
+
+ if (info[0].xpose==0 and info[1].xpose==1 and info[2].xpose==2 and
+ srcdetail[n*SLAB_MAXDIM ].str==1 and
+ srcdetail[n*SLAB_MAXDIM+1].str==1 and
+ srcdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_REAL)
+ {
+ // Optimised version for a special case: no transposing
+
+ int const srcoffi = info[0].src.local.off;
+ int const srcoffj = info[1].src.local.off;
+ int const srcoffk = info[2].src.local.off;
+
+ int const srcleni = info[0].src.local.len;
+ int const srclenj = info[1].src.local.len;
+ int const srclenk = info[2].src.local.len;
+
+ int const srcdetailoffi = srcdetail[n*SLAB_MAXDIM+0].off;
+ int const srcdetailoffj = srcdetail[n*SLAB_MAXDIM+1].off;
+ int const srcdetailoffk = srcdetail[n*SLAB_MAXDIM+2].off;
+
+ int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+0].len;
+ int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+1].len;
+ int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+2].len;
+
+ if (n==0) assert (srcoffset[n]==0);
+ // TODO: This does not take nvaris into account
+ // if (n<size-1) assert (srcoffset[n+1]==srcoffset[n]+srcdetailleni*srcdetaillenj*srcdetaillenk);
+
+ for (int vari=0; vari<nvaris; ++vari) {
+ CCTK_REAL * restrict const srcdataptr =
+ (CCTK_REAL *)&srcdata.front() + srcoffset[n] + vari * srcelems[n];
+ CCTK_REAL const * restrict const srcptr =
+ (CCTK_REAL const *)srcptrs[varis[vari]];
+
+# pragma omp parallel for
+ for (int k = 0; k < srcdetaillenk; ++k) {
+ for (int j = 0; j < srcdetaillenj; ++j) {
+ for (int i = 0; i < srcdetailleni; ++i) {
+ int const srcindi = srcdetailoffi + i - srcoffi;
+ int const srcindj = srcdetailoffj + j - srcoffj;
+ int const srcindk = srcdetailoffk + k - srcoffk;
+ ifcheck assert (srcindi>=0 and srcindi<srcleni);
+ ifcheck assert (srcindj>=0 and srcindj<srclenj);
+ ifcheck assert (srcindk>=0 and srcindk<srclenk);
+ size_t const srcind =
+ srcindi + srcleni * (srcindj + srclenj * srcindk);
+ size_t const bufind =
+ i + srcdetailleni * (j + srcdetaillenj * k);
+ srcdataptr[bufind] = srcptr[srcind];
+ }
+ }
+ }
+
+ } // for vari
+
+ } else if (info[0].xpose==1 and info[1].xpose==0 and info[2].xpose==2 and
+ srcdetail[n*SLAB_MAXDIM ].str==1 and
+ srcdetail[n*SLAB_MAXDIM+1].str==1 and
+ srcdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_REAL)
+ {
+ // Optimised version for a special case: transpose x and y
+
+ int const srcoffi = info[0].src.local.off;
+ int const srcoffj = info[1].src.local.off;
+ int const srcoffk = info[2].src.local.off;
+
+ int const srcleni = info[0].src.local.len;
+ int const srclenj = info[1].src.local.len;
+ int const srclenk = info[2].src.local.len;
+
+ int const srcdetailoffi = srcdetail[n*SLAB_MAXDIM+0].off;
+ int const srcdetailoffj = srcdetail[n*SLAB_MAXDIM+1].off;
+ int const srcdetailoffk = srcdetail[n*SLAB_MAXDIM+2].off;
+
+ int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+0].len;
+ int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+1].len;
+ int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+2].len;
+
+ if (n==0) assert (srcoffset[n]==0);
+ // TODO: This does not take nvaris into account
+ // if (n<size-1) assert (srcoffset[n+1]==srcoffset[n]+srcdetailleni*srcdetaillenj*srcdetaillenk);
+
+ for (int vari=0; vari<nvaris; ++vari) {
+ CCTK_REAL * restrict const srcdataptr =
+ (CCTK_REAL *)&srcdata.front() + srcoffset[n] + vari * srcelems[n];
+ CCTK_REAL const * restrict const srcptr =
+ (CCTK_REAL const *)srcptrs[varis[vari]];
+
+# pragma omp parallel for
+ for (int k = 0; k < srcdetaillenk; ++k) {
+ for (int j = 0; j < srcdetaillenj; ++j) {
+ for (int i = 0; i < srcdetailleni; ++i) {
+ int const srcindi = srcdetailoffi + i - srcoffi;
+ int const srcindj = srcdetailoffj + j - srcoffj;
+ int const srcindk = srcdetailoffk + k - srcoffk;
+ ifcheck assert (srcindi>=0 and srcindi<srcleni);
+ ifcheck assert (srcindj>=0 and srcindj<srclenj);
+ ifcheck assert (srcindk>=0 and srcindk<srclenk);
+ size_t const srcind =
+ srcindi + srcleni * (srcindj + srclenj * srcindk);
+ size_t const bufind =
+ j + srcdetaillenj * (i + srcdetailleni * k);
+ srcdataptr[bufind] = srcptr[srcind];
+ }
+ }
+ }
+
+ } // for vari
+
+ } else if (srcdetail[n*SLAB_MAXDIM ].str==1 and
+ srcdetail[n*SLAB_MAXDIM+1].str==1 and
+ srcdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_REAL)
+ {
+ // Optimised version for CCTK_REAL and stride 1
+
+ int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+info[0].xpose].len;
+ int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+info[1].xpose].len;
+ int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+info[2].xpose].len;
+
+ for (int vari=0; vari<nvaris; ++vari) {
+ CCTK_REAL * restrict const srcdataptr =
+ (CCTK_REAL *)&srcdata.front() + srcoffset[n] + vari * srcelems[n];
+ CCTK_REAL const * restrict const srcptr =
+ (CCTK_REAL const *)srcptrs[varis[vari]];
+
+# pragma omp parallel for
+ for (int k = 0; k < srcdetaillenk; ++k) {
+ for (int j = 0; j < srcdetaillenj; ++j) {
+ for (int i = 0; i < srcdetailleni; ++i) {
+ int ipos[SLAB_MAXDIM];
+ ipos[0] = i;
+ ipos[1] = j;
+ ipos[2] = k;
+ int srcipos[SLAB_MAXDIM];
+ int bufipos[SLAB_MAXDIM];
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ int const c = info[d].xpose;
+ srcipos[c] = srcdetail[n*SLAB_MAXDIM+c].off + ipos[d];
+ assert (srcipos[c] >= info[c].src.local.off and
+ srcipos[c] < info[c].src.local.off + info[c].src.local.len);
+ assert (srcipos[c] >= allinfo[n*SLAB_MAXDIM+c].src.slab.off and
+ srcipos[c] <= allinfo[n*SLAB_MAXDIM+c].src.slab.off + (allinfo[n*SLAB_MAXDIM+c].src.slab.len - 1));
+ bufipos[d] = ipos[d];
+ assert (bufipos[d] >= 0 and
+ bufipos[d] < srcdetail[n*SLAB_MAXDIM+c].len);
+ }
+ size_t srcind = 0;
+ size_t bufind = 0;
+ for (int d=SLAB_MAXDIM-1; d>=0; --d) {
+ int const c = info[d].xpose;
+ srcind = srcind * info[d].src.local.len + srcipos[d] - info[d].src.local.off;
+ bufind = bufind * srcdetail[n*SLAB_MAXDIM+c].len + bufipos[d];
+ }
+ assert (srcind < srclentot);
+ assert (bufind < (size_t)srccount[n]);
+ srcdataptr[bufind] = srcptr[srcind];
+ }
+ }
+ }
+
+ } // for vari
+
+ } else {
+ // Generic, unoptimised version
+
+ int const srcdetailleni = srcdetail[n*SLAB_MAXDIM+info[0].xpose].len;
+ int const srcdetaillenj = srcdetail[n*SLAB_MAXDIM+info[1].xpose].len;
+ int const srcdetaillenk = srcdetail[n*SLAB_MAXDIM+info[2].xpose].len;
+
+ for (int vari=0; vari<nvaris; ++vari) {
+ char * restrict const srcdataptr =
+ &srcdata[vartypesize * (srcoffset[n] + vari * srcelems[n])];
+ char const * restrict const srcptr =
+ (char const *)srcptrs[varis[vari]];
+
+# pragma omp parallel for
+ for (int k = 0; k < srcdetaillenk; ++k) {
+ for (int j = 0; j < srcdetaillenj; ++j) {
+ for (int i = 0; i < srcdetailleni; ++i) {
+ int ipos[SLAB_MAXDIM];
+ ipos[0] = i;
+ ipos[1] = j;
+ ipos[2] = k;
+ int srcipos[SLAB_MAXDIM];
+ int bufipos[SLAB_MAXDIM];
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ int const c = info[d].xpose;
+ srcipos[c] = srcdetail[n*SLAB_MAXDIM+c].off + ipos[d] * srcdetail[n*SLAB_MAXDIM+c].str;
+ assert (srcipos[c] >= info[c].src.local.off and
+ srcipos[c] < info[c].src.local.off + info[c].src.local.len);
+ assert (srcipos[c] >= allinfo[n*SLAB_MAXDIM+c].src.slab.off and
+ srcipos[c] <= allinfo[n*SLAB_MAXDIM+c].src.slab.off + (allinfo[n*SLAB_MAXDIM+c].src.slab.len - 1) * allinfo[n*SLAB_MAXDIM+c].src.slab.str);
+ assert ((srcipos[c] - allinfo[n*SLAB_MAXDIM+c].src.slab.off) % allinfo[n*SLAB_MAXDIM+c].src.slab.str == 0);
+ bufipos[d] = ipos[d];
+ assert (bufipos[d] >= 0 and bufipos[d] < srcdetail[n*SLAB_MAXDIM+c].len);
+ }
+ size_t srcind = 0;
+ size_t bufind = 0;
+ for (int d=SLAB_MAXDIM-1; d>=0; --d) {
+ int const c = info[d].xpose;
+ srcind = srcind * info[d].src.local.len + srcipos[d] - info[d].src.local.off;
+ bufind = bufind * srcdetail[n*SLAB_MAXDIM+c].len + bufipos[d];
+ }
+ assert (srcind < srclentot);
+ assert (bufind < (size_t)srccount[n]);
+ memcpy (srcdataptr + vartypesize * bufind,
+ srcptr + vartypesize * srcind,
+ vartypesize);
+ }
+ }
+ }
+
+ } // for vari
+
+ }
+ } // for n
+
+ ifcheck {
+ if (vartype == CCTK_VARIABLE_REAL) {
+ CCTK_REAL const * restrict const srcdataptr =
+ (CCTK_REAL const *)&srcdata.front();
+ CCTK_REAL marker;
+ memset (&marker, POISON_VALUE, sizeof marker);
+ for (int i = 0; i < srcoffset[size]; ++i) {
+ assert (memcmp(&srcdataptr[i], &marker, sizeof marker) != 0);
+ }
+ }
+ }
+ CCTK_TimerStopI (timer_copy_in);
+
+
+
+ CCTK_TimerStartI (timer_xfer);
+ ifdebug fflush (stdout);
+ if (not HAVE_MPI or use_alltoallv) {
+ MPI_Alltoallv
+ (&srcdata.front(), &srccount.front(), &srcoffset.front(), vardatatype,
+ &dstdata.front(), &dstcount.front(), &dstoffset.front(), vardatatype,
+ comm);
+ } else {
+ vector<MPI_Request> requests (2 * size);
+ // Start receive
+ for (int n = 0; n < size; ++n) {
+ if (n != rank and dstcount[n] > 0) {
+ MPI_Irecv
+ (&dstdata[vartypesize * dstoffset[n]], dstcount[n], vardatatype,
+ n, 0, comm, &requests[n]);
+ } else {
+ requests[n] = MPI_REQUEST_NULL;
+ }
+ }
+ // Start send
+ for (int n = 0; n < size; ++n) {
+ if (n != rank and srccount[n] > 0) {
+ MPI_Isend
+ (&srcdata[vartypesize * srcoffset[n]], srccount[n], vardatatype,
+ n, 0, comm, &requests[size + n]);
+ } else {
+ requests[size + n] = MPI_REQUEST_NULL;
+ }
+ }
+ // Self communication
+ {
+ int const n = rank;
+ assert (dstcount[n] == srccount[n]);
+ memcpy (&dstdata[vartypesize * dstoffset[n]],
+ &srcdata[vartypesize * srcoffset[n]],
+ dstcount[n] * vartypesize);
+ }
+ // Wait
+ MPI_Waitall (2 * size, &requests.front(), MPI_STATUSES_IGNORE);
+ }
+
+ ifcheck {
+ if (vartype == CCTK_VARIABLE_REAL) {
+ for (int vari=0; vari<nvaris; ++vari) {
+ CCTK_REAL const * restrict const dstdataptr =
+ (CCTK_REAL const *)&dstdata.front();
+ CCTK_REAL marker;
+ memset (&marker, POISON_VALUE, sizeof marker);
+ for (int i = 0; i < dstoffset[size]; ++i) {
+ assert (memcmp(&dstdataptr[i], &marker, sizeof marker) != 0);
+ }
+ }
+ }
+ }
+ CCTK_TimerStopI (timer_xfer);
+
+
+
+ CCTK_TimerStartI (timer_copy_back);
+ for (int n = 0; n < size; ++n) {
+ check (SLAB_MAXDIM == 3);
+
+ if (info[0].flip==0 and info[1].flip==0 and info[2].flip==0 and
+ dstdetail[n*SLAB_MAXDIM ].str==1 and
+ dstdetail[n*SLAB_MAXDIM+1].str==1 and
+ dstdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_REAL)
+ {
+ // Optimised version for a special case: no flipping
+
+ int const dstoffi = info[0].dst.local.off;
+ int const dstoffj = info[1].dst.local.off;
+ int const dstoffk = info[2].dst.local.off;
+
+ int const dstleni = info[0].dst.local.len;
+ int const dstlenj = info[1].dst.local.len;
+ int const dstlenk = info[2].dst.local.len;
+
+ int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off;
+ int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off;
+ int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off;
+
+ int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len;
+ int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len;
+ int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len;
+
+ for (int vari=0; vari<nvaris; ++vari) {
+ CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]];
+ CCTK_REAL const * restrict const dstdataptr =
+ (CCTK_REAL const *)&dstdata.front() +
+ dstoffset[n] + vari * dstelems[n];
+
+# pragma omp parallel for
+ for (int k = 0; k < dstdetaillenk; ++k) {
+ for (int j = 0; j < dstdetaillenj; ++j) {
+ for (int i = 0; i < dstdetailleni; ++i) {
+ int const dstindi = dstdetailoffi + i - dstoffi;
+ int const dstindj = dstdetailoffj + j - dstoffj;
+ int const dstindk = dstdetailoffk + k - dstoffk;
+ ifcheck assert (dstindi>=0 and dstindi<dstleni);
+ ifcheck assert (dstindj>=0 and dstindj<dstlenj);
+ ifcheck assert (dstindk>=0 and dstindk<dstlenk);
+ size_t const dstind =
+ dstindi + dstleni * (dstindj + dstlenj * dstindk);
+ size_t const bufind =
+ i + dstdetailleni * (j + dstdetaillenj * k);
+ dstptr[dstind] = dstdataptr[bufind];
+ }
+ }
+ }
+
+ } // for vari
+
+ } else if (info[0].flip==1 and info[1].flip==0 and info[2].flip==0 and
+ dstdetail[n*SLAB_MAXDIM ].str==1 and
+ dstdetail[n*SLAB_MAXDIM+1].str==1 and
+ dstdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_REAL)
+ {
+ // Optimised version for a special case: flip in x direction
+
+ int const dstoffi = info[0].dst.local.off;
+ int const dstoffj = info[1].dst.local.off;
+ int const dstoffk = info[2].dst.local.off;
+
+ int const dstleni = info[0].dst.local.len;
+ int const dstlenj = info[1].dst.local.len;
+ int const dstlenk = info[2].dst.local.len;
+
+ int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off;
+ int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off;
+ int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off;
+
+ int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len;
+ int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len;
+ int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len;
+
+ for (int vari=0; vari<nvaris; ++vari) {
+ CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]];
+ CCTK_REAL const * restrict const dstdataptr =
+ (CCTK_REAL const *)&dstdata.front() +
+ dstoffset[n] + vari * dstelems[n];
+
+# pragma omp parallel for
+ for (int k = 0; k < dstdetaillenk; ++k) {
+ for (int j = 0; j < dstdetaillenj; ++j) {
+ for (int i = 0; i < dstdetailleni; ++i) {
+ int const dstindi = dstdetailoffi + i - dstoffi;
+ int const dstindj = dstdetailoffj + j - dstoffj;
+ int const dstindk = dstdetailoffk + k - dstoffk;
+ ifcheck assert (dstindi>=0 and dstindi<dstleni);
+ ifcheck assert (dstindj>=0 and dstindj<dstlenj);
+ ifcheck assert (dstindk>=0 and dstindk<dstlenk);
+ size_t const dstind =
+ dstindi + dstleni * (dstindj + dstlenj * dstindk);
+ size_t const bufind =
+ (dstdetailleni - 1 - i) + dstdetailleni * (j + dstdetaillenj * k);
+ dstptr[dstind] = dstdataptr[bufind];
+ }
+ }
+ }
+
+ } // for vari
+
+ } else if (info[0].flip==0 and info[1].flip==1 and info[2].flip==0 and
+ dstdetail[n*SLAB_MAXDIM ].str==1 and
+ dstdetail[n*SLAB_MAXDIM+1].str==1 and
+ dstdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_REAL)
+ {
+ // Optimised version for a special case: flip in y direction
+
+ int const dstoffi = info[0].dst.local.off;
+ int const dstoffj = info[1].dst.local.off;
+ int const dstoffk = info[2].dst.local.off;
+
+ int const dstleni = info[0].dst.local.len;
+ int const dstlenj = info[1].dst.local.len;
+ int const dstlenk = info[2].dst.local.len;
+
+ int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off;
+ int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off;
+ int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off;
+
+ int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len;
+ int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len;
+ int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len;
+
+ for (int vari=0; vari<nvaris; ++vari) {
+ CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]];
+ CCTK_REAL const * restrict const dstdataptr =
+ (CCTK_REAL const *)&dstdata.front() +
+ dstoffset[n] + vari * dstelems[n];
+
+# pragma omp parallel for
+ for (int k = 0; k < dstdetaillenk; ++k) {
+ for (int j = 0; j < dstdetaillenj; ++j) {
+ for (int i = 0; i < dstdetailleni; ++i) {
+ int const dstindi = dstdetailoffi + i - dstoffi;
+ int const dstindj = dstdetailoffj + j - dstoffj;
+ int const dstindk = dstdetailoffk + k - dstoffk;
+ ifcheck assert (dstindi>=0 and dstindi<dstleni);
+ ifcheck assert (dstindj>=0 and dstindj<dstlenj);
+ ifcheck assert (dstindk>=0 and dstindk<dstlenk);
+ size_t const dstind =
+ dstindi + dstleni * (dstindj + dstlenj * dstindk);
+ size_t const bufind =
+ i + dstdetailleni * ((dstdetaillenj - 1 - j) + dstdetaillenj * k);
+ dstptr[dstind] = dstdataptr[bufind];
+ }
+ }
+ }
+
+ } // for vari
+
+ } else if (info[0].flip==1 and info[1].flip==1 and info[2].flip==0 and
+ dstdetail[n*SLAB_MAXDIM ].str==1 and
+ dstdetail[n*SLAB_MAXDIM+1].str==1 and
+ dstdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_REAL)
+ {
+ // Optimised version for a special case: flip in x and y direction
+
+ int const dstoffi = info[0].dst.local.off;
+ int const dstoffj = info[1].dst.local.off;
+ int const dstoffk = info[2].dst.local.off;
+
+ int const dstleni = info[0].dst.local.len;
+ int const dstlenj = info[1].dst.local.len;
+ int const dstlenk = info[2].dst.local.len;
+
+ int const dstdetailoffi = dstdetail[n*SLAB_MAXDIM+0].off;
+ int const dstdetailoffj = dstdetail[n*SLAB_MAXDIM+1].off;
+ int const dstdetailoffk = dstdetail[n*SLAB_MAXDIM+2].off;
+
+ int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len;
+ int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len;
+ int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len;
+
+ for (int vari=0; vari<nvaris; ++vari) {
+ CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]];
+ CCTK_REAL const * restrict const dstdataptr =
+ (CCTK_REAL const *)&dstdata.front() +
+ dstoffset[n] + vari * dstelems[n];
+
+# pragma omp parallel for
+ for (int k = 0; k < dstdetaillenk; ++k) {
+ for (int j = 0; j < dstdetaillenj; ++j) {
+ for (int i = 0; i < dstdetailleni; ++i) {
+ int const dstindi = dstdetailoffi + i - dstoffi;
+ int const dstindj = dstdetailoffj + j - dstoffj;
+ int const dstindk = dstdetailoffk + k - dstoffk;
+ ifcheck assert (dstindi>=0 and dstindi<dstleni);
+ ifcheck assert (dstindj>=0 and dstindj<dstlenj);
+ ifcheck assert (dstindk>=0 and dstindk<dstlenk);
+ size_t const dstind =
+ dstindi + dstleni * (dstindj + dstlenj * dstindk);
+ size_t const bufind =
+ (dstdetailleni - 1 - i) + dstdetailleni * ((dstdetaillenj - 1 - j) + dstdetaillenj * k);
+ dstptr[dstind] = dstdataptr[bufind];
+ }
+ }
+ }
+
+ } // for vari
+
+ } else if (dstdetail[n*SLAB_MAXDIM ].str==1 and
+ dstdetail[n*SLAB_MAXDIM+1].str==1 and
+ dstdetail[n*SLAB_MAXDIM+2].str==1 and
+ vartype == CCTK_VARIABLE_REAL)
+ {
+ // Optimised version for CCTK_REAL and stride 1
+
+ int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len;
+ int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len;
+ int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len;
+
+ for (int vari=0; vari<nvaris; ++vari) {
+ CCTK_REAL * restrict const dstptr = (CCTK_REAL *)dstptrs[varis[vari]];
+ CCTK_REAL const * restrict const dstdataptr =
+ (CCTK_REAL const *)&dstdata.front() +
+ dstoffset[n] + vari * dstelems[n];
+
+# pragma omp parallel for
+ for (int k = 0; k < dstdetaillenk; ++k) {
+ for (int j = 0; j < dstdetaillenj; ++j) {
+ for (int i = 0; i < dstdetailleni; ++i) {
+ int ipos[SLAB_MAXDIM];
+ ipos[0] = i;
+ ipos[1] = j;
+ ipos[2] = k;
+ int bufipos[SLAB_MAXDIM];
+ int dstipos[SLAB_MAXDIM];
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ if (not info[d].flip) {
+ bufipos[d] = ipos[d];
+ } else {
+ bufipos[d] = dstdetail[n*SLAB_MAXDIM+d].len - 1 - ipos[d];
+ }
+ ifcheck assert (bufipos[d] >= 0 and bufipos[d] < dstdetail[n*SLAB_MAXDIM+d].len);
+ dstipos[d] = dstdetail[n*SLAB_MAXDIM+d].off + ipos[d];
+ ifcheck assert (dstipos[d] >= info[d].dst.local.off and
+ dstipos[d] < info[d].dst.local.off + info[d].dst.local.len);
+ ifcheck assert (dstipos[d] >= info[d].dst.slab.off and
+ dstipos[d] <= info[d].dst.slab.off + info[d].dst.slab.len - 1);
+ }
+ size_t bufind = 0;
+ size_t dstind = 0;
+ for (int d=SLAB_MAXDIM-1; d>=0; --d) {
+ bufind = bufind * dstdetail[n*SLAB_MAXDIM+d].len + bufipos[d];
+ dstind = dstind * info[d].dst.local.len + dstipos[d] - info[d].dst.local.off;
+ }
+ ifcheck assert (bufind < (size_t)dstcount[n]);
+ ifcheck assert (dstind < dstlentot);
+ dstptr[dstind] = dstdataptr[bufind];
+ }
+ }
+ }
+
+ } // for vari
+
+ } else {
+ // Generic, unoptimised version
+
+ int const dstdetailleni = dstdetail[n*SLAB_MAXDIM+0].len;
+ int const dstdetaillenj = dstdetail[n*SLAB_MAXDIM+1].len;
+ int const dstdetaillenk = dstdetail[n*SLAB_MAXDIM+2].len;
+
+ for (int vari=0; vari<nvaris; ++vari) {
+ char * restrict const dstptr = (char *)dstptrs[varis[vari]];
+ char const * restrict const dstdataptr =
+ &dstdata[vartypesize * (dstoffset[n] + vari * dstelems[n])];
+
+# pragma omp parallel for
+ for (int k = 0; k < dstdetaillenk; ++k) {
+ for (int j = 0; j < dstdetaillenj; ++j) {
+ for (int i = 0; i < dstdetailleni; ++i) {
+ int ipos[SLAB_MAXDIM];
+ ipos[0] = i;
+ ipos[1] = j;
+ ipos[2] = k;
+ int bufipos[SLAB_MAXDIM];
+ int dstipos[SLAB_MAXDIM];
+ for (int d=0; d<SLAB_MAXDIM; ++d) {
+ if (not info[d].flip) {
+ bufipos[d] = ipos[d];
+ } else {
+ bufipos[d] = dstdetail[n*SLAB_MAXDIM+d].len - 1 - ipos[d];
+ }
+ ifcheck assert (bufipos[d] >= 0 and bufipos[d] < dstdetail[n*SLAB_MAXDIM+d].len);
+ dstipos[d] = dstdetail[n*SLAB_MAXDIM+d].off + ipos[d] * info[d].dst.slab.str;
+ ifcheck assert (dstipos[d] >= info[d].dst.local.off and
+ dstipos[d] < info[d].dst.local.off + info[d].dst.local.len);
+ ifcheck assert (dstipos[d] >= info[d].dst.slab.off and
+ dstipos[d] <= info[d].dst.slab.off + (info[d].dst.slab.len - 1) * info[d].dst.slab.str);
+ ifcheck assert ((dstipos[d] - info[d].dst.slab.off) % info[d].dst.slab.str == 0);
+ }
+ size_t bufind = 0;
+ size_t dstind = 0;
+ for (int d=SLAB_MAXDIM-1; d>=0; --d) {
+ bufind = bufind * dstdetail[n*SLAB_MAXDIM+d].len + bufipos[d];
+ dstind = dstind * info[d].dst.local.len + dstipos[d] - info[d].dst.local.off;
+ }
+ ifcheck assert (bufind < (size_t)dstcount[n]);
+ ifcheck assert (dstind < dstlentot);
+ memcpy (dstptr + vartypesize * dstind,
+ dstdataptr + vartypesize * bufind,
+ vartypesize);
+ }
+ }
+ }
+
+ } // for vari
+
+ }
+
+ } // for n
+ CCTK_TimerStopI (timer_copy_back);
+
+ } // for vartypei
+
+
+
+ ifcheck {
+ ifdebug fflush (stdout);
+ MPI_Barrier (comm);
+ }
+
+ return 0;
+}
+
+
+
+extern "C"
+void CCTK_FCALL
+CCTK_FNAME(Slab_Transfer) (int * restrict const ierr,
+ cGH const * const * restrict const cctkGH,
+ int const * restrict const dim,
+ int const * restrict const src_gsh,
+ int const * restrict const src_lbnd,
+ int const * restrict const src_lsh,
+ int const * restrict const src_lbbox,
+ int const * restrict const src_ubbox,
+ int const * restrict const src_nghostzones,
+ int const * restrict const src_off,
+ int const * restrict const src_str,
+ int const * restrict const src_len,
+ int const * restrict const dst_gsh,
+ int const * restrict const dst_lbnd,
+ int const * restrict const dst_lsh,
+ int const * restrict const dst_lbbox,
+ int const * restrict const dst_ubbox,
+ int const * restrict const dst_nghostzones,
+ int const * restrict const dst_off,
+ int const * restrict const dst_str,
+ int const * restrict const dst_len,
+ int const * restrict const xpose,
+ int const * restrict const flip,
+ int const * restrict const options,
+ int const * restrict const srctype,
+ void const * restrict const srcptr,
+ int const * restrict const dsttype,
+ void * restrict const dstptr)
+{
+ vector<xferinfo> xferinfo (*dim);
+
+ for (int d=0; d<*dim; ++d) {
+ xferinfo[d].src.gsh = src_gsh[d];
+ xferinfo[d].src.lbnd = src_lbnd[d];
+ xferinfo[d].src.lsh = src_lsh[d];
+ xferinfo[d].src.lbbox = src_lbbox[d];
+ xferinfo[d].src.ubbox = src_ubbox[d];
+ xferinfo[d].src.nghostzones = src_nghostzones[d];
+ xferinfo[d].src.off = src_off[d];
+ xferinfo[d].src.str = src_str[d];
+ xferinfo[d].src.len = src_len[d];
+
+ xferinfo[d].dst.gsh = dst_gsh[d];
+ xferinfo[d].dst.lbnd = dst_lbnd[d];
+ xferinfo[d].dst.lsh = dst_lsh[d];
+ xferinfo[d].dst.lbbox = dst_lbbox[d];
+ xferinfo[d].dst.ubbox = dst_ubbox[d];
+ xferinfo[d].dst.nghostzones = dst_nghostzones[d];
+ xferinfo[d].dst.off = dst_off[d];
+ xferinfo[d].dst.str = dst_str[d];
+ xferinfo[d].dst.len = dst_len[d];
+
+ xferinfo[d].xpose = xpose[d];
+ xferinfo[d].flip = flip[d];
+ }
+
+ *ierr = Slab_Transfer (*cctkGH, *dim, &xferinfo.front(), *options,
+ *srctype, srcptr, *dsttype, dstptr);
+}
+
+
+
+extern "C"
+int
+Slab_Transfer (cGH const * restrict const cctkGH,
+ int const dim,
+ xferinfo const * restrict const xferinfo,
+ int const options,
+ int const srctype,
+ void const * restrict const srcptr,
+ int const dsttype,
+ void * restrict const dstptr)
+{
+ int const nvars = 1;
+ int const srctypes[] = { srctype };
+ void const * restrict const srcptrs[] = { srcptr };
+ int const dsttypes[] = { dsttype };
+ void * restrict const dstptrs[] = { dstptr };
+ return Slab_MultiTransfer (cctkGH, dim, xferinfo, options,
+ nvars, srctypes, srcptrs, dsttypes, dstptrs);
+}