#include #include #include #include #include #include #include #include #include #include #include #include #include "fasterp.hh" namespace CarpetInterp2 { int const ipoison = -1234567890; CCTK_REAL const poison = -1.0e+12; template T get_poison () { abort(); } template<> int get_poison () { return ipoison; } template<> CCTK_REAL get_poison () { return poison; } template void fill (vector & v, T const & val) { // fill (v.begin(), v.end(), val); #pragma omp parallel for for (int n=0; n void fill_with_poison (vector & v) { #ifndef NDEBUG fill (v, get_poison()); // fill (v.begin(), v.end(), get_poison()); #endif } template void check_for_poison (vector const & v) { #pragma omp parallel for for (int n=0; n()); } } // Create an MPI datatype for location information MPI_Datatype fasterp_iloc_t::mpi_datatype () { static bool initialised = false; static MPI_Datatype newtype; if (not initialised) { static fasterp_iloc_t s; #define ENTRY(type, name) \ { \ sizeof s.name / sizeof(type), /* count elements */ \ (char*)&s.name - (char*)&s, /* offsetof doesn't work (why?) */ \ dist::datatype() /* find MPI datatype */ \ } dist::mpi_struct_descr_t const descr[] = { ENTRY(int, mrc), #ifdef CARPET_DEBUG ENTRY(int, pn), ENTRY(int, ipos), ENTRY(int, ind), #endif ENTRY(int, ind3d), ENTRY(CCTK_REAL, offset), {1, sizeof(s), MPI_UB} }; #undef ENTRY dist::create_mpi_datatype (sizeof(descr) / sizeof(descr[0]), descr, newtype); initialised = true; } return newtype; } void fasterp_iloc_t:: output (ostream& os) const { os << "fasterp_iloc_t{" << "mrc=" << mrc << "," #ifdef CARPET_DEBUG << "pn=" << pn << "," << "ipos=" << ipos << "," << "ind=" << ind << "," #endif << "ind3d=" << ind3d << "," << "offset=" << offset << "}"; } ////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// int mrc_t::maps = -1; int mrc_t::reflevels = -1; int mrc_t::components = -1; void mrc_t:: determine_mrc_info () { maps = Carpet::maps; reflevels = Carpet::reflevels; components = 0; for (int m=0; mcomponents (rl); components = max (components, ncomps); } } } mrc_t:: mrc_t (int ind) { assert (ind >= 0); int const ind0 = ind; c = ind % components; ind /= components; rl = ind % reflevels; ind /= reflevels; m = ind % maps; ind /= maps; assert (ind == 0); assert (get_ind() == ind0); } void mrc_t:: output (ostream& os) const { os << "mrc{m=" << m << ",rl=" << rl << ",c=" << c << "}"; } void pn_t:: output (ostream& os) const { os << "pn{p=" << p << ",n=" << n << "}"; } ////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// // Calculate the coefficients for one interpolation stencil // TODO: Could templatify this function on the order to improve // efficiency void fasterp_src_loc_t:: calc_stencil (fasterp_iloc_t const & iloc, ivect const & lsh, int const order) { assert (order <= max_order); CCTK_REAL const eps = 1.0e-12; #ifdef CARPET_DEBUG mrc = iloc.mrc; pn = iloc.pn; ipos = iloc.ipos; // Save lsh saved_lsh = lsh; #endif #ifndef NDEBUG // Poison all coefficients for (int d=0; d= 0.5*(order-1) and offset < 0.5*(order+1))); for (int d=0; d=0 and ind+either(exact,0,order)0) os << ","; os << "["; for (int n=0; n<=max_order; ++n) { if (n>0) os << ","; os << coeffs[d][n]; } os << "]"; } os << "],"; os << "exact=" << exact << ","; #ifdef CARPET_DEBUG os << "pn=" << pn << ","; os << "mrc=" << mrc << ","; os << "ipos=" << ipos << ","; os << "ind=" << ind << ","; #endif os << "ind3d=" << ind3d; #ifdef CARPET_DEBUG os << "," << "saved_lsh=" << saved_lsh; #endif os << "}"; } ////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// // Set up an interpolation starting from global coordinates fasterp_setup_t:: fasterp_setup_t (cGH const * restrict const cctkGH, fasterp_glocs_t const & locations, int const order_) : order (order_) { // Some global properties int const npoints = locations.size(); // Calculate patch numbers and local coordinates fasterp_llocs_t local_locations (npoints); if (CCTK_IsFunctionAliased ("MultiPatch_GlobalToLocal")) { // This is a muulti-patch simulation: convert global to local // coordinates CCTK_REAL const * coords[dim]; CCTK_REAL * local_coords[dim]; for (int d=0; d max_order) { CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, CCTK_THORNSTRING, "Interpolation order cannot be larger than max_order=%d; " "order=%d was requested. " "(You can increase the compile time constant max_order " "in thorn CarpetInterp2.)", max_order, order); } // Some global properties int const npoints = locations.size(); int const nprocs = CCTK_nProcs (cctkGH); // int const myproc = CCTK_MyProc (cctkGH); mrc_t::determine_mrc_info(); int const maxmrc = mrc_t::get_max_ind(); MPI_Comm & comm_world = * (MPI_Comm *) GetMPICommWorld (cctkGH); // Obtain the coordinate ranges for all patches vector lower (Carpet::maps); vector upper (Carpet::maps); vector delta (Carpet::maps); // spacing on finest possible grid vector idelta (Carpet::maps); // inverse spacing for (int m=0; m ilocs (npoints); vector proc (npoints); fill_with_poison (proc); vector nlocs (nprocs, 0); int min_rl, max_rl; if (Carpet::is_level_mode()) { min_rl = Carpet::reflevel; max_rl = Carpet::reflevel + 1; } else if (Carpet::is_global_mode()) { min_rl = 0; max_rl = Carpet::reflevels; } #pragma omp parallel for for (int n=0; nbaseextent(Carpet::mglevel, 0); // rvect const rpos = // (pos - lower.AT(m)) / (upper.AT(m) - lower.AT(m)) * // rvect (baseext.upper() - baseext.lower()); rvect const rpos = (pos - lower.AT(m)) * idelta.AT(m); // Find refinement level and component int rl, c; ivect ipos; hh->locate_position (rpos, Carpet::mglevel, min_rl, max_rl, rl, c, ipos); if (not (rl>=0 and c>=0)) { #pragma omp critical { ostringstream msg; msg << "Interpolation point " << n << " on map " << m << " " << "at " << pos << " is outside of the grid hierarchy"; CCTK_WARN (CCTK_WARN_ABORT, msg.str().c_str()); } } assert (rl>=0 and c>=0); ibbox const & ext = Carpet::vdd.AT(m)->boxes.AT(Carpet::mglevel).AT(rl).AT(c).exterior; rvect dpos = rpos - rvect(ipos); // Convert from Carpet indexing to grid point indexing assert (all (ipos % ext.stride() == ivect(0))); ipos /= ext.stride(); dpos /= rvect(ext.stride()); assert (all (abs(dpos) <= rvect(0.5))); ivect const ind = ipos - ext.lower() / ext.stride(); ivect const lsh = ext.shape() / ext.stride(); int const ind3d = index(lsh, ind); #if 0 ENTER_SINGLEMAP_MODE (cctkGH, m, CCTK_GF) { ENTER_LOCAL_MODE (cctkGH, c, CCTK_GF) { CCTK_REAL const * restrict const xptr = (CCTK_REAL const *) CCTK_VarDataPtr (cctkGH, 0, "grid::x"); CCTK_REAL const * restrict const yptr = (CCTK_REAL const *) CCTK_VarDataPtr (cctkGH, 0, "grid::y"); CCTK_REAL const * restrict const zptr = (CCTK_REAL const *) CCTK_VarDataPtr (cctkGH, 0, "grid::z"); assert (xptr); assert (yptr); assert (zptr); cout << "CI2 map=" << m << " pos=" << pos << " ind=" << ind << " x=" << xptr[ind3d] << " y=" << yptr[ind3d] << " z=" << zptr[ind3d] << endl; } LEAVE_LOCAL_MODE; } LEAVE_SINGLEMAP_MODE; #endif // TODO: assert that there are enough ghost zones // Store result fasterp_iloc_t & iloc = ilocs.AT(n); iloc.mrc = mrc_t(m, rl, c); #ifdef CARPET_DEBUG iloc.pn.p = dist::rank(); iloc.pn.n = n; iloc.ipos = ipos * ext.stride(); iloc.ind = ind; #endif iloc.ind3d = ind3d; iloc.offset = dpos; // Find source processor int const p = Carpet::vhh.AT(m)->processor(rl,c); proc.AT(n) = p; #pragma omp atomic ++ nlocs.AT(p); // Output if (veryverbose) { #pragma omp critical { cout << "Point #" << n << " at " << pos << ": iloc " << iloc << endl; } } } // Find mapping from processors to "processor indices": It may be // too expensive to store data for all processors, so we store // only data about those processors with which we actually // communicate. if (verbose) CCTK_INFO ("Determine set of communicating processors"); { int n_nz_nlocs = 0; for (int p=0; p 0) { ++ n_nz_nlocs; } } recv_descr.procs.resize (n_nz_nlocs); recv_descr.procinds.resize (nprocs, -1); int pp = 0; int offset = 0; for (int p=0; p 0) { recv_descr.procs.AT(pp).p = p; recv_descr.procs.AT(pp).offset = offset; recv_descr.procs.AT(pp).npoints = nlocs.AT(p); recv_descr.procinds.AT(p) = pp; ++ pp; offset += nlocs.AT(p); } } assert (pp == n_nz_nlocs); assert (offset == npoints); recv_descr.npoints = npoints; } // Create a mapping "index" from location index, as specified by // the user, to the index order in which the data are received // from the other processors. if (verbose) { CCTK_INFO ("Determine inter-processor gather index permutation"); } { vector index (recv_descr.procs.size()); fill_with_poison (index); for (int pp=0; pp= 0); recv_descr.index.AT(n) = index.AT(pp); ++index.AT(pp); } for (int pp=0; pp received (npoints); fill (received, false); #pragma omp parallel for for (int n=0; n recv_npoints (nprocs), recv_offsets (nprocs); fill (recv_npoints, 0); fill_with_poison (recv_offsets); { int offset = 0; for (int pp=0; pp send_npoints (nprocs), send_offsets (nprocs); fill_with_poison (send_npoints); fill_with_poison (send_offsets); MPI_Alltoall (&recv_npoints.front(), 1, MPI_INT, &send_npoints.front(), 1, MPI_INT, comm_world); int npoints_send = 0; for (int p=0; p scattered_ilocs(npoints); #pragma omp parallel for for (int n=0; n gathered_ilocs(npoints_send); MPI_Alltoallv (&scattered_ilocs.front(), &recv_npoints.front(), &recv_offsets.front(), fasterp_iloc_t::mpi_datatype(), &gathered_ilocs.front(), &send_npoints.front(), &send_offsets.front(), fasterp_iloc_t::mpi_datatype(), comm_world); #ifdef CARPET_DEBUG // Ensure that the ilocs were sent to the right processor for (int p=0; pis_local(rl,c)); } } #endif // Fill in send descriptors send_descr.npoints = npoints_send; { int n_nz_send_npoints = 0; for (int p=0; p 0) ++n_nz_send_npoints; } send_descr.procs.resize (n_nz_send_npoints); int pp = 0; for (int p=0; p 0) { send_proc_t & send_proc = send_descr.procs.AT(pp); send_proc.p = p; send_proc.offset = send_offsets.AT(p); send_proc.npoints = send_npoints.AT(p); ++pp; } } assert (pp == n_nz_send_npoints); } // Calculate stencil coefficients if (verbose) CCTK_INFO ("Calculate stencil coefficients"); for (size_t pp=0; pp mrc2comp (maxmrc, -1); vector comp2mrc (maxmrc); fill_with_poison (comp2mrc); int ncomps = 0; vector npoints_comp (maxmrc); fill (npoints_comp, 0); // TODO: parallelise with OpenMP for (int n=0; nis_local(rl,c)); ibbox const & ext = Carpet::vdd.AT(m)->boxes.AT(Carpet::mglevel).AT(rl).AT(c).exterior; send_comp.lsh = ext.shape() / ext.stride(); send_comp.offset = offset; send_comp.npoints = npoints_comp.AT(mrc); offset += send_comp.npoints; } assert (offset == send_proc.npoints); send_proc.index.resize (send_proc.npoints); fill_with_poison (send_proc.index); // TODO: This is not parallel! Loop over comps instead? // #pragma omp parallel for for (int n=0; n used(send_proc.npoints, false); for (int n=0; n recv_count (dist::size()); fill (recv_count, 0); for (size_t pp=0; pp 0); recv_count.AT(recv_proc.p) = recv_proc.npoints; } vector send_count (dist::size()); fill (send_count, 0); for (size_t pp=0; pp 0); send_count.AT(send_proc.p) = send_proc.npoints; } { vector tmp_count (dist::size()); MPI_Alltoall (&send_count.front(), 1, MPI_INT, &tmp_count .front(), 1, MPI_INT, dist::comm()); swap (send_count, tmp_count); } bool error = false; for (int p=0; p const & varinds, vector & values) const { DECLARE_CCTK_PARAMETERS; // Desired time level int const tl = 0; // current time size_t const nvars = varinds.size(); if (verbose) CCTK_VInfo (CCTK_THORNSTRING, "Interpolating %d variables", int(nvars)); assert (values.size() == nvars); for (size_t v=0; v= 0 and vi <= CCTK_NumVars()); int const gi = CCTK_GroupIndexFromVarI (vi); assert (gi >= 0); cGroup group; { int const ierr = CCTK_GroupData (gi, &group); assert (not ierr); } assert (group.grouptype == CCTK_GF); assert (group.vartype == CCTK_VARIABLE_REAL); assert (group.dim == dim); #if 0 // Cannot be called in level mode cGroupDynamicData dyndata; { int const ierr = CCTK_GroupDynamicData (cctkGH, gi, &dyndata); assert (not ierr); } assert (dyndata.activetimelevels > tl); #endif assert (values.AT(v) != NULL); } MPI_Comm & comm_world = * (MPI_Comm *) GetMPICommWorld (cctkGH); int const mpi_tag = 0; // Post Irecvs if (verbose) CCTK_INFO ("Posting MPI_Irecvs"); vector recv_points (recv_descr.npoints * nvars); fill_with_poison (recv_points); vector recv_reqs (recv_descr.procs.size()); #ifdef CARPET_DEBUG vector recv_pn (recv_descr.npoints); vector recv_reqs_pn (recv_descr.procs.size()); #endif for (size_t pp=0; pp(), recv_proc.p, mpi_tag, comm_world, & recv_reqs.AT(pp)); #ifdef CARPET_DEBUG MPI_Irecv (& recv_pn.AT(recv_proc.offset), recv_proc.npoints * 2, dist::datatype(), recv_proc.p, mpi_tag, comm_world, & recv_reqs_pn.AT(pp)); #endif } // Interpolate data and post Isends if (verbose) CCTK_INFO ("Interpolating and posting MPI_Isends"); // TODO: Use one arrays per processor? vector send_points (send_descr.npoints * nvars); fill_with_poison (send_points); vector send_reqs (send_descr.procs.size()); #ifdef CARPET_DEBUG vector send_pn (send_descr.npoints); vector send_reqs_pn (send_descr.procs.size()); #endif for (size_t pp=0; pp computed_points (send_proc.npoints * nvars); fill_with_poison (computed_points); #ifdef CARPET_DEBUG vector computed_pn (send_descr.npoints); #endif for (size_t comp=0; comp varptrs (nvars); for (size_t v=0; v= 0); int const vi = varinds.AT(v) - CCTK_FirstVarIndexI (gi); assert (vi >= 0); varptrs.AT(v) = (CCTK_REAL const *) (* Carpet::arrdata.AT(gi).AT(m).data.AT(vi)) (tl, rl, c, Carpet::mglevel)->storage(); assert (varptrs.AT(v)); } #pragma omp parallel for for (int n=0; n0) { assert (send_pn.AT(send_proc.offset + n ).n > send_pn.AT(send_proc.offset + n-1).n); } } #endif MPI_Isend (& send_points.AT(send_proc.offset * nvars), send_proc.npoints * nvars, dist::datatype(), send_proc.p, mpi_tag, comm_world, & send_reqs.AT(pp)); #ifdef CARPET_DEBUG MPI_Isend (& send_pn.AT(send_proc.offset), send_proc.npoints * 2, dist::datatype(), send_proc.p, mpi_tag, comm_world, & send_reqs_pn.AT(pp)); #endif } // for pp // Wait for Irecvs to complete if (verbose) CCTK_INFO ("Waiting for MPI_Irevcs to complete"); MPI_Waitall (recv_reqs.size(), & recv_reqs.front(), MPI_STATUSES_IGNORE); #ifdef CARPET_DEBUG MPI_Waitall (recv_reqs.size(), & recv_reqs_pn.front(), MPI_STATUSES_IGNORE); #endif // Gather data if (verbose) CCTK_INFO ("Gathering data"); #pragma omp parallel for for (int n=0; n