diff options
-rw-r--r-- | Carpet/CarpetLib/param.ccl | 32 | ||||
-rw-r--r-- | Carpet/CarpetLib/src/commstate.cc | 144 | ||||
-rw-r--r-- | Carpet/CarpetLib/src/gdata.cc | 40 | ||||
-rw-r--r-- | Carpet/CarpetLib/src/timestat.cc | 4 | ||||
-rw-r--r-- | Carpet/CarpetLib/src/timestat.hh | 2 |
5 files changed, 190 insertions, 32 deletions
diff --git a/Carpet/CarpetLib/param.ccl b/Carpet/CarpetLib/param.ccl index 2292280c6..8d736ed3f 100644 --- a/Carpet/CarpetLib/param.ccl +++ b/Carpet/CarpetLib/param.ccl @@ -82,6 +82,38 @@ STRING memstat_file "File name in which memstat output is collected (because std +# Experimental communication parameters + +BOOLEAN interleave_communications "Try to interleave communications with each other; each processor begins to communicate with its 'right neighbour' in rank, instead of with the root processor" STEERABLE=always +{ +} "no" + +BOOLEAN vary_tags "Use different tags for each communication" STEERABLE=always +{ +} "no" + +BOOLEAN barrier_between_stages "Add a barrier between the communication stages (slows down, but may make timing numbers easier to interpret)" STEERABLE=always +{ +} "no" + +BOOLEAN combine_sends "Send data together and in order of processor ranks" STEERABLE=always +{ +} "no" + +BOOLEAN reduce_mpi_waitall "Call MPI_Waitall only for requests that are not null" STEERABLE=always +{ +} "no" + +BOOLEAN use_mpi_send "Use MPI_Send instead of MPI_Isend" STEERABLE=always +{ +} "no" + +BOOLEAN use_mpi_ssend "Use MPI_Ssend instead of MPI_Isend" STEERABLE=always +{ +} "no" + + + SHARES: IO USES STRING out_dir diff --git a/Carpet/CarpetLib/src/commstate.cc b/Carpet/CarpetLib/src/commstate.cc index 7a5caca3b..0ac7b21de 100644 --- a/Carpet/CarpetLib/src/commstate.cc +++ b/Carpet/CarpetLib/src/commstate.cc @@ -1,3 +1,4 @@ +#include <cassert> #include <cstring> #include "cctk.h" @@ -81,15 +82,18 @@ void comm_state::step () // (a clever MPI layer may take advantage of such early posting). num_posted_recvs = num_completed_recvs = 0; - for (size_t type = 0; type < typebufs.size(); type++) { - - // skip unused datatype buffers - if (not typebufs[type].in_use) { - continue; - } + for (int proc1 = 0; proc1 < dist::size(); ++ proc1) { + size_t const proc = + interleave_communications + ? (proc1 + dist::rank()) % dist::size() + : proc1; + + for (size_t type = 0; type < typebufs.size(); type++) { + + // skip unused datatype buffers + if (not typebufs[type].in_use) continue; - int& datatypesize = typebufs[type].datatypesize; - for (size_t proc = 0; proc < typebufs[type].procbufs.size(); proc++) { + int datatypesize = typebufs[type].datatypesize; procbufdesc& procbuf = typebufs[type].procbufs[proc]; procbuf.sendbufbase = new char[procbuf.sendbufsize*datatypesize]; @@ -105,8 +109,12 @@ void comm_state::step () if (procbuf.recvbufsize > 0) { wtime_commstate_sizes_irecv.start(); + int const tag = + vary_tags + ? (dist::rank() + dist::size() * (proc + dist::size() * type)) % 32768 + : type; MPI_Irecv (procbuf.recvbufbase, procbuf.recvbufsize, - typebufs[type].mpi_datatype, proc, type, + typebufs[type].mpi_datatype, proc, tag, dist::comm(), &rrequests[dist::size()*type + proc]); wtime_commstate_sizes_irecv.stop(); num_posted_recvs++; @@ -114,6 +122,13 @@ void comm_state::step () } } + if (barrier_between_stages) { + // Add a barrier, to try to ensure that all Irecvs are posted + // before the first Isends are made + // (Alternative: Use MPI_Alltoallv instead) + MPI_Barrier (dist::comm()); + } + // Now go and get the send buffers filled with data. // Once a buffer is full it will be posted right away // (see gdata::copy_into_sendbuffer() and @@ -122,6 +137,61 @@ void comm_state::step () break; case state_fill_send_buffers: + if (combine_sends) { + // Send the data. Do not send them sequentially, but try to + // intersperse the communications + for (int proc1 = 0; proc1 < dist::size(); ++ proc1) { + int const proc = + interleave_communications + ? (proc1 + dist::size() - dist::rank()) % dist::size() + : proc1; + + for (size_t type = 0; type < typebufs.size(); type++) { + // skip unused datatype buffers + if (not typebufs[type].in_use) continue; + + int const datatypesize = typebufs[type].datatypesize; + procbufdesc const & procbuf = typebufs[type].procbufs[proc]; + + int const fillstate = procbuf.sendbuf - procbuf.sendbufbase; + assert (fillstate == (int)procbuf.sendbufsize * datatypesize); + + if (procbuf.sendbufsize > 0) { + int const tag = + vary_tags + ? (proc + dist::size() * (dist::rank() + dist::size() * type)) % 32768 + : type; + if (use_mpi_send) { + // use MPI_Send + wtime_commstate_send.start(); + MPI_Send (procbuf.sendbufbase, procbuf.sendbufsize, + typebufs[type].mpi_datatype, proc, tag, + dist::comm()); + srequests[dist::size()*type + proc] = MPI_REQUEST_NULL; + wtime_commstate_send.stop(procbuf.sendbufsize * datatypesize); + } else if (use_mpi_ssend) { + // use MPI_Ssend + wtime_commstate_ssend.start(); + MPI_Ssend (procbuf.sendbufbase, procbuf.sendbufsize, + typebufs[type].mpi_datatype, proc, tag, + dist::comm()); + srequests[dist::size()*type + proc] = MPI_REQUEST_NULL; + wtime_commstate_ssend.stop(procbuf.sendbufsize * datatypesize); + } else { + // use MPI_Isend + wtime_commstate_isend.start(); + MPI_Isend (procbuf.sendbufbase, procbuf.sendbufsize, + typebufs[type].mpi_datatype, proc, tag, + dist::comm(), &srequests[dist::size()*type + proc]); + wtime_commstate_isend.stop(procbuf.sendbufsize * datatypesize); + } + } + + } // for type + + } // for proc + } + // Now fall through to the next state in which the recv buffers // are emptied as soon as data has arrived. thestate = state_empty_recv_buffers; @@ -180,9 +250,30 @@ bool comm_state::AllPostedCommunicationsFinished () // check if all outstanding receives have been completed already if (num_posted_recvs == num_completed_recvs) { // finalize the outstanding sends in one go - wtime_commstate_waitall_final.start(); - MPI_Waitall (srequests.size(), &srequests.front(), MPI_STATUSES_IGNORE); - wtime_commstate_waitall_final.stop(); + if (reduce_mpi_waitall) { + size_t nreqs = 0; + for (size_t i=0; i<srequests.size(); ++i) { + if (srequests.at(i) != MPI_REQUEST_NULL) { + ++nreqs; + } + } + vector<MPI_Request> reqs(nreqs); + nreqs = 0; + for (size_t i=0; i<srequests.size(); ++i) { + if (srequests.at(i) != MPI_REQUEST_NULL) { + reqs.at(nreqs) = srequests.at(i); + ++nreqs; + } + } + assert (nreqs == reqs.size()); + wtime_commstate_waitall_final.start(); + MPI_Waitall (reqs.size(), &reqs.front(), MPI_STATUSES_IGNORE); + wtime_commstate_waitall_final.stop(); + } else { + wtime_commstate_waitall_final.start(); + MPI_Waitall (srequests.size(), &srequests.front(), MPI_STATUSES_IGNORE); + wtime_commstate_waitall_final.stop(); + } return true; } @@ -193,13 +284,34 @@ bool comm_state::AllPostedCommunicationsFinished () if (use_waitall) { // mark all posted recveive buffers as ready for (size_t i = 0; i < recvbuffers_ready.size(); i++) { - recvbuffers_ready[i] = rrequests[i] != MPI_REQUEST_NULL; + recvbuffers_ready.at(i) = rrequests.at(i) != MPI_REQUEST_NULL; } // wait for completion of all posted receive operations - wtime_commstate_waitall.start(); - MPI_Waitall (rrequests.size(), &rrequests.front(), MPI_STATUSES_IGNORE); - wtime_commstate_waitall.stop(); + if (reduce_mpi_waitall) { + size_t nreqs = 0; + for (size_t i=0; i<rrequests.size(); ++i) { + if (rrequests.at(i) != MPI_REQUEST_NULL) { + ++nreqs; + } + } + vector<MPI_Request> reqs(nreqs); + nreqs = 0; + for (size_t i=0; i<rrequests.size(); ++i) { + if (rrequests.at(i) != MPI_REQUEST_NULL) { + reqs.at(nreqs) = rrequests.at(i); + ++nreqs; + } + } + assert (nreqs == reqs.size()); + wtime_commstate_waitall.start(); + MPI_Waitall (reqs.size(), &reqs.front(), MPI_STATUSES_IGNORE); + wtime_commstate_waitall.stop(); + } else { + wtime_commstate_waitall.start(); + MPI_Waitall (rrequests.size(), &rrequests.front(), MPI_STATUSES_IGNORE); + wtime_commstate_waitall.stop(); + } num_completed_recvs = num_posted_recvs; } else { int num_completed_recvs_ = 0; diff --git a/Carpet/CarpetLib/src/gdata.cc b/Carpet/CarpetLib/src/gdata.cc index d31d917d2..f275ef02a 100644 --- a/Carpet/CarpetLib/src/gdata.cc +++ b/Carpet/CarpetLib/src/gdata.cc @@ -283,6 +283,8 @@ void gdata::copy_from_wait (comm_state& state, void gdata::copy_into_sendbuffer (comm_state& state, const gdata* src, const ibbox& box) { + DECLARE_CCTK_PARAMETERS; + if (proc() == src->proc()) { // copy on same processor copy_from_innerloop (src, box); @@ -315,14 +317,16 @@ void gdata::copy_into_sendbuffer (comm_state& state, } } - // post the send if the buffer is full - if (fillstate == (int)procbuf.sendbufsize * datatypesize) { - wtime_commstate_isend.start(); - MPI_Isend (procbuf.sendbufbase, procbuf.sendbufsize, - state.typebufs.at(c_datatype()).mpi_datatype, - proc(), c_datatype(), dist::comm(), - &state.srequests.at(dist::size()*c_datatype() + proc())); - wtime_commstate_isend.stop(procbuf.sendbufsize * datatypesize); + if (not combine_sends) { + // post the send if the buffer is full + if (fillstate == (int)procbuf.sendbufsize * datatypesize) { + wtime_commstate_isend.start(); + MPI_Isend (procbuf.sendbufbase, procbuf.sendbufsize, + state.typebufs.at(c_datatype()).mpi_datatype, + proc(), c_datatype(), dist::comm(), + &state.srequests.at(dist::size()*c_datatype() + proc())); + wtime_commstate_isend.stop(procbuf.sendbufsize * datatypesize); + } } } } @@ -506,6 +510,8 @@ void gdata const int order_space, const int order_time) { + DECLARE_CCTK_PARAMETERS; + if (proc() == srcs.at(0)->proc()) { // interpolate on same processor interpolate_from_innerloop (srcs, times, box, time, @@ -534,14 +540,16 @@ void gdata // advance send buffer to point to the next ibbox slot procbuf.sendbuf += datatypesize * box.size(); - // post the send if the buffer is full - if (fillstate == (int)procbuf.sendbufsize*datatypesize) { - wtime_commstate_interpolate_to_isend.start(); - MPI_Isend (procbuf.sendbufbase, procbuf.sendbufsize, - state.typebufs.at(c_datatype()).mpi_datatype, - proc(), c_datatype(), dist::comm(), - &state.srequests.at(dist::size()*c_datatype() + proc())); - wtime_commstate_interpolate_to_isend.stop(procbuf.sendbufsize*datatypesize); + if (not combine_sends) { + // post the send if the buffer is full + if (fillstate == (int)procbuf.sendbufsize*datatypesize) { + wtime_commstate_interpolate_to_isend.start(); + MPI_Isend (procbuf.sendbufbase, procbuf.sendbufsize, + state.typebufs.at(c_datatype()).mpi_datatype, + proc(), c_datatype(), dist::comm(), + &state.srequests.at(dist::size()*c_datatype() + proc())); + wtime_commstate_interpolate_to_isend.stop(procbuf.sendbufsize*datatypesize); + } } } } diff --git a/Carpet/CarpetLib/src/timestat.cc b/Carpet/CarpetLib/src/timestat.cc index b126471be..893c5664d 100644 --- a/Carpet/CarpetLib/src/timestat.cc +++ b/Carpet/CarpetLib/src/timestat.cc @@ -130,6 +130,8 @@ timestat wtime_commstate_sizes_irecv; timestat wtime_commstate_waitall_final; timestat wtime_commstate_waitall; timestat wtime_commstate_waitsome; +timestat wtime_commstate_send; +timestat wtime_commstate_ssend; timestat wtime_commstate_isend; timestat wtime_commstate_memcpy; timestat wtime_commstate_interpolate_irecv; @@ -243,6 +245,8 @@ void CarpetLib_printtimestats (CCTK_ARGUMENTS) << " wtime_commstate_waitall_final: " << wtime_commstate_waitall_final << endl << " wtime_commstate_waitall: " << wtime_commstate_waitall << endl << " wtime_commstate_waitsome: " << wtime_commstate_waitsome << endl + << " wtime_commstate_send: " << wtime_commstate_send << endl + << " wtime_commstate_ssend: " << wtime_commstate_ssend << endl << " wtime_commstate_isend: " << wtime_commstate_isend << endl << " wtime_commstate_memcpy: " << wtime_commstate_memcpy << endl << " wtime_commstate_interpolate_irecv: " << wtime_commstate_interpolate_irecv << endl diff --git a/Carpet/CarpetLib/src/timestat.hh b/Carpet/CarpetLib/src/timestat.hh index 9e22a5635..a262c4dbf 100644 --- a/Carpet/CarpetLib/src/timestat.hh +++ b/Carpet/CarpetLib/src/timestat.hh @@ -80,6 +80,8 @@ extern timestat wtime_commstate_sizes_irecv; extern timestat wtime_commstate_waitall_final; extern timestat wtime_commstate_waitall; extern timestat wtime_commstate_waitsome; +extern timestat wtime_commstate_send; +extern timestat wtime_commstate_ssend; extern timestat wtime_commstate_isend; extern timestat wtime_commstate_memcpy; extern timestat wtime_commstate_interpolate_irecv; |