aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Carpet/CarpetLib/param.ccl32
-rw-r--r--Carpet/CarpetLib/src/commstate.cc144
-rw-r--r--Carpet/CarpetLib/src/gdata.cc40
-rw-r--r--Carpet/CarpetLib/src/timestat.cc4
-rw-r--r--Carpet/CarpetLib/src/timestat.hh2
5 files changed, 190 insertions, 32 deletions
diff --git a/Carpet/CarpetLib/param.ccl b/Carpet/CarpetLib/param.ccl
index 2292280c6..8d736ed3f 100644
--- a/Carpet/CarpetLib/param.ccl
+++ b/Carpet/CarpetLib/param.ccl
@@ -82,6 +82,38 @@ STRING memstat_file "File name in which memstat output is collected (because std
+# Experimental communication parameters
+
+BOOLEAN interleave_communications "Try to interleave communications with each other; each processor begins to communicate with its 'right neighbour' in rank, instead of with the root processor" STEERABLE=always
+{
+} "no"
+
+BOOLEAN vary_tags "Use different tags for each communication" STEERABLE=always
+{
+} "no"
+
+BOOLEAN barrier_between_stages "Add a barrier between the communication stages (slows down, but may make timing numbers easier to interpret)" STEERABLE=always
+{
+} "no"
+
+BOOLEAN combine_sends "Send data together and in order of processor ranks" STEERABLE=always
+{
+} "no"
+
+BOOLEAN reduce_mpi_waitall "Call MPI_Waitall only for requests that are not null" STEERABLE=always
+{
+} "no"
+
+BOOLEAN use_mpi_send "Use MPI_Send instead of MPI_Isend" STEERABLE=always
+{
+} "no"
+
+BOOLEAN use_mpi_ssend "Use MPI_Ssend instead of MPI_Isend" STEERABLE=always
+{
+} "no"
+
+
+
SHARES: IO
USES STRING out_dir
diff --git a/Carpet/CarpetLib/src/commstate.cc b/Carpet/CarpetLib/src/commstate.cc
index 7a5caca3b..0ac7b21de 100644
--- a/Carpet/CarpetLib/src/commstate.cc
+++ b/Carpet/CarpetLib/src/commstate.cc
@@ -1,3 +1,4 @@
+#include <cassert>
#include <cstring>
#include "cctk.h"
@@ -81,15 +82,18 @@ void comm_state::step ()
// (a clever MPI layer may take advantage of such early posting).
num_posted_recvs = num_completed_recvs = 0;
- for (size_t type = 0; type < typebufs.size(); type++) {
-
- // skip unused datatype buffers
- if (not typebufs[type].in_use) {
- continue;
- }
+ for (int proc1 = 0; proc1 < dist::size(); ++ proc1) {
+ size_t const proc =
+ interleave_communications
+ ? (proc1 + dist::rank()) % dist::size()
+ : proc1;
+
+ for (size_t type = 0; type < typebufs.size(); type++) {
+
+ // skip unused datatype buffers
+ if (not typebufs[type].in_use) continue;
- int& datatypesize = typebufs[type].datatypesize;
- for (size_t proc = 0; proc < typebufs[type].procbufs.size(); proc++) {
+ int datatypesize = typebufs[type].datatypesize;
procbufdesc& procbuf = typebufs[type].procbufs[proc];
procbuf.sendbufbase = new char[procbuf.sendbufsize*datatypesize];
@@ -105,8 +109,12 @@ void comm_state::step ()
if (procbuf.recvbufsize > 0) {
wtime_commstate_sizes_irecv.start();
+ int const tag =
+ vary_tags
+ ? (dist::rank() + dist::size() * (proc + dist::size() * type)) % 32768
+ : type;
MPI_Irecv (procbuf.recvbufbase, procbuf.recvbufsize,
- typebufs[type].mpi_datatype, proc, type,
+ typebufs[type].mpi_datatype, proc, tag,
dist::comm(), &rrequests[dist::size()*type + proc]);
wtime_commstate_sizes_irecv.stop();
num_posted_recvs++;
@@ -114,6 +122,13 @@ void comm_state::step ()
}
}
+ if (barrier_between_stages) {
+ // Add a barrier, to try to ensure that all Irecvs are posted
+ // before the first Isends are made
+ // (Alternative: Use MPI_Alltoallv instead)
+ MPI_Barrier (dist::comm());
+ }
+
// Now go and get the send buffers filled with data.
// Once a buffer is full it will be posted right away
// (see gdata::copy_into_sendbuffer() and
@@ -122,6 +137,61 @@ void comm_state::step ()
break;
case state_fill_send_buffers:
+ if (combine_sends) {
+ // Send the data. Do not send them sequentially, but try to
+ // intersperse the communications
+ for (int proc1 = 0; proc1 < dist::size(); ++ proc1) {
+ int const proc =
+ interleave_communications
+ ? (proc1 + dist::size() - dist::rank()) % dist::size()
+ : proc1;
+
+ for (size_t type = 0; type < typebufs.size(); type++) {
+ // skip unused datatype buffers
+ if (not typebufs[type].in_use) continue;
+
+ int const datatypesize = typebufs[type].datatypesize;
+ procbufdesc const & procbuf = typebufs[type].procbufs[proc];
+
+ int const fillstate = procbuf.sendbuf - procbuf.sendbufbase;
+ assert (fillstate == (int)procbuf.sendbufsize * datatypesize);
+
+ if (procbuf.sendbufsize > 0) {
+ int const tag =
+ vary_tags
+ ? (proc + dist::size() * (dist::rank() + dist::size() * type)) % 32768
+ : type;
+ if (use_mpi_send) {
+ // use MPI_Send
+ wtime_commstate_send.start();
+ MPI_Send (procbuf.sendbufbase, procbuf.sendbufsize,
+ typebufs[type].mpi_datatype, proc, tag,
+ dist::comm());
+ srequests[dist::size()*type + proc] = MPI_REQUEST_NULL;
+ wtime_commstate_send.stop(procbuf.sendbufsize * datatypesize);
+ } else if (use_mpi_ssend) {
+ // use MPI_Ssend
+ wtime_commstate_ssend.start();
+ MPI_Ssend (procbuf.sendbufbase, procbuf.sendbufsize,
+ typebufs[type].mpi_datatype, proc, tag,
+ dist::comm());
+ srequests[dist::size()*type + proc] = MPI_REQUEST_NULL;
+ wtime_commstate_ssend.stop(procbuf.sendbufsize * datatypesize);
+ } else {
+ // use MPI_Isend
+ wtime_commstate_isend.start();
+ MPI_Isend (procbuf.sendbufbase, procbuf.sendbufsize,
+ typebufs[type].mpi_datatype, proc, tag,
+ dist::comm(), &srequests[dist::size()*type + proc]);
+ wtime_commstate_isend.stop(procbuf.sendbufsize * datatypesize);
+ }
+ }
+
+ } // for type
+
+ } // for proc
+ }
+
// Now fall through to the next state in which the recv buffers
// are emptied as soon as data has arrived.
thestate = state_empty_recv_buffers;
@@ -180,9 +250,30 @@ bool comm_state::AllPostedCommunicationsFinished ()
// check if all outstanding receives have been completed already
if (num_posted_recvs == num_completed_recvs) {
// finalize the outstanding sends in one go
- wtime_commstate_waitall_final.start();
- MPI_Waitall (srequests.size(), &srequests.front(), MPI_STATUSES_IGNORE);
- wtime_commstate_waitall_final.stop();
+ if (reduce_mpi_waitall) {
+ size_t nreqs = 0;
+ for (size_t i=0; i<srequests.size(); ++i) {
+ if (srequests.at(i) != MPI_REQUEST_NULL) {
+ ++nreqs;
+ }
+ }
+ vector<MPI_Request> reqs(nreqs);
+ nreqs = 0;
+ for (size_t i=0; i<srequests.size(); ++i) {
+ if (srequests.at(i) != MPI_REQUEST_NULL) {
+ reqs.at(nreqs) = srequests.at(i);
+ ++nreqs;
+ }
+ }
+ assert (nreqs == reqs.size());
+ wtime_commstate_waitall_final.start();
+ MPI_Waitall (reqs.size(), &reqs.front(), MPI_STATUSES_IGNORE);
+ wtime_commstate_waitall_final.stop();
+ } else {
+ wtime_commstate_waitall_final.start();
+ MPI_Waitall (srequests.size(), &srequests.front(), MPI_STATUSES_IGNORE);
+ wtime_commstate_waitall_final.stop();
+ }
return true;
}
@@ -193,13 +284,34 @@ bool comm_state::AllPostedCommunicationsFinished ()
if (use_waitall) {
// mark all posted recveive buffers as ready
for (size_t i = 0; i < recvbuffers_ready.size(); i++) {
- recvbuffers_ready[i] = rrequests[i] != MPI_REQUEST_NULL;
+ recvbuffers_ready.at(i) = rrequests.at(i) != MPI_REQUEST_NULL;
}
// wait for completion of all posted receive operations
- wtime_commstate_waitall.start();
- MPI_Waitall (rrequests.size(), &rrequests.front(), MPI_STATUSES_IGNORE);
- wtime_commstate_waitall.stop();
+ if (reduce_mpi_waitall) {
+ size_t nreqs = 0;
+ for (size_t i=0; i<rrequests.size(); ++i) {
+ if (rrequests.at(i) != MPI_REQUEST_NULL) {
+ ++nreqs;
+ }
+ }
+ vector<MPI_Request> reqs(nreqs);
+ nreqs = 0;
+ for (size_t i=0; i<rrequests.size(); ++i) {
+ if (rrequests.at(i) != MPI_REQUEST_NULL) {
+ reqs.at(nreqs) = rrequests.at(i);
+ ++nreqs;
+ }
+ }
+ assert (nreqs == reqs.size());
+ wtime_commstate_waitall.start();
+ MPI_Waitall (reqs.size(), &reqs.front(), MPI_STATUSES_IGNORE);
+ wtime_commstate_waitall.stop();
+ } else {
+ wtime_commstate_waitall.start();
+ MPI_Waitall (rrequests.size(), &rrequests.front(), MPI_STATUSES_IGNORE);
+ wtime_commstate_waitall.stop();
+ }
num_completed_recvs = num_posted_recvs;
} else {
int num_completed_recvs_ = 0;
diff --git a/Carpet/CarpetLib/src/gdata.cc b/Carpet/CarpetLib/src/gdata.cc
index d31d917d2..f275ef02a 100644
--- a/Carpet/CarpetLib/src/gdata.cc
+++ b/Carpet/CarpetLib/src/gdata.cc
@@ -283,6 +283,8 @@ void gdata::copy_from_wait (comm_state& state,
void gdata::copy_into_sendbuffer (comm_state& state,
const gdata* src, const ibbox& box)
{
+ DECLARE_CCTK_PARAMETERS;
+
if (proc() == src->proc()) {
// copy on same processor
copy_from_innerloop (src, box);
@@ -315,14 +317,16 @@ void gdata::copy_into_sendbuffer (comm_state& state,
}
}
- // post the send if the buffer is full
- if (fillstate == (int)procbuf.sendbufsize * datatypesize) {
- wtime_commstate_isend.start();
- MPI_Isend (procbuf.sendbufbase, procbuf.sendbufsize,
- state.typebufs.at(c_datatype()).mpi_datatype,
- proc(), c_datatype(), dist::comm(),
- &state.srequests.at(dist::size()*c_datatype() + proc()));
- wtime_commstate_isend.stop(procbuf.sendbufsize * datatypesize);
+ if (not combine_sends) {
+ // post the send if the buffer is full
+ if (fillstate == (int)procbuf.sendbufsize * datatypesize) {
+ wtime_commstate_isend.start();
+ MPI_Isend (procbuf.sendbufbase, procbuf.sendbufsize,
+ state.typebufs.at(c_datatype()).mpi_datatype,
+ proc(), c_datatype(), dist::comm(),
+ &state.srequests.at(dist::size()*c_datatype() + proc()));
+ wtime_commstate_isend.stop(procbuf.sendbufsize * datatypesize);
+ }
}
}
}
@@ -506,6 +510,8 @@ void gdata
const int order_space,
const int order_time)
{
+ DECLARE_CCTK_PARAMETERS;
+
if (proc() == srcs.at(0)->proc()) {
// interpolate on same processor
interpolate_from_innerloop (srcs, times, box, time,
@@ -534,14 +540,16 @@ void gdata
// advance send buffer to point to the next ibbox slot
procbuf.sendbuf += datatypesize * box.size();
- // post the send if the buffer is full
- if (fillstate == (int)procbuf.sendbufsize*datatypesize) {
- wtime_commstate_interpolate_to_isend.start();
- MPI_Isend (procbuf.sendbufbase, procbuf.sendbufsize,
- state.typebufs.at(c_datatype()).mpi_datatype,
- proc(), c_datatype(), dist::comm(),
- &state.srequests.at(dist::size()*c_datatype() + proc()));
- wtime_commstate_interpolate_to_isend.stop(procbuf.sendbufsize*datatypesize);
+ if (not combine_sends) {
+ // post the send if the buffer is full
+ if (fillstate == (int)procbuf.sendbufsize*datatypesize) {
+ wtime_commstate_interpolate_to_isend.start();
+ MPI_Isend (procbuf.sendbufbase, procbuf.sendbufsize,
+ state.typebufs.at(c_datatype()).mpi_datatype,
+ proc(), c_datatype(), dist::comm(),
+ &state.srequests.at(dist::size()*c_datatype() + proc()));
+ wtime_commstate_interpolate_to_isend.stop(procbuf.sendbufsize*datatypesize);
+ }
}
}
}
diff --git a/Carpet/CarpetLib/src/timestat.cc b/Carpet/CarpetLib/src/timestat.cc
index b126471be..893c5664d 100644
--- a/Carpet/CarpetLib/src/timestat.cc
+++ b/Carpet/CarpetLib/src/timestat.cc
@@ -130,6 +130,8 @@ timestat wtime_commstate_sizes_irecv;
timestat wtime_commstate_waitall_final;
timestat wtime_commstate_waitall;
timestat wtime_commstate_waitsome;
+timestat wtime_commstate_send;
+timestat wtime_commstate_ssend;
timestat wtime_commstate_isend;
timestat wtime_commstate_memcpy;
timestat wtime_commstate_interpolate_irecv;
@@ -243,6 +245,8 @@ void CarpetLib_printtimestats (CCTK_ARGUMENTS)
<< " wtime_commstate_waitall_final: " << wtime_commstate_waitall_final << endl
<< " wtime_commstate_waitall: " << wtime_commstate_waitall << endl
<< " wtime_commstate_waitsome: " << wtime_commstate_waitsome << endl
+ << " wtime_commstate_send: " << wtime_commstate_send << endl
+ << " wtime_commstate_ssend: " << wtime_commstate_ssend << endl
<< " wtime_commstate_isend: " << wtime_commstate_isend << endl
<< " wtime_commstate_memcpy: " << wtime_commstate_memcpy << endl
<< " wtime_commstate_interpolate_irecv: " << wtime_commstate_interpolate_irecv << endl
diff --git a/Carpet/CarpetLib/src/timestat.hh b/Carpet/CarpetLib/src/timestat.hh
index 9e22a5635..a262c4dbf 100644
--- a/Carpet/CarpetLib/src/timestat.hh
+++ b/Carpet/CarpetLib/src/timestat.hh
@@ -80,6 +80,8 @@ extern timestat wtime_commstate_sizes_irecv;
extern timestat wtime_commstate_waitall_final;
extern timestat wtime_commstate_waitall;
extern timestat wtime_commstate_waitsome;
+extern timestat wtime_commstate_send;
+extern timestat wtime_commstate_ssend;
extern timestat wtime_commstate_isend;
extern timestat wtime_commstate_memcpy;
extern timestat wtime_commstate_interpolate_irecv;