aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoreschnett <eschnett@d0051148-8e13-4bef-be1d-f6c572c85f9f>2010-05-12 18:43:21 +0000
committereschnett <eschnett@d0051148-8e13-4bef-be1d-f6c572c85f9f>2010-05-12 18:43:21 +0000
commitf2c4311f68fbc4ed034edbc8854983c5c7344fe1 (patch)
tree56709ec899194488a8266397d042b87a2aac1aed
parent6fa9d6e5a2bcd4ad4e81540d7be2cbba18d77de8 (diff)
TimerReport contains a function CollectTimerInfo which performs a
reduction of timer values across processes. Currently, this function can fail with an error from MPI_ReduceAll if the number of timers is different on the different processes. TimerReport assumes that the timers are the same on all processes. This is not something which is enforced by Cactus, but it is usually true. This patch adds a check that the number of timers is the same, and prints a warning and disables output of the corresponding timer files if it is not, rather than aborting with an MPI error. (Patch from Ian Hinder) git-svn-id: http://svn.cactuscode.org/arrangements/CactusUtils/TimerReport/trunk@30 d0051148-8e13-4bef-be1d-f6c572c85f9f
-rw-r--r--src/Output.c45
1 files changed, 37 insertions, 8 deletions
diff --git a/src/Output.c b/src/Output.c
index e58c196..5de203d 100644
--- a/src/Output.c
+++ b/src/Output.c
@@ -48,8 +48,8 @@ static void OutputAllTimersTogether (CCTK_ARGUMENTS);
static void OutputAllTimersReadable (CCTK_ARGUMENTS);
static void PrintTopTimers (CCTK_ARGUMENTS);
-static void CollectTimerInfo (cGH const * restrict const cctkGH,
- struct timer_stats * restrict const timers);
+static int CollectTimerInfo (cGH const * restrict const cctkGH,
+ struct timer_stats * restrict const timers);
static char *QuoteForCSV (const char*);
static char *QuoteForTSV (const char*);
@@ -332,7 +332,8 @@ static void OutputAllTimersTogether (CCTK_ARGUMENTS)
DECLARE_CCTK_PARAMETERS;
struct timer_stats timers;
- CollectTimerInfo (cctkGH, &timers);
+ if (!CollectTimerInfo (cctkGH, &timers))
+ return;
if (CCTK_MyProc(cctkGH) == 0)
{
@@ -452,8 +453,8 @@ static void OutputAllTimersReadable (CCTK_ARGUMENTS)
DECLARE_CCTK_PARAMETERS;
struct timer_stats timers;
- CollectTimerInfo (cctkGH, &timers);
-
+ if (!CollectTimerInfo (cctkGH, &timers))
+ return;
if (CCTK_MyProc(cctkGH) == 0)
{
@@ -613,11 +614,29 @@ static void PrintTopTimers (CCTK_ARGUMENTS)
return;
}
+static int integer_same_on_all_procs(cGH const * restrict const cctkGH, const CCTK_INT i)
+{
+ /* There is no "equals" reduction operator, so we check that
+ * minimum and maximum are the same */
+ const int reduce_min = CCTK_ReductionArrayHandle ("minimum");
+ const int reduce_max = CCTK_ReductionArrayHandle ("maximum");
+ CCTK_INT min_i = 0;
+ CCTK_INT max_i = 0;
+
+ if (CCTK_ReduceLocScalar(cctkGH, -1 /* All processors */, reduce_min,
+ &i, &min_i, CCTK_VARIABLE_INT))
+ CCTK_WARN (CCTK_WARN_ABORT, "Error in calling min reduction operator");
+
+ if (CCTK_ReduceLocScalar(cctkGH, -1 /* All processors */, reduce_max,
+ &i, &max_i, CCTK_VARIABLE_INT))
+ CCTK_WARN (CCTK_WARN_ABORT, "Error in calling max reduction operator");
+ return min_i == max_i;
+}
/* Collect timer information onto the root processor */
-static void CollectTimerInfo (cGH const * restrict const cctkGH,
- struct timer_stats * restrict const timers)
+static int CollectTimerInfo (cGH const * restrict const cctkGH,
+ struct timer_stats * restrict const timers)
{
DECLARE_CCTK_PARAMETERS;
@@ -625,7 +644,16 @@ static void CollectTimerInfo (cGH const * restrict const cctkGH,
timers->ntimers = CCTK_NumTimers();
assert (timers->ntimers >= 0);
-
+
+ /* Check that the number of timers is consistent across processors */
+ if (!integer_same_on_all_procs(cctkGH, timers->ntimers))
+ {
+ CCTK_VWarn (1, __LINE__, __FILE__, CCTK_THORNSTRING,
+ "Number of timers is inconsistent across processes; cannot collect timer information. Number of timers on processor %d: %d",
+ CCTK_MyProc(cctkGH), timers->ntimers);
+ return 0;
+ }
+
timers->secs_local = malloc(timers->ntimers * sizeof *timers->secs_local);
assert (timers->ntimers==0 || timers->secs_local);
if (CCTK_MyProc(cctkGH) == 0)
@@ -685,6 +713,7 @@ static void CollectTimerInfo (cGH const * restrict const cctkGH,
CCTK_WARN (CCTK_WARN_ABORT,
"Error in calling reduction operators");
}
+ return 1;
}