diff options
author | eschnett <eschnett@d0051148-8e13-4bef-be1d-f6c572c85f9f> | 2010-05-12 18:43:21 +0000 |
---|---|---|
committer | eschnett <eschnett@d0051148-8e13-4bef-be1d-f6c572c85f9f> | 2010-05-12 18:43:21 +0000 |
commit | f2c4311f68fbc4ed034edbc8854983c5c7344fe1 (patch) | |
tree | 56709ec899194488a8266397d042b87a2aac1aed | |
parent | 6fa9d6e5a2bcd4ad4e81540d7be2cbba18d77de8 (diff) |
TimerReport contains a function CollectTimerInfo which performs a
reduction of timer values across processes. Currently, this function
can fail with an error from MPI_ReduceAll if the number of timers is
different on the different processes. TimerReport assumes that the
timers are the same on all processes. This is not something which is
enforced by Cactus, but it is usually true. This patch adds a check
that the number of timers is the same, and prints a warning and
disables output of the corresponding timer files if it is not, rather
than aborting with an MPI error.
(Patch from Ian Hinder)
git-svn-id: http://svn.cactuscode.org/arrangements/CactusUtils/TimerReport/trunk@30 d0051148-8e13-4bef-be1d-f6c572c85f9f
-rw-r--r-- | src/Output.c | 45 |
1 files changed, 37 insertions, 8 deletions
diff --git a/src/Output.c b/src/Output.c index e58c196..5de203d 100644 --- a/src/Output.c +++ b/src/Output.c @@ -48,8 +48,8 @@ static void OutputAllTimersTogether (CCTK_ARGUMENTS); static void OutputAllTimersReadable (CCTK_ARGUMENTS); static void PrintTopTimers (CCTK_ARGUMENTS); -static void CollectTimerInfo (cGH const * restrict const cctkGH, - struct timer_stats * restrict const timers); +static int CollectTimerInfo (cGH const * restrict const cctkGH, + struct timer_stats * restrict const timers); static char *QuoteForCSV (const char*); static char *QuoteForTSV (const char*); @@ -332,7 +332,8 @@ static void OutputAllTimersTogether (CCTK_ARGUMENTS) DECLARE_CCTK_PARAMETERS; struct timer_stats timers; - CollectTimerInfo (cctkGH, &timers); + if (!CollectTimerInfo (cctkGH, &timers)) + return; if (CCTK_MyProc(cctkGH) == 0) { @@ -452,8 +453,8 @@ static void OutputAllTimersReadable (CCTK_ARGUMENTS) DECLARE_CCTK_PARAMETERS; struct timer_stats timers; - CollectTimerInfo (cctkGH, &timers); - + if (!CollectTimerInfo (cctkGH, &timers)) + return; if (CCTK_MyProc(cctkGH) == 0) { @@ -613,11 +614,29 @@ static void PrintTopTimers (CCTK_ARGUMENTS) return; } +static int integer_same_on_all_procs(cGH const * restrict const cctkGH, const CCTK_INT i) +{ + /* There is no "equals" reduction operator, so we check that + * minimum and maximum are the same */ + const int reduce_min = CCTK_ReductionArrayHandle ("minimum"); + const int reduce_max = CCTK_ReductionArrayHandle ("maximum"); + CCTK_INT min_i = 0; + CCTK_INT max_i = 0; + + if (CCTK_ReduceLocScalar(cctkGH, -1 /* All processors */, reduce_min, + &i, &min_i, CCTK_VARIABLE_INT)) + CCTK_WARN (CCTK_WARN_ABORT, "Error in calling min reduction operator"); + + if (CCTK_ReduceLocScalar(cctkGH, -1 /* All processors */, reduce_max, + &i, &max_i, CCTK_VARIABLE_INT)) + CCTK_WARN (CCTK_WARN_ABORT, "Error in calling max reduction operator"); + return min_i == max_i; +} /* Collect timer information onto the root processor */ -static void CollectTimerInfo (cGH const * restrict const cctkGH, - struct timer_stats * restrict const timers) +static int CollectTimerInfo (cGH const * restrict const cctkGH, + struct timer_stats * restrict const timers) { DECLARE_CCTK_PARAMETERS; @@ -625,7 +644,16 @@ static void CollectTimerInfo (cGH const * restrict const cctkGH, timers->ntimers = CCTK_NumTimers(); assert (timers->ntimers >= 0); - + + /* Check that the number of timers is consistent across processors */ + if (!integer_same_on_all_procs(cctkGH, timers->ntimers)) + { + CCTK_VWarn (1, __LINE__, __FILE__, CCTK_THORNSTRING, + "Number of timers is inconsistent across processes; cannot collect timer information. Number of timers on processor %d: %d", + CCTK_MyProc(cctkGH), timers->ntimers); + return 0; + } + timers->secs_local = malloc(timers->ntimers * sizeof *timers->secs_local); assert (timers->ntimers==0 || timers->secs_local); if (CCTK_MyProc(cctkGH) == 0) @@ -685,6 +713,7 @@ static void CollectTimerInfo (cGH const * restrict const cctkGH, CCTK_WARN (CCTK_WARN_ABORT, "Error in calling reduction operators"); } + return 1; } |