aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/documentation.tex18
-rw-r--r--param.ccl8
-rw-r--r--src/CheckpointRecovery.c82
3 files changed, 69 insertions, 39 deletions
diff --git a/doc/documentation.tex b/doc/documentation.tex
index 9be2e93..8a7cb24 100644
--- a/doc/documentation.tex
+++ b/doc/documentation.tex
@@ -285,7 +285,7 @@ parameters. The most important ones are:
boolean flag for checkpointing only initial data
\item {\tt IO::recover}\\
keyword parameter telling if/how to recover.\\
- Choices are {\tt "auto"}, {\tt "manual"}, and {\tt "no"}.
+ Choices are {\tt "no"}, {\tt "manual"}, {\tt "auto"}, and {\tt "autoprobe"}.
\item {\tt IO::recover\_file}\\
filename of the recovery file
\item {\tt IO::recovery\_dir}\\
@@ -300,11 +300,23 @@ Checkpoint filenames consist of a basename (as specified in
plus the file extension indicating the file format ({\tt ".ieee"} for IEEEIO
data from {\tt CactusPUGHIO/IOFlexIO}, or {\tt ".h5} for HDF5 data from
{\tt CactusPUGHIO/IOHDF5}).\\
+Use the {\tt "manual"} mode to recover from a specific checkpoint file
+by adding the iteration number to the basename parameter.
+
The {\tt "auto"} recovery mode will automatically
recover from the latest checkpoint file found in the recovery directory.
In this case {\tt IO::recover\_file} should contain the basename only.
-Use the {\tt "manual"} mode to recover from a specific checkpoint file
-by adding the iteration number to the basename parameter.\\
+
+The {\tt "autoprobe"} recovery mode is similar to the {\tt "auto"} mode
+except that it would not stop the code if no checkpoint file was
+found but only print a warning message and then continue with the
+simulation. This mode allows you to enable checkpointing and recovery
+in the same parameter file and use that without any changes to restart your
+simulation. On the other hand, you are responsible now for making the
+checkpoint/recovery directory/file parameters match -- a mismatch will not
+be detected by Cactus in order to terminate it. Instead the simulation would
+always start from initial data without any recovery.\\
+
Because the same I/O methods implement both output of 3D data and
checkpoint files, the same I/O modes are used (see Section~\ref{iomodes}).
Note that the recovery routines in Cactus can process both chunked and
diff --git a/param.ccl b/param.ccl
index 39f2d83..5117187 100644
--- a/param.ccl
+++ b/param.ccl
@@ -105,10 +105,10 @@ BOOLEAN checkpoint_keep_all "Keep all checkpoint files ?" STEERABLE = RECOVER
} "no"
KEYWORD recover "Recover from a checkpoint file ?" STEERABLE = RECOVER
{
- "no" :: "Don't recover"
- "manual" :: "Recover from the checkpoint file given as <recovery_dir>/<recover_file>"
- "yes" :: "Same as \"manual\" (this value is deprecated)"
- "auto" :: "Automatically recover from the latest checkpoint file found in <recovery_dir>"
+ "no" :: "Don't recover"
+ "manual" :: "Recover from the checkpoint file given as <recovery_dir>/<recover_file>"
+ "auto" :: "Automatically recover from the latest checkpoint file found in <recovery_dir>"
+ "autoprobe" :: "Probe for checkpoint files and automatically recover, continue as usual if nothing was found"
} "no"
INT checkpoint_every "Checkpoint every x iterations" STEERABLE = ALWAYS
{
diff --git a/src/CheckpointRecovery.c b/src/CheckpointRecovery.c
index 6cb2582..71e98c1 100644
--- a/src/CheckpointRecovery.c
+++ b/src/CheckpointRecovery.c
@@ -42,6 +42,7 @@ void IOUtil_RecoverIDFromDatafiles (cGH *GH);
/* Local data holding info on Recover Functions */
static cHandledData *RecoverFunctions = NULL;
static int num_functions = 0;
+static int checkpoint_file_exists = 0;
#ifdef HAVE_SCANDIR
/* prefix and extension of potential recovery files */
@@ -330,12 +331,19 @@ void IOUtil_RecoverGH (cGH *GH)
myGH = (ioGH *) CCTK_GHExtension (GH, "IO");
- myGH->recovered = IOUtil_RecoverFromFile (GH, NULL, CP_RECOVER_DATA) >= 0;
+ if (checkpoint_file_exists)
+ {
+ myGH->recovered = IOUtil_RecoverFromFile (GH, NULL, CP_RECOVER_DATA) >= 0;
- /* stop if recovery failed */
- if (! myGH->recovered)
+ /* stop if recovery failed */
+ if (! myGH->recovered)
+ {
+ CCTK_WARN (0, "Failed to restart from recovery !");
+ }
+ }
+ else
{
- CCTK_WARN (0, "Failed to restart from recovery !");
+ myGH->recovered = 0;
}
}
@@ -474,26 +482,34 @@ void IOUtil_RecoverIDFromDatafiles (cGH *GH)
It is called by the IO thorns' parameter recovery routines
scheduled at CCTK_RECOVER_PARAMETERS, and simply calls
the given callback routine with its arguments
- plus a recovery file name.
+ plus a checkpoint filename.
@enddesc
@var recoverFn
@vdesc callback function for recovery of parameters
- from a given recovery file
+ from a given checkpoint file
@vtype int (*) (cGH *, const char *, int)
@vio in
@endvar
@var fileExtension
- @vdesc extension of valid recovery files for given callback
+ @vdesc extension of valid checkpoint files for given callback
@vtype const char *
@vio in
@endvar
@var fileType
- @vdesc string to describe the type of recovery file
+ @vdesc string to describe the type of checkpoint file
(used for warning/info messages)
@vtype const char *
@vio in
@endvar
+
+ @returntype int
+ @returndesc
+ 0 if in "autoprobe" mode and no cp files were found, or<BR>
+ +1 if parameter recovery was successful for some cp file,<BR>
+ -1 if in "auto" mode and no checkpoint files were found,
+ or if parameter recovery failed for some cp file
+ @endreturndesc
@@*/
int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH,
const char *basefilename,
@@ -501,20 +517,20 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH,
const char *fileExtension,
const char *fileType)
{
- DECLARE_CCTK_PARAMETERS
- int retval = -1; /* the return value */
+ int retval; /* the return value */
cGH *dummyGH = NULL; /* there's no GH yet but the callback routine
expects a GH pointer */
+#ifdef HAVE_SCANDIR
+ int i, nRecoverFiles;
+ struct dirent **recoverFileList = NULL;
+#endif
+ DECLARE_CCTK_PARAMETERS
- if (CCTK_Equals (recover, "auto"))
+ if (CCTK_Equals (recover, "auto") || CCTK_Equals (recover, "autoprobe"))
{
-
+ retval = CCTK_Equals (recover, "auto") ? -1 : 0;
#ifdef HAVE_SCANDIR
- int i, nRecoverFiles;
- struct dirent **recoverFileList = NULL;
-
-
if (verbose)
{
CCTK_VInfo (CCTK_THORNSTRING, "Searching for %s checkpoint files "
@@ -522,7 +538,7 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH,
fileType, recover_file, recovery_dir);
}
- /* set the file prefix and extension for selecting valid recovery files */
+ /* set the file prefix and extension for selecting valid checkpoint files */
/* we have to pass it via global variables to the select() routine
because it doesn't receive user-supplied arguments */
recoverFilePrefix = (char *) malloc (strlen (recover_file) + 5);
@@ -536,30 +552,28 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH,
if (nRecoverFiles <= 0)
{
- CCTK_VWarn (1, __LINE__, __FILE__, CCTK_THORNSTRING,
+ CCTK_VWarn (retval ? 1 : 3, __LINE__, __FILE__, CCTK_THORNSTRING,
"No %s checkpoint files with basefilename '%s' found in "
"recovery directory '%s'",
fileType, recover_file, recovery_dir);
}
-
- /* loop over all recovery files found and call the callback routine
- skip all following files after the first successful recovery */
- for (i = 0; i < nRecoverFiles; i++)
+ else
{
- if (retval < 0)
+ /* loop over all recovery files found and call the callback routine;
+ skip all following files after the first successful recovery (when
+ recoverFn() returned a positive value) */
+ for (i = 0; i < nRecoverFiles; i++)
{
- retval = (*recoverFn) (dummyGH, recoverFileList [i]->d_name,
- CP_RECOVER_PARAMETERS);
+ if (retval <= 0)
+ {
+ retval = recoverFn (dummyGH, recoverFileList[i]->d_name,
+ CP_RECOVER_PARAMETERS);
+ }
+ free (recoverFileList[i]);
}
- free (recoverFileList [i]);
- }
- if (recoverFileList)
- {
free (recoverFileList);
}
-
#else
-
/* no scandir(3) ? give up ! */
CCTK_WARN (0, "You cannot use 'IO::recover = \"auto\"' on "
"this architecture because it doesn't provide scandir(3) to "
@@ -567,7 +581,6 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH,
"Please use 'IO::recover = \"manual\"' instead !");
#endif
-
}
else
{
@@ -582,6 +595,11 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH,
"with basefilename '%s' in recovery directory '%s'",
fileType, recover_file, recovery_dir);
}
+
+ /* remember parameter recovery status for later evaluation in
+ IOUtil_RecoverGH() */
+ checkpoint_file_exists = retval > 0;
+
return (retval);
}