aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortradke <tradke@b32723a9-ab3a-4a60-88e2-2e5d99d7c17a>2001-12-16 20:39:46 +0000
committertradke <tradke@b32723a9-ab3a-4a60-88e2-2e5d99d7c17a>2001-12-16 20:39:46 +0000
commit04dfc890087a068d5769a570fefa7223460543ed (patch)
treee71eee9f8bee86ade7434fbf76f9b2530bab6683
parentbc6291eb873b0261bbcdd43abc9d22d30d774026 (diff)
Added new recovery mode "autoprobe" which is like "auto" but lets the user
continue with the simulation if no checkpoint file was found. Closes PR Cactus/300. git-svn-id: http://svn.cactuscode.org/arrangements/CactusBase/IOUtil/trunk@139 b32723a9-ab3a-4a60-88e2-2e5d99d7c17a
-rw-r--r--doc/documentation.tex18
-rw-r--r--param.ccl8
-rw-r--r--src/CheckpointRecovery.c82
3 files changed, 69 insertions, 39 deletions
diff --git a/doc/documentation.tex b/doc/documentation.tex
index 9be2e93..8a7cb24 100644
--- a/doc/documentation.tex
+++ b/doc/documentation.tex
@@ -285,7 +285,7 @@ parameters. The most important ones are:
boolean flag for checkpointing only initial data
\item {\tt IO::recover}\\
keyword parameter telling if/how to recover.\\
- Choices are {\tt "auto"}, {\tt "manual"}, and {\tt "no"}.
+ Choices are {\tt "no"}, {\tt "manual"}, {\tt "auto"}, and {\tt "autoprobe"}.
\item {\tt IO::recover\_file}\\
filename of the recovery file
\item {\tt IO::recovery\_dir}\\
@@ -300,11 +300,23 @@ Checkpoint filenames consist of a basename (as specified in
plus the file extension indicating the file format ({\tt ".ieee"} for IEEEIO
data from {\tt CactusPUGHIO/IOFlexIO}, or {\tt ".h5} for HDF5 data from
{\tt CactusPUGHIO/IOHDF5}).\\
+Use the {\tt "manual"} mode to recover from a specific checkpoint file
+by adding the iteration number to the basename parameter.
+
The {\tt "auto"} recovery mode will automatically
recover from the latest checkpoint file found in the recovery directory.
In this case {\tt IO::recover\_file} should contain the basename only.
-Use the {\tt "manual"} mode to recover from a specific checkpoint file
-by adding the iteration number to the basename parameter.\\
+
+The {\tt "autoprobe"} recovery mode is similar to the {\tt "auto"} mode
+except that it would not stop the code if no checkpoint file was
+found but only print a warning message and then continue with the
+simulation. This mode allows you to enable checkpointing and recovery
+in the same parameter file and use that without any changes to restart your
+simulation. On the other hand, you are responsible now for making the
+checkpoint/recovery directory/file parameters match -- a mismatch will not
+be detected by Cactus in order to terminate it. Instead the simulation would
+always start from initial data without any recovery.\\
+
Because the same I/O methods implement both output of 3D data and
checkpoint files, the same I/O modes are used (see Section~\ref{iomodes}).
Note that the recovery routines in Cactus can process both chunked and
diff --git a/param.ccl b/param.ccl
index 39f2d83..5117187 100644
--- a/param.ccl
+++ b/param.ccl
@@ -105,10 +105,10 @@ BOOLEAN checkpoint_keep_all "Keep all checkpoint files ?" STEERABLE = RECOVER
} "no"
KEYWORD recover "Recover from a checkpoint file ?" STEERABLE = RECOVER
{
- "no" :: "Don't recover"
- "manual" :: "Recover from the checkpoint file given as <recovery_dir>/<recover_file>"
- "yes" :: "Same as \"manual\" (this value is deprecated)"
- "auto" :: "Automatically recover from the latest checkpoint file found in <recovery_dir>"
+ "no" :: "Don't recover"
+ "manual" :: "Recover from the checkpoint file given as <recovery_dir>/<recover_file>"
+ "auto" :: "Automatically recover from the latest checkpoint file found in <recovery_dir>"
+ "autoprobe" :: "Probe for checkpoint files and automatically recover, continue as usual if nothing was found"
} "no"
INT checkpoint_every "Checkpoint every x iterations" STEERABLE = ALWAYS
{
diff --git a/src/CheckpointRecovery.c b/src/CheckpointRecovery.c
index 6cb2582..71e98c1 100644
--- a/src/CheckpointRecovery.c
+++ b/src/CheckpointRecovery.c
@@ -42,6 +42,7 @@ void IOUtil_RecoverIDFromDatafiles (cGH *GH);
/* Local data holding info on Recover Functions */
static cHandledData *RecoverFunctions = NULL;
static int num_functions = 0;
+static int checkpoint_file_exists = 0;
#ifdef HAVE_SCANDIR
/* prefix and extension of potential recovery files */
@@ -330,12 +331,19 @@ void IOUtil_RecoverGH (cGH *GH)
myGH = (ioGH *) CCTK_GHExtension (GH, "IO");
- myGH->recovered = IOUtil_RecoverFromFile (GH, NULL, CP_RECOVER_DATA) >= 0;
+ if (checkpoint_file_exists)
+ {
+ myGH->recovered = IOUtil_RecoverFromFile (GH, NULL, CP_RECOVER_DATA) >= 0;
- /* stop if recovery failed */
- if (! myGH->recovered)
+ /* stop if recovery failed */
+ if (! myGH->recovered)
+ {
+ CCTK_WARN (0, "Failed to restart from recovery !");
+ }
+ }
+ else
{
- CCTK_WARN (0, "Failed to restart from recovery !");
+ myGH->recovered = 0;
}
}
@@ -474,26 +482,34 @@ void IOUtil_RecoverIDFromDatafiles (cGH *GH)
It is called by the IO thorns' parameter recovery routines
scheduled at CCTK_RECOVER_PARAMETERS, and simply calls
the given callback routine with its arguments
- plus a recovery file name.
+ plus a checkpoint filename.
@enddesc
@var recoverFn
@vdesc callback function for recovery of parameters
- from a given recovery file
+ from a given checkpoint file
@vtype int (*) (cGH *, const char *, int)
@vio in
@endvar
@var fileExtension
- @vdesc extension of valid recovery files for given callback
+ @vdesc extension of valid checkpoint files for given callback
@vtype const char *
@vio in
@endvar
@var fileType
- @vdesc string to describe the type of recovery file
+ @vdesc string to describe the type of checkpoint file
(used for warning/info messages)
@vtype const char *
@vio in
@endvar
+
+ @returntype int
+ @returndesc
+ 0 if in "autoprobe" mode and no cp files were found, or<BR>
+ +1 if parameter recovery was successful for some cp file,<BR>
+ -1 if in "auto" mode and no checkpoint files were found,
+ or if parameter recovery failed for some cp file
+ @endreturndesc
@@*/
int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH,
const char *basefilename,
@@ -501,20 +517,20 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH,
const char *fileExtension,
const char *fileType)
{
- DECLARE_CCTK_PARAMETERS
- int retval = -1; /* the return value */
+ int retval; /* the return value */
cGH *dummyGH = NULL; /* there's no GH yet but the callback routine
expects a GH pointer */
+#ifdef HAVE_SCANDIR
+ int i, nRecoverFiles;
+ struct dirent **recoverFileList = NULL;
+#endif
+ DECLARE_CCTK_PARAMETERS
- if (CCTK_Equals (recover, "auto"))
+ if (CCTK_Equals (recover, "auto") || CCTK_Equals (recover, "autoprobe"))
{
-
+ retval = CCTK_Equals (recover, "auto") ? -1 : 0;
#ifdef HAVE_SCANDIR
- int i, nRecoverFiles;
- struct dirent **recoverFileList = NULL;
-
-
if (verbose)
{
CCTK_VInfo (CCTK_THORNSTRING, "Searching for %s checkpoint files "
@@ -522,7 +538,7 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH,
fileType, recover_file, recovery_dir);
}
- /* set the file prefix and extension for selecting valid recovery files */
+ /* set the file prefix and extension for selecting valid checkpoint files */
/* we have to pass it via global variables to the select() routine
because it doesn't receive user-supplied arguments */
recoverFilePrefix = (char *) malloc (strlen (recover_file) + 5);
@@ -536,30 +552,28 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH,
if (nRecoverFiles <= 0)
{
- CCTK_VWarn (1, __LINE__, __FILE__, CCTK_THORNSTRING,
+ CCTK_VWarn (retval ? 1 : 3, __LINE__, __FILE__, CCTK_THORNSTRING,
"No %s checkpoint files with basefilename '%s' found in "
"recovery directory '%s'",
fileType, recover_file, recovery_dir);
}
-
- /* loop over all recovery files found and call the callback routine
- skip all following files after the first successful recovery */
- for (i = 0; i < nRecoverFiles; i++)
+ else
{
- if (retval < 0)
+ /* loop over all recovery files found and call the callback routine;
+ skip all following files after the first successful recovery (when
+ recoverFn() returned a positive value) */
+ for (i = 0; i < nRecoverFiles; i++)
{
- retval = (*recoverFn) (dummyGH, recoverFileList [i]->d_name,
- CP_RECOVER_PARAMETERS);
+ if (retval <= 0)
+ {
+ retval = recoverFn (dummyGH, recoverFileList[i]->d_name,
+ CP_RECOVER_PARAMETERS);
+ }
+ free (recoverFileList[i]);
}
- free (recoverFileList [i]);
- }
- if (recoverFileList)
- {
free (recoverFileList);
}
-
#else
-
/* no scandir(3) ? give up ! */
CCTK_WARN (0, "You cannot use 'IO::recover = \"auto\"' on "
"this architecture because it doesn't provide scandir(3) to "
@@ -567,7 +581,6 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH,
"Please use 'IO::recover = \"manual\"' instead !");
#endif
-
}
else
{
@@ -582,6 +595,11 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH,
"with basefilename '%s' in recovery directory '%s'",
fileType, recover_file, recovery_dir);
}
+
+ /* remember parameter recovery status for later evaluation in
+ IOUtil_RecoverGH() */
+ checkpoint_file_exists = retval > 0;
+
return (retval);
}