diff options
author | tradke <tradke@b32723a9-ab3a-4a60-88e2-2e5d99d7c17a> | 2001-12-16 20:39:46 +0000 |
---|---|---|
committer | tradke <tradke@b32723a9-ab3a-4a60-88e2-2e5d99d7c17a> | 2001-12-16 20:39:46 +0000 |
commit | 04dfc890087a068d5769a570fefa7223460543ed (patch) | |
tree | e71eee9f8bee86ade7434fbf76f9b2530bab6683 | |
parent | bc6291eb873b0261bbcdd43abc9d22d30d774026 (diff) |
Added new recovery mode "autoprobe" which is like "auto" but lets the user
continue with the simulation if no checkpoint file was found.
Closes PR Cactus/300.
git-svn-id: http://svn.cactuscode.org/arrangements/CactusBase/IOUtil/trunk@139 b32723a9-ab3a-4a60-88e2-2e5d99d7c17a
-rw-r--r-- | doc/documentation.tex | 18 | ||||
-rw-r--r-- | param.ccl | 8 | ||||
-rw-r--r-- | src/CheckpointRecovery.c | 82 |
3 files changed, 69 insertions, 39 deletions
diff --git a/doc/documentation.tex b/doc/documentation.tex index 9be2e93..8a7cb24 100644 --- a/doc/documentation.tex +++ b/doc/documentation.tex @@ -285,7 +285,7 @@ parameters. The most important ones are: boolean flag for checkpointing only initial data \item {\tt IO::recover}\\ keyword parameter telling if/how to recover.\\ - Choices are {\tt "auto"}, {\tt "manual"}, and {\tt "no"}. + Choices are {\tt "no"}, {\tt "manual"}, {\tt "auto"}, and {\tt "autoprobe"}. \item {\tt IO::recover\_file}\\ filename of the recovery file \item {\tt IO::recovery\_dir}\\ @@ -300,11 +300,23 @@ Checkpoint filenames consist of a basename (as specified in plus the file extension indicating the file format ({\tt ".ieee"} for IEEEIO data from {\tt CactusPUGHIO/IOFlexIO}, or {\tt ".h5} for HDF5 data from {\tt CactusPUGHIO/IOHDF5}).\\ +Use the {\tt "manual"} mode to recover from a specific checkpoint file +by adding the iteration number to the basename parameter. + The {\tt "auto"} recovery mode will automatically recover from the latest checkpoint file found in the recovery directory. In this case {\tt IO::recover\_file} should contain the basename only. -Use the {\tt "manual"} mode to recover from a specific checkpoint file -by adding the iteration number to the basename parameter.\\ + +The {\tt "autoprobe"} recovery mode is similar to the {\tt "auto"} mode +except that it would not stop the code if no checkpoint file was +found but only print a warning message and then continue with the +simulation. This mode allows you to enable checkpointing and recovery +in the same parameter file and use that without any changes to restart your +simulation. On the other hand, you are responsible now for making the +checkpoint/recovery directory/file parameters match -- a mismatch will not +be detected by Cactus in order to terminate it. Instead the simulation would +always start from initial data without any recovery.\\ + Because the same I/O methods implement both output of 3D data and checkpoint files, the same I/O modes are used (see Section~\ref{iomodes}). Note that the recovery routines in Cactus can process both chunked and @@ -105,10 +105,10 @@ BOOLEAN checkpoint_keep_all "Keep all checkpoint files ?" STEERABLE = RECOVER } "no" KEYWORD recover "Recover from a checkpoint file ?" STEERABLE = RECOVER { - "no" :: "Don't recover" - "manual" :: "Recover from the checkpoint file given as <recovery_dir>/<recover_file>" - "yes" :: "Same as \"manual\" (this value is deprecated)" - "auto" :: "Automatically recover from the latest checkpoint file found in <recovery_dir>" + "no" :: "Don't recover" + "manual" :: "Recover from the checkpoint file given as <recovery_dir>/<recover_file>" + "auto" :: "Automatically recover from the latest checkpoint file found in <recovery_dir>" + "autoprobe" :: "Probe for checkpoint files and automatically recover, continue as usual if nothing was found" } "no" INT checkpoint_every "Checkpoint every x iterations" STEERABLE = ALWAYS { diff --git a/src/CheckpointRecovery.c b/src/CheckpointRecovery.c index 6cb2582..71e98c1 100644 --- a/src/CheckpointRecovery.c +++ b/src/CheckpointRecovery.c @@ -42,6 +42,7 @@ void IOUtil_RecoverIDFromDatafiles (cGH *GH); /* Local data holding info on Recover Functions */ static cHandledData *RecoverFunctions = NULL; static int num_functions = 0; +static int checkpoint_file_exists = 0; #ifdef HAVE_SCANDIR /* prefix and extension of potential recovery files */ @@ -330,12 +331,19 @@ void IOUtil_RecoverGH (cGH *GH) myGH = (ioGH *) CCTK_GHExtension (GH, "IO"); - myGH->recovered = IOUtil_RecoverFromFile (GH, NULL, CP_RECOVER_DATA) >= 0; + if (checkpoint_file_exists) + { + myGH->recovered = IOUtil_RecoverFromFile (GH, NULL, CP_RECOVER_DATA) >= 0; - /* stop if recovery failed */ - if (! myGH->recovered) + /* stop if recovery failed */ + if (! myGH->recovered) + { + CCTK_WARN (0, "Failed to restart from recovery !"); + } + } + else { - CCTK_WARN (0, "Failed to restart from recovery !"); + myGH->recovered = 0; } } @@ -474,26 +482,34 @@ void IOUtil_RecoverIDFromDatafiles (cGH *GH) It is called by the IO thorns' parameter recovery routines scheduled at CCTK_RECOVER_PARAMETERS, and simply calls the given callback routine with its arguments - plus a recovery file name. + plus a checkpoint filename. @enddesc @var recoverFn @vdesc callback function for recovery of parameters - from a given recovery file + from a given checkpoint file @vtype int (*) (cGH *, const char *, int) @vio in @endvar @var fileExtension - @vdesc extension of valid recovery files for given callback + @vdesc extension of valid checkpoint files for given callback @vtype const char * @vio in @endvar @var fileType - @vdesc string to describe the type of recovery file + @vdesc string to describe the type of checkpoint file (used for warning/info messages) @vtype const char * @vio in @endvar + + @returntype int + @returndesc + 0 if in "autoprobe" mode and no cp files were found, or<BR> + +1 if parameter recovery was successful for some cp file,<BR> + -1 if in "auto" mode and no checkpoint files were found, + or if parameter recovery failed for some cp file + @endreturndesc @@*/ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH, const char *basefilename, @@ -501,20 +517,20 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH, const char *fileExtension, const char *fileType) { - DECLARE_CCTK_PARAMETERS - int retval = -1; /* the return value */ + int retval; /* the return value */ cGH *dummyGH = NULL; /* there's no GH yet but the callback routine expects a GH pointer */ +#ifdef HAVE_SCANDIR + int i, nRecoverFiles; + struct dirent **recoverFileList = NULL; +#endif + DECLARE_CCTK_PARAMETERS - if (CCTK_Equals (recover, "auto")) + if (CCTK_Equals (recover, "auto") || CCTK_Equals (recover, "autoprobe")) { - + retval = CCTK_Equals (recover, "auto") ? -1 : 0; #ifdef HAVE_SCANDIR - int i, nRecoverFiles; - struct dirent **recoverFileList = NULL; - - if (verbose) { CCTK_VInfo (CCTK_THORNSTRING, "Searching for %s checkpoint files " @@ -522,7 +538,7 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH, fileType, recover_file, recovery_dir); } - /* set the file prefix and extension for selecting valid recovery files */ + /* set the file prefix and extension for selecting valid checkpoint files */ /* we have to pass it via global variables to the select() routine because it doesn't receive user-supplied arguments */ recoverFilePrefix = (char *) malloc (strlen (recover_file) + 5); @@ -536,30 +552,28 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH, if (nRecoverFiles <= 0) { - CCTK_VWarn (1, __LINE__, __FILE__, CCTK_THORNSTRING, + CCTK_VWarn (retval ? 1 : 3, __LINE__, __FILE__, CCTK_THORNSTRING, "No %s checkpoint files with basefilename '%s' found in " "recovery directory '%s'", fileType, recover_file, recovery_dir); } - - /* loop over all recovery files found and call the callback routine - skip all following files after the first successful recovery */ - for (i = 0; i < nRecoverFiles; i++) + else { - if (retval < 0) + /* loop over all recovery files found and call the callback routine; + skip all following files after the first successful recovery (when + recoverFn() returned a positive value) */ + for (i = 0; i < nRecoverFiles; i++) { - retval = (*recoverFn) (dummyGH, recoverFileList [i]->d_name, - CP_RECOVER_PARAMETERS); + if (retval <= 0) + { + retval = recoverFn (dummyGH, recoverFileList[i]->d_name, + CP_RECOVER_PARAMETERS); + } + free (recoverFileList[i]); } - free (recoverFileList [i]); - } - if (recoverFileList) - { free (recoverFileList); } - #else - /* no scandir(3) ? give up ! */ CCTK_WARN (0, "You cannot use 'IO::recover = \"auto\"' on " "this architecture because it doesn't provide scandir(3) to " @@ -567,7 +581,6 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH, "Please use 'IO::recover = \"manual\"' instead !"); #endif - } else { @@ -582,6 +595,11 @@ int IOUtil_RecoverParameters (int (*recoverFn) (cGH *GH, "with basefilename '%s' in recovery directory '%s'", fileType, recover_file, recovery_dir); } + + /* remember parameter recovery status for later evaluation in + IOUtil_RecoverGH() */ + checkpoint_file_exists = retval > 0; + return (retval); } |