diff options
Diffstat (limited to 'src/RecoverGH.c')
-rw-r--r-- | src/RecoverGH.c | 343 |
1 files changed, 343 insertions, 0 deletions
diff --git a/src/RecoverGH.c b/src/RecoverGH.c new file mode 100644 index 0000000..0391b30 --- /dev/null +++ b/src/RecoverGH.c @@ -0,0 +1,343 @@ + /*@@ + @file RecoverGH.c + @date Fri Jun 19 09:14:22 1998 + @author Tom Goodale + @desc + Contains the routines to do the internal checkpoint recovery. + + Currently can recover from: + (1) One file containing recombined data + (2) Multiple unrecombined files, where the current + number of processors and outputing processors + match those used to write the data. + @enddesc + @history + @hauthor Gabrielle Allen @hdate 19 Oct 1998 + @hdesc Changed names ready for thorn_IO + @endhistory + @version $Id$ + @@*/ + +static char *rcsid = "$Id$"; + +#include <stdio.h> + +#include "cctk.h" +#include "flesh.h" +#include "declare_parameters.h" +#include "GHExtensions.h" +#include "WarnLevel.h" +#include "Comm.h" +#ifdef CACTUSBASE_PUGH +#include "CactusBase/pugh/src/include/pugh.h" +#endif +#include "CactusBase/IOUtil/src/ioGH.h" +#include "ioFlexGH.h" + + +/* this one comes from RestoreFile.c */ +int IOFlexIO_RestoreIEEEIOfile (cGH *GH, IOFile ifp, + int IOrecover_ioproc, + int IOrecover_ioproc_every, + int IOrecover_unchunked); + + + /*@@ + @routine IOFlexIO_RecoverGH + @date Fri Jun 19 09:22:52 1998 + @author Tom Goodale + @desc + Recovers a GH. + @enddesc + @calls IOUtil_PrepareFilename IOFlexIO_RestoreIEEEIOfile + @calledby + @history + @hauthor Gabrielle Allen + @hdate Thu Jul 2 18:17:59 1998 + @hdesc Restore the physical time and iteration count, pass myproc to + IEEEIOparamRestore + Derives the filename from IOUtil_PrepareFilename (in thorn IOUtil) + @hauthor Gabrielle Allen @hdate Oct 17 1998 + @hdesc Added input of (some) GH structure variables + @endhistory + +@@*/ + +int IOFlexIO_RecoverGH (cGH *GH, const char *basename, int called_from) +{ +#ifdef CACTUSBASE_PUGH + + DECLARE_PARAMETERS + IOFile ifp; + char ftmp [1024], fname [1024]; + int proc, nprocs, myproc; + int index; + int is_IEEEIO_file; + Long nels_stored; + int nt_stored; + int iteration_stored; + int file_ioproc, file_ioproc_every; + int file_nprocs, file_unchunked; + CCTK_INT4 tmpInt; + char msg [512]; + pGH *pughGH; + ioGH *ioUtilGH; + cTimer total_time, dataset_time, param_time; +#ifdef MPI + CCTK_INT info [3]; +#endif + + + /* Get the handles for PUGH and IOUtil extensions */ + pughGH = (pGH *) GH->extensions [CCTK_GetGHExtensionHandle ("PUGH")]; + ioUtilGH = (ioGH *) GH->extensions [CCTK_GetGHExtensionHandle ("IO")]; + + /* identify myself */ + nprocs = CCTK_GetnProcs (GH); + myproc = CCTK_GetMyProc (GH); + + /* initialize timers */ + CactusResetTimer (&total_time); + CactusResetTimer (&dataset_time); + CactusResetTimer (¶m_time); + + CactusStartTimer (&total_time); + + /* Examine base file to find whether recovering from + * one (recombined) file or from multiple files + */ + + if (myproc == 0) { + + /* Determine name of base file + NOTE: As we don't know whether the file is chunked or not + we need to try both file names. */ + /* at first try with current chunking mode */ + file_unchunked = ioUtilGH->unchunked; + IOUtil_PrepareFilename(GH, basename, fname, called_from, 0, file_unchunked); + if (called_from == CP_RECOVER_DATA) + strcat (fname, ".chkpt"); + strcat (fname, ".ieee"); + + if (IO_verbose) + printf ("Opening file %s\n", fname); + + /* Open file, make sure the file is valid */ + ifp = IEEEopen (fname, "r"); + if (IOisValid (ifp)) + is_IEEEIO_file = 1; + else { + if (IO_verbose) + printf ("Cannot open file '%s'\n", fname); + + /* now try with the other chunking mode */ + file_unchunked = ! ioUtilGH->unchunked; + IOUtil_PrepareFilename (GH, basename, fname, called_from, + 0, file_unchunked); + if (called_from == CP_RECOVER_DATA) + strcat (fname, ".chkpt"); + strcat (fname, ".ieee"); + if (IO_verbose) + printf ("Trying now file '%s'...\n", fname); + + /* Open file, make sure the file is valid */ + ifp = IEEEopen (fname, "r"); + is_IEEEIO_file = IOisValid (ifp); + } + } + + if (myproc == 0 && is_IEEEIO_file) { + /* Now determine how the data was written */ + + /* Read nioprocs used to write data */ + index = IOreadAttributeInfo (ifp, "GH$ioproc_every", &nt_stored, &nels_stored); + if (index >= 0 && nt_stored == FLEXIO_INT4 && nels_stored == 1) { + IOreadAttribute (ifp, index, &tmpInt); + file_ioproc_every = tmpInt; + } else { + CCTK_WARN (1, "Unable to restore GH$ioproc_every. " + "Assuming it is nprocs and continuing"); + file_ioproc_every = nprocs; + } + + /* Read nprocs used to write data */ + index = IOreadAttributeInfo (ifp, "GH$nprocs", &nt_stored, &nels_stored); + if (index >= 0 && nt_stored == FLEXIO_INT4 && nels_stored == 1) { + IOreadAttribute (ifp, index, &tmpInt); + file_nprocs = tmpInt; + } else { + CCTK_WARN (1, "Unable to restore GH$nprocs. " + "Assuming it is 1 and continuing"); + file_nprocs = 1; + } + + /* Determine whether data is chunked or unchunked + We could derive this from the filename itself but just to be sure ... */ + index = IOreadAttributeInfo (ifp, "unchunked", &nt_stored, &nels_stored); + if (index >= 0 && nt_stored == FLEXIO_INT4 && nels_stored == 1) { + IOreadAttribute (ifp, index, &tmpInt); + file_unchunked = tmpInt; + } else { + sprintf (msg, "Unable to restore 'unchunked' attribute. " + "Assuming it is %s and continuing", + file_unchunked ? "true" : "false"); + CCTK_WARN (1, msg); + } + + /* If we restore from multiple files + * the number of processors must match. + */ + + if (file_ioproc_every == nprocs || file_unchunked) { + if (IO_verbose) + printf ("Recovering from one %s file\n", + file_unchunked ? "unchunked" : "chunked"); + } else { + if (file_nprocs != nprocs) { + sprintf (msg, "Must restart on %d processors with multiple files " + "or recombine them", file_nprocs); + CCTK_WARN (0, msg); + } + if (IO_verbose) + printf ("Recovering from %d chunked files\n", + nprocs / file_ioproc_every + (nprocs % file_ioproc_every?1:0)); + } + } + +#ifdef MPI + /* Broadcast chunking mode to all processors from processor zero */ + info [0] = is_IEEEIO_file; + info [1] = unchunked; + info [2] = file_ioproc_every; + CACTUS_MPI_ERROR (MPI_Bcast (info, 3, PUGH_MPI_INT, 0, + pughGH->PUGH_COMM_WORLD)); + is_IEEEIO_file = info [0]; + unchunked = info [1]; + file_ioproc_every = info [2]; +#endif + + /* return here to IOUtil if no valid file could be found */ + if (! is_IEEEIO_file) { + if (myproc == 0) { + sprintf (msg, "No valid IEEEIO file '%s' found !", fname); + CCTK_WARN (2, msg); + } + return (-1); + } + + /* Determine the IO processors for each node and the corresponding + checkpoint file */ + file_ioproc = myproc - (myproc % file_ioproc_every); + IOUtil_PrepareFilename (GH, basename, fname, called_from, + file_ioproc/file_ioproc_every, file_unchunked); + if (called_from == CP_RECOVER_DATA) + strcat (fname, ".chkpt"); + strcat (fname, ".ieee"); + + /* Open chunked files on other IO processors */ + if (myproc != 0 && myproc == file_ioproc) { + + if (IO_verbose) + printf ("Opening chunked file '%s' on processor %d.\n", + fname, myproc); + + /* Open file, make sure the file is valid */ + ifp = IEEEopen (fname, "r"); + if (! IOisValid (ifp)) { + sprintf (msg, "Cannot open file '%s' on processor %d", + fname, myproc); + CCTK_WARN (0, msg); + } + } + + /* Restore the data */ + if (IO_verbose && myproc == 0) + printf ("Recovering %schunked data with ioproc %d, ioproc_every %d.\n", + file_unchunked ? "un" : "", file_ioproc, file_ioproc_every); + + CactusStartTimer (&dataset_time); + IOFlexIO_RestoreIEEEIOfile (GH, ifp, file_ioproc, file_ioproc_every, + file_unchunked); + CactusStopTimer (&dataset_time); + + /* Close the file. */ + if (myproc == file_ioproc) { + if (IO_verbose) + printf ("Closing file '%s' after recovery.\n", fname); + IOclose (ifp); + } + + if (called_from == CP_RECOVER_DATA) { + /* Must read in parameters and scalars on all processors. */ + CactusStartTimer (¶m_time); + for (proc = file_ioproc; + proc < file_ioproc+file_ioproc_every && proc < nprocs; + proc++) { + + /* Only have the file open by one proc at any time. */ + if (proc == myproc) { + + /* Open file, make sure the file is valid */ + ifp = IEEEopen (fname, "r"); + if (! IOisValid (ifp)) { + sprintf (msg, "Cannot open checkpoint file '%s' on processor %d", + fname, myproc); + CCTK_WARN (0, msg); + } + + /* Restore the parameters. */ + if (IO_verbose) + printf ("Recovering parameters on processor %d.\n", myproc); +/*** FIXME ***/ +#if 0 + IO_IEEEIOparamRestore (ifp, myproc); + + /* Restore the structure variables. */ + if (IO_verbose) + printf ("Recovering GH variables.\n"); + IO_IEEEIOStructRestore (GH, ifp); +#endif + + /* Restore global variables */ + + /* Get the iteration number. */ + if (IO_verbose) + printf ("Recovering iteration number.\n"); + index = IOreadAttributeInfo (ifp, "iteration", &nt_stored, &nels_stored); + + if (index >= 0 && nt_stored == FLEXIO_INT4 && nels_stored == 1) { + IOreadAttribute (ifp, index, &iteration_stored); + if (IO_verbose) + printf ("Iteration number is %d\n", (int) iteration_stored); + GH->iteration = iteration_stored; + } else + printf ("*Warning* Unable to restore iteration number\n"); + + /* Close the file. */ + if (IO_verbose) + printf ("Closing '%s' after recovery.\n", fname); + IOclose (ifp); + } + + /* Synchronise all processors */ + CCTK_Barrier (GH); + } + CactusStopTimer (¶m_time); + } + + /* print timing output */ + if (IO_verbose && called_from == CP_RECOVER_DATA && myproc == 0) { + printf ( + "----------------------------------------------------------------\n"); +/*** FIXME: choose right component of basic[] ***/ + printf ("Time to restore data: %10.3lf sec\n", + dataset_time.total.basic [0]); + printf ("Time to restore parameters: %10.3lf sec\n", + param_time.total.basic [0]); + printf ("Time to recover from checkpoint: %10.3lf sec\n", + total_time.total.basic [0]); + } + + return (0); +#endif /* CACTUSBASE_PUGH */ +} |