/*@@ @file RecoverGH.c @date Fri Jun 19 09:14:22 1998 @author Tom Goodale @desc Contains the routines to do the internal checkpoint recovery. Currently can recover from: (1) One file containing recombined data (2) Multiple unrecombined files, where the current number of processors and outputing processors match those used to write the data. @enddesc @history @hauthor Gabrielle Allen @hdate 19 Oct 1998 @hdesc Changed names ready for thorn_IO @endhistory @version $Id$ @@*/ static char *rcsid = "$Id$"; #include #include "cctk.h" #include "cctk_Flesh.h" #include "cctk_parameters.h" #include "cctk_GHExtensions.h" #include "cctk_WarnLevel.h" #include "cctk_Comm.h" #include "CactusPUGH/PUGH/src/include/pugh.h" #include "CactusBase/IOUtil/src/ioGH.h" #include "ioFlexGH.h" /* this one comes from RestoreFile.c */ int IOFlexIO_RestoreIEEEIOfile (cGH *GH, IOFile ifp, int IOrecover_ioproc, int IOrecover_ioproc_every, int IOrecover_unchunked); /* Local function prototypes */ int IOFlexIO_restoreParams (IOFile ifp, int myproc); /*@@ @routine IOFlexIO_RecoverGH @date Fri Jun 19 09:22:52 1998 @author Tom Goodale @desc Recovers a GH. @enddesc @calls IOUtil_PrepareFilename IOFlexIO_RestoreIEEEIOfile @calledby @history @hauthor Gabrielle Allen @hdate Thu Jul 2 18:17:59 1998 @hdesc Restore the physical time and iteration count, pass myproc to IEEEIOparamRestore Derives the filename from IOUtil_PrepareFilename (in thorn IOUtil) @hauthor Gabrielle Allen @hdate Oct 17 1998 @hdesc Added input of (some) GH structure variables @endhistory @@*/ int IOFlexIO_RecoverGH (cGH *GH, const char *basename, int called_from) { DECLARE_CCTK_PARAMETERS IOFile ifp; char fname [1024]; int proc, nprocs, myproc; int index; int is_IEEEIO_file; Long nels_stored; int nt_stored; int main_loop_index_stored; int cctk_iteration_stored; CCTK_REAL cctk_time_stored; int file_ioproc, file_ioproc_every; int file_nprocs, file_unchunked; CCTK_INT4 tmpInt; CCTK_REAL tmpDouble; char msg [512]; pGH *pughGH; ioGH *ioUtilGH; flexioGH *myGH; #ifdef MPI CCTK_INT info [3]; #endif /* just to make the compiler happy */ ifp = (IOFile) -1; is_IEEEIO_file = 0; file_ioproc_every = 0; file_unchunked = 0; /* Get the handles for PUGH, IOUtil, and IOFlexIO extensions */ pughGH = (pGH *) GH->extensions [CCTK_GHExtensionHandle ("PUGH")]; ioUtilGH = (ioGH *) GH->extensions [CCTK_GHExtensionHandle ("IO")]; myGH = (flexioGH *) GH->extensions [CCTK_GHExtensionHandle ("IOFlexIO")]; /* identify myself */ nprocs = CCTK_nProcs (GH); myproc = CCTK_MyProc (GH); /* start the total timer */ if (print_timing_info) CCTK_TimerStartI (myGH->recoverTotalTimer); /* Examine base file to find whether recovering from * one (recombined) file or from multiple files */ if (myproc == 0) { /* Determine name of base file NOTE: As we don't know whether the file is chunked or not we need to try both file names. */ /* at first try with current chunking mode */ file_unchunked = ioUtilGH->unchunked; IOUtil_PrepareFilename(GH, basename, fname, called_from, 0, file_unchunked); if (called_from == CP_RECOVER_DATA) strcat (fname, ".chkpt"); strcat (fname, ".ieee"); if (verbose) printf ("Opening file %s\n", fname); /* Open file, make sure the file is valid */ ifp = IEEEopen (fname, "r"); if (IOisValid (ifp)) is_IEEEIO_file = 1; else { if (verbose) printf ("Cannot open file '%s'\n", fname); /* now try with the other chunking mode */ file_unchunked = ! ioUtilGH->unchunked; IOUtil_PrepareFilename (GH, basename, fname, called_from, 0, file_unchunked); if (called_from == CP_RECOVER_DATA) strcat (fname, ".chkpt"); strcat (fname, ".ieee"); if (verbose) printf ("Trying now file '%s'...\n", fname); /* Open file, make sure the file is valid */ ifp = IEEEopen (fname, "r"); is_IEEEIO_file = IOisValid (ifp); } } if (myproc == 0 && is_IEEEIO_file) { /* Now determine how the data was written */ /* Read nioprocs used to write data */ index = IOreadAttributeInfo (ifp, "GH$ioproc_every", &nt_stored, &nels_stored); if (index >= 0 && nt_stored == FLEXIO_INT4 && nels_stored == 1) { IOreadAttribute (ifp, index, &tmpInt); file_ioproc_every = tmpInt; } else { CCTK_WARN (1, "Unable to restore GH$ioproc_every. " "Assuming it is nprocs and continuing"); file_ioproc_every = nprocs; } /* Read nprocs used to write data */ index = IOreadAttributeInfo (ifp, "GH$nprocs", &nt_stored, &nels_stored); if (index >= 0 && nt_stored == FLEXIO_INT4 && nels_stored == 1) { IOreadAttribute (ifp, index, &tmpInt); file_nprocs = tmpInt; } else { CCTK_WARN (1, "Unable to restore GH$nprocs. " "Assuming it is 1 and continuing"); file_nprocs = 1; } /* Determine whether data is chunked or unchunked We could derive this from the filename itself but just to be sure ... */ index = IOreadAttributeInfo (ifp, "unchunked", &nt_stored, &nels_stored); if (index >= 0 && nt_stored == FLEXIO_INT4 && nels_stored == 1) { IOreadAttribute (ifp, index, &tmpInt); file_unchunked = tmpInt; } else { sprintf (msg, "Unable to restore 'unchunked' attribute. " "Assuming it is %s and continuing", file_unchunked ? "true" : "false"); CCTK_WARN (1, msg); } /* If we restore from multiple files * the number of processors must match. */ if (file_ioproc_every == nprocs || file_unchunked) { if (verbose) printf ("Recovering from one %s file\n", file_unchunked ? "unchunked" : "chunked"); file_ioproc_every = nprocs; } else { if (file_nprocs != nprocs) { sprintf (msg, "Must restart on %d processors with multiple files " "or recombine them", file_nprocs); CCTK_WARN (0, msg); } if (verbose) printf ("Recovering from %d chunked files\n", nprocs / file_ioproc_every + (nprocs % file_ioproc_every?1:0)); } } #ifdef MPI /* Broadcast chunking mode to all processors from processor zero */ info [0] = is_IEEEIO_file; info [1] = file_unchunked; info [2] = file_ioproc_every; CACTUS_MPI_ERROR (MPI_Bcast (info, 3, PUGH_MPI_INT, 0, pughGH->PUGH_COMM_WORLD)); is_IEEEIO_file = info [0]; /* FIXME: Parameter on LHS */ file_unchunked = info [1]; file_ioproc_every = info [2]; #endif /* return here to IOUtil if no valid file could be found */ if (! is_IEEEIO_file) { if (myproc == 0) { sprintf (msg, "No valid IEEEIO file '%s' found !", fname); CCTK_WARN (2, msg); } return (-1); } /* Determine the IO processors for each node and the corresponding checkpoint file */ file_ioproc = myproc - (myproc % file_ioproc_every); IOUtil_PrepareFilename (GH, basename, fname, called_from, file_ioproc/file_ioproc_every, file_unchunked); if (called_from == CP_RECOVER_DATA) strcat (fname, ".chkpt"); strcat (fname, ".ieee"); /* Open chunked files on other IO processors */ if (myproc != 0 && myproc == file_ioproc) { if (verbose) printf ("Opening chunked file '%s' on processor %d.\n", fname, myproc); /* Open file, make sure the file is valid */ ifp = IEEEopen (fname, "r"); if (! IOisValid (ifp)) { sprintf (msg, "Cannot open file '%s' on processor %d", fname, myproc); CCTK_WARN (0, msg); } } /* Restore the data */ if (verbose && myproc == 0) printf ("Recovering %schunked data with ioproc %d, ioproc_every %d.\n", file_unchunked ? "un" : "", file_ioproc, file_ioproc_every); /* start timer for restoring variables */ if (print_timing_info) { CCTK_TimerResetI (myGH->recoverVarsTimer); CCTK_TimerStartI (myGH->recoverVarsTimer); } /* restore all variables from checkpoint file */ IOFlexIO_RestoreIEEEIOfile (GH, ifp, file_ioproc, file_ioproc_every, file_unchunked); /* stop variable timer */ if (print_timing_info) CCTK_TimerStopI (myGH->recoverVarsTimer); /* Close the file. */ if (myproc == file_ioproc) { if (verbose) printf ("Closing file '%s' after recovery.\n", fname); IOclose (ifp); } if (called_from == CP_RECOVER_DATA) { /* Must read in parameters and scalars on all processors. */ for (proc = file_ioproc; proc < file_ioproc+file_ioproc_every && proc < nprocs; proc++) { /* Only have the file open by one proc at any time. */ if (proc == myproc) { /* start timer for restoring parameters */ if (print_timing_info) { CCTK_TimerResetI (myGH->recoverParamsTimer); CCTK_TimerStartI (myGH->recoverParamsTimer); } /* Open file, make sure the file is valid */ ifp = IEEEopen (fname, "r"); if (! IOisValid (ifp)) { sprintf (msg, "Cannot open checkpoint file '%s' on processor %d", fname, myproc); CCTK_WARN (0, msg); } /* Restore the parameters. */ if (verbose) printf ("Recovering parameters on processor %d.\n", myproc); IOFlexIO_restoreParams (ifp, myproc); /* Restore global variables */ /* Get the main loop index. */ if (verbose) printf ("Recovering main loop index.\n"); index = IOreadAttributeInfo (ifp, "main loop index", &nt_stored, &nels_stored); if (index >= 0 && nt_stored == FLEXIO_INT4 && nels_stored == 1) { IOreadAttribute (ifp, index, &tmpInt); main_loop_index_stored = (int) tmpInt; if (verbose) printf ("main loop index is %d\n", (int) main_loop_index_stored); CCTK_SetMainLoopIndex (main_loop_index_stored); } else printf ("*Warning* Unable to restore main loop index\n"); /* Get the iteration number. */ if (verbose) printf ("Recovering GH->cctk_iteration.\n"); index = IOreadAttributeInfo (ifp, "GH$iteration", &nt_stored, &nels_stored); if (index >= 0 && nt_stored == FLEXIO_INT4 && nels_stored == 1) { IOreadAttribute (ifp, index, &tmpInt); cctk_iteration_stored = (int) tmpInt; if (verbose) printf ("cctk_iteration is %d\n", (int) cctk_iteration_stored); GH->cctk_iteration = cctk_iteration_stored; } else printf ("*Warning* Unable to restore GH->cctk_iteration\n"); /* Get cctk_time. */ if (verbose) printf ("Recovering GH->cctk_time.\n"); index = IOreadAttributeInfo (ifp, "GH$time", &nt_stored, &nels_stored); if (index >= 0 && nt_stored == FLEXIO_REAL && nels_stored == 1) { IOreadAttribute (ifp, index, &tmpDouble); cctk_time_stored = (double) tmpDouble; if (verbose) printf ("GH->cctk_time is %f\n", cctk_time_stored); GH->cctk_time = cctk_time_stored; } else printf ("*Warning* Unable to restore GH->cctk_time\n"); /* Close the file. */ if (verbose) printf ("Closing '%s' after recovery.\n", fname); IOclose (ifp); /* stop timer for restoring parameters */ if (print_timing_info) CCTK_TimerStopI (myGH->recoverParamsTimer); } /* Synchronise all processors */ CCTK_Barrier (GH); } } /* stop total recovery timer and print timing info */ if (print_timing_info && called_from == CP_RECOVER_DATA) { t_TimerInfo *info; CCTK_TimerStopI (myGH->recoverTotalTimer); printf ("%s timing information for recovery:\n", CCTK_THORNSTRING); /*** FIXME: select timer to get info from at runtime ***/ #ifdef HAVE_TIME_GETTIMEOFDAY info = CCTK_TimerCreateInfo (); if (info) { if (myGH->recoverVarsTimer >= 0) { CCTK_TimerGetI (myGH->recoverVarsTimer, info); printf ("Time to recover datasets: %5.1f sec\n", info->vals[0].val.d); } if (myGH->recoverParamsTimer >= 0) { CCTK_TimerGetI (myGH->recoverParamsTimer, info); printf ("Time to recover parameters: %5.1f sec\n", info->vals[0].val.d); } if (myGH->recoverTotalTimer >= 0) { CCTK_TimerGetI (myGH->recoverTotalTimer, info); printf ("Total time to recover: %5.1f sec\n", info->vals[0].val.d); } CCTK_TimerDestroyInfo (info); } else CCTK_WARN (1, "Couldn't create timer info structure ! No timing output " "available."); #else CCTK_WARN (1, "No GETTIMEOFDAY timer available."); #endif printf ("----------------------------------------------------------------" "---------\n"); } return (0); } int IOFlexIO_restoreParams (IOFile ifp, int myproc) { CCTK_INFO ("Called IOFlexIO_restoreParams()\n"); return (0); }