/*@@ @file RecoverGH.c @date Fri Jun 19 09:14:22 1998 @author Tom Goodale @desc Contains the routines to do the internal checkpoint recovery. Currently can recover from: (1) One file containing recombined data (2) Multiple unrecombined files, where the current number of processors and outputing processors match those used to write the data. @enddesc @history @hauthor Gabrielle Allen @hdate 19 Oct 1998 @hdesc Changed names ready for thorn_IO @endhistory @version $Id$ @@*/ #include #include #include #include "cctk.h" #include "cctk_Parameters.h" #include "CactusPUGH/PUGH/src/include/pugh.h" #include "CactusBase/IOUtil/src/ioGH.h" #include "ioFlexGH.h" /* the rcs ID and its dummy funtion to use it */ static char *rcsid = "$Id$"; CCTK_FILEVERSION (CactusPUGHIO_IOFlexIO_RecoverGH_c); /* maximum length of an attribute name */ #define MAX_ATTRNAME_LEN 256 /* local function prototypes */ static int IOFlexIOi_RecoverParameters (fileinfo_t *file); static int IOFlexIOi_RecoverGHextensions (cGH *GH, fileinfo_t *file); static int IOFlexIOi_OpenFile (cGH *GH, const char *basename, int called_from, fileinfo_t *file); /* this one comes from RestoreFile.c */ int IOFlexIOi_RecoverVariables (cGH *GH, fileinfo_t *file); /* check for scandir(3) which is used to evaluate IO::recover = "auto" */ #ifdef HAVE_SCANDIR #include /* prefix for selecting checkpoint files */ static char *CPfilePrefix; static int IOFlexIOi_CPfileSelect (struct dirent *entry); static int IOFlexIOi_CPfileCompare (struct dirent **a, struct dirent **b); #endif /*@@ @routine IOFlexIO_Recover @date Fri Jun 19 09:22:52 1998 @author Tom Goodale @desc Recovers a GH from an IEEEIO file. This routine is registered with IOUtil as IOFlexIO's recovery routine. @enddesc @calledby IOUtil_RecoverFromFile @history @endhistory @var GH @vdesc Pointer to CCTK grid hierarchy @vtype cGH @vio in @endvar @var basename @vdesc the basename of the file to recover from The file suffix is appended by the routine. @vtype const char * @vio in @endvar @var called_from @vdesc flag indicating where this routine was called from (either RECOVER or filereader) @vtype int @vio in @endvar @history @hauthor Gabrielle Allen @hdate Thu Jul 2 18:17:59 1998 @hdesc Restore the physical time and iteration count, pass myproc to IEEEIOparamRestore Derives the filename from IOUtil_PrepareFilename (in thorn IOUtil) @hauthor Gabrielle Allen @hdate Oct 17 1998 @hdesc Added input of (some) GH structure variables @endhistory @@*/ int IOFlexIO_Recover (cGH *GH, const char *basename, int called_from) { DECLARE_CCTK_PARAMETERS int result; flexioGH *myGH; static fileinfo_t file; /* this is static because info is passed from CP_RECOVERY_PARAMETERS to CP_RECOVERY_DATA */ /* to make the compiler happy */ myGH = NULL; /* start the total timer */ if (GH && print_timing_info) { myGH = (flexioGH *) GH->extensions [CCTK_GHExtensionHandle ("IOFlexIO")]; CCTK_TimerStartI (myGH->recoverTotalTimer); } /* open the file if it wasn't already opened at CCTK_RECOVER_PARAMETERS */ /* FIXME Gab ... asymmetric levfac */ if (called_from == CP_RECOVER_PARAMETERS || called_from == FILEREADER_DATA || (GH && (GH->cctk_levfac[0] > 1 || GH->cctk_convlevel > 0))) { if (IOFlexIOi_OpenFile (GH, basename, called_from, &file) < 0) return (-1); } else { /* This is the case for CP_RECOVER_DATA. CCTK_RECOVER_PARAMETERS must have been called before and set up the file info structure. */ if (! file.is_IEEEIO_file) return (-1); } /* Recover parameters (and return) */ if (called_from == CP_RECOVER_PARAMETERS) return (IOFlexIOi_RecoverParameters (&file)); /* Recover variables */ if (verbose) CCTK_VInfo (CCTK_THORNSTRING, "Recovering %schunked data with ioproc %d, " "ioproc_every %d", file.unchunked ? "un" : "", file.ioproc, file.ioproc_every); result = IOFlexIOi_RecoverVariables (GH, &file); /* Recover GH extensions */ if (result == 0 && called_from == CP_RECOVER_DATA) { if (verbose) CCTK_INFO ("Recovering GH extensions"); result = IOFlexIOi_RecoverGHextensions (GH, &file); } /* Close the file */ if (CCTK_MyProc (GH) == file.ioproc) { if (verbose) { if (called_from == CP_RECOVER_DATA) CCTK_VInfo (CCTK_THORNSTRING, "Closing checkpoint file '%s' after " "recovery", file.filename); else CCTK_VInfo (CCTK_THORNSTRING, "Closing data file '%s'", file.filename); } CACTUS_IEEEIO_ERROR (IOclose (file.fid)); } /* stop total recovery timer and print timing info */ if (print_timing_info && called_from == CP_RECOVER_DATA) { cTimerData *info; CCTK_TimerStopI (myGH->recoverTotalTimer); CCTK_INFO ("timing information for recovery:"); /*** FIXME: select timer to get info from at runtime ***/ #ifdef HAVE_TIME_GETTIMEOFDAY info = CCTK_TimerCreateData (); if (info) { if (myGH->recoverTotalTimer >= 0) { CCTK_TimerI (myGH->recoverTotalTimer, info); CCTK_VInfo (CCTK_THORNSTRING, "Time to recover: %5.1f sec", info->vals[0].val.d); } CCTK_TimerDestroyData (info); } else CCTK_WARN (1, "Couldn't create timer info structure ! No timing output " "available."); #else CCTK_WARN (1, "No GETTIMEOFDAY timer available."); #endif CCTK_INFO ("--------------------------------------------------------------" "-----------"); } return (result); } /*@@ @routine IOFlexIO_RecoverParameters @date Thu Apr 13 2000 @author Thomas Radke @desc Recovers the parameters from an IEEEIO checkpoint file. This routine is registered at CCTK_RECOVER_PARAMETERS. Note that it doesn't get passed any parameters, not even a GH. This is not yet created at CCTK_RECOVER_PARAMETERS. @enddesc @calledby flesh's scheduler @history @endhistory @@*/ int IOFlexIO_RecoverParameters (void) { DECLARE_CCTK_PARAMETERS int retval; cGH *dummyGH = NULL; retval = -1; if (CCTK_Equals (recover, "auto")) { #ifdef HAVE_SCANDIR int i, nCPfiles; struct dirent **CPfilelist = NULL; /* set the CP file prefix for selecting file names */ /* we have to pass it in a global variable because the select() routine doesn't get user-supplied arguments */ CPfilePrefix = (char *) malloc (strlen (recover_file) + 5); sprintf (CPfilePrefix, "%s.it_", recover_file); nCPfiles = scandir (recovery_dir, &CPfilelist, IOFlexIOi_CPfileSelect, IOFlexIOi_CPfileCompare); for (i = 0; i < nCPfiles; i++) { if (retval < 0) retval = IOFlexIO_Recover (dummyGH, CPfilelist [i]->d_name, CP_RECOVER_PARAMETERS); free (CPfilelist [i]); } if (CPfilelist) free (CPfilelist); #else /* no scandir(3) ? give up ! */ CCTK_WARN (0, "You cannot use 'IO::recover = \"auto\"' on " "this architecture because it doesn't provide scandir(3) to " "automatically look for checkpoint files.\n" "Please use 'IO::recover = \"manual\"' instead !"); #endif } else retval = IOFlexIO_Recover (dummyGH, NULL, CP_RECOVER_PARAMETERS); return (retval); } /**************************** local routines ******************************/ static int IOFlexIOi_OpenFile (cGH *GH, const char *basename, int called_from, fileinfo_t *file) { DECLARE_CCTK_PARAMETERS int index; int nprocs, myproc; Long nels_stored; int nt_stored; CCTK_INT4 tmpInt; #ifdef CCTK_MPI MPI_Comm comm; CCTK_INT4 info [3]; #endif #ifdef CCTK_MPI /* Get the communicator for broadcasting the info structure */ /* NOTE: When recovering parameters thorn PUGH is not yet initialized so that we have to use MPI_COMM_WORLD in this case */ comm = CCTK_GHExtensionHandle ("PUGH") < 0 ? MPI_COMM_WORLD : pugh_pGH (GH)->PUGH_COMM_WORLD; #endif /* identify myself */ nprocs = CCTK_nProcs (GH); myproc = CCTK_MyProc (GH); /* Examine basefile to find out whether we are recovering from * one or multiple files and whether the data are chunked or not. * * This is done by processor 0 only since this is always an IO processor * and a corresponding file must exist in all cases. */ /* Determine name of base file NOTE: As we don't know whether the file is chunked or not we need to try both file names. */ /* at first try with current chunking mode */ file->unchunked = out3D_unchunked; IOUtil_PrepareFilename (GH, basename, file->filename, called_from, 0, file->unchunked); if (called_from == CP_RECOVER_DATA || called_from == CP_RECOVER_PARAMETERS) strcat (file->filename, ".chkpt"); strcat (file->filename, ".ieee"); if (myproc == 0) { if (verbose) CCTK_VInfo (CCTK_THORNSTRING, "Opening file '%s'", file->filename); /* Open file, make sure the file is valid */ file->fid = IEEEopen (file->filename, "r"); if (IOisValid (file->fid)) file->is_IEEEIO_file = 1; else { if (verbose) CCTK_VInfo (CCTK_THORNSTRING, "No valid IEEEIO file '%s' found, " "trying other file", file->filename); /* now try with the other chunking mode */ file->unchunked = ! out3D_unchunked; IOUtil_PrepareFilename (GH, basename, file->filename, called_from, 0, file->unchunked); if (called_from == CP_RECOVER_DATA || called_from ==CP_RECOVER_PARAMETERS) strcat (file->filename, ".chkpt"); strcat (file->filename, ".ieee"); if (verbose) CCTK_VInfo (CCTK_THORNSTRING, "Trying now file '%s'...",file->filename); /* Open file, make sure the file is valid */ file->fid = IEEEopen (file->filename, "r"); file->is_IEEEIO_file = IOisValid (file->fid); } } /* Okay, we have the complete filename. Let's read the file now. */ if (myproc == 0 && file->is_IEEEIO_file) { file->is_IEEEIO_file = 0; /* Determine how the data was written by reading the GH extensions */ index = IOreadAttributeInfo (file->fid, "GH$ioproc_every", &nt_stored, &nels_stored); if (index >= 0 && nt_stored == FLEXIO_INT4 && nels_stored == 1) { IOreadAttribute (file->fid, index, &tmpInt); file->ioproc_every = tmpInt; } else { CCTK_WARN (1, "Unable to restore GH$ioproc_every. " "Assuming it is nprocs and continuing"); file->ioproc_every = nprocs; } /* Read nprocs used to write data */ index = IOreadAttributeInfo (file->fid, "GH$nprocs", &nt_stored, &nels_stored); if (index >= 0 && nt_stored == FLEXIO_INT4 && nels_stored == 1) { IOreadAttribute (file->fid, index, &tmpInt); file->nprocs = tmpInt; } else { CCTK_WARN (1, "Unable to restore GH$nprocs. " "Assuming it is 1 and continuing"); file->nprocs = 1; } /* Determine whether data is chunked or unchunked We could derive this from the filename itself but just to be sure ... */ index = IOreadAttributeInfo (file->fid, "unchunked", &nt_stored, &nels_stored); if (index >= 0 && nt_stored == FLEXIO_INT4 && nels_stored == 1) { IOreadAttribute (file->fid, index, &tmpInt); file->unchunked = tmpInt; } else CCTK_VWarn (1, __LINE__, __FILE__, CCTK_THORNSTRING, "Unable to restore 'unchunked' attribute. Assuming it is %s " "and continuing", file->unchunked ? "true" : "false"); /* If we recover from multiple files the number of * writing processors must match the number of reading * processors, and the total number of processors must match. */ if ((file->ioproc_every == nprocs && nprocs > 1) || file->unchunked) { if (verbose) CCTK_VInfo (CCTK_THORNSTRING, "Recovering from one %schunked file", file->unchunked ? "un":""); file->ioproc_every = nprocs; file->is_IEEEIO_file = 1; } else { if (file->nprocs != nprocs) { CCTK_VWarn (1, __LINE__, __FILE__, CCTK_THORNSTRING, "Must restart on %d processors with chunked files " "or recombine them", file->nprocs); } else { if (verbose) CCTK_VInfo (CCTK_THORNSTRING, "Recovering from %d chunked files", nprocs / file->ioproc_every + (nprocs % file->ioproc_every ? 1 : 0)); file->is_IEEEIO_file = 1; } } } if (myproc == 0 && ! file->is_IEEEIO_file) CCTK_VWarn (2, __LINE__, __FILE__, CCTK_THORNSTRING, "No valid IEEEIO file '%s' found", file->filename); #ifdef CCTK_MPI /* Broadcast the file information to all processors */ info [0] = file->is_IEEEIO_file; info [1] = file->unchunked; info [2] = file->ioproc_every; CACTUS_MPI_ERROR (MPI_Bcast (info, 3, PUGH_MPI_INT, 0, comm)); file->is_IEEEIO_file = info [0]; file->unchunked = info [1]; file->ioproc_every = info [2]; #endif if (file->is_IEEEIO_file) { /* Determine the IO processors for each node and the corresponding checkpoint file */ file->ioproc = myproc - (myproc % file->ioproc_every); IOUtil_PrepareFilename (GH, basename, file->filename, called_from, file->ioproc/file->ioproc_every, file->unchunked); if (called_from == CP_RECOVER_DATA || called_from == CP_RECOVER_PARAMETERS) strcat (file->filename, ".chkpt"); strcat (file->filename, ".ieee"); /* Open chunked files on other IO processors */ if (myproc == file->ioproc && myproc != 0) { if (verbose) CCTK_VInfo (CCTK_THORNSTRING, "Opening chunked file '%s' on " "processor %d", file->filename, myproc); /* Open file, make sure the file is valid */ file->fid = IEEEopen (file->filename, "r"); if (! IOisValid (file->fid)) { CCTK_VWarn (1, __LINE__, __FILE__, CCTK_THORNSTRING, "Cannot open file '%s' on processor %d", file->filename, myproc); file->is_IEEEIO_file = 0; } } #ifdef CCTK_MPI /* Finally check whether all files have valid recovery files */ info [0] = file->is_IEEEIO_file; CACTUS_MPI_ERROR (MPI_Allreduce (&info [0], &info [1], 1, PUGH_MPI_INT4, MPI_LAND, comm)); file->is_IEEEIO_file = info [1]; #endif } /* Return 0 for success otherwise negative */ return (file->is_IEEEIO_file ? 0 : -1); } /* NOTE: Although we could read the GH extensions from multiple recovery files in parallel, this is done only on by processor 0 here. Broadcasting the GH extensions is found faster than sending it in a loop from each IO processor to all the non IOs (don't have subcommunicators yet) */ static int IOFlexIOi_RecoverGHextensions (cGH *GH, fileinfo_t *file) { Long nels_stored; int index, nt_stored; CCTK_REAL realBuffer; CCTK_INT4 int4Buffer [2]; extern int PUGH_SetMainLoopIndex (int main_loop_index); if (CCTK_MyProc (GH) == 0) { /* rewind to the first dataset where the GH extensions are attached to */ CACTUS_IEEEIO_ERROR (IOseek (file->fid, 0)); /* Get the iteration number. */ index = IOreadAttributeInfo (file->fid, "GH$iteration", &nt_stored, &nels_stored); if (index >= 0 && nt_stored == FLEXIO_INT4 && nels_stored == 1) IOreadAttribute (file->fid, index, &int4Buffer [0]); else { CCTK_WARN (1, "Unable to restore GH->cctk_iteration, defaulting to 0"); int4Buffer [0] = 0; } /* Get the main loop index. */ index = IOreadAttributeInfo (file->fid, "main loop index", &nt_stored, &nels_stored); if (index >= 0 && nt_stored == FLEXIO_INT4 && nels_stored == 1) IOreadAttribute (file->fid, index, &int4Buffer [1]); else { CCTK_WARN (1, "Unable to restore main loop index, defaulting to 0"); int4Buffer [1] = 0; } /* Get cctk_time. */ index = IOreadAttributeInfo (file->fid, "GH$time", &nt_stored, &nels_stored); if (index >= 0 && nt_stored == FLEXIO_REAL && nels_stored == 1) IOreadAttribute (file->fid, index, &realBuffer); else { CCTK_WARN (1, "Unable to restore GH->cctk_time, defaulting to 0.0"); realBuffer = 0.0; } } #ifdef CCTK_MPI /* Broadcast the GH extensions to all processors */ /* NOTE: We have to use MPI_COMM_WORLD here because PUGH_COMM_WORLD is not yet set up at parameter recovery time. We also assume that PUGH_MPI_INT4 is a compile-time defined datatype. */ CACTUS_MPI_ERROR (MPI_Bcast (int4Buffer, 2, PUGH_MPI_INT4, 0,MPI_COMM_WORLD)); CACTUS_MPI_ERROR (MPI_Bcast (&realBuffer, 1, PUGH_MPI_REAL, 0, MPI_COMM_WORLD)); #endif GH->cctk_time = realBuffer; GH->cctk_iteration = (int) int4Buffer [0]; PUGH_SetMainLoopIndex ((int) int4Buffer [1]); return (0); } /* NOTE: Although we could read the parameters from multiple recovery files in parallel, this is done only on by processor 0 here. Broadcasting the complete parameter string is found faster than sending it in a loop from each IO processor to all the non IOs (don't have subcommunicators yet) */ static int IOFlexIOi_RecoverParameters (fileinfo_t *file) { DECLARE_CCTK_PARAMETERS int index, atype; Long asize; char *parameters; CCTK_INT4 parameterSize; cGH *GH = NULL; /* There's no cGH set up yet so we pass a NULL pointer to CCTK_MyProc() */ /* To make the compiler happy */ parameterSize = -1; parameters = NULL; if (CCTK_MyProc (GH) == 0) { if (verbose) CCTK_VInfo (CCTK_THORNSTRING, "Recovering parameters from checkpoint " "file '%s'", file->filename); /* Get the parameters attribute. */ index = IOreadAttributeInfo (file->fid, GLOBAL_PARAMETERS, &atype, &asize); if (index >= 0 && atype == FLEXIO_CHAR && asize > 0) { parameterSize = (CCTK_INT4) asize; parameters = (char *) malloc (parameterSize); IOreadAttribute (file->fid, index, parameters); } else CCTK_WARN (1, "Can't read global parameters. " "Is this really a Cactus IEEEIO checkpoint file ?"); } #ifdef CCTK_MPI /* Broadcast the parameter buffer size to all processors */ /* NOTE: We have to use MPI_COMM_WORLD here because PUGH_COMM_WORLD is not yet set up at parameter recovery time. We also assume that PUGH_MPI_INT4 is a compile-time defined datatype. */ CACTUS_MPI_ERROR (MPI_Bcast (¶meterSize, 1, PUGH_MPI_INT4, 0, MPI_COMM_WORLD)); #endif if (parameterSize > 0) { #ifdef CCTK_MPI if (CCTK_MyProc (GH) != 0) parameters = (char *) malloc (parameterSize + 1); CACTUS_MPI_ERROR (MPI_Bcast (parameters, parameterSize + 1, PUGH_MPI_CHAR, 0, MPI_COMM_WORLD)); #endif IOUtil_SetAllParameters (parameters); free (parameters); } /* Return 0 for success otherwise negative */ return (parameterSize > 0 ? 0 : -1); } #ifdef HAVE_SCANDIR /* function to be called by scandir(2) to select potential IEEEIO checkpoint files in the given recovery directory (see IOFlexIO_RecoverParameters()) */ static int IOFlexIOi_CPfileSelect (struct dirent *entry) { char *p; int len, prefixLen; len = strlen (entry->d_name); prefixLen = strlen (CPfilePrefix); /* At first check for CPfilePrefix in the beginning of the filename */ if (strncmp (entry->d_name, CPfilePrefix, prefixLen)) return (0); /* Now check if there is an iteration number following the file prefix. */ for (p = (char *) entry->d_name + prefixLen; *p && *p != '.'; p++) if (! isdigit (*p)) return (0); /* Check for a '.file_' suffix for chunked output. We only select the chunked output file of processor 0 in that case. */ if (p - entry->d_name < len) if (! strncmp (p, "file_", 5) && strncmp (p, "file_0", 6)) return (0); /* Finally check the suffix */ if (len < 5 || strcmp (entry->d_name + len - 5, ".ieee")) return (0); /* Cut the file name after the iteration number field because we only need the basename later on. */ *p = 0; return (1); } /* function to be called by scandir(2) to sort the list of potential checkpoint files by their iteration number (see IOFlexIO_RecoverParameters()) */ static int IOFlexIOi_CPfileCompare (struct dirent **a, struct dirent **b) { int it_a, it_b; int len = strlen (CPfilePrefix); /* extract the iteration numbers */ it_a = atoi ((*a)->d_name + len); it_b = atoi ((*b)->d_name + len); /* note that this causes the file list to be sorted in descendent order */ if (it_a > it_b) return (-1); if (it_a < it_b) return (+1); return (0); } #endif /* HAVE_SCANDIR */