/*@@ @file RecoverGH.c @date Fri Jun 19 09:14:22 1998 @author Tom Goodale @desc Routines to recover variables from a given IEEEIO data or checkpoint file. @enddesc @version $Id$ @@*/ #include #include #include #include "cctk.h" #include "cctk_Parameters.h" #include "CactusPUGH/PUGH/src/include/pugh.h" #include "CactusBase/IOUtil/src/ioutil_CheckpointRecovery.h" #include "ioFlexGH.h" /* the rcs ID and its dummy function to use it */ static const char *rcsid = "$Id$"; CCTK_FILEVERSION(CactusPUGHIO_IOFlexIO_RecoverGH_c) /******************************************************************** ******************** Macro Definitions ************************ ********************************************************************/ /* maximum length of an attribute name */ #define MAX_ATTRNAME_LEN 256 /******************************************************************** ******************** External Routines ************************ ********************************************************************/ int IOFlexIO_RecoverParameters (void); /******************************************************************** ******************** Internal Routines ************************ ********************************************************************/ static int RecoverParameters (fileinfo_t *file); static int RecoverGHextensions (cGH *GH, fileinfo_t *file); static int OpenFile (cGH *GH, const char *basefilename, int called_from, fileinfo_t *file); /*@@ @routine IOFlexIO_Recover @date Fri Jun 19 09:22:52 1998 @author Tom Goodale @desc Recovers a GH from an IEEEIO file. This routine is registered with IOUtil as IOFlexIO's recovery routine. @enddesc @calls OpenFile RecoverParameters RecoverGHextensions IOFlexIOi_RecoverVariables IOUtil_PrintTimings @var GH @vdesc Pointer to CCTK grid hierarchy @vtype cGH * @vio in @endvar @var basefilename @vdesc the basefilename of the file to recover from The file suffix is appended by the routine. @vtype const char * @vio in @endvar @var called_from @vdesc flag indicating where this routine was called from (either CP_RECOVER_DATA or FILEREADER_DATA) @vtype int @vio in @endvar @returntype int @returndesc >0 = success -1 = recovery failed @endreturndesc @@*/ int IOFlexIO_Recover (cGH *GH, const char *basefilename, int called_from) { int result; flexioGH *myGH; static fileinfo_t file; /* this is static because info is passed from CP_RECOVERY_PARAMETERS to CP_RECOVERY_DATA */ const char *timer_description = "Time to recover:"; DECLARE_CCTK_PARAMETERS /* to make the compiler happy */ myGH = NULL; result = 0; /* start the total timer */ if (GH) { myGH = CCTK_GHExtension (GH, "IOFlexIO"); if (myGH->print_timing_info) { CCTK_TimerStartI (myGH->timers[3]); } } /* open the file if it wasn't already opened at CCTK_RECOVER_PARAMETERS */ /* FIXME Gab ... asymmetric levfac */ if (called_from == CP_RECOVER_PARAMETERS || called_from == FILEREADER_DATA || (GH && (GH->cctk_levfac[0] > 1 || GH->cctk_convlevel > 0))) { memset (&file, 0, sizeof (file)); if (OpenFile (GH, basefilename, called_from, &file) < 0) { return (-1); } } else { /* This is the case for CP_RECOVER_DATA. CCTK_RECOVER_PARAMETERS must have been called before and set up the file info structure. */ if (! file.is_IEEEIO_file) { return (-1); } } /* Recover parameters (and return) */ if (called_from == CP_RECOVER_PARAMETERS) { return (RecoverParameters (&file)); } /* Recover GH extensions */ if (called_from == CP_RECOVER_DATA) { if (CCTK_Equals (verbose, "full")) { CCTK_INFO ("Recovering GH extensions"); } result = RecoverGHextensions (GH, &file); } if (! result) { /* Recover variables */ if (CCTK_Equals (verbose, "full")) { CCTK_VInfo (CCTK_THORNSTRING, "Recovering %schunked data with ioproc %d, " "ioproc_every %d", file.unchunked ? "un" : "", file.ioproc, file.ioproc_every); } result = IOFlexIOi_RecoverVariables (GH, &file); } /* Close the file and remove it if requested by the user */ if (CCTK_MyProc (GH) == file.ioproc) { if (CCTK_Equals (verbose, "full")) { if (called_from == CP_RECOVER_DATA) { CCTK_VInfo (CCTK_THORNSTRING, "Closing checkpoint file '%s' after " "recovery", file.filename); } else { CCTK_VInfo (CCTK_THORNSTRING, "Closing data file '%s'", file.filename); } } FLEXIO_ERROR (IOclose (file.fid)); if (called_from == CP_RECOVER_DATA && recover_and_remove) { if (CCTK_Equals (verbose, "full")) { CCTK_VInfo (CCTK_THORNSTRING, "Old checkpoint file '%s' will be removed" " after next IO::checkpoint_keep " "successful checkpoints", file.filename); } myGH->cp_filenames[myGH->cp_fileindex] = strdup (file.filename); myGH->cp_fileindex = (myGH->cp_fileindex+1) % checkpoint_keep; } } /* stop total recovery timer and print timing info */ if (called_from == CP_RECOVER_DATA && myGH->print_timing_info) { CCTK_TimerStopI (myGH->timers[3]); IOUtil_PrintTimings ("Timing information for recovery in IOFlexIO:", 1, &myGH->timers[3], &timer_description); } /* print an info message */ if (called_from == CP_RECOVER_DATA) { CCTK_VInfo (CCTK_THORNSTRING, "Restarting simulation at iteration %d (physical time %g)", GH->cctk_iteration, (double) GH->cctk_time); } return (result); } /*@@ @routine IOFlexIO_RecoverParameters @date Thu Apr 13 2000 @author Thomas Radke @desc @desc Recovers the parameters from an IEEEIO checkpoint file. This routine is scheduled at CCTK_RECOVER_PARAMETERS. Note that it cannot be registered with IOUtil to be scheduled from there (as done with the IOFlexIO_Recover routine) because the registration mechanism isn't activated yet at CCTK_RECOVER_PARAMETERS. Instead we call the generic parameter recovery routine from IOUtil here, and just pass the necessary callback function and its arguments. Note also that this routine doesn't get passed any parameters, not even a GH, because this doesn't exist yet at the time it is being called. @enddesc @calls IOUtil_RecoverParameters @returntype int @returndesc return code of @seeroutine IOUtil_RecoverParameters, ie. positive for successful parameter recovery, or
0 if recovery wasn't requested, or
negative if parameter recovery failed @endreturndesc @@*/ int IOFlexIO_RecoverParameters (void) { return (IOUtil_RecoverParameters (IOFlexIO_Recover, ".ieee", "IEEEIO")); } /******************************************************************** ******************** Internal Routines ************************ ********************************************************************/ static int OpenFile (cGH *GH, const char *basefilename, int called_from, fileinfo_t *file) { int i, myproc, nprocs, type, retval; Long dim; #ifdef CCTK_MPI MPI_Comm comm; CCTK_INT4 info[4]; #endif DECLARE_CCTK_PARAMETERS /* identify myself */ nprocs = CCTK_nProcs (GH); myproc = CCTK_MyProc (GH); /* Examine basefile to find out whether we are recovering from * one or multiple files and whether the data are chunked or not. * * This is done by processor 0 only since this is always an IO processor * and a corresponding file must exist in all cases. */ /* Determine name of base file NOTE: As we don't know whether the file is chunked or not we need to try both file names. */ /* at first try with unchunked mode */ file->unchunked = 1; free (file->filename); file->filename = IOUtil_AssembleFilename (GH, basefilename, "", ".ieee", called_from, 0, file->unchunked); if (myproc == 0) { if (CCTK_Equals (verbose, "full")) { CCTK_VInfo (CCTK_THORNSTRING, "Opening file '%s'", file->filename); } /* Open file, make sure the file is valid */ file->fid = IEEEopen (file->filename, "r"); file->is_IEEEIO_file = IOisValid (file->fid); if (! file->is_IEEEIO_file) { CCTK_VWarn (2, __LINE__, __FILE__, CCTK_THORNSTRING, "No valid IEEEIO file '%s' found", file->filename); /* now try with chunked mode */ file->unchunked = 0; free (file->filename); file->filename = IOUtil_AssembleFilename (GH, basefilename, "", ".ieee", called_from, 0,file->unchunked); if (CCTK_Equals (verbose, "full")) { CCTK_VInfo (CCTK_THORNSTRING, "Trying now file '%s'...",file->filename); } /* Open file, make sure the file is valid */ file->fid = IEEEopen (file->filename, "r"); file->is_IEEEIO_file = IOisValid (file->fid); } } /* okay, we have the complete filename. Let's read the file now. */ if (myproc == 0 && file->is_IEEEIO_file) { file->is_IEEEIO_file = 0; /* determine how the data was written by reading the GH extensions */ i = IOreadAttributeInfo (file->fid, "GH$ioproc_every", &type, &dim); if (i >= 0 && type == FLEXIO_INT4 && dim == 1) { IOreadAttribute (file->fid, i, &file->ioproc_every); } else { CCTK_WARN (1, "Unable to restore GH$ioproc_every. " "Assuming it is nprocs and continuing"); file->ioproc_every = nprocs; } /* read nprocs used to write data */ i = IOreadAttributeInfo (file->fid, "GH$nprocs", &type, &dim); if (i >= 0 && type == FLEXIO_INT4 && dim == 1) { IOreadAttribute (file->fid, i, &file->nprocs); } else { CCTK_WARN (1, "Unable to restore GH$nprocs. " "Assuming it is 1 and continuing"); file->nprocs = 1; } /* determine whether data is chunked or unchunked We could derive this from the filename itself but just to be sure ... */ i = IOreadAttributeInfo (file->fid, "unchunked", &type, &dim); if (i >= 0 && type == FLEXIO_INT4 && dim == 1) { IOreadAttribute (file->fid, i, &file->unchunked); } else { CCTK_VWarn (1, __LINE__, __FILE__, CCTK_THORNSTRING, "Unable to restore 'unchunked' attribute. Assuming it is %s " "and continuing", file->unchunked ? "true" : "false"); } /* determine whether data was written using the old timelevel naming scheme. New files contain a Cactus version string attribute... */ i = IOreadAttributeInfo (file->fid, "Cactus version", &type, &dim); file->has_version = i >= 0 && type == CHAR; if (! file->has_version) { CCTK_WARN (4, "Unable to restore 'Cactus version' attribute. " "Assuming data was written using the the old timelevel " "naming scheme and continuing"); } /* If we recover from multiple files the number of * writing processors must match the number of reading * processors, and the total number of processors must match. */ if ((file->ioproc_every == nprocs && nprocs > 1) || file->unchunked) { if (CCTK_Equals (verbose, "full")) { CCTK_VInfo (CCTK_THORNSTRING, "Recovering from one %schunked file", file->unchunked ? "un":""); } file->ioproc_every = nprocs; file->is_IEEEIO_file = 1; } else { if (file->nprocs != nprocs) { CCTK_VWarn (1, __LINE__, __FILE__, CCTK_THORNSTRING, "Must restart on %d processors with chunked files " "or recombine them", file->nprocs); } else { if (CCTK_Equals (verbose, "full")) { CCTK_VInfo (CCTK_THORNSTRING, "Recovering from %d chunked files", nprocs / file->ioproc_every + (nprocs % file->ioproc_every ? 1 : 0)); } file->is_IEEEIO_file = 1; } } } if (myproc == 0 && ! file->is_IEEEIO_file) { CCTK_VWarn (2, __LINE__, __FILE__, CCTK_THORNSTRING, "No valid IEEEIO file '%s' found", file->filename); } #ifdef CCTK_MPI /* Get the communicator for broadcasting the info structure */ /* NOTE: When recovering parameters thorn PUGH is not yet initialized so that we have to use MPI_COMM_WORLD in this case */ comm = CCTK_GHExtensionHandle ("PUGH") < 0 ? MPI_COMM_WORLD : PUGH_pGH (GH)->PUGH_COMM_WORLD; /* Broadcast the file information to all processors */ info[0] = file->is_IEEEIO_file; info[1] = file->unchunked; info[2] = file->ioproc_every; info[3] = file->has_version; CACTUS_MPI_ERROR (MPI_Bcast (info, 4, PUGH_MPI_INT, 0, comm)); file->is_IEEEIO_file = info[0]; file->unchunked = info[1]; file->ioproc_every = info[2]; file->has_version = info[3]; #endif if (file->is_IEEEIO_file) { /* Determine the IO processors for each node and the corresponding checkpoint file */ file->ioproc = myproc - (myproc % file->ioproc_every); free (file->filename); file->filename = IOUtil_AssembleFilename (GH, basefilename, "", ".ieee", called_from, file->ioproc / file->ioproc_every, file->unchunked); /* Open chunked files on other IO processors */ if (myproc == file->ioproc && myproc != 0) { if (CCTK_Equals (verbose, "full")) { CCTK_VInfo (CCTK_THORNSTRING, "Opening chunked file '%s' on " "processor %d", file->filename, myproc); } /* Open file, make sure the file is valid */ file->fid = IEEEopen (file->filename, "r"); if (! IOisValid (file->fid)) { CCTK_VWarn (1, __LINE__, __FILE__, CCTK_THORNSTRING, "Cannot open file '%s' on processor %d", file->filename, myproc); file->is_IEEEIO_file = 0; } } #ifdef CCTK_MPI /* Finally check whether all files have valid recovery files */ info[0] = file->is_IEEEIO_file; CACTUS_MPI_ERROR (MPI_Allreduce (&info[0], &info[1], 1, PUGH_MPI_INT4, MPI_LAND, comm)); file->is_IEEEIO_file = info[1]; #endif } /* Return 0 for success otherwise negative */ retval = (file->is_IEEEIO_file ? 0 : -1); return (retval); } /* NOTE: Although we could read the GH extensions from multiple recovery files in parallel, this is done only on by processor 0 here. Broadcasting the GH extensions is found faster than sending it in a loop from each IO processor to all the non IOs (don't have subcommunicators yet) */ static int RecoverGHextensions (cGH *GH, fileinfo_t *file) { Long dim; int i, type; CCTK_REAL realBuffer; CCTK_INT4 int4Buffer[2]; if (CCTK_MyProc (GH) == 0) { /* rewind to the first dataset where the GH extensions are attached to */ FLEXIO_ERROR (IOseek (file->fid, 0)); /* get the iteration number */ i = IOreadAttributeInfo (file->fid, "GH$iteration", &type, &dim); if (i >= 0 && type == FLEXIO_INT4 && dim == 1) { IOreadAttribute (file->fid, i, &int4Buffer[0]); } else { CCTK_WARN (1, "Unable to restore GH->cctk_iteration, defaulting to 0"); int4Buffer[0] = 0; } /* get the main loop index */ i = IOreadAttributeInfo (file->fid, "main loop index", &type, &dim); if (i >= 0 && type == FLEXIO_INT4 && dim == 1) { IOreadAttribute (file->fid, i, &int4Buffer[1]); } else { CCTK_WARN (1, "Unable to restore main loop index, defaulting to 0"); int4Buffer[1] = 0; } /* get cctk_time */ i = IOreadAttributeInfo (file->fid, "GH$time", &type, &dim); if (i >= 0 && type == FLEXIO_REAL && dim == 1) { IOreadAttribute (file->fid, i, &realBuffer); } else { CCTK_WARN (1, "Unable to restore GH->cctk_time, defaulting to 0.0"); realBuffer = 0.0; } } #ifdef CCTK_MPI /* Broadcast the GH extensions to all processors */ /* NOTE: We have to use MPI_COMM_WORLD here because PUGH_COMM_WORLD is not yet set up at parameter recovery time. We also assume that PUGH_MPI_INT4 is a compile-time defined datatype. */ CACTUS_MPI_ERROR (MPI_Bcast (int4Buffer, 2, PUGH_MPI_INT4, 0,MPI_COMM_WORLD)); CACTUS_MPI_ERROR (MPI_Bcast (&realBuffer, 1, PUGH_MPI_REAL,0,MPI_COMM_WORLD)); #endif GH->cctk_time = realBuffer; GH->cctk_iteration = (int) int4Buffer[0]; CCTK_SetMainLoopIndex ((int) int4Buffer[1]); return (0); } /* NOTE: Although we could read the parameters from multiple recovery files in parallel, this is done only on by processor 0 here. Broadcasting the complete parameter string is found faster than sending it in a loop from each IO processor to all the non IOs (don't have subcommunicators yet) */ static int RecoverParameters (fileinfo_t *file) { int i, myproc, atype, retval; Long asize; char *parameters; CCTK_INT4 parameterSize; DECLARE_CCTK_PARAMETERS /* To make the compiler happy */ parameterSize = -1; parameters = NULL; myproc = CCTK_MyProc (NULL); if (myproc == 0) { if (CCTK_Equals (verbose, "full")) { CCTK_VInfo (CCTK_THORNSTRING, "Recovering parameters from checkpoint " "file '%s'", file->filename); } /* get the parameters attribute. */ i = IOreadAttributeInfo (file->fid, GLOBAL_PARAMETERS, &atype, &asize); if (i >= 0 && atype == CHAR && asize > 0) { parameterSize = (CCTK_INT4) asize; parameters = malloc (parameterSize + 1); IOreadAttribute (file->fid, i, parameters); } else { CCTK_WARN (1, "Can't read global parameters. " "Is this really a Cactus IEEEIO checkpoint file ?"); } } #ifdef CCTK_MPI /* Broadcast the parameter buffer size to all processors */ /* NOTE: We have to use MPI_COMM_WORLD here because PUGH_COMM_WORLD is not yet set up at parameter recovery time. We also assume that PUGH_MPI_INT4 is a compile-time defined datatype. */ CACTUS_MPI_ERROR (MPI_Bcast (¶meterSize, 1, PUGH_MPI_INT4, 0, MPI_COMM_WORLD)); #endif if (parameterSize > 0) { #ifdef CCTK_MPI if (myproc) { parameters = malloc (parameterSize + 1); } CACTUS_MPI_ERROR (MPI_Bcast (parameters, parameterSize + 1, PUGH_MPI_CHAR, 0, MPI_COMM_WORLD)); #endif IOUtil_SetAllParameters (parameters); free (parameters); } /* return positive value for success otherwise negative */ retval = (parameterSize > 0 ? 1 : -1); return (retval); }