From d69c71d3983cee0cfd3793734a5c9fa8b1391b21 Mon Sep 17 00:00:00 2001 From: swhite Date: Wed, 26 Apr 2006 14:49:06 +0000 Subject: Christian's TriggerTerminationManual, renamed and expanded git-svn-id: http://svn.aei.mpg.de/numrel/AEIThorns/ManualTermination/trunk@2 e5a5a894-0e4f-0410-be11-d22c8b0a171a --- README | 10 ++++ doc/documentation.tex | 100 +++++++++++++++++++++++++++++++ interface.ccl | 8 +++ param.ccl | 35 +++++++++++ schedule.ccl | 30 ++++++++++ src/ManualTermination.c | 140 ++++++++++++++++++++++++++++++++++++++++++++ src/ManualTermination.h | 8 +++ src/ManualTerminationFile.c | 48 +++++++++++++++ src/make.code.defn | 8 +++ 9 files changed, 387 insertions(+) create mode 100644 README create mode 100644 doc/documentation.tex create mode 100644 interface.ccl create mode 100644 param.ccl create mode 100644 schedule.ccl create mode 100644 src/ManualTermination.c create mode 100644 src/ManualTermination.h create mode 100644 src/ManualTerminationFile.c create mode 100644 src/make.code.defn diff --git a/README b/README new file mode 100644 index 0000000..d299580 --- /dev/null +++ b/README @@ -0,0 +1,10 @@ +CVS info : $Header$ + +Cactus Code Thorn ManualTermination, based on TriggerTerminationManual +Thorn Author(s) : Christian D. Ott +Thorn Maintainer(s) : Christian D. Ott +-------------------------------------------------------------------------- + +Purpose of the thorn: + +Watch the gone by walltime. Trigger termination if only n minutes are left to some limit set by the user. diff --git a/doc/documentation.tex b/doc/documentation.tex new file mode 100644 index 0000000..33668ca --- /dev/null +++ b/doc/documentation.tex @@ -0,0 +1,100 @@ +% $Header$ + +\documentclass{article} + +% Use the Cactus ThornGuide style file +% (Automatically used from Cactus distribution, if you have a +% thorn without the Cactus Flesh download this from the Cactus +% homepage at www.cactuscode.org) +\usepackage{../../../../doc/latex/cactus} +\RequirePackage{alltt} +\RequirePackage{fancyvrb} + +\begin{document} + +% The author of the documentation +\author{Steve White \textless swhite@aei.mpg.de\textgreater} + +% The title of the document (not necessarily the name of the Thorn) +\title{ManualTermination\\ + Manual Termination of Cactus Simulations} + +% the date your document was last changed, if your document is in CVS, +% please use: +\date{$ $Date$ $} + +\maketitle + +% Do not delete next line +% START CACTUS THORNGUIDE + +\begin{abstract} +Thorn \textbf{ManualTermination} safely terminates Cactus +simulation jobs, and can be configured to allow other users to +terminate the job. + +The thorn can also be configured to terminate a certain number of minutes +before a given maximum walltime has elapsed. Also, it can be configured +to periodically check the contents of a given file, and terminate based +on the contents of that file. + +In either case, the job should be checkpointed. +\end{abstract} + + + +\section{Requirements} + +The program must be set up for checkpointing. (It can be argued that +checkpointing functionality is common sense and good etiquette for +long-running programs in a multi-user environment.) + +For termination from a file, the PBS batch system is used. + +\section{Setup} + + +\begin{Verbatim}[commandchars=\\\{\},frame=single] +cmd="mpirun \textsl{command...}" +/opt/NumRelScript/chain\_job "$0" "$cmd" +\end{Verbatim} + + +\begin{Verbatim}[commandchars=\\\{\},frame=single] +# # # # # # # # # # # # # # # Checkpointing / Recovery +ActiveThorns = "IOHDF5Util IOHDF5" + +IO::checkpoint_dir = "cpr/" +IO::checkpoint_file = "chain" # Name to taste +IO::checkpoint_on_terminate = "yes" +IO::recover_dir = "cpr/" +IO::recover_file = "chain" # Same name +IO::recover = "autoprobe" +IOHDF5::checkpoint = "yes" + +# # # # # # # # # # # # # # # Termination +ActiveThorns = "ManualTermination" + +ManualTermination::on_remaining_walltime=1400 #minutes before termination +ManualTermination::max_walltime=12 # hours +ManualTermination::termination_from_file=yes +ManualTermination::check_file_every=10 #evolution steps +ManualTermination::output_remtime_every_minutes=2 # how often to remind user + +\end{Verbatim} + +\section{Use} + +\section{Licensing and Support} + +Thorn \textbf{JobChaining} is distributed under the GNU Lesser Public +License. +For details please see the file \texttt{COPYING.LIB} in the top-level +directory of this thorn. + +Please send any suggestions or comments to the maintainer of the thorn. + +% Do not delete next line +% END CACTUS THORNGUIDE + +\end{document} diff --git a/interface.ccl b/interface.ccl new file mode 100644 index 0000000..b2d0b22 --- /dev/null +++ b/interface.ccl @@ -0,0 +1,8 @@ +# Interface definition for thorn ManualTermination +# $Header$ + +IMPLEMENTS: ManualTermination + + + +CCTK_REAL watchminutes TYPE=scalar diff --git a/param.ccl b/param.ccl new file mode 100644 index 0000000..d4024ff --- /dev/null +++ b/param.ccl @@ -0,0 +1,35 @@ +# Parameter definitions for thorn ManualTermination +# $Header$ + +private: + +INT on_remaining_walltime "When to trigger termination in MINUTES" STEERABLE = ALWAYS +{ + 0:0 :: "Don't trigger termination" + 1:* :: "So many minutes before your job walltime is over" +} 0 + +CCTK_REAL max_walltime "Walltime in HOURS allocated for this job" STEERABLE = ALWAYS +{ + 0.:* :: "Should be positive, right" +} 0.0 + +BOOLEAN termination_from_file "Use termination file; specified by termination_filename" STEERABLE = ALWAYS +{ +} "no" + +STRING termination_file "Termination file name (full path)" STEERABLE = ALWAYS +{ + .* :: "Termination file" +} "" + +INT check_file_every "Check termination file every n timesteps" STEERABLE = ALWAYS +{ + 1: :: "Should be greater than or equal to one" +} 1 + +INT output_remtime_every_minutes "Output remaining time every n minutes" STEERABLE = ALWAYS +{ + 0:0 :: "No output" + 1: :: "Positive..." +} 60 diff --git a/schedule.ccl b/schedule.ccl new file mode 100644 index 0000000..99684d3 --- /dev/null +++ b/schedule.ccl @@ -0,0 +1,30 @@ +# Schedule definitions for thorn ManualTermination +# $Header$ + +STORAGE: watchminutes + +if (on_remaining_walltime > 0) +{ + schedule ManualTermination_StartTimer at WRAGH + { + LANG:C + } "Start timer" + + schedule ManualTermination_ResetMinutes at POST_RECOVER_VARIABLES + { + LANG:C + } "Reset Watchtime" + + schedule ManualTermination_CheckWalltime at EVOL + { + LANG:C + } "Check elapsed job walltime" +} + +if (termination_from_file) +{ + schedule ManualTerminationFile at EVOL + { + LANG:C + } "Check termination file" +} diff --git a/src/ManualTermination.c b/src/ManualTermination.c new file mode 100644 index 0000000..0dce2fe --- /dev/null +++ b/src/ManualTermination.c @@ -0,0 +1,140 @@ +#include +#include +#include +#include + +#include "cctk.h" +#include "cctk_Arguments.h" +#include "cctk_Parameters.h" +#include "cctk_Termination.h" +#include "cctk_Timers.h" +#include "ManualTermination.h" + +enum{ BUFLEN = 128 }; + +/* On first call, pass parameter terminate_filename. + If it is null will construct file name in /tmp based on PBS_JOBID. + Subsequent calls ignore the argument, and return a static buffer. +*/ +const char * MT_get_terminate_filename( CCTK_STRING terminate_filename ) +{ + static char buf[BUFLEN]; + + if( strlen( buf ) != 0 ) + return buf; + + if( strlen( terminate_filename ) == 0 ) + { + const char * pbs_jobid = getenv("PBS_JOBID"); + snprintf( buf, BUFLEN, "/tmp/cactus_terminate.%s", pbs_jobid ); + } + else + { + snprintf( buf, BUFLEN, "%s", terminate_filename ); + } + return buf; +} + +int ManualTermination_StartTimer (CCTK_ARGUMENTS) +{ + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + int retval = 0; + int ierr; + int TimerIndex; + + /* only one processor needs to query the elapsed runtime */ + if (CCTK_MyProc (cctkGH) != 0) + { + return (0); + } + /* Create timer */ + TimerIndex = CCTK_TimerCreate("WatchWalltime"); + /* Start timer */ + ierr = CCTK_TimerStart("WatchWalltime"); + + *watchminutes = output_remtime_every_minutes*1.0e0; + + CCTK_VInfo (CCTK_THORNSTRING, "Started Timer"); + CCTK_VInfo (CCTK_THORNSTRING, "Reminding you every %d " + "minutes about remaining walltime.", + output_remtime_every_minutes); + + if( termination_from_file ) + { + FILE *termfile = fopen( MT_get_terminate_filename(termination_file), "w" ); + if( termfile != NULL ) + { + fprintf( termfile, "%d", 0 ); + fclose( termfile ); + } + else + { + CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, "ManualTermination", + "Could not open termination file '%s'. Error: %d", + MT_get_terminate_filename(NULL), strerror(errno)); + } + } + return (retval); +} + +int ManualTermination_ResetMinutes (CCTK_ARGUMENTS) +{ + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + *watchminutes = output_remtime_every_minutes*1.0e0; + + return 0; +} + +int ManualTermination_CheckWalltime (CCTK_ARGUMENTS) +{ + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + int retval,ierr; + cTimerData *info; + const cTimerVal *walltime; + CCTK_REAL time; + + /* only one processor needs to query the elapsed runtime */ + if (CCTK_MyProc (cctkGH) != 0) + { + return (0); + } + + info = CCTK_TimerCreateData(); + ierr = CCTK_Timer("WatchWalltime",info); + + /* stop timer */ + ierr = CCTK_TimerStop("WatchWalltime"); + /* get walltime */ + walltime = CCTK_GetClockValue("gettimeofday",info); + time = CCTK_TimerClockSeconds(walltime); + CCTK_TimerDestroyData(info); + /* Start timer */ + ierr = CCTK_TimerStart("WatchWalltime"); + + if ( (time/60.0e0 > *watchminutes) && *watchminutes != 0) + { + *watchminutes = (*watchminutes)+output_remtime_every_minutes*1.0e0; + CCTK_INFO ("***********************************************************"); + CCTK_VInfo (CCTK_THORNSTRING, "Remaining wallclock time for your job " + "is %1.2f minutes. :-)", (max_walltime*60.0-time/60.0)); + CCTK_INFO ("***********************************************************"); + } + + if (time/60.0e0 >= (max_walltime*60.0e0 - on_remaining_walltime*1.0e0)) + { + CCTK_VInfo (CCTK_THORNSTRING, "Remaining wallclock time for your job " + "is %1.2f minutes. Triggering termination ...", + (max_walltime*60.0-time/60.0)); + CCTK_TerminateNext (cctkGH); + } + + retval = 0; + + return (retval); +} diff --git a/src/ManualTermination.h b/src/ManualTermination.h new file mode 100644 index 0000000..bca3f3c --- /dev/null +++ b/src/ManualTermination.h @@ -0,0 +1,8 @@ +#ifndef MANUALTERMINATION_H +#define MANUALTERMINATION_H + +#include "cctk.h" + +const char * MT_get_terminate_filename( CCTK_STRING ); + +#endif diff --git a/src/ManualTerminationFile.c b/src/ManualTerminationFile.c new file mode 100644 index 0000000..342ac3f --- /dev/null +++ b/src/ManualTerminationFile.c @@ -0,0 +1,48 @@ +#include +#include +#include + +#include "cctk.h" +#include "cctk_Arguments.h" +#include "cctk_Parameters.h" +#include "cctk_Termination.h" +#include "cctk_Timers.h" +#include "ManualTermination.h" + + +int ManualTerminationFile (CCTK_ARGUMENTS) +{ + int retval = 0, terminate; + FILE *terminationfile; + + DECLARE_CCTK_PARAMETERS; + + /* only one processor needs to query the elapsed runtime */ + if (CCTK_MyProc (cctkGH) != 0) + { + return (0); + } + + if ( ((cctkGH->cctk_iteration-1) % check_file_every*1.0e0) != 0) + { + return(0); + } + + terminationfile = fopen(MT_get_terminate_filename(NULL),"r"); + + if(terminationfile!=NULL) + { + terminate=0; + fscanf(terminationfile,"%d",&terminate); + fclose(terminationfile); + + if (terminate==1) + { + CCTK_VInfo (CCTK_THORNSTRING, "OH MY GOD! Found termination signal " + "in termination file! TERMINATION NOW!!!!"); + CCTK_TerminateNext (cctkGH); + } + } + + return (retval); +} diff --git a/src/make.code.defn b/src/make.code.defn new file mode 100644 index 0000000..bf7f9e4 --- /dev/null +++ b/src/make.code.defn @@ -0,0 +1,8 @@ +# Main make.code.defn file for thorn ManualTermination +# $Header$ + +# Source files in this directory +SRCS = ManualTermination.c ManualTerminationFile.c + +# Subdirectories containing source files +SUBDIRS = -- cgit v1.2.3