From 50ac02734e80dc84326274e2a77bb03bf626e03a Mon Sep 17 00:00:00 2001 From: swhite Date: Thu, 27 Apr 2006 15:20:42 +0000 Subject: Made it work. git-svn-id: http://svn.aei.mpg.de/numrel/AEIThorns/ManualTermination/trunk@3 e5a5a894-0e4f-0410-be11-d22c8b0a171a --- README | 26 +++++++++++-- doc/documentation.tex | 34 ++++++++++++----- schedule.ccl | 10 +++++ src/ManualTermination.c | 80 ++++++++++------------------------------ src/ManualTermination.h | 8 ---- src/ManualTerminationFile.c | 89 +++++++++++++++++++++++++++++++++++++++------ 6 files changed, 153 insertions(+), 94 deletions(-) delete mode 100644 src/ManualTermination.h diff --git a/README b/README index d299580..0a0ed5e 100644 --- a/README +++ b/README @@ -1,10 +1,30 @@ CVS info : $Header$ Cactus Code Thorn ManualTermination, based on TriggerTerminationManual -Thorn Author(s) : Christian D. Ott -Thorn Maintainer(s) : Christian D. Ott +Thorn Author(s) : Christian D. Ott , + Steve White +Thorn Maintainer(s) : Steve White -------------------------------------------------------------------------- Purpose of the thorn: -Watch the gone by walltime. Trigger termination if only n minutes are left to some limit set by the user. +Watch the gone by walltime. Trigger termination (and checkpoint) if only n +minutes are left to some limit set by the user. Alternatively, terminate if a +1 appears in a specified file. + +Copyright (C) 2004 Christian Ott + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + diff --git a/doc/documentation.tex b/doc/documentation.tex index 33668ca..4efbd95 100644 --- a/doc/documentation.tex +++ b/doc/documentation.tex @@ -49,17 +49,9 @@ The program must be set up for checkpointing. (It can be argued that checkpointing functionality is common sense and good etiquette for long-running programs in a multi-user environment.) -For termination from a file, the PBS batch system is used. - \section{Setup} -\begin{Verbatim}[commandchars=\\\{\},frame=single] -cmd="mpirun \textsl{command...}" -/opt/NumRelScript/chain\_job "$0" "$cmd" -\end{Verbatim} - - \begin{Verbatim}[commandchars=\\\{\},frame=single] # # # # # # # # # # # # # # # Checkpointing / Recovery ActiveThorns = "IOHDF5Util IOHDF5" @@ -75,8 +67,11 @@ IOHDF5::checkpoint = "yes" # # # # # # # # # # # # # # # Termination ActiveThorns = "ManualTermination" + # termination by wall time ManualTermination::on_remaining_walltime=1400 #minutes before termination ManualTermination::max_walltime=12 # hours + + # termination from a file ManualTermination::termination_from_file=yes ManualTermination::check_file_every=10 #evolution steps ManualTermination::output_remtime_every_minutes=2 # how often to remind user @@ -85,11 +80,30 @@ ManualTermination::output_remtime_every_minutes=2 # how often to remind user \section{Use} +The two modes, termination by wall time and termination from file, are +meant to be independent and can be used together or separately. + +The default file checked is +\texttt{/tmp//cactus\_terminate.\textit{job\_id}}, +where by default, \texttt{\textit{job\_id}} is gotten from the \texttt{PBS\_JOBID} +environment variable. If the environment variable +\texttt{MANUAL\_TERMINATION\_JOB\_ID} is set, that will be used instead +as the \texttt{\textit{job\_id}}. + +In this configuration, any user may terminate the run by putting a '1' into +the specified file. + +The the termination file is removed when the run shuts down. + +It should be possible to use thorn \textbf{ManualTermination} with thorn +\textbf{JobChaining}. If a job is terminated by \textbf{ManualTermination}, +\textbf{JobChaining} will not attempt to re-queue the simulation. + \section{Licensing and Support} -Thorn \textbf{JobChaining} is distributed under the GNU Lesser Public +Thorn \textbf{ManualTermination} is distributed under the GNU Lesser Public License. -For details please see the file \texttt{COPYING.LIB} in the top-level +For details please see the file \texttt{README} in the top-level directory of this thorn. Please send any suggestions or comments to the maintainer of the thorn. diff --git a/schedule.ccl b/schedule.ccl index 99684d3..971a67d 100644 --- a/schedule.ccl +++ b/schedule.ccl @@ -23,8 +23,18 @@ if (on_remaining_walltime > 0) if (termination_from_file) { + schedule ManualTermination_Init at WRAGH + { + LANG:C + } "Initialise manual termination" + schedule ManualTerminationFile at EVOL { LANG:C } "Check termination file" + + schedule ManualTermination_Cleanup at SHUTDOWN + { + LANG:C + } "Clean up termination file" } diff --git a/src/ManualTermination.c b/src/ManualTermination.c index 0dce2fe..e0e32c4 100644 --- a/src/ManualTermination.c +++ b/src/ManualTermination.c @@ -1,39 +1,12 @@ -#include #include #include -#include #include "cctk.h" #include "cctk_Arguments.h" #include "cctk_Parameters.h" #include "cctk_Termination.h" #include "cctk_Timers.h" -#include "ManualTermination.h" -enum{ BUFLEN = 128 }; - -/* On first call, pass parameter terminate_filename. - If it is null will construct file name in /tmp based on PBS_JOBID. - Subsequent calls ignore the argument, and return a static buffer. -*/ -const char * MT_get_terminate_filename( CCTK_STRING terminate_filename ) -{ - static char buf[BUFLEN]; - - if( strlen( buf ) != 0 ) - return buf; - - if( strlen( terminate_filename ) == 0 ) - { - const char * pbs_jobid = getenv("PBS_JOBID"); - snprintf( buf, BUFLEN, "/tmp/cactus_terminate.%s", pbs_jobid ); - } - else - { - snprintf( buf, BUFLEN, "%s", terminate_filename ); - } - return buf; -} int ManualTermination_StartTimer (CCTK_ARGUMENTS) { @@ -50,9 +23,9 @@ int ManualTermination_StartTimer (CCTK_ARGUMENTS) return (0); } /* Create timer */ - TimerIndex = CCTK_TimerCreate("WatchWalltime"); + TimerIndex = CCTK_TimerCreate ("WatchWalltime"); /* Start timer */ - ierr = CCTK_TimerStart("WatchWalltime"); + ierr = CCTK_TimerStart ("WatchWalltime"); *watchminutes = output_remtime_every_minutes*1.0e0; @@ -60,22 +33,6 @@ int ManualTermination_StartTimer (CCTK_ARGUMENTS) CCTK_VInfo (CCTK_THORNSTRING, "Reminding you every %d " "minutes about remaining walltime.", output_remtime_every_minutes); - - if( termination_from_file ) - { - FILE *termfile = fopen( MT_get_terminate_filename(termination_file), "w" ); - if( termfile != NULL ) - { - fprintf( termfile, "%d", 0 ); - fclose( termfile ); - } - else - { - CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, "ManualTermination", - "Could not open termination file '%s'. Error: %d", - MT_get_terminate_filename(NULL), strerror(errno)); - } - } return (retval); } @@ -94,7 +51,7 @@ int ManualTermination_CheckWalltime (CCTK_ARGUMENTS) DECLARE_CCTK_ARGUMENTS; DECLARE_CCTK_PARAMETERS; - int retval,ierr; + int retval = 0,ierr; cTimerData *info; const cTimerVal *walltime; CCTK_REAL time; @@ -104,37 +61,38 @@ int ManualTermination_CheckWalltime (CCTK_ARGUMENTS) { return (0); } + if (on_remaining_walltime <= 0) + return retval; - info = CCTK_TimerCreateData(); - ierr = CCTK_Timer("WatchWalltime",info); - + info = CCTK_TimerCreateData (); + ierr = CCTK_Timer ("WatchWalltime",info); /* stop timer */ - ierr = CCTK_TimerStop("WatchWalltime"); + ierr = CCTK_TimerStop ("WatchWalltime"); /* get walltime */ - walltime = CCTK_GetClockValue("gettimeofday",info); - time = CCTK_TimerClockSeconds(walltime); - CCTK_TimerDestroyData(info); - /* Start timer */ - ierr = CCTK_TimerStart("WatchWalltime"); + walltime = CCTK_GetClockValue ("gettimeofday",info); + time = CCTK_TimerClockSeconds (walltime); + CCTK_TimerDestroyData (info); + /* start timer */ + ierr = CCTK_TimerStart ("WatchWalltime"); - if ( (time/60.0e0 > *watchminutes) && *watchminutes != 0) + if ((time/60.0e0 > *watchminutes) && *watchminutes != 0) { *watchminutes = (*watchminutes)+output_remtime_every_minutes*1.0e0; - CCTK_INFO ("***********************************************************"); + CCTK_INFO ("------------------------------------------------------"); CCTK_VInfo (CCTK_THORNSTRING, "Remaining wallclock time for your job " "is %1.2f minutes. :-)", (max_walltime*60.0-time/60.0)); - CCTK_INFO ("***********************************************************"); + CCTK_INFO ("------------------------------------------------------"); } if (time/60.0e0 >= (max_walltime*60.0e0 - on_remaining_walltime*1.0e0)) { + CCTK_INFO ("------------------------------------------------------"); CCTK_VInfo (CCTK_THORNSTRING, "Remaining wallclock time for your job " "is %1.2f minutes. Triggering termination ...", (max_walltime*60.0-time/60.0)); + CCTK_INFO ("------------------------------------------------------"); CCTK_TerminateNext (cctkGH); } - retval = 0; - - return (retval); + return retval; } diff --git a/src/ManualTermination.h b/src/ManualTermination.h deleted file mode 100644 index bca3f3c..0000000 --- a/src/ManualTermination.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef MANUALTERMINATION_H -#define MANUALTERMINATION_H - -#include "cctk.h" - -const char * MT_get_terminate_filename( CCTK_STRING ); - -#endif diff --git a/src/ManualTerminationFile.c b/src/ManualTerminationFile.c index 342ac3f..7e10a38 100644 --- a/src/ManualTerminationFile.c +++ b/src/ManualTerminationFile.c @@ -1,14 +1,68 @@ #include #include #include +#include #include "cctk.h" #include "cctk_Arguments.h" #include "cctk_Parameters.h" #include "cctk_Termination.h" -#include "cctk_Timers.h" -#include "ManualTermination.h" +enum{ BUFLEN = 128 }; +/* On first call, pass parameter terminate_filename. + If it is null will construct file name in /tmp based on PBS_JOBID. + Subsequent calls ignore the argument, and return a static buffer. +*/ +const char * MT_get_terminate_filename (CCTK_STRING terminate_filename) +{ + static char buf[BUFLEN]; + + if (strlen (buf) != 0) + return buf; + + if (strlen (terminate_filename) == 0) + { + const char * pbs_jobid = getenv ("MANUAL_TERMINATION_JOB_ID"); + if (pbs_jobid == NULL) + pbs_jobid = getenv ("PBS_JOBID"); + + if (pbs_jobid == NULL) + CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, "ManualTermination", + "Could not find environment variable " + "'MANUAL_TERMINATION_JOB_ID' or 'PBS_JOBID'" + ); + else + snprintf (buf, BUFLEN, "/tmp/cactus_terminate.%s", pbs_jobid); + } + else + { + snprintf (buf, BUFLEN, "%s", terminate_filename); + } + return buf; +} + +int ManualTermination_Init (CCTK_ARGUMENTS) +{ + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + if (termination_from_file) + { + FILE *termfile = fopen (MT_get_terminate_filename (termination_file), "w"); + if (termfile != NULL) + { + fprintf (termfile, "%d", 0); + fclose (termfile); + } + else + { + CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, "ManualTermination", + "Could not open termination file '%s'. Error: %d", + MT_get_terminate_filename (NULL), strerror (errno)); + } + } + return 0; +} int ManualTerminationFile (CCTK_ARGUMENTS) { @@ -20,29 +74,40 @@ int ManualTerminationFile (CCTK_ARGUMENTS) /* only one processor needs to query the elapsed runtime */ if (CCTK_MyProc (cctkGH) != 0) { - return (0); + return 0; } - if ( ((cctkGH->cctk_iteration-1) % check_file_every*1.0e0) != 0) + if (((cctkGH->cctk_iteration- 1) % check_file_every * 1.0e0) != 0) { - return(0); + return 0; } - terminationfile = fopen(MT_get_terminate_filename(NULL),"r"); + terminationfile = fopen (MT_get_terminate_filename (NULL), "r"); - if(terminationfile!=NULL) + if (terminationfile != NULL) { - terminate=0; - fscanf(terminationfile,"%d",&terminate); - fclose(terminationfile); + terminate = 0; + fscanf (terminationfile, "%d", &terminate); + fclose (terminationfile); - if (terminate==1) + if (terminate == 1) { + CCTK_INFO ("------------------------------------------------------"); CCTK_VInfo (CCTK_THORNSTRING, "OH MY GOD! Found termination signal " "in termination file! TERMINATION NOW!!!!"); + CCTK_INFO ("------------------------------------------------------"); CCTK_TerminateNext (cctkGH); } } - return (retval); + return retval; +} + +int ManualTermination_Cleanup (CCTK_ARGUMENTS) +{ + DECLARE_CCTK_PARAMETERS; + + if (termination_from_file && MT_get_terminate_filename (NULL)) + remove (MT_get_terminate_filename (NULL)); + return 0; } -- cgit v1.2.3