aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorswhite <swhite@e5a5a894-0e4f-0410-be11-d22c8b0a171a>2006-04-27 15:20:42 +0000
committerswhite <swhite@e5a5a894-0e4f-0410-be11-d22c8b0a171a>2006-04-27 15:20:42 +0000
commit50ac02734e80dc84326274e2a77bb03bf626e03a (patch)
treea9cfcd52021b1d3da8dcf7d4cf74307a4f39f4a7
parentd69c71d3983cee0cfd3793734a5c9fa8b1391b21 (diff)
Made it work.
git-svn-id: http://svn.aei.mpg.de/numrel/AEIThorns/ManualTermination/trunk@3 e5a5a894-0e4f-0410-be11-d22c8b0a171a
-rw-r--r--README26
-rw-r--r--doc/documentation.tex34
-rw-r--r--schedule.ccl10
-rw-r--r--src/ManualTermination.c80
-rw-r--r--src/ManualTermination.h8
-rw-r--r--src/ManualTerminationFile.c89
6 files changed, 153 insertions, 94 deletions
diff --git a/README b/README
index d299580..0a0ed5e 100644
--- a/README
+++ b/README
@@ -1,10 +1,30 @@
CVS info : $Header$
Cactus Code Thorn ManualTermination, based on TriggerTerminationManual
-Thorn Author(s) : Christian D. Ott <cott@aei.mpg.de>
-Thorn Maintainer(s) : Christian D. Ott <cott@aei.mpg.de>
+Thorn Author(s) : Christian D. Ott <cott@aei.mpg.de>,
+ Steve White <swhite@aei.mpg.de>
+Thorn Maintainer(s) : Steve White <swhite@aei.mpg.de>
--------------------------------------------------------------------------
Purpose of the thorn:
-Watch the gone by walltime. Trigger termination if only n minutes are left to some limit set by the user.
+Watch the gone by walltime. Trigger termination (and checkpoint) if only n
+minutes are left to some limit set by the user. Alternatively, terminate if a
+1 appears in a specified file.
+
+Copyright (C) 2004 Christian Ott
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
diff --git a/doc/documentation.tex b/doc/documentation.tex
index 33668ca..4efbd95 100644
--- a/doc/documentation.tex
+++ b/doc/documentation.tex
@@ -49,18 +49,10 @@ The program must be set up for checkpointing. (It can be argued that
checkpointing functionality is common sense and good etiquette for
long-running programs in a multi-user environment.)
-For termination from a file, the PBS batch system is used.
-
\section{Setup}
\begin{Verbatim}[commandchars=\\\{\},frame=single]
-cmd="mpirun \textsl{command...}"
-/opt/NumRelScript/chain\_job "$0" "$cmd"
-\end{Verbatim}
-
-
-\begin{Verbatim}[commandchars=\\\{\},frame=single]
# # # # # # # # # # # # # # # Checkpointing / Recovery
ActiveThorns = "IOHDF5Util IOHDF5"
@@ -75,8 +67,11 @@ IOHDF5::checkpoint = "yes"
# # # # # # # # # # # # # # # Termination
ActiveThorns = "ManualTermination"
+ # termination by wall time
ManualTermination::on_remaining_walltime=1400 #minutes before termination
ManualTermination::max_walltime=12 # hours
+
+ # termination from a file
ManualTermination::termination_from_file=yes
ManualTermination::check_file_every=10 #evolution steps
ManualTermination::output_remtime_every_minutes=2 # how often to remind user
@@ -85,11 +80,30 @@ ManualTermination::output_remtime_every_minutes=2 # how often to remind user
\section{Use}
+The two modes, termination by wall time and termination from file, are
+meant to be independent and can be used together or separately.
+
+The default file checked is
+\texttt{/tmp//cactus\_terminate.\textit{job\_id}},
+where by default, \texttt{\textit{job\_id}} is gotten from the \texttt{PBS\_JOBID}
+environment variable. If the environment variable
+\texttt{MANUAL\_TERMINATION\_JOB\_ID} is set, that will be used instead
+as the \texttt{\textit{job\_id}}.
+
+In this configuration, any user may terminate the run by putting a '1' into
+the specified file.
+
+The the termination file is removed when the run shuts down.
+
+It should be possible to use thorn \textbf{ManualTermination} with thorn
+\textbf{JobChaining}. If a job is terminated by \textbf{ManualTermination},
+\textbf{JobChaining} will not attempt to re-queue the simulation.
+
\section{Licensing and Support}
-Thorn \textbf{JobChaining} is distributed under the GNU Lesser Public
+Thorn \textbf{ManualTermination} is distributed under the GNU Lesser Public
License.
-For details please see the file \texttt{COPYING.LIB} in the top-level
+For details please see the file \texttt{README} in the top-level
directory of this thorn.
Please send any suggestions or comments to the maintainer of the thorn.
diff --git a/schedule.ccl b/schedule.ccl
index 99684d3..971a67d 100644
--- a/schedule.ccl
+++ b/schedule.ccl
@@ -23,8 +23,18 @@ if (on_remaining_walltime > 0)
if (termination_from_file)
{
+ schedule ManualTermination_Init at WRAGH
+ {
+ LANG:C
+ } "Initialise manual termination"
+
schedule ManualTerminationFile at EVOL
{
LANG:C
} "Check termination file"
+
+ schedule ManualTermination_Cleanup at SHUTDOWN
+ {
+ LANG:C
+ } "Clean up termination file"
}
diff --git a/src/ManualTermination.c b/src/ManualTermination.c
index 0dce2fe..e0e32c4 100644
--- a/src/ManualTermination.c
+++ b/src/ManualTermination.c
@@ -1,39 +1,12 @@
-#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <errno.h>
#include "cctk.h"
#include "cctk_Arguments.h"
#include "cctk_Parameters.h"
#include "cctk_Termination.h"
#include "cctk_Timers.h"
-#include "ManualTermination.h"
-enum{ BUFLEN = 128 };
-
-/* On first call, pass parameter terminate_filename.
- If it is null will construct file name in /tmp based on PBS_JOBID.
- Subsequent calls ignore the argument, and return a static buffer.
-*/
-const char * MT_get_terminate_filename( CCTK_STRING terminate_filename )
-{
- static char buf[BUFLEN];
-
- if( strlen( buf ) != 0 )
- return buf;
-
- if( strlen( terminate_filename ) == 0 )
- {
- const char * pbs_jobid = getenv("PBS_JOBID");
- snprintf( buf, BUFLEN, "/tmp/cactus_terminate.%s", pbs_jobid );
- }
- else
- {
- snprintf( buf, BUFLEN, "%s", terminate_filename );
- }
- return buf;
-}
int ManualTermination_StartTimer (CCTK_ARGUMENTS)
{
@@ -50,9 +23,9 @@ int ManualTermination_StartTimer (CCTK_ARGUMENTS)
return (0);
}
/* Create timer */
- TimerIndex = CCTK_TimerCreate("WatchWalltime");
+ TimerIndex = CCTK_TimerCreate ("WatchWalltime");
/* Start timer */
- ierr = CCTK_TimerStart("WatchWalltime");
+ ierr = CCTK_TimerStart ("WatchWalltime");
*watchminutes = output_remtime_every_minutes*1.0e0;
@@ -60,22 +33,6 @@ int ManualTermination_StartTimer (CCTK_ARGUMENTS)
CCTK_VInfo (CCTK_THORNSTRING, "Reminding you every %d "
"minutes about remaining walltime.",
output_remtime_every_minutes);
-
- if( termination_from_file )
- {
- FILE *termfile = fopen( MT_get_terminate_filename(termination_file), "w" );
- if( termfile != NULL )
- {
- fprintf( termfile, "%d", 0 );
- fclose( termfile );
- }
- else
- {
- CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, "ManualTermination",
- "Could not open termination file '%s'. Error: %d",
- MT_get_terminate_filename(NULL), strerror(errno));
- }
- }
return (retval);
}
@@ -94,7 +51,7 @@ int ManualTermination_CheckWalltime (CCTK_ARGUMENTS)
DECLARE_CCTK_ARGUMENTS;
DECLARE_CCTK_PARAMETERS;
- int retval,ierr;
+ int retval = 0,ierr;
cTimerData *info;
const cTimerVal *walltime;
CCTK_REAL time;
@@ -104,37 +61,38 @@ int ManualTermination_CheckWalltime (CCTK_ARGUMENTS)
{
return (0);
}
+ if (on_remaining_walltime <= 0)
+ return retval;
- info = CCTK_TimerCreateData();
- ierr = CCTK_Timer("WatchWalltime",info);
-
+ info = CCTK_TimerCreateData ();
+ ierr = CCTK_Timer ("WatchWalltime",info);
/* stop timer */
- ierr = CCTK_TimerStop("WatchWalltime");
+ ierr = CCTK_TimerStop ("WatchWalltime");
/* get walltime */
- walltime = CCTK_GetClockValue("gettimeofday",info);
- time = CCTK_TimerClockSeconds(walltime);
- CCTK_TimerDestroyData(info);
- /* Start timer */
- ierr = CCTK_TimerStart("WatchWalltime");
+ walltime = CCTK_GetClockValue ("gettimeofday",info);
+ time = CCTK_TimerClockSeconds (walltime);
+ CCTK_TimerDestroyData (info);
+ /* start timer */
+ ierr = CCTK_TimerStart ("WatchWalltime");
- if ( (time/60.0e0 > *watchminutes) && *watchminutes != 0)
+ if ((time/60.0e0 > *watchminutes) && *watchminutes != 0)
{
*watchminutes = (*watchminutes)+output_remtime_every_minutes*1.0e0;
- CCTK_INFO ("***********************************************************");
+ CCTK_INFO ("------------------------------------------------------");
CCTK_VInfo (CCTK_THORNSTRING, "Remaining wallclock time for your job "
"is %1.2f minutes. :-)", (max_walltime*60.0-time/60.0));
- CCTK_INFO ("***********************************************************");
+ CCTK_INFO ("------------------------------------------------------");
}
if (time/60.0e0 >= (max_walltime*60.0e0 - on_remaining_walltime*1.0e0))
{
+ CCTK_INFO ("------------------------------------------------------");
CCTK_VInfo (CCTK_THORNSTRING, "Remaining wallclock time for your job "
"is %1.2f minutes. Triggering termination ...",
(max_walltime*60.0-time/60.0));
+ CCTK_INFO ("------------------------------------------------------");
CCTK_TerminateNext (cctkGH);
}
- retval = 0;
-
- return (retval);
+ return retval;
}
diff --git a/src/ManualTermination.h b/src/ManualTermination.h
deleted file mode 100644
index bca3f3c..0000000
--- a/src/ManualTermination.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef MANUALTERMINATION_H
-#define MANUALTERMINATION_H
-
-#include "cctk.h"
-
-const char * MT_get_terminate_filename( CCTK_STRING );
-
-#endif
diff --git a/src/ManualTerminationFile.c b/src/ManualTerminationFile.c
index 342ac3f..7e10a38 100644
--- a/src/ManualTerminationFile.c
+++ b/src/ManualTerminationFile.c
@@ -1,14 +1,68 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <errno.h>
#include "cctk.h"
#include "cctk_Arguments.h"
#include "cctk_Parameters.h"
#include "cctk_Termination.h"
-#include "cctk_Timers.h"
-#include "ManualTermination.h"
+enum{ BUFLEN = 128 };
+/* On first call, pass parameter terminate_filename.
+ If it is null will construct file name in /tmp based on PBS_JOBID.
+ Subsequent calls ignore the argument, and return a static buffer.
+*/
+const char * MT_get_terminate_filename (CCTK_STRING terminate_filename)
+{
+ static char buf[BUFLEN];
+
+ if (strlen (buf) != 0)
+ return buf;
+
+ if (strlen (terminate_filename) == 0)
+ {
+ const char * pbs_jobid = getenv ("MANUAL_TERMINATION_JOB_ID");
+ if (pbs_jobid == NULL)
+ pbs_jobid = getenv ("PBS_JOBID");
+
+ if (pbs_jobid == NULL)
+ CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, "ManualTermination",
+ "Could not find environment variable "
+ "'MANUAL_TERMINATION_JOB_ID' or 'PBS_JOBID'"
+ );
+ else
+ snprintf (buf, BUFLEN, "/tmp/cactus_terminate.%s", pbs_jobid);
+ }
+ else
+ {
+ snprintf (buf, BUFLEN, "%s", terminate_filename);
+ }
+ return buf;
+}
+
+int ManualTermination_Init (CCTK_ARGUMENTS)
+{
+ DECLARE_CCTK_ARGUMENTS;
+ DECLARE_CCTK_PARAMETERS;
+
+ if (termination_from_file)
+ {
+ FILE *termfile = fopen (MT_get_terminate_filename (termination_file), "w");
+ if (termfile != NULL)
+ {
+ fprintf (termfile, "%d", 0);
+ fclose (termfile);
+ }
+ else
+ {
+ CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, "ManualTermination",
+ "Could not open termination file '%s'. Error: %d",
+ MT_get_terminate_filename (NULL), strerror (errno));
+ }
+ }
+ return 0;
+}
int ManualTerminationFile (CCTK_ARGUMENTS)
{
@@ -20,29 +74,40 @@ int ManualTerminationFile (CCTK_ARGUMENTS)
/* only one processor needs to query the elapsed runtime */
if (CCTK_MyProc (cctkGH) != 0)
{
- return (0);
+ return 0;
}
- if ( ((cctkGH->cctk_iteration-1) % check_file_every*1.0e0) != 0)
+ if (((cctkGH->cctk_iteration- 1) % check_file_every * 1.0e0) != 0)
{
- return(0);
+ return 0;
}
- terminationfile = fopen(MT_get_terminate_filename(NULL),"r");
+ terminationfile = fopen (MT_get_terminate_filename (NULL), "r");
- if(terminationfile!=NULL)
+ if (terminationfile != NULL)
{
- terminate=0;
- fscanf(terminationfile,"%d",&terminate);
- fclose(terminationfile);
+ terminate = 0;
+ fscanf (terminationfile, "%d", &terminate);
+ fclose (terminationfile);
- if (terminate==1)
+ if (terminate == 1)
{
+ CCTK_INFO ("------------------------------------------------------");
CCTK_VInfo (CCTK_THORNSTRING, "OH MY GOD! Found termination signal "
"in termination file! TERMINATION NOW!!!!");
+ CCTK_INFO ("------------------------------------------------------");
CCTK_TerminateNext (cctkGH);
}
}
- return (retval);
+ return retval;
+}
+
+int ManualTermination_Cleanup (CCTK_ARGUMENTS)
+{
+ DECLARE_CCTK_PARAMETERS;
+
+ if (termination_from_file && MT_get_terminate_filename (NULL))
+ remove (MT_get_terminate_filename (NULL));
+ return 0;
}