aboutsummaryrefslogtreecommitdiff
path: root/Carpet/LoopControl
diff options
context:
space:
mode:
authorErik Schnetter <schnetter@cct.lsu.edu>2009-09-21 11:28:26 -0500
committerBarry Wardell <barry.wardell@gmail.com>2011-12-14 16:45:09 +0000
commit29e373ad99f97175fd6443dd7e9307e10cc125f2 (patch)
tree1b424d49d8e2dd079b78821c5fa842e559c91732 /Carpet/LoopControl
parentf5b0376823ed3658847fdf2f3447c87fcdec15ff (diff)
LoopControl: Implement cache-collaborative multi-threading
Ignore-this: 5169757c7749834ae595d4d73b39220 Add a new, additional feature to LoopControl: different threads can work on small regions that are likely to use the same cache entries as other threads, trying to reduce cache pressure. This makes sense mostly when the regions are still expensive although they are small, e.g. for the BSSN RHS.
Diffstat (limited to 'Carpet/LoopControl')
-rw-r--r--Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.hill.par20
-rw-r--r--Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.legacy.par20
-rw-r--r--Carpet/LoopControl/src/loopcontrol.c187
-rw-r--r--Carpet/LoopControl/src/loopcontrol.h32
-rw-r--r--Carpet/LoopControl/src/loopcontrol_fortran.h55
-rw-r--r--Carpet/LoopControl/src/loopcontrol_types.F906
6 files changed, 219 insertions, 101 deletions
diff --git a/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.hill.par b/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.hill.par
index c7cac7fdd..93fb29489 100644
--- a/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.hill.par
+++ b/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.hill.par
@@ -1,5 +1,4 @@
-Cactus::cctk_run_title = "Benchmark of McLachlan using Carpet with one refinement level"
-Cactus::cctk_timer_output = "full"
+Cactus::cctk_run_title = "Benchmark of McLachlan using Carpet with one refinement level"
Cactus::cctk_itlast = @ITERATIONS@
@@ -11,7 +10,7 @@ ActiveThorns = "Fortran"
ActiveThorns = "LoopControl"
-#LoopControl::printstats = yes
+LoopControl::printstats = yes
LoopControl::legacy_init = no
LoopControl::use_random_restart_hill_climbing = yes
@@ -20,7 +19,7 @@ LoopControl::use_random_restart_hill_climbing = yes
ActiveThorns = "IOUtil"
-#IO::print_timing_info = yes
+IO::out_dir = $parfile
@@ -36,9 +35,7 @@ driver::ghost_size = 3
Carpet::init_fill_timelevels = yes
-CarpetLib::combine_recompose = yes
-
-#CarpetLib::print_timestats_every = 1
+CarpetLib::print_timestats_every = @ITERATIONS@
CarpetLib::print_memstats_every = @ITERATIONS@
@@ -128,5 +125,10 @@ IOBasic::outInfo_every = @ITERATIONS@
ActiveThorns = "TimerReport"
-TimerReport::output_all_timers = yes
-TimerReport::all_timers_clock = "cycle"
+TimerReport::output_all_timers = yes
+TimerReport::all_timers_clock = "cycle"
+TimerReport::out_every = @ITERATIONS@
+TimerReport::out_filename = "TimerReport"
+TimerReport::output_all_timers_together = yes
+TimerReport::output_all_timers_readable = yes
+TimerReport::n_top_timers = 20
diff --git a/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.legacy.par b/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.legacy.par
index dc1bbbdcb..09be6185d 100644
--- a/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.legacy.par
+++ b/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.legacy.par
@@ -1,5 +1,4 @@
-Cactus::cctk_run_title = "Benchmark of McLachlan using Carpet with one refinement level"
-Cactus::cctk_timer_output = "full"
+Cactus::cctk_run_title = "Benchmark of McLachlan using Carpet with one refinement level"
Cactus::cctk_itlast = @ITERATIONS@
@@ -11,7 +10,7 @@ ActiveThorns = "Fortran"
ActiveThorns = "LoopControl"
-#LoopControl::printstats = yes
+LoopControl::printstats = yes
LoopControl::legacy_init = yes
LoopControl::use_random_restart_hill_climbing = no
@@ -20,7 +19,7 @@ LoopControl::use_random_restart_hill_climbing = no
ActiveThorns = "IOUtil"
-#IO::print_timing_info = yes
+IO::out_dir = $parfile
@@ -36,9 +35,9 @@ driver::ghost_size = 3
Carpet::init_fill_timelevels = yes
-CarpetLib::combine_recompose = yes
+CarpetLib::avoid_arraysize_bytes = 0
-#CarpetLib::print_timestats_every = 1
+CarpetLib::print_timestats_every = @ITERATIONS@
CarpetLib::print_memstats_every = @ITERATIONS@
@@ -128,5 +127,10 @@ IOBasic::outInfo_every = @ITERATIONS@
ActiveThorns = "TimerReport"
-TimerReport::output_all_timers = yes
-TimerReport::all_timers_clock = "cycle"
+TimerReport::output_all_timers = yes
+TimerReport::all_timers_clock = "cycle"
+TimerReport::out_every = @ITERATIONS@
+TimerReport::out_filename = "TimerReport"
+TimerReport::output_all_timers_together = yes
+TimerReport::output_all_timers_readable = yes
+TimerReport::n_top_timers = 20
diff --git a/Carpet/LoopControl/src/loopcontrol.c b/Carpet/LoopControl/src/loopcontrol.c
index afc19fec3..34bfe4d6b 100644
--- a/Carpet/LoopControl/src/loopcontrol.c
+++ b/Carpet/LoopControl/src/loopcontrol.c
@@ -63,11 +63,11 @@ lc_statmap_t * lc_statmap_list = NULL;
/* Find all possible thread topologies */
/* This finds all possible thread topologies which can be expressed as
- NIxNJxNK. More complex topologies, e.g. based on a recursive
- subdiviston, are not considered (and cannot be expressed in the
- data structures used in LoopControl). I think more complex
- topologies are not necessary, since the number of treads is usually
- quite small and contains many small factors in its prime
+ NIxNJxNK x NIIxNJJxNKK. More complex topologies, e.g. based on a
+ recursive subdiviston, are not considered (and cannot be expressed
+ with the data structures used in LoopControl). I expect that more
+ complex topologies are not necessary, since the number of treads is
+ usually quite small and contains many small factors in its prime
decomposition. */
static
void
@@ -77,17 +77,36 @@ find_thread_topologies (lc_topology_t * restrict const topologies,
int const nthreads)
{
* ntopologies = 0;
+
for (int nk=1; nk<=nthreads; ++nk) {
if (nthreads % nk == 0) {
for (int nj=1; nj<=nthreads/nk; ++nj) {
if (nthreads % (nj*nk) == 0) {
- int const ni = nthreads/(nj*nk);
- if (nthreads == ni*nj*nk) {
- assert (* ntopologies < maxntopologies);
- topologies[* ntopologies].nthreads[0] = ni;
- topologies[* ntopologies].nthreads[1] = nj;
- topologies[* ntopologies].nthreads[2] = nk;
- ++ * ntopologies;
+ for (int ni=1; ni<=nthreads/(nj*nk); ++ni) {
+ if (nthreads % (ni*nj*nk) == 0) {
+
+ int const nithreads = nthreads/(ni*nj*nk);
+ for (int nkk=1; nkk<=nithreads; ++nkk) {
+ if (nithreads % nkk == 0) {
+ for (int njj=1; njj<=nithreads/nkk; ++njj) {
+ if (nithreads % (njj*nkk) == 0) {
+ int const nii = nithreads/(njj*nkk);
+
+ assert (* ntopologies < maxntopologies);
+ topologies[* ntopologies].nthreads[0][0] = ni;
+ topologies[* ntopologies].nthreads[0][1] = nj;
+ topologies[* ntopologies].nthreads[0][2] = nk;
+ topologies[* ntopologies].nthreads[1][0] = nii;
+ topologies[* ntopologies].nthreads[1][1] = njj;
+ topologies[* ntopologies].nthreads[1][2] = nkk;
+ ++ * ntopologies;
+
+ }
+ }
+ }
+ }
+
+ }
}
}
}
@@ -222,20 +241,28 @@ lc_stattime_init (lc_stattime_t * restrict const lt,
lt->inthreads = -1;
lt->jnthreads = -1;
lt->knthreads = -1;
+ lt->inithreads = -1;
+ lt->jnithreads = -1;
+ lt->knithreads = -1;
} else {
assert (state->topology >= 0 && state->topology < ls->ntopologies);
- lt->inthreads = ls->topologies[lt->state.topology].nthreads[0];
- lt->jnthreads = ls->topologies[lt->state.topology].nthreads[1];
- lt->knthreads = ls->topologies[lt->state.topology].nthreads[2];
+ lt->inthreads = ls->topologies[lt->state.topology].nthreads[0][0];
+ lt->jnthreads = ls->topologies[lt->state.topology].nthreads[0][1];
+ lt->knthreads = ls->topologies[lt->state.topology].nthreads[0][2];
+ lt->inithreads = ls->topologies[lt->state.topology].nthreads[1][0];
+ lt->jnithreads = ls->topologies[lt->state.topology].nthreads[1][1];
+ lt->knithreads = ls->topologies[lt->state.topology].nthreads[1][2];
}
if (debug) {
- printf ("Thread topology #%d [%d,%d,%d]\n",
- lt->state.topology, lt->inthreads, lt->jnthreads, lt->knthreads);
+ printf ("Thread topology #%d [%d,%d,%d]x[%d,%d,%d]\n",
+ lt->state.topology,
+ lt->inthreads, lt->jnthreads, lt->knthreads,
+ lt->inithreads, lt->jnithreads, lt->knithreads);
}
/* Assert thread topology consistency */
@@ -243,7 +270,12 @@ lc_stattime_init (lc_stattime_t * restrict const lt,
assert (lt->inthreads >= 1);
assert (lt->jnthreads >= 1);
assert (lt->knthreads >= 1);
- assert (lt->inthreads * lt->jnthreads * lt->knthreads == ls->num_threads);
+ assert (lt->inithreads >= 1);
+ assert (lt->jnithreads >= 1);
+ assert (lt->knithreads >= 1);
+ assert (lt->inthreads * lt->jnthreads * lt->knthreads *
+ lt->inithreads * lt->jnithreads * lt->knithreads ==
+ ls->num_threads);
}
/*** Tilings ****************************************************************/
@@ -355,37 +387,56 @@ lc_statset_init (lc_statset_t * restrict const ls,
assert (ls);
assert (lm);
assert (num_threads >= 1);
+ int total_npoints = 1;
for (int d=0; d<3; ++d) {
assert (npoints[d] >= 0);
+ assert (npoints[d] < 1000000000);
+ assert (npoints[d] < 1000000000 / total_npoints);
+ total_npoints *= npoints[d];
}
/*** Threads ****************************************************************/
- ls->num_threads = num_threads;
+ static int saved_num_threads = -1;
+ static lc_topology_t * restrict saved_topologies;
+ static int saved_ntopologies;
- /* For up to 1024 threads, there are at most 270 possible
- topologies */
- int const maxntopologies = 1000;
- if (debug) {
- printf ("Running on %d threads\n", ls->num_threads);
- }
- ls->topologies = malloc (maxntopologies * sizeof * ls->topologies);
- find_thread_topologies
- (ls->topologies, maxntopologies, & ls->ntopologies, ls->num_threads);
-#if 0
- ls->topologies =
- realloc (ls->topologies, ls->ntopologies * sizeof * ls->topologies);
-#endif
- if (debug) {
- printf ("Found %d possible thread topologies\n", ls->ntopologies);
- for (int n = 0; n < ls->ntopologies; ++n) {
- printf (" %2d: %2d %2d %2d\n",
- n,
- ls->topologies[n].nthreads[0],
- ls->topologies[n].nthreads[1],
- ls->topologies[n].nthreads[2]);
+ if (saved_num_threads == -1) {
+ saved_num_threads = num_threads;
+
+ /* For up to 1024 threads, there are at most 611556 possible
+ topologies */
+ int const maxntopologies = 1000000;
+ if (debug) {
+ printf ("Running on %d threads\n", num_threads);
+ }
+
+ saved_topologies = malloc (maxntopologies * sizeof * saved_topologies);
+ find_thread_topologies
+ (saved_topologies, maxntopologies, & saved_ntopologies,
+ saved_num_threads);
+ saved_topologies =
+ realloc (saved_topologies, saved_ntopologies * sizeof * saved_topologies);
+
+ if (debug) {
+ printf ("Found %d possible thread topologies\n", saved_ntopologies);
+ for (int n = 0; n < saved_ntopologies; ++n) {
+ printf (" %2d: %2d %2d %2d %2d %2d %2d\n",
+ n,
+ saved_topologies[n].nthreads[0][0],
+ saved_topologies[n].nthreads[0][1],
+ saved_topologies[n].nthreads[0][2],
+ saved_topologies[n].nthreads[1][0],
+ saved_topologies[n].nthreads[1][1],
+ saved_topologies[n].nthreads[1][2]);
+ }
}
}
+
+ assert (saved_num_threads == num_threads);
+ ls->num_threads = saved_num_threads;
+ ls->topologies = saved_topologies;
+ ls->ntopologies = saved_ntopologies;
assert (ls->ntopologies > 0);
/*** Tilings ****************************************************************/
@@ -409,7 +460,9 @@ lc_statset_init (lc_statset_t * restrict const ls,
for (int n = 0; n < ls->ntopologies; ++n) {
int tiling;
for (tiling = 1; tiling < ls->ntilings[d]; ++tiling) {
- if (ls->tilings[d][tiling].npoints * ls->topologies[n].nthreads[d] >
+ if (ls->tilings[d][tiling].npoints *
+ ls->topologies[n].nthreads[0][d] *
+ ls->topologies[n].nthreads[1][d] >
ls->npoints[d])
{
break;
@@ -606,12 +659,12 @@ lc_control_init (lc_control_t * restrict const lc,
if (lc_inthreads == -1 || lc_jnthreads == -1 || lc_knthreads == -1) {
CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, CCTK_THORNSTRING,
- "Illegal thread topology [%d,%d,%d] specified",
+ "Illegal thread topology [%d,%d,%d]x[1,1,1] specified",
(int)lc_inthreads, (int)lc_jnthreads, (int)lc_knthreads);
}
if (lc_inthreads * lc_jnthreads * lc_knthreads != ls->num_threads) {
CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, CCTK_THORNSTRING,
- "Specified thread topology [%d,%d,%d] is not compatible with the number of threads %d",
+ "Specified thread topology [%d,%d,%d]x[1,1,1] is not compatible with the number of threads %d",
(int)lc_inthreads, (int)lc_jnthreads, (int)lc_knthreads,
ls->num_threads);
}
@@ -698,13 +751,21 @@ lc_control_init (lc_control_t * restrict const lc,
lt->inthreads = lc_inthreads;
lt->jnthreads = lc_jnthreads;
lt->knthreads = lc_knthreads;
+ lt->inithreads = 1;
+ lt->jnithreads = 1;
+ lt->knithreads = 1;
}
/* Assert thread topology consistency */
assert (lt->inthreads >= 1);
assert (lt->jnthreads >= 1);
assert (lt->knthreads >= 1);
- assert (lt->inthreads * lt->jnthreads * lt->knthreads == ls->num_threads);
+ assert (lt->inithreads >= 1);
+ assert (lt->jnithreads >= 1);
+ assert (lt->knithreads >= 1);
+ assert (lt->inthreads * lt->jnthreads * lt->knthreads *
+ lt->inithreads * lt->jnithreads * lt->knithreads ==
+ ls->num_threads);
/* Tilings */
@@ -738,7 +799,6 @@ lc_control_init (lc_control_t * restrict const lc,
/*** Threads ****************************************************************/
-
/* Thread loop settings */
lc->iiimin = imin;
lc->jjjmin = jmin;
@@ -762,7 +822,7 @@ lc_control_init (lc_control_t * restrict const lc,
int const ci = c % lt->inthreads; c /= lt->inthreads;
int const cj = c % lt->jnthreads; c /= lt->jnthreads;
int const ck = c % lt->knthreads; c /= lt->knthreads;
- assert (c == 0);
+ /* see below where c is continued to be used */
lc->iii = lc->iiimin + ci * lc->iiistep;
lc->jjj = lc->jjjmin + cj * lc->jjjstep;
lc->kkk = lc->kkkmin + ck * lc->kkkstep;
@@ -789,6 +849,29 @@ lc_control_init (lc_control_t * restrict const lc,
+ /*** Inner threads **********************************************************/
+
+ /* Inner loop thread parallelism */
+ /* (No thread parallelism yet) */
+ int const cii = c % lt->inithreads; c /= lt->inithreads;
+ int const cjj = c % lt->jnithreads; c /= lt->jnithreads;
+ int const ckk = c % lt->knithreads; c /= lt->knithreads;
+ assert (c == 0);
+ lc->iiii = cii;
+ lc->jjjj = cjj;
+ lc->kkkk = ckk;
+ lc->iiiistep = (lc->iistep + lt->inithreads - 1) / lt->inithreads;
+ lc->jjjjstep = (lc->jjstep + lt->jnithreads - 1) / lt->jnithreads;
+ lc->kkkkstep = (lc->kkstep + lt->knithreads - 1) / lt->knithreads;
+ lc->iiiimin = lc->iiii * lc->iiiistep;
+ lc->jjjjmin = lc->jjjj * lc->jjjjstep;
+ lc->kkkkmin = lc->kkkk * lc->kkkkstep;
+ lc->iiiimax = lc->iiiimin + lc->iiiistep;
+ lc->jjjjmax = lc->jjjjmin + lc->jjjjstep;
+ lc->kkkkmax = lc->kkkkmin + lc->kkkkstep;
+
+
+
/****************************************************************************/
/* Timer */
@@ -810,6 +893,10 @@ lc_control_finish (lc_control_t * restrict const lc)
ignore_iteration = ignore_initial_overhead && lt->time_count == 0.0;
}
+ /* Add a barrier to catch load imbalances */
+#pragma omp barrier
+ ;
+
/* Timer */
double const time_calc_end = omp_get_wtime();
double const time_calc_begin = lc->time_calc_begin;
@@ -849,7 +936,7 @@ lc_control_finish (lc_control_t * restrict const lc)
lt->last_updated = time_calc_end;
}
-#pragma omp barrier
+/* #pragma omp barrier */
{
DECLARE_CCTK_PARAMETERS;
@@ -910,9 +997,11 @@ lc_printstats (CCTK_ARGUMENTS)
int imin_calc = -1;
int ntimes = 0;
for (lc_stattime_t * lt = ls->stattime_list; lt; lt = lt->next) {
- printf (" stattime #%d topology=%d [%d,%d,%d] tiling=[%d,%d,%d]\n",
+ printf (" stattime #%d topology=%d [%d,%d,%d]x[%d,%d,%d] tiling=[%d,%d,%d]\n",
ntimes,
- lt->state.topology, lt->inthreads, lt->jnthreads, lt->knthreads,
+ lt->state.topology,
+ lt->inthreads, lt->jnthreads, lt->knthreads,
+ lt->inithreads, lt->jnithreads, lt->knithreads,
lt->inpoints, lt->jnpoints, lt->knpoints);
double const count = lt->time_count;
double const setup = lt->time_setup_sum / count;
diff --git a/Carpet/LoopControl/src/loopcontrol.h b/Carpet/LoopControl/src/loopcontrol.h
index 7f882617c..0f3b6d80d 100644
--- a/Carpet/LoopControl/src/loopcontrol.h
+++ b/Carpet/LoopControl/src/loopcontrol.h
@@ -56,7 +56,7 @@ extern "C" {
/* A topology */
typedef struct lc_topology_t {
- int nthreads[3];
+ int nthreads[2][3]; /* [0:outer|1:inner][ijk] */
} lc_topology_t;
/* A tiling specification */
@@ -90,6 +90,7 @@ typedef struct lc_stattime_t {
lc_state_t state;
int inthreads, jnthreads, knthreads;
+ int inithreads, jnithreads, knithreads;
int inpoints, jnpoints, knpoints;
/* Data */
@@ -250,12 +251,18 @@ typedef struct lc_control_t {
/* Control settings for current thread (useful for debugging) */
int thread_num;
int iii, jjj, kkk;
+ int iiii, jjjj, kkkk;
/* Control settings for tiling loop */
int iimin, jjmin, kkmin;
int iimax, jjmax, kkmax;
int iistep, jjstep, kkstep;
+ /* Control settings for inner thread parallelism */
+ int iiiimin, jjjjmin, kkkkmin;
+ int iiiimax, jjjjmax, kkkkmax;
+ int iiiistep, jjjjstep, kkkkstep;
+
/* Timing statistics */
double time_setup_begin, time_calc_begin;
} lc_control_t;
@@ -319,39 +326,44 @@ lc_control_finish (lc_control_t * restrict lc);
lc_kk < lc_lc.kkmax; \
lc_kk += lc_lc.kkstep) \
{ \
- int const lc_kmax = lc_min (lc_kk + lc_lc.kkstep, lc_lc.kkmax); \
+ int const lc_kmin = lc_kk + lc_lc.kkkkmin; \
+ int const lc_kmax = \
+ lc_min (lc_kk + lc_lc.kkkkmax, lc_lc.kkmax); \
\
for (int lc_jj = lc_lc.jjmin; \
lc_jj < lc_lc.jjmax; \
lc_jj += lc_lc.jjstep) \
{ \
- int const lc_jmax = lc_min (lc_jj + lc_lc.jjstep, lc_lc.jjmax); \
+ int const lc_jmin = lc_jj + lc_lc.jjjjmin; \
+ int const lc_jmax = \
+ lc_min (lc_jj + lc_lc.jjjjmax, lc_lc.jjmax); \
\
for (int lc_ii = lc_lc.iimin; \
lc_ii < lc_lc.iimax; \
lc_ii += lc_lc.iistep) \
{ \
- int const lc_imax = lc_min (lc_ii + lc_lc.iistep, lc_lc.iimax); \
+ int const lc_imin = lc_ii + lc_lc.iiiimin; \
+ int const lc_imax = \
+ lc_min (lc_ii + lc_lc.iiiimax, lc_lc.iimax); \
\
/* Fine loop */ \
- for (int k = lc_kk; k < lc_kmax; ++k) { \
- for (int j = lc_jj; j < lc_jmax; ++j) { \
- int const lc_imin = lc_ii; \
+ for (int k = lc_kmin; k < lc_kmax; ++k) { \
+ for (int j = lc_jmin; j < lc_jmax; ++j) { \
LC_PRELOOP_STATEMENTS \
{ \
for (int i = lc_imin; i < lc_imax; ++i) {
#define LC_ENDLOOP3(name) \
- } \
} \
- LC_POSTLOOP_STATEMENTS \
} \
+ LC_POSTLOOP_STATEMENTS \
} \
} \
} \
} \
+ } \
lc_control_finish (& lc_lc); \
- } while (0)
+ } while (0)
/* Pre- and post loop statements are inserted around the innermost
loop, which is executed serially. By default these are empty. */
diff --git a/Carpet/LoopControl/src/loopcontrol_fortran.h b/Carpet/LoopControl/src/loopcontrol_fortran.h
index 7edf22d84..94020ac28 100644
--- a/Carpet/LoopControl/src/loopcontrol_fortran.h
+++ b/Carpet/LoopControl/src/loopcontrol_fortran.h
@@ -10,41 +10,46 @@ type (lc_statmap_t), save :: name/**/_lm &&\
logical, save :: name/**/_initialised = .false. &&\
type (lc_control_t) :: name/**/_lc &&\
integer :: name/**/_ii, name/**/_jj, name/**/_kk &&\
+integer :: name/**/_imin, name/**/_jmin, name/**/_kmin &&\
integer :: name/**/_imax, name/**/_jmax, name/**/_kmax &&\
integer :: i, j, k
-#define LC_PRIVATE3(name) \
-name/**/_lc, \
+#define LC_PRIVATE3(name) \
+name/**/_lc, \
+name/**/_imin, name/**/_jmin, name/**/_kmin, \
name/**/_imax, name/**/_jmax, name/**/_kmax
-#define LC_LOOP3(name, i,j,k, imin,jmin,kmin, imax,jmax,kmax, ilsh,jlsh,klsh) &&\
-if (.not. name/**/_initialised) then &&\
-!$omp single &&\
- call lc_statmap_init (name/**/_lm, "name") &&\
-!$omp end single &&\
-!$omp single &&\
- /* Set this flag only after initialising */ &&\
- name/**/_initialised = .true. &&\
-!$omp end single &&\
-end if &&\
+#define LC_LOOP3(name, i,j,k, imin,jmin,kmin, imax,jmax,kmax, ilsh,jlsh,klsh) &&\
+if (.not. name/**/_initialised) then &&\
+!$omp single &&\
+ call lc_statmap_init (name/**/_lm, "name") &&\
+!$omp end single &&\
+!$omp single &&\
+ /* Set this flag only after initialising */ &&\
+ name/**/_initialised = .true. &&\
+!$omp end single &&\
+end if &&\
call lc_control_init (name/**/_lc, name/**/_lm, imin,jmin,kmin, imax,jmax,kmax, ilsh,jlsh,klsh) &&\
- &&\
-/* Coarse loop */ &&\
-do name/**/_kk = name/**/_lc%kkmin + 1, name/**/_lc%kkmax, name/**/_lc%kkstep &&\
- name/**/_kmax = min (name/**/_kk - 1 + name/**/_lc%kkstep, name/**/_lc%kkmax) &&\
- do name/**/_jj = name/**/_lc%jjmin + 1, name/**/_lc%jjmax, name/**/_lc%jjstep &&\
- name/**/_jmax = min (name/**/_jj - 1 + name/**/_lc%jjstep, name/**/_lc%jjmax) &&\
- do name/**/_ii = name/**/_lc%iimin + 1, name/**/_lc%iimax, name/**/_lc%iistep &&\
- name/**/_imax = min (name/**/_ii - 1 + name/**/_lc%iistep, name/**/_lc%iimax) &&\
- &&\
- /* Fine loop */ &&\
- do k = name/**/_kk, name/**/_kmax &&\
- do j = name/**/_jj, name/**/_jmax &&\
- do i = name/**/_ii, name/**/_imax
+ &&\
+/* Coarse loop */ &&\
+do name/**/_kk = name/**/_lc%kkmin + 1, name/**/_lc%kkmax, name/**/_lc%kkstep &&\
+ name/**/_kmin = name/**/_kk + name/**/_lc%kkkkmax &&\
+ name/**/_kmax = min (name/**/_kmin - 1 + name/**/_lc%kkkkstep, name/**/_lc%kkmax) &&\
+ do name/**/_jj = name/**/_lc%jjmin + 1, name/**/_lc%jjmax, name/**/_lc%jjstep &&\
+ name/**/_jmin = name/**/_jj + name/**/_lc%jjjjmax &&\
+ name/**/_jmax = min (name/**/_jj - 1 + name/**/_lc%jjjjmax, name/**/_lc%jjmax) &&\
+ do name/**/_ii = name/**/_lc%iimin + 1, name/**/_lc%iimax, name/**/_lc%iistep &&\
+ name/**/_imin = name/**/_ii + name/**/_lc%iiiimax &&\
+ name/**/_imax = min (name/**/_ii - 1 + name/**/_lc%iiiimax, name/**/_lc%iimax) &&\
+ &&\
+ /* Fine loop */ &&\
+ do k = name/**/_kmin, name/**/_kmax &&\
+ do j = name/**/_jmin, name/**/_jmax &&\
+ do i = name/**/_imin, name/**/_imax
diff --git a/Carpet/LoopControl/src/loopcontrol_types.F90 b/Carpet/LoopControl/src/loopcontrol_types.F90
index 7fcd60964..843d6bdd0 100644
--- a/Carpet/LoopControl/src/loopcontrol_types.F90
+++ b/Carpet/LoopControl/src/loopcontrol_types.F90
@@ -37,12 +37,18 @@ module loopcontrol_types
! Control settings for current thread (useful for debugging)
integer thread_num
integer iii, jjj, kkk
+ integer iiii, jjjj, kkkk
! Control settings for tiling loop
integer iimin, jjmin, kkmin
integer iimax, jjmax, kkmax
integer iistep, jjstep, kkstep
+ ! Control settings for inner thread parallelism
+ integer iiiimin, jjjjmin, kkkkmin
+ integer iiiimax, jjjjmax, kkkkmax
+ integer iiiistep, jjjjstep, kkkkstep
+
! Timing statistics
double precision time_setup_begin, time_calc_begin
end type lc_control_t