diff options
author | Erik Schnetter <schnetter@cct.lsu.edu> | 2009-09-21 11:28:26 -0500 |
---|---|---|
committer | Barry Wardell <barry.wardell@gmail.com> | 2011-12-14 16:45:09 +0000 |
commit | 29e373ad99f97175fd6443dd7e9307e10cc125f2 (patch) | |
tree | 1b424d49d8e2dd079b78821c5fa842e559c91732 /Carpet/LoopControl | |
parent | f5b0376823ed3658847fdf2f3447c87fcdec15ff (diff) |
LoopControl: Implement cache-collaborative multi-threading
Ignore-this: 5169757c7749834ae595d4d73b39220
Add a new, additional feature to LoopControl: different threads can
work on small regions that are likely to use the same cache entries as
other threads, trying to reduce cache pressure. This makes sense
mostly when the regions are still expensive although they are small,
e.g. for the BSSN RHS.
Diffstat (limited to 'Carpet/LoopControl')
-rw-r--r-- | Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.hill.par | 20 | ||||
-rw-r--r-- | Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.legacy.par | 20 | ||||
-rw-r--r-- | Carpet/LoopControl/src/loopcontrol.c | 187 | ||||
-rw-r--r-- | Carpet/LoopControl/src/loopcontrol.h | 32 | ||||
-rw-r--r-- | Carpet/LoopControl/src/loopcontrol_fortran.h | 55 | ||||
-rw-r--r-- | Carpet/LoopControl/src/loopcontrol_types.F90 | 6 |
6 files changed, 219 insertions, 101 deletions
diff --git a/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.hill.par b/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.hill.par index c7cac7fdd..93fb29489 100644 --- a/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.hill.par +++ b/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.hill.par @@ -1,5 +1,4 @@ -Cactus::cctk_run_title = "Benchmark of McLachlan using Carpet with one refinement level" -Cactus::cctk_timer_output = "full" +Cactus::cctk_run_title = "Benchmark of McLachlan using Carpet with one refinement level" Cactus::cctk_itlast = @ITERATIONS@ @@ -11,7 +10,7 @@ ActiveThorns = "Fortran" ActiveThorns = "LoopControl" -#LoopControl::printstats = yes +LoopControl::printstats = yes LoopControl::legacy_init = no LoopControl::use_random_restart_hill_climbing = yes @@ -20,7 +19,7 @@ LoopControl::use_random_restart_hill_climbing = yes ActiveThorns = "IOUtil" -#IO::print_timing_info = yes +IO::out_dir = $parfile @@ -36,9 +35,7 @@ driver::ghost_size = 3 Carpet::init_fill_timelevels = yes -CarpetLib::combine_recompose = yes - -#CarpetLib::print_timestats_every = 1 +CarpetLib::print_timestats_every = @ITERATIONS@ CarpetLib::print_memstats_every = @ITERATIONS@ @@ -128,5 +125,10 @@ IOBasic::outInfo_every = @ITERATIONS@ ActiveThorns = "TimerReport" -TimerReport::output_all_timers = yes -TimerReport::all_timers_clock = "cycle" +TimerReport::output_all_timers = yes +TimerReport::all_timers_clock = "cycle" +TimerReport::out_every = @ITERATIONS@ +TimerReport::out_filename = "TimerReport" +TimerReport::output_all_timers_together = yes +TimerReport::output_all_timers_readable = yes +TimerReport::n_top_timers = 20 diff --git a/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.legacy.par b/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.legacy.par index dc1bbbdcb..09be6185d 100644 --- a/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.legacy.par +++ b/Carpet/LoopControl/par/Bench_McLachlan_Carpet_1lev.legacy.par @@ -1,5 +1,4 @@ -Cactus::cctk_run_title = "Benchmark of McLachlan using Carpet with one refinement level" -Cactus::cctk_timer_output = "full" +Cactus::cctk_run_title = "Benchmark of McLachlan using Carpet with one refinement level" Cactus::cctk_itlast = @ITERATIONS@ @@ -11,7 +10,7 @@ ActiveThorns = "Fortran" ActiveThorns = "LoopControl" -#LoopControl::printstats = yes +LoopControl::printstats = yes LoopControl::legacy_init = yes LoopControl::use_random_restart_hill_climbing = no @@ -20,7 +19,7 @@ LoopControl::use_random_restart_hill_climbing = no ActiveThorns = "IOUtil" -#IO::print_timing_info = yes +IO::out_dir = $parfile @@ -36,9 +35,9 @@ driver::ghost_size = 3 Carpet::init_fill_timelevels = yes -CarpetLib::combine_recompose = yes +CarpetLib::avoid_arraysize_bytes = 0 -#CarpetLib::print_timestats_every = 1 +CarpetLib::print_timestats_every = @ITERATIONS@ CarpetLib::print_memstats_every = @ITERATIONS@ @@ -128,5 +127,10 @@ IOBasic::outInfo_every = @ITERATIONS@ ActiveThorns = "TimerReport" -TimerReport::output_all_timers = yes -TimerReport::all_timers_clock = "cycle" +TimerReport::output_all_timers = yes +TimerReport::all_timers_clock = "cycle" +TimerReport::out_every = @ITERATIONS@ +TimerReport::out_filename = "TimerReport" +TimerReport::output_all_timers_together = yes +TimerReport::output_all_timers_readable = yes +TimerReport::n_top_timers = 20 diff --git a/Carpet/LoopControl/src/loopcontrol.c b/Carpet/LoopControl/src/loopcontrol.c index afc19fec3..34bfe4d6b 100644 --- a/Carpet/LoopControl/src/loopcontrol.c +++ b/Carpet/LoopControl/src/loopcontrol.c @@ -63,11 +63,11 @@ lc_statmap_t * lc_statmap_list = NULL; /* Find all possible thread topologies */ /* This finds all possible thread topologies which can be expressed as - NIxNJxNK. More complex topologies, e.g. based on a recursive - subdiviston, are not considered (and cannot be expressed in the - data structures used in LoopControl). I think more complex - topologies are not necessary, since the number of treads is usually - quite small and contains many small factors in its prime + NIxNJxNK x NIIxNJJxNKK. More complex topologies, e.g. based on a + recursive subdiviston, are not considered (and cannot be expressed + with the data structures used in LoopControl). I expect that more + complex topologies are not necessary, since the number of treads is + usually quite small and contains many small factors in its prime decomposition. */ static void @@ -77,17 +77,36 @@ find_thread_topologies (lc_topology_t * restrict const topologies, int const nthreads) { * ntopologies = 0; + for (int nk=1; nk<=nthreads; ++nk) { if (nthreads % nk == 0) { for (int nj=1; nj<=nthreads/nk; ++nj) { if (nthreads % (nj*nk) == 0) { - int const ni = nthreads/(nj*nk); - if (nthreads == ni*nj*nk) { - assert (* ntopologies < maxntopologies); - topologies[* ntopologies].nthreads[0] = ni; - topologies[* ntopologies].nthreads[1] = nj; - topologies[* ntopologies].nthreads[2] = nk; - ++ * ntopologies; + for (int ni=1; ni<=nthreads/(nj*nk); ++ni) { + if (nthreads % (ni*nj*nk) == 0) { + + int const nithreads = nthreads/(ni*nj*nk); + for (int nkk=1; nkk<=nithreads; ++nkk) { + if (nithreads % nkk == 0) { + for (int njj=1; njj<=nithreads/nkk; ++njj) { + if (nithreads % (njj*nkk) == 0) { + int const nii = nithreads/(njj*nkk); + + assert (* ntopologies < maxntopologies); + topologies[* ntopologies].nthreads[0][0] = ni; + topologies[* ntopologies].nthreads[0][1] = nj; + topologies[* ntopologies].nthreads[0][2] = nk; + topologies[* ntopologies].nthreads[1][0] = nii; + topologies[* ntopologies].nthreads[1][1] = njj; + topologies[* ntopologies].nthreads[1][2] = nkk; + ++ * ntopologies; + + } + } + } + } + + } } } } @@ -222,20 +241,28 @@ lc_stattime_init (lc_stattime_t * restrict const lt, lt->inthreads = -1; lt->jnthreads = -1; lt->knthreads = -1; + lt->inithreads = -1; + lt->jnithreads = -1; + lt->knithreads = -1; } else { assert (state->topology >= 0 && state->topology < ls->ntopologies); - lt->inthreads = ls->topologies[lt->state.topology].nthreads[0]; - lt->jnthreads = ls->topologies[lt->state.topology].nthreads[1]; - lt->knthreads = ls->topologies[lt->state.topology].nthreads[2]; + lt->inthreads = ls->topologies[lt->state.topology].nthreads[0][0]; + lt->jnthreads = ls->topologies[lt->state.topology].nthreads[0][1]; + lt->knthreads = ls->topologies[lt->state.topology].nthreads[0][2]; + lt->inithreads = ls->topologies[lt->state.topology].nthreads[1][0]; + lt->jnithreads = ls->topologies[lt->state.topology].nthreads[1][1]; + lt->knithreads = ls->topologies[lt->state.topology].nthreads[1][2]; } if (debug) { - printf ("Thread topology #%d [%d,%d,%d]\n", - lt->state.topology, lt->inthreads, lt->jnthreads, lt->knthreads); + printf ("Thread topology #%d [%d,%d,%d]x[%d,%d,%d]\n", + lt->state.topology, + lt->inthreads, lt->jnthreads, lt->knthreads, + lt->inithreads, lt->jnithreads, lt->knithreads); } /* Assert thread topology consistency */ @@ -243,7 +270,12 @@ lc_stattime_init (lc_stattime_t * restrict const lt, assert (lt->inthreads >= 1); assert (lt->jnthreads >= 1); assert (lt->knthreads >= 1); - assert (lt->inthreads * lt->jnthreads * lt->knthreads == ls->num_threads); + assert (lt->inithreads >= 1); + assert (lt->jnithreads >= 1); + assert (lt->knithreads >= 1); + assert (lt->inthreads * lt->jnthreads * lt->knthreads * + lt->inithreads * lt->jnithreads * lt->knithreads == + ls->num_threads); } /*** Tilings ****************************************************************/ @@ -355,37 +387,56 @@ lc_statset_init (lc_statset_t * restrict const ls, assert (ls); assert (lm); assert (num_threads >= 1); + int total_npoints = 1; for (int d=0; d<3; ++d) { assert (npoints[d] >= 0); + assert (npoints[d] < 1000000000); + assert (npoints[d] < 1000000000 / total_npoints); + total_npoints *= npoints[d]; } /*** Threads ****************************************************************/ - ls->num_threads = num_threads; + static int saved_num_threads = -1; + static lc_topology_t * restrict saved_topologies; + static int saved_ntopologies; - /* For up to 1024 threads, there are at most 270 possible - topologies */ - int const maxntopologies = 1000; - if (debug) { - printf ("Running on %d threads\n", ls->num_threads); - } - ls->topologies = malloc (maxntopologies * sizeof * ls->topologies); - find_thread_topologies - (ls->topologies, maxntopologies, & ls->ntopologies, ls->num_threads); -#if 0 - ls->topologies = - realloc (ls->topologies, ls->ntopologies * sizeof * ls->topologies); -#endif - if (debug) { - printf ("Found %d possible thread topologies\n", ls->ntopologies); - for (int n = 0; n < ls->ntopologies; ++n) { - printf (" %2d: %2d %2d %2d\n", - n, - ls->topologies[n].nthreads[0], - ls->topologies[n].nthreads[1], - ls->topologies[n].nthreads[2]); + if (saved_num_threads == -1) { + saved_num_threads = num_threads; + + /* For up to 1024 threads, there are at most 611556 possible + topologies */ + int const maxntopologies = 1000000; + if (debug) { + printf ("Running on %d threads\n", num_threads); + } + + saved_topologies = malloc (maxntopologies * sizeof * saved_topologies); + find_thread_topologies + (saved_topologies, maxntopologies, & saved_ntopologies, + saved_num_threads); + saved_topologies = + realloc (saved_topologies, saved_ntopologies * sizeof * saved_topologies); + + if (debug) { + printf ("Found %d possible thread topologies\n", saved_ntopologies); + for (int n = 0; n < saved_ntopologies; ++n) { + printf (" %2d: %2d %2d %2d %2d %2d %2d\n", + n, + saved_topologies[n].nthreads[0][0], + saved_topologies[n].nthreads[0][1], + saved_topologies[n].nthreads[0][2], + saved_topologies[n].nthreads[1][0], + saved_topologies[n].nthreads[1][1], + saved_topologies[n].nthreads[1][2]); + } } } + + assert (saved_num_threads == num_threads); + ls->num_threads = saved_num_threads; + ls->topologies = saved_topologies; + ls->ntopologies = saved_ntopologies; assert (ls->ntopologies > 0); /*** Tilings ****************************************************************/ @@ -409,7 +460,9 @@ lc_statset_init (lc_statset_t * restrict const ls, for (int n = 0; n < ls->ntopologies; ++n) { int tiling; for (tiling = 1; tiling < ls->ntilings[d]; ++tiling) { - if (ls->tilings[d][tiling].npoints * ls->topologies[n].nthreads[d] > + if (ls->tilings[d][tiling].npoints * + ls->topologies[n].nthreads[0][d] * + ls->topologies[n].nthreads[1][d] > ls->npoints[d]) { break; @@ -606,12 +659,12 @@ lc_control_init (lc_control_t * restrict const lc, if (lc_inthreads == -1 || lc_jnthreads == -1 || lc_knthreads == -1) { CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, CCTK_THORNSTRING, - "Illegal thread topology [%d,%d,%d] specified", + "Illegal thread topology [%d,%d,%d]x[1,1,1] specified", (int)lc_inthreads, (int)lc_jnthreads, (int)lc_knthreads); } if (lc_inthreads * lc_jnthreads * lc_knthreads != ls->num_threads) { CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, CCTK_THORNSTRING, - "Specified thread topology [%d,%d,%d] is not compatible with the number of threads %d", + "Specified thread topology [%d,%d,%d]x[1,1,1] is not compatible with the number of threads %d", (int)lc_inthreads, (int)lc_jnthreads, (int)lc_knthreads, ls->num_threads); } @@ -698,13 +751,21 @@ lc_control_init (lc_control_t * restrict const lc, lt->inthreads = lc_inthreads; lt->jnthreads = lc_jnthreads; lt->knthreads = lc_knthreads; + lt->inithreads = 1; + lt->jnithreads = 1; + lt->knithreads = 1; } /* Assert thread topology consistency */ assert (lt->inthreads >= 1); assert (lt->jnthreads >= 1); assert (lt->knthreads >= 1); - assert (lt->inthreads * lt->jnthreads * lt->knthreads == ls->num_threads); + assert (lt->inithreads >= 1); + assert (lt->jnithreads >= 1); + assert (lt->knithreads >= 1); + assert (lt->inthreads * lt->jnthreads * lt->knthreads * + lt->inithreads * lt->jnithreads * lt->knithreads == + ls->num_threads); /* Tilings */ @@ -738,7 +799,6 @@ lc_control_init (lc_control_t * restrict const lc, /*** Threads ****************************************************************/ - /* Thread loop settings */ lc->iiimin = imin; lc->jjjmin = jmin; @@ -762,7 +822,7 @@ lc_control_init (lc_control_t * restrict const lc, int const ci = c % lt->inthreads; c /= lt->inthreads; int const cj = c % lt->jnthreads; c /= lt->jnthreads; int const ck = c % lt->knthreads; c /= lt->knthreads; - assert (c == 0); + /* see below where c is continued to be used */ lc->iii = lc->iiimin + ci * lc->iiistep; lc->jjj = lc->jjjmin + cj * lc->jjjstep; lc->kkk = lc->kkkmin + ck * lc->kkkstep; @@ -789,6 +849,29 @@ lc_control_init (lc_control_t * restrict const lc, + /*** Inner threads **********************************************************/ + + /* Inner loop thread parallelism */ + /* (No thread parallelism yet) */ + int const cii = c % lt->inithreads; c /= lt->inithreads; + int const cjj = c % lt->jnithreads; c /= lt->jnithreads; + int const ckk = c % lt->knithreads; c /= lt->knithreads; + assert (c == 0); + lc->iiii = cii; + lc->jjjj = cjj; + lc->kkkk = ckk; + lc->iiiistep = (lc->iistep + lt->inithreads - 1) / lt->inithreads; + lc->jjjjstep = (lc->jjstep + lt->jnithreads - 1) / lt->jnithreads; + lc->kkkkstep = (lc->kkstep + lt->knithreads - 1) / lt->knithreads; + lc->iiiimin = lc->iiii * lc->iiiistep; + lc->jjjjmin = lc->jjjj * lc->jjjjstep; + lc->kkkkmin = lc->kkkk * lc->kkkkstep; + lc->iiiimax = lc->iiiimin + lc->iiiistep; + lc->jjjjmax = lc->jjjjmin + lc->jjjjstep; + lc->kkkkmax = lc->kkkkmin + lc->kkkkstep; + + + /****************************************************************************/ /* Timer */ @@ -810,6 +893,10 @@ lc_control_finish (lc_control_t * restrict const lc) ignore_iteration = ignore_initial_overhead && lt->time_count == 0.0; } + /* Add a barrier to catch load imbalances */ +#pragma omp barrier + ; + /* Timer */ double const time_calc_end = omp_get_wtime(); double const time_calc_begin = lc->time_calc_begin; @@ -849,7 +936,7 @@ lc_control_finish (lc_control_t * restrict const lc) lt->last_updated = time_calc_end; } -#pragma omp barrier +/* #pragma omp barrier */ { DECLARE_CCTK_PARAMETERS; @@ -910,9 +997,11 @@ lc_printstats (CCTK_ARGUMENTS) int imin_calc = -1; int ntimes = 0; for (lc_stattime_t * lt = ls->stattime_list; lt; lt = lt->next) { - printf (" stattime #%d topology=%d [%d,%d,%d] tiling=[%d,%d,%d]\n", + printf (" stattime #%d topology=%d [%d,%d,%d]x[%d,%d,%d] tiling=[%d,%d,%d]\n", ntimes, - lt->state.topology, lt->inthreads, lt->jnthreads, lt->knthreads, + lt->state.topology, + lt->inthreads, lt->jnthreads, lt->knthreads, + lt->inithreads, lt->jnithreads, lt->knithreads, lt->inpoints, lt->jnpoints, lt->knpoints); double const count = lt->time_count; double const setup = lt->time_setup_sum / count; diff --git a/Carpet/LoopControl/src/loopcontrol.h b/Carpet/LoopControl/src/loopcontrol.h index 7f882617c..0f3b6d80d 100644 --- a/Carpet/LoopControl/src/loopcontrol.h +++ b/Carpet/LoopControl/src/loopcontrol.h @@ -56,7 +56,7 @@ extern "C" { /* A topology */ typedef struct lc_topology_t { - int nthreads[3]; + int nthreads[2][3]; /* [0:outer|1:inner][ijk] */ } lc_topology_t; /* A tiling specification */ @@ -90,6 +90,7 @@ typedef struct lc_stattime_t { lc_state_t state; int inthreads, jnthreads, knthreads; + int inithreads, jnithreads, knithreads; int inpoints, jnpoints, knpoints; /* Data */ @@ -250,12 +251,18 @@ typedef struct lc_control_t { /* Control settings for current thread (useful for debugging) */ int thread_num; int iii, jjj, kkk; + int iiii, jjjj, kkkk; /* Control settings for tiling loop */ int iimin, jjmin, kkmin; int iimax, jjmax, kkmax; int iistep, jjstep, kkstep; + /* Control settings for inner thread parallelism */ + int iiiimin, jjjjmin, kkkkmin; + int iiiimax, jjjjmax, kkkkmax; + int iiiistep, jjjjstep, kkkkstep; + /* Timing statistics */ double time_setup_begin, time_calc_begin; } lc_control_t; @@ -319,39 +326,44 @@ lc_control_finish (lc_control_t * restrict lc); lc_kk < lc_lc.kkmax; \ lc_kk += lc_lc.kkstep) \ { \ - int const lc_kmax = lc_min (lc_kk + lc_lc.kkstep, lc_lc.kkmax); \ + int const lc_kmin = lc_kk + lc_lc.kkkkmin; \ + int const lc_kmax = \ + lc_min (lc_kk + lc_lc.kkkkmax, lc_lc.kkmax); \ \ for (int lc_jj = lc_lc.jjmin; \ lc_jj < lc_lc.jjmax; \ lc_jj += lc_lc.jjstep) \ { \ - int const lc_jmax = lc_min (lc_jj + lc_lc.jjstep, lc_lc.jjmax); \ + int const lc_jmin = lc_jj + lc_lc.jjjjmin; \ + int const lc_jmax = \ + lc_min (lc_jj + lc_lc.jjjjmax, lc_lc.jjmax); \ \ for (int lc_ii = lc_lc.iimin; \ lc_ii < lc_lc.iimax; \ lc_ii += lc_lc.iistep) \ { \ - int const lc_imax = lc_min (lc_ii + lc_lc.iistep, lc_lc.iimax); \ + int const lc_imin = lc_ii + lc_lc.iiiimin; \ + int const lc_imax = \ + lc_min (lc_ii + lc_lc.iiiimax, lc_lc.iimax); \ \ /* Fine loop */ \ - for (int k = lc_kk; k < lc_kmax; ++k) { \ - for (int j = lc_jj; j < lc_jmax; ++j) { \ - int const lc_imin = lc_ii; \ + for (int k = lc_kmin; k < lc_kmax; ++k) { \ + for (int j = lc_jmin; j < lc_jmax; ++j) { \ LC_PRELOOP_STATEMENTS \ { \ for (int i = lc_imin; i < lc_imax; ++i) { #define LC_ENDLOOP3(name) \ - } \ } \ - LC_POSTLOOP_STATEMENTS \ } \ + LC_POSTLOOP_STATEMENTS \ } \ } \ } \ } \ + } \ lc_control_finish (& lc_lc); \ - } while (0) + } while (0) /* Pre- and post loop statements are inserted around the innermost loop, which is executed serially. By default these are empty. */ diff --git a/Carpet/LoopControl/src/loopcontrol_fortran.h b/Carpet/LoopControl/src/loopcontrol_fortran.h index 7edf22d84..94020ac28 100644 --- a/Carpet/LoopControl/src/loopcontrol_fortran.h +++ b/Carpet/LoopControl/src/loopcontrol_fortran.h @@ -10,41 +10,46 @@ type (lc_statmap_t), save :: name/**/_lm &&\ logical, save :: name/**/_initialised = .false. &&\ type (lc_control_t) :: name/**/_lc &&\ integer :: name/**/_ii, name/**/_jj, name/**/_kk &&\ +integer :: name/**/_imin, name/**/_jmin, name/**/_kmin &&\ integer :: name/**/_imax, name/**/_jmax, name/**/_kmax &&\ integer :: i, j, k -#define LC_PRIVATE3(name) \ -name/**/_lc, \ +#define LC_PRIVATE3(name) \ +name/**/_lc, \ +name/**/_imin, name/**/_jmin, name/**/_kmin, \ name/**/_imax, name/**/_jmax, name/**/_kmax -#define LC_LOOP3(name, i,j,k, imin,jmin,kmin, imax,jmax,kmax, ilsh,jlsh,klsh) &&\ -if (.not. name/**/_initialised) then &&\ -!$omp single &&\ - call lc_statmap_init (name/**/_lm, "name") &&\ -!$omp end single &&\ -!$omp single &&\ - /* Set this flag only after initialising */ &&\ - name/**/_initialised = .true. &&\ -!$omp end single &&\ -end if &&\ +#define LC_LOOP3(name, i,j,k, imin,jmin,kmin, imax,jmax,kmax, ilsh,jlsh,klsh) &&\ +if (.not. name/**/_initialised) then &&\ +!$omp single &&\ + call lc_statmap_init (name/**/_lm, "name") &&\ +!$omp end single &&\ +!$omp single &&\ + /* Set this flag only after initialising */ &&\ + name/**/_initialised = .true. &&\ +!$omp end single &&\ +end if &&\ call lc_control_init (name/**/_lc, name/**/_lm, imin,jmin,kmin, imax,jmax,kmax, ilsh,jlsh,klsh) &&\ - &&\ -/* Coarse loop */ &&\ -do name/**/_kk = name/**/_lc%kkmin + 1, name/**/_lc%kkmax, name/**/_lc%kkstep &&\ - name/**/_kmax = min (name/**/_kk - 1 + name/**/_lc%kkstep, name/**/_lc%kkmax) &&\ - do name/**/_jj = name/**/_lc%jjmin + 1, name/**/_lc%jjmax, name/**/_lc%jjstep &&\ - name/**/_jmax = min (name/**/_jj - 1 + name/**/_lc%jjstep, name/**/_lc%jjmax) &&\ - do name/**/_ii = name/**/_lc%iimin + 1, name/**/_lc%iimax, name/**/_lc%iistep &&\ - name/**/_imax = min (name/**/_ii - 1 + name/**/_lc%iistep, name/**/_lc%iimax) &&\ - &&\ - /* Fine loop */ &&\ - do k = name/**/_kk, name/**/_kmax &&\ - do j = name/**/_jj, name/**/_jmax &&\ - do i = name/**/_ii, name/**/_imax + &&\ +/* Coarse loop */ &&\ +do name/**/_kk = name/**/_lc%kkmin + 1, name/**/_lc%kkmax, name/**/_lc%kkstep &&\ + name/**/_kmin = name/**/_kk + name/**/_lc%kkkkmax &&\ + name/**/_kmax = min (name/**/_kmin - 1 + name/**/_lc%kkkkstep, name/**/_lc%kkmax) &&\ + do name/**/_jj = name/**/_lc%jjmin + 1, name/**/_lc%jjmax, name/**/_lc%jjstep &&\ + name/**/_jmin = name/**/_jj + name/**/_lc%jjjjmax &&\ + name/**/_jmax = min (name/**/_jj - 1 + name/**/_lc%jjjjmax, name/**/_lc%jjmax) &&\ + do name/**/_ii = name/**/_lc%iimin + 1, name/**/_lc%iimax, name/**/_lc%iistep &&\ + name/**/_imin = name/**/_ii + name/**/_lc%iiiimax &&\ + name/**/_imax = min (name/**/_ii - 1 + name/**/_lc%iiiimax, name/**/_lc%iimax) &&\ + &&\ + /* Fine loop */ &&\ + do k = name/**/_kmin, name/**/_kmax &&\ + do j = name/**/_jmin, name/**/_jmax &&\ + do i = name/**/_imin, name/**/_imax diff --git a/Carpet/LoopControl/src/loopcontrol_types.F90 b/Carpet/LoopControl/src/loopcontrol_types.F90 index 7fcd60964..843d6bdd0 100644 --- a/Carpet/LoopControl/src/loopcontrol_types.F90 +++ b/Carpet/LoopControl/src/loopcontrol_types.F90 @@ -37,12 +37,18 @@ module loopcontrol_types ! Control settings for current thread (useful for debugging) integer thread_num integer iii, jjj, kkk + integer iiii, jjjj, kkkk ! Control settings for tiling loop integer iimin, jjmin, kkmin integer iimax, jjmax, kkmax integer iistep, jjstep, kkstep + ! Control settings for inner thread parallelism + integer iiiimin, jjjjmin, kkkkmin + integer iiiimax, jjjjmax, kkkkmax + integer iiiistep, jjjjstep, kkkkstep + ! Timing statistics double precision time_setup_begin, time_calc_begin end type lc_control_t |