From 911aeb937cd4334c4406e5d115ae9a1a01e0c4e4 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Tue, 5 Jul 2011 12:28:23 -0400 Subject: LoopControl: Correct OpenMP parallelisation. Add self-tests. Correct the OpenMP parallelisation: Previously, when the loop size is not a multiple of the block size assigned to each OpenMP thread, some grid points would be traversed multiple times. Add new parameter do_selftest that enables (somewhat expensive) self-tests to ensure in each loop that each grid point is traversed exactly once. --- Carpet/LoopControl/interface.ccl | 7 -- Carpet/LoopControl/param.ccl | 2 +- Carpet/LoopControl/schedule.ccl | 16 ---- Carpet/LoopControl/src/lc_selftest.c | 123 ------------------------- Carpet/LoopControl/src/loopcontrol.c | 132 ++++++++++++++++++++++++--- Carpet/LoopControl/src/loopcontrol.h | 31 ++++++- Carpet/LoopControl/src/loopcontrol_types.F90 | 3 + Carpet/LoopControl/src/make.code.defn | 2 +- 8 files changed, 151 insertions(+), 165 deletions(-) delete mode 100644 Carpet/LoopControl/src/lc_selftest.c diff --git a/Carpet/LoopControl/interface.ccl b/Carpet/LoopControl/interface.ccl index c0362e5fa..8769d737b 100644 --- a/Carpet/LoopControl/interface.ccl +++ b/Carpet/LoopControl/interface.ccl @@ -5,10 +5,3 @@ IMPLEMENTS: LoopControl INCLUDE HEADER: loopcontrol.h IN loopcontrol.h USES INCLUDE HEADER: vectors.h - - - -CCTK_REAL selftest TYPE=gf TAGS='prolongation="none"' -{ - var1 var2 -} "Self-test variables" diff --git a/Carpet/LoopControl/param.ccl b/Carpet/LoopControl/param.ccl index 005202538..f5a1fa05f 100644 --- a/Carpet/LoopControl/param.ccl +++ b/Carpet/LoopControl/param.ccl @@ -37,7 +37,7 @@ BOOLEAN check_type_sizes "Check that Fortran and C types match" { } "yes" -BOOLEAN do_selftest "Perform some self-tests" +BOOLEAN do_selftest "Perform (expensive) self-tests in every loop" { } "no" diff --git a/Carpet/LoopControl/schedule.ccl b/Carpet/LoopControl/schedule.ccl index c1b74863a..090da0f5a 100644 --- a/Carpet/LoopControl/schedule.ccl +++ b/Carpet/LoopControl/schedule.ccl @@ -8,22 +8,6 @@ if (check_type_sizes) { } "Check that sizes of control structures are identical in C and Fortran" } -if (do_selftest) { - STORAGE: selftest - - SCHEDULE lc_selftest AT basegrid - { - LANG: C - OPTIONS: global loop-local - } "Perform some self-checks" - - SCHEDULE lc_selftest AT postregrid - { - LANG: C - OPTIONS: global loop-local - } "Perform some self-checks" -} - if (printstats) { SCHEDULE lc_printstats_analysis AT analysis { diff --git a/Carpet/LoopControl/src/lc_selftest.c b/Carpet/LoopControl/src/lc_selftest.c deleted file mode 100644 index d80421af6..000000000 --- a/Carpet/LoopControl/src/lc_selftest.c +++ /dev/null @@ -1,123 +0,0 @@ -#include -#include - -#include - -#include - -void lc_selftest (CCTK_ARGUMENTS) -{ - DECLARE_CCTK_ARGUMENTS; - -#ifdef CCTK_REAL_VEC_SIZE - /* Vectorisation is enabled */ - int const vector_size = CCTK_REAL_VEC_SIZE; -#else - /* Vectorisation is disabled */ - int const vector_size = 1; -#endif - - /* Initialise (without using LoopControl) */ -#pragma omp parallel for - for (int k=0; kkkkstep = (lc->kkkmax - lc->kkkmin + lt->knthreads-1) / lt->knthreads; /* Find location of current thread */ - lc->thread_num = omp_get_thread_num(); - int c = lc->thread_num; - int const ci = c % lt->inthreads; c /= lt->inthreads; - int const cj = c % lt->jnthreads; c /= lt->jnthreads; - int const ck = c % lt->knthreads; c /= lt->knthreads; - /* see below where c is continued to be used */ + lc->thread_num = omp_get_thread_num(); + int c_outer = + lc->thread_num / (lt->inithreads * lt->jnithreads * lt->knithreads); + int const ci = c_outer % lt->inthreads; c_outer /= lt->inthreads; + int const cj = c_outer % lt->jnthreads; c_outer /= lt->jnthreads; + int const ck = c_outer % lt->knthreads; c_outer /= lt->knthreads; + assert (c_outer == 0); lc->iii = lc->iiimin + ci * lc->iiistep; lc->jjj = lc->jjjmin + cj * lc->jjjstep; lc->kkk = lc->kkkmin + ck * lc->kkkstep; @@ -928,11 +929,12 @@ lc_control_init (lc_control_t * restrict const lc, /*** Inner threads **********************************************************/ /* Inner loop thread parallelism */ - /* (No thread parallelism yet) */ - int const cii = c % lt->inithreads; c /= lt->inithreads; - int const cjj = c % lt->jnithreads; c /= lt->jnithreads; - int const ckk = c % lt->knithreads; c /= lt->knithreads; - assert (c == 0); + int c_inner = + lc->thread_num % (lt->inithreads * lt->jnithreads * lt->knithreads); + int const cii = c_inner % lt->inithreads; c_inner /= lt->inithreads; + int const cjj = c_inner % lt->jnithreads; c_inner /= lt->jnithreads; + int const ckk = c_inner % lt->knithreads; c_inner /= lt->knithreads; + assert (c_inner == 0); lc->iiii = cii; lc->jjjj = cjj; lc->kkkk = ckk; @@ -949,6 +951,23 @@ lc_control_init (lc_control_t * restrict const lc, + /****************************************************************************/ + + /* Self test */ + if (do_selftest) { + char * restrict mem; +#pragma omp single copyprivate (mem) + { + mem = + calloc (lc->ilsh * lc->jlsh * lc->klsh, sizeof * lc->selftest_count); + } + lc->selftest_count = mem; + } else { + lc->selftest_count = NULL; + } + + + /****************************************************************************/ /* Timer */ @@ -957,9 +976,27 @@ lc_control_init (lc_control_t * restrict const lc, +void +lc_control_selftest (lc_control_t * restrict const lc, + int const imin, int const imax, int const j, int const k) +{ + assert (imin >= 0 && imax <= lc->ilsh); + assert (j >= 0 && j < lc->jlsh); + assert (k >= 0 && k < lc->klsh); + for (int i = imin; i < imax; ++i) { + int const ind3d = i + lc->ilsh * (j + lc->jlsh * k); +#pragma omp atomic + ++ lc->selftest_count[ind3d]; + } +} + + + void lc_control_finish (lc_control_t * restrict const lc) { + DECLARE_CCTK_PARAMETERS; + lc_stattime_t * restrict const lt = lc->stattime; lc_statset_t * restrict const ls = lc->statset; @@ -967,7 +1004,6 @@ lc_control_finish (lc_control_t * restrict const lc) int first_iteration; #pragma omp single copyprivate (ignore_iteration) { - DECLARE_CCTK_PARAMETERS; ignore_iteration = ignore_initial_overhead && lt->time_count == 0.0; first_iteration = lt->time_count_init == 0.0; } @@ -1021,7 +1057,6 @@ lc_control_finish (lc_control_t * restrict const lc) /* #pragma omp barrier */ { - DECLARE_CCTK_PARAMETERS; if (use_simulated_annealing) { #pragma omp single { @@ -1035,6 +1070,77 @@ lc_control_finish (lc_control_t * restrict const lc) } } } + + /* Perform self-check */ + + if (do_selftest) { + /* Ensure all threads have finished the loop */ +#pragma omp barrier + ; + /* Assert that exactly the specified points have been set */ + static int failure = 0; +#pragma omp for reduction(+: failure) + for (int k=0; kklsh; ++k) { + for (int j=0; jjlsh; ++j) { + for (int i=0; iilsh; ++i) { + int const ind3d = i + lc->ilsh * (j + lc->jlsh * k); + char const inside = + (i >= lc->imin && i < lc->imax) && + (j >= lc->jmin && j < lc->jmax) && + (k >= lc->kmin && k < lc->kmax); + if (lc->selftest_count[ind3d] != inside) { + ++ failure; +#pragma omp critical + { + fprintf (stderr, " i=[%d,%d,%d] count=%d expected=%d\n", + i, j, k, + (int) lc->selftest_count[ind3d], (int) inside); + } + } + } + } + } + if (failure) { + for (int n=0; nimin, lc->jmin, lc->kmin); + fprintf (stderr, " imax=[%d,%d,%d]\n", lc->imax, lc->jmax, lc->kmax); + fprintf (stderr, " ilsh=[%d,%d,%d]\n", lc->ilsh, lc->jlsh, lc->klsh); + fprintf (stderr, " di=%d\n", lc->di); + fprintf (stderr, " Thread parallellism:\n"); + fprintf (stderr, " iiimin =[%d,%d,%d]\n", lc->iiimin, lc->jjjmin, lc->kkkmin); + fprintf (stderr, " iiimax =[%d,%d,%d]\n", lc->iiimax, lc->jjjmax, lc->kkkmax); + fprintf (stderr, " iiistep=[%d,%d,%d]\n", lc->iiistep, lc->jjjstep, lc->kkkstep); + fprintf (stderr, " Current thread:\n"); + fprintf (stderr, " thread_num=%d\n", lc->thread_num); + fprintf (stderr, " iii =[%d,%d,%d]\n", lc->iii, lc->jjj, lc->kkk); + fprintf (stderr, " iiii =[%d,%d,%d]\n", lc->iiii, lc->jjjj, lc->kkkk); + fprintf (stderr, " Tiling loop:\n"); + fprintf (stderr, " iimin =[%d,%d,%d]\n", lc->iimin, lc->jjmin, lc->kkmin); + fprintf (stderr, " iimax =[%d,%d,%d]\n", lc->iimax, lc->jjmax, lc->kkmax); + fprintf (stderr, " iistep=[%d,%d,%d]\n", lc->iistep, lc->jjstep, lc->kkstep); + fprintf (stderr, " Inner thread parallelism:\n"); + fprintf (stderr, " iiiimin =[%d,%d,%d]\n", lc->iiiimin, lc->jjjjmin, lc->kkkkmin); + fprintf (stderr, " iiiimax =[%d,%d,%d]\n", lc->iiiimax, lc->jjjjmax, lc->kkkkmax); + fprintf (stderr, " iiiistep=[%d,%d,%d]\n", lc->iiiistep, lc->jjjjstep, lc->kkkkstep); + } +#pragma omp barrier + ; + } +#pragma omp critical + { + CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, CCTK_THORNSTRING, + "LoopControl loop \"%s\" self-test failed", + lc->statmap->name); + } + } +#pragma omp single nowait + { + free (lc->selftest_count); + } + } } diff --git a/Carpet/LoopControl/src/loopcontrol.h b/Carpet/LoopControl/src/loopcontrol.h index 635d71128..59ddae4e9 100644 --- a/Carpet/LoopControl/src/loopcontrol.h +++ b/Carpet/LoopControl/src/loopcontrol.h @@ -216,33 +216,42 @@ typedef struct lc_control_t { lc_stattime_t * restrict stattime; /* Copy of arguments (useful for debugging) */ + /* Full domain */ int imin, jmin, kmin; int imax, jmax, kmax; int ilsh, jlsh, klsh; int di; /* Control settings for thread parallelism (useful for debugging) */ + /* Outer thread decomposition of full domain */ int iiimin, jjjmin, kkkmin; int iiimax, jjjmax, kkkmax; int iiistep, jjjstep, kkkstep; /* Control settings for current thread (useful for debugging) */ int thread_num; + /* Location of this thread in full domain */ int iii, jjj, kkk; + /* Index (not location!) of this thread in loop tile */ int iiii, jjjj, kkkk; /* Control settings for tiling loop */ + /* Loop tiling decomposition in this thread's domain */ int iimin, jjmin, kkmin; int iimax, jjmax, kkmax; int iistep, jjstep, kkstep; /* Control settings for inner thread parallelism */ + /* Inner thread decomposition, as offsets (!) to loop tiling */ int iiiimin, jjjjmin, kkkkmin; int iiiimax, jjjjmax, kkkkmax; int iiiistep, jjjjstep, kkkkstep; /* Timing statistics */ double time_setup_begin, time_calc_begin; + + /* Self check */ + char * restrict selftest_count; } lc_control_t; @@ -296,6 +305,10 @@ lc_control_init (lc_control_t * restrict lc, int ilsh, int jlsh, int klsh, int di); +void +lc_control_selftest (lc_control_t * restrict lc, + int imin, int imax, int j, int k); + void lc_control_finish (lc_control_t * restrict lc); @@ -321,6 +334,7 @@ lc_control_finish (lc_control_t * restrict lc); (imax_), (jmax_), (kmax_), \ (ilsh_), (jlsh_), (klsh_), \ lc_di); \ + int const lc_do_selftest = lc_lc.selftest_count != 0; \ \ /* Coarse loop */ \ for (int lc_kk = lc_lc.kkmin; \ @@ -328,31 +342,40 @@ lc_control_finish (lc_control_t * restrict lc); lc_kk += lc_lc.kkstep) \ { \ int const lc_kmin = lc_kk + lc_lc.kkkkmin; \ - int const lc_kmax = lc_min (lc_kk + lc_lc.kkkkmax, lc_lc.kkmax); \ + int const lc_kmax = \ + lc_min (lc_kk + lc_min (lc_lc.kkkkmax, lc_lc.kkstep), \ + lc_lc.kkmax); \ \ for (int lc_jj = lc_lc.jjmin; \ lc_jj < lc_lc.jjmax; \ lc_jj += lc_lc.jjstep) \ { \ int const lc_jmin = lc_jj + lc_lc.jjjjmin; \ - int const lc_jmax = lc_min (lc_jj + lc_lc.jjjjmax, lc_lc.jjmax); \ + int const lc_jmax = \ + lc_min (lc_jj + lc_min (lc_lc.jjjjmax, lc_lc.jjstep), \ + lc_lc.jjmax); \ \ for (int lc_ii = lc_lc.iimin; \ lc_ii < lc_lc.iimax; \ lc_ii += lc_lc.iistep) \ { \ int const lc_imin = lc_ii + lc_lc.iiiimin; \ - int const lc_imax = lc_min (lc_ii + lc_lc.iiiimax, lc_lc.iimax); \ + int const lc_imax = \ + lc_min (lc_ii + lc_min (lc_lc.iiiimax, lc_lc.iistep), \ + lc_lc.iimax); \ \ /* Fine loop */ \ for (int k = lc_kmin; k < lc_kmax; ++k) { \ for (int j = lc_jmin; j < lc_jmax; ++j) { \ LC_PRELOOP_STATEMENTS \ { \ + if (CCTK_BUILTIN_EXPECT(lc_do_selftest, 0)) { \ + lc_control_selftest (& lc_lc, lc_imin, lc_imax, j, k); \ + } \ int const lc_ipos = \ lc_imin + lc_lc.ilsh * (j + lc_lc.jlsh * k); \ int const lc_ioffset = (lc_ipos & - lc_di) - lc_ipos; \ - for (int i = lc_imin + lc_ioffset; i < lc_imax; i += lc_di) { + for (int i = lc_imin + lc_ioffset; i < lc_imax; i += lc_di) { #define LC_ENDLOOP3VEC(name) \ } \ diff --git a/Carpet/LoopControl/src/loopcontrol_types.F90 b/Carpet/LoopControl/src/loopcontrol_types.F90 index 76fcaf910..159f49e92 100644 --- a/Carpet/LoopControl/src/loopcontrol_types.F90 +++ b/Carpet/LoopControl/src/loopcontrol_types.F90 @@ -52,6 +52,9 @@ module loopcontrol_types ! Timing statistics double precision time_setup_begin, time_calc_begin + + ! Self check + CCTK_POINTER selftest_count end type lc_control_t end module loopcontrol_types diff --git a/Carpet/LoopControl/src/make.code.defn b/Carpet/LoopControl/src/make.code.defn index c821a8438..a0cd4b09b 100644 --- a/Carpet/LoopControl/src/make.code.defn +++ b/Carpet/LoopControl/src/make.code.defn @@ -1,7 +1,7 @@ # Main make.code.defn file for thorn LoopControl # Source files in this directory -SRCS = loopcontrol.c loopcontrol.F90 lc_get_type_sizes.F90 loopcontrol_types.F90 lc_auto.c lc_siman.c lc_hill.c lc_selftest.c +SRCS = loopcontrol.c loopcontrol.F90 lc_get_type_sizes.F90 loopcontrol_types.F90 lc_auto.c lc_siman.c lc_hill.c # Subdirectories containing source files SUBDIRS = -- cgit v1.2.3