diff options
-rw-r--r-- | Carpet/LoopControl/interface.ccl | 7 | ||||
-rw-r--r-- | Carpet/LoopControl/param.ccl | 2 | ||||
-rw-r--r-- | Carpet/LoopControl/schedule.ccl | 16 | ||||
-rw-r--r-- | Carpet/LoopControl/src/lc_selftest.c | 123 | ||||
-rw-r--r-- | Carpet/LoopControl/src/loopcontrol.c | 132 | ||||
-rw-r--r-- | Carpet/LoopControl/src/loopcontrol.h | 31 | ||||
-rw-r--r-- | Carpet/LoopControl/src/loopcontrol_types.F90 | 3 | ||||
-rw-r--r-- | Carpet/LoopControl/src/make.code.defn | 2 |
8 files changed, 151 insertions, 165 deletions
diff --git a/Carpet/LoopControl/interface.ccl b/Carpet/LoopControl/interface.ccl index c0362e5fa..8769d737b 100644 --- a/Carpet/LoopControl/interface.ccl +++ b/Carpet/LoopControl/interface.ccl @@ -5,10 +5,3 @@ IMPLEMENTS: LoopControl INCLUDE HEADER: loopcontrol.h IN loopcontrol.h USES INCLUDE HEADER: vectors.h - - - -CCTK_REAL selftest TYPE=gf TAGS='prolongation="none"' -{ - var1 var2 -} "Self-test variables" diff --git a/Carpet/LoopControl/param.ccl b/Carpet/LoopControl/param.ccl index 005202538..f5a1fa05f 100644 --- a/Carpet/LoopControl/param.ccl +++ b/Carpet/LoopControl/param.ccl @@ -37,7 +37,7 @@ BOOLEAN check_type_sizes "Check that Fortran and C types match" { } "yes" -BOOLEAN do_selftest "Perform some self-tests" +BOOLEAN do_selftest "Perform (expensive) self-tests in every loop" { } "no" diff --git a/Carpet/LoopControl/schedule.ccl b/Carpet/LoopControl/schedule.ccl index c1b74863a..090da0f5a 100644 --- a/Carpet/LoopControl/schedule.ccl +++ b/Carpet/LoopControl/schedule.ccl @@ -8,22 +8,6 @@ if (check_type_sizes) { } "Check that sizes of control structures are identical in C and Fortran" } -if (do_selftest) { - STORAGE: selftest - - SCHEDULE lc_selftest AT basegrid - { - LANG: C - OPTIONS: global loop-local - } "Perform some self-checks" - - SCHEDULE lc_selftest AT postregrid - { - LANG: C - OPTIONS: global loop-local - } "Perform some self-checks" -} - if (printstats) { SCHEDULE lc_printstats_analysis AT analysis { diff --git a/Carpet/LoopControl/src/lc_selftest.c b/Carpet/LoopControl/src/lc_selftest.c deleted file mode 100644 index d80421af6..000000000 --- a/Carpet/LoopControl/src/lc_selftest.c +++ /dev/null @@ -1,123 +0,0 @@ -#include <cctk.h> -#include <cctk_Arguments.h> - -#include <vectors.h> - -#include <loopcontrol.h> - -void lc_selftest (CCTK_ARGUMENTS) -{ - DECLARE_CCTK_ARGUMENTS; - -#ifdef CCTK_REAL_VEC_SIZE - /* Vectorisation is enabled */ - int const vector_size = CCTK_REAL_VEC_SIZE; -#else - /* Vectorisation is disabled */ - int const vector_size = 1; -#endif - - /* Initialise (without using LoopControl) */ -#pragma omp parallel for - for (int k=0; k<cctk_lsh[2]; ++k) { - for (int j=0; j<cctk_lsh[1]; ++j) { - for (int i=0; i<cctk_lsh[0]; ++i) { - int const ind3d = CCTK_GFINDEX3D(cctkGH, i,j,k); - var1[ind3d] = 0; - var2[ind3d] = 0; - } - } - } - - /* Test 1: Loop over all points */ - { - int imin[3], imax[3]; - for (int d=0; d<3; ++d) { - imin[d] = 0; - imax[d] = cctk_lsh[d]; - } -#pragma omp parallel - LC_LOOP3VEC(lc_selftest1, - i,j,k, - imin[0],imin[1],imin[2], - imax[0],imax[1],imax[2], - cctk_lsh[0],cctk_lsh[1],cctk_lsh[2], - vector_size) - { - int const ind3d = CCTK_GFINDEX3D(cctkGH, i,j,k); -#pragma omp atomic - ++ var1[ind3d]; - } LC_ENDLOOP3VEC(lc_selftest1); - } - - /* Test 2: Loop over interior points, then loop over boundary - points */ - { - int imin[3], imax[3]; - for (int d=0; d<3; ++d) { - imin[d] = cctk_nghostzones[d]; - imax[d] = cctk_lsh[d] - cctk_nghostzones[d]; - } -#pragma omp parallel - LC_LOOP3VEC(lc_selftest2a, - i,j,k, - imin[0],imin[1],imin[2], - imax[0],imax[1],imax[2], - cctk_lsh[0],cctk_lsh[1],cctk_lsh[2], - vector_size) - { - int const ind3d = CCTK_GFINDEX3D(cctkGH, i,j,k); -#pragma omp atomic - ++ var2[ind3d]; - } LC_ENDLOOP3VEC(lc_selftest2a); - - for (int dir=0; dir<3; ++dir) { - for (int face=0; face<2; ++face) { - for (int d=0; d<dir; ++d) { - imin[d] = 0; - imax[d] = cctk_lsh[d]; - } - if (face==0) { - imin[dir] = 0; - imax[dir] = cctk_nghostzones[dir]; - } else { - imin[dir] = cctk_lsh[dir] - cctk_nghostzones[dir]; - imax[dir] = cctk_lsh[dir]; - } - for (int d=dir+1; d<3; ++d) { - imin[d] = cctk_nghostzones[d]; - imax[d] = cctk_lsh[d] - cctk_nghostzones[d]; - } -#pragma omp parallel - LC_LOOP3VEC(lc_selftest2b, - i,j,k, - imin[0],imin[1],imin[2], - imax[0],imax[1],imax[2], - cctk_lsh[0],cctk_lsh[1],cctk_lsh[2], - vector_size) - { - int const ind3d = CCTK_GFINDEX3D(cctkGH, i,j,k); -#pragma omp atomic - ++ var2[ind3d]; - } LC_ENDLOOP3VEC(lc_selftest2b); - } - } - } - - /* Evaluate tests (without using LoopControl) */ - int failure = 0; -#pragma omp parallel for reduction(+: failure) - for (int k=0; k<cctk_lsh[2]; ++k) { - for (int j=0; j<cctk_lsh[1]; ++j) { - for (int i=0; i<cctk_lsh[0]; ++i) { - int const ind3d = CCTK_GFINDEX3D(cctkGH, i,j,k); - failure += var1[ind3d] != 1; - failure += var2[ind3d] != 1; - } - } - } - - if (failure) { - CCTK_WARN (CCTK_WARN_ABORT, "LoopControl self-test failed"); - } -} diff --git a/Carpet/LoopControl/src/loopcontrol.c b/Carpet/LoopControl/src/loopcontrol.c index 83a09a313..b4e4bf328 100644 --- a/Carpet/LoopControl/src/loopcontrol.c +++ b/Carpet/LoopControl/src/loopcontrol.c @@ -898,12 +898,13 @@ lc_control_init (lc_control_t * restrict const lc, lc->kkkstep = (lc->kkkmax - lc->kkkmin + lt->knthreads-1) / lt->knthreads; /* Find location of current thread */ - lc->thread_num = omp_get_thread_num(); - int c = lc->thread_num; - int const ci = c % lt->inthreads; c /= lt->inthreads; - int const cj = c % lt->jnthreads; c /= lt->jnthreads; - int const ck = c % lt->knthreads; c /= lt->knthreads; - /* see below where c is continued to be used */ + lc->thread_num = omp_get_thread_num(); + int c_outer = + lc->thread_num / (lt->inithreads * lt->jnithreads * lt->knithreads); + int const ci = c_outer % lt->inthreads; c_outer /= lt->inthreads; + int const cj = c_outer % lt->jnthreads; c_outer /= lt->jnthreads; + int const ck = c_outer % lt->knthreads; c_outer /= lt->knthreads; + assert (c_outer == 0); lc->iii = lc->iiimin + ci * lc->iiistep; lc->jjj = lc->jjjmin + cj * lc->jjjstep; lc->kkk = lc->kkkmin + ck * lc->kkkstep; @@ -928,11 +929,12 @@ lc_control_init (lc_control_t * restrict const lc, /*** Inner threads **********************************************************/ /* Inner loop thread parallelism */ - /* (No thread parallelism yet) */ - int const cii = c % lt->inithreads; c /= lt->inithreads; - int const cjj = c % lt->jnithreads; c /= lt->jnithreads; - int const ckk = c % lt->knithreads; c /= lt->knithreads; - assert (c == 0); + int c_inner = + lc->thread_num % (lt->inithreads * lt->jnithreads * lt->knithreads); + int const cii = c_inner % lt->inithreads; c_inner /= lt->inithreads; + int const cjj = c_inner % lt->jnithreads; c_inner /= lt->jnithreads; + int const ckk = c_inner % lt->knithreads; c_inner /= lt->knithreads; + assert (c_inner == 0); lc->iiii = cii; lc->jjjj = cjj; lc->kkkk = ckk; @@ -951,6 +953,23 @@ lc_control_init (lc_control_t * restrict const lc, /****************************************************************************/ + /* Self test */ + if (do_selftest) { + char * restrict mem; +#pragma omp single copyprivate (mem) + { + mem = + calloc (lc->ilsh * lc->jlsh * lc->klsh, sizeof * lc->selftest_count); + } + lc->selftest_count = mem; + } else { + lc->selftest_count = NULL; + } + + + + /****************************************************************************/ + /* Timer */ lc->time_calc_begin = omp_get_wtime(); } @@ -958,8 +977,26 @@ lc_control_init (lc_control_t * restrict const lc, void +lc_control_selftest (lc_control_t * restrict const lc, + int const imin, int const imax, int const j, int const k) +{ + assert (imin >= 0 && imax <= lc->ilsh); + assert (j >= 0 && j < lc->jlsh); + assert (k >= 0 && k < lc->klsh); + for (int i = imin; i < imax; ++i) { + int const ind3d = i + lc->ilsh * (j + lc->jlsh * k); +#pragma omp atomic + ++ lc->selftest_count[ind3d]; + } +} + + + +void lc_control_finish (lc_control_t * restrict const lc) { + DECLARE_CCTK_PARAMETERS; + lc_stattime_t * restrict const lt = lc->stattime; lc_statset_t * restrict const ls = lc->statset; @@ -967,7 +1004,6 @@ lc_control_finish (lc_control_t * restrict const lc) int first_iteration; #pragma omp single copyprivate (ignore_iteration) { - DECLARE_CCTK_PARAMETERS; ignore_iteration = ignore_initial_overhead && lt->time_count == 0.0; first_iteration = lt->time_count_init == 0.0; } @@ -1021,7 +1057,6 @@ lc_control_finish (lc_control_t * restrict const lc) /* #pragma omp barrier */ { - DECLARE_CCTK_PARAMETERS; if (use_simulated_annealing) { #pragma omp single { @@ -1035,6 +1070,77 @@ lc_control_finish (lc_control_t * restrict const lc) } } } + + /* Perform self-check */ + + if (do_selftest) { + /* Ensure all threads have finished the loop */ +#pragma omp barrier + ; + /* Assert that exactly the specified points have been set */ + static int failure = 0; +#pragma omp for reduction(+: failure) + for (int k=0; k<lc->klsh; ++k) { + for (int j=0; j<lc->jlsh; ++j) { + for (int i=0; i<lc->ilsh; ++i) { + int const ind3d = i + lc->ilsh * (j + lc->jlsh * k); + char const inside = + (i >= lc->imin && i < lc->imax) && + (j >= lc->jmin && j < lc->jmax) && + (k >= lc->kmin && k < lc->kmax); + if (lc->selftest_count[ind3d] != inside) { + ++ failure; +#pragma omp critical + { + fprintf (stderr, " i=[%d,%d,%d] count=%d expected=%d\n", + i, j, k, + (int) lc->selftest_count[ind3d], (int) inside); + } + } + } + } + } + if (failure) { + for (int n=0; n<omp_get_num_threads(); ++n) { + if (n == omp_get_thread_num()) { + fprintf (stderr, "Thread: %d\n", n); + fprintf (stderr, " Arguments:\n"); + fprintf (stderr, " imin=[%d,%d,%d]\n", lc->imin, lc->jmin, lc->kmin); + fprintf (stderr, " imax=[%d,%d,%d]\n", lc->imax, lc->jmax, lc->kmax); + fprintf (stderr, " ilsh=[%d,%d,%d]\n", lc->ilsh, lc->jlsh, lc->klsh); + fprintf (stderr, " di=%d\n", lc->di); + fprintf (stderr, " Thread parallellism:\n"); + fprintf (stderr, " iiimin =[%d,%d,%d]\n", lc->iiimin, lc->jjjmin, lc->kkkmin); + fprintf (stderr, " iiimax =[%d,%d,%d]\n", lc->iiimax, lc->jjjmax, lc->kkkmax); + fprintf (stderr, " iiistep=[%d,%d,%d]\n", lc->iiistep, lc->jjjstep, lc->kkkstep); + fprintf (stderr, " Current thread:\n"); + fprintf (stderr, " thread_num=%d\n", lc->thread_num); + fprintf (stderr, " iii =[%d,%d,%d]\n", lc->iii, lc->jjj, lc->kkk); + fprintf (stderr, " iiii =[%d,%d,%d]\n", lc->iiii, lc->jjjj, lc->kkkk); + fprintf (stderr, " Tiling loop:\n"); + fprintf (stderr, " iimin =[%d,%d,%d]\n", lc->iimin, lc->jjmin, lc->kkmin); + fprintf (stderr, " iimax =[%d,%d,%d]\n", lc->iimax, lc->jjmax, lc->kkmax); + fprintf (stderr, " iistep=[%d,%d,%d]\n", lc->iistep, lc->jjstep, lc->kkstep); + fprintf (stderr, " Inner thread parallelism:\n"); + fprintf (stderr, " iiiimin =[%d,%d,%d]\n", lc->iiiimin, lc->jjjjmin, lc->kkkkmin); + fprintf (stderr, " iiiimax =[%d,%d,%d]\n", lc->iiiimax, lc->jjjjmax, lc->kkkkmax); + fprintf (stderr, " iiiistep=[%d,%d,%d]\n", lc->iiiistep, lc->jjjjstep, lc->kkkkstep); + } +#pragma omp barrier + ; + } +#pragma omp critical + { + CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, CCTK_THORNSTRING, + "LoopControl loop \"%s\" self-test failed", + lc->statmap->name); + } + } +#pragma omp single nowait + { + free (lc->selftest_count); + } + } } diff --git a/Carpet/LoopControl/src/loopcontrol.h b/Carpet/LoopControl/src/loopcontrol.h index 635d71128..59ddae4e9 100644 --- a/Carpet/LoopControl/src/loopcontrol.h +++ b/Carpet/LoopControl/src/loopcontrol.h @@ -216,33 +216,42 @@ typedef struct lc_control_t { lc_stattime_t * restrict stattime; /* Copy of arguments (useful for debugging) */ + /* Full domain */ int imin, jmin, kmin; int imax, jmax, kmax; int ilsh, jlsh, klsh; int di; /* Control settings for thread parallelism (useful for debugging) */ + /* Outer thread decomposition of full domain */ int iiimin, jjjmin, kkkmin; int iiimax, jjjmax, kkkmax; int iiistep, jjjstep, kkkstep; /* Control settings for current thread (useful for debugging) */ int thread_num; + /* Location of this thread in full domain */ int iii, jjj, kkk; + /* Index (not location!) of this thread in loop tile */ int iiii, jjjj, kkkk; /* Control settings for tiling loop */ + /* Loop tiling decomposition in this thread's domain */ int iimin, jjmin, kkmin; int iimax, jjmax, kkmax; int iistep, jjstep, kkstep; /* Control settings for inner thread parallelism */ + /* Inner thread decomposition, as offsets (!) to loop tiling */ int iiiimin, jjjjmin, kkkkmin; int iiiimax, jjjjmax, kkkkmax; int iiiistep, jjjjstep, kkkkstep; /* Timing statistics */ double time_setup_begin, time_calc_begin; + + /* Self check */ + char * restrict selftest_count; } lc_control_t; @@ -297,6 +306,10 @@ lc_control_init (lc_control_t * restrict lc, int di); void +lc_control_selftest (lc_control_t * restrict lc, + int imin, int imax, int j, int k); + +void lc_control_finish (lc_control_t * restrict lc); @@ -321,6 +334,7 @@ lc_control_finish (lc_control_t * restrict lc); (imax_), (jmax_), (kmax_), \ (ilsh_), (jlsh_), (klsh_), \ lc_di); \ + int const lc_do_selftest = lc_lc.selftest_count != 0; \ \ /* Coarse loop */ \ for (int lc_kk = lc_lc.kkmin; \ @@ -328,31 +342,40 @@ lc_control_finish (lc_control_t * restrict lc); lc_kk += lc_lc.kkstep) \ { \ int const lc_kmin = lc_kk + lc_lc.kkkkmin; \ - int const lc_kmax = lc_min (lc_kk + lc_lc.kkkkmax, lc_lc.kkmax); \ + int const lc_kmax = \ + lc_min (lc_kk + lc_min (lc_lc.kkkkmax, lc_lc.kkstep), \ + lc_lc.kkmax); \ \ for (int lc_jj = lc_lc.jjmin; \ lc_jj < lc_lc.jjmax; \ lc_jj += lc_lc.jjstep) \ { \ int const lc_jmin = lc_jj + lc_lc.jjjjmin; \ - int const lc_jmax = lc_min (lc_jj + lc_lc.jjjjmax, lc_lc.jjmax); \ + int const lc_jmax = \ + lc_min (lc_jj + lc_min (lc_lc.jjjjmax, lc_lc.jjstep), \ + lc_lc.jjmax); \ \ for (int lc_ii = lc_lc.iimin; \ lc_ii < lc_lc.iimax; \ lc_ii += lc_lc.iistep) \ { \ int const lc_imin = lc_ii + lc_lc.iiiimin; \ - int const lc_imax = lc_min (lc_ii + lc_lc.iiiimax, lc_lc.iimax); \ + int const lc_imax = \ + lc_min (lc_ii + lc_min (lc_lc.iiiimax, lc_lc.iistep), \ + lc_lc.iimax); \ \ /* Fine loop */ \ for (int k = lc_kmin; k < lc_kmax; ++k) { \ for (int j = lc_jmin; j < lc_jmax; ++j) { \ LC_PRELOOP_STATEMENTS \ { \ + if (CCTK_BUILTIN_EXPECT(lc_do_selftest, 0)) { \ + lc_control_selftest (& lc_lc, lc_imin, lc_imax, j, k); \ + } \ int const lc_ipos = \ lc_imin + lc_lc.ilsh * (j + lc_lc.jlsh * k); \ int const lc_ioffset = (lc_ipos & - lc_di) - lc_ipos; \ - for (int i = lc_imin + lc_ioffset; i < lc_imax; i += lc_di) { + for (int i = lc_imin + lc_ioffset; i < lc_imax; i += lc_di) { #define LC_ENDLOOP3VEC(name) \ } \ diff --git a/Carpet/LoopControl/src/loopcontrol_types.F90 b/Carpet/LoopControl/src/loopcontrol_types.F90 index 76fcaf910..159f49e92 100644 --- a/Carpet/LoopControl/src/loopcontrol_types.F90 +++ b/Carpet/LoopControl/src/loopcontrol_types.F90 @@ -52,6 +52,9 @@ module loopcontrol_types ! Timing statistics double precision time_setup_begin, time_calc_begin + + ! Self check + CCTK_POINTER selftest_count end type lc_control_t end module loopcontrol_types diff --git a/Carpet/LoopControl/src/make.code.defn b/Carpet/LoopControl/src/make.code.defn index c821a8438..a0cd4b09b 100644 --- a/Carpet/LoopControl/src/make.code.defn +++ b/Carpet/LoopControl/src/make.code.defn @@ -1,7 +1,7 @@ # Main make.code.defn file for thorn LoopControl # Source files in this directory -SRCS = loopcontrol.c loopcontrol.F90 lc_get_type_sizes.F90 loopcontrol_types.F90 lc_auto.c lc_siman.c lc_hill.c lc_selftest.c +SRCS = loopcontrol.c loopcontrol.F90 lc_get_type_sizes.F90 loopcontrol_types.F90 lc_auto.c lc_siman.c lc_hill.c # Subdirectories containing source files SUBDIRS = |