aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorErik Schnetter <schnetter@cct.lsu.edu>2011-07-05 12:28:23 -0400
committerBarry Wardell <barry.wardell@gmail.com>2011-12-14 19:54:46 +0000
commit911aeb937cd4334c4406e5d115ae9a1a01e0c4e4 (patch)
treec23ddd5bcf559d7ee37f102dd779b41b0e2b4f6c
parent5ce764af71f43b21ad94db223058d4ca5896c9fe (diff)
LoopControl: Correct OpenMP parallelisation. Add self-tests.
Correct the OpenMP parallelisation: Previously, when the loop size is not a multiple of the block size assigned to each OpenMP thread, some grid points would be traversed multiple times. Add new parameter do_selftest that enables (somewhat expensive) self-tests to ensure in each loop that each grid point is traversed exactly once.
-rw-r--r--Carpet/LoopControl/interface.ccl7
-rw-r--r--Carpet/LoopControl/param.ccl2
-rw-r--r--Carpet/LoopControl/schedule.ccl16
-rw-r--r--Carpet/LoopControl/src/lc_selftest.c123
-rw-r--r--Carpet/LoopControl/src/loopcontrol.c132
-rw-r--r--Carpet/LoopControl/src/loopcontrol.h31
-rw-r--r--Carpet/LoopControl/src/loopcontrol_types.F903
-rw-r--r--Carpet/LoopControl/src/make.code.defn2
8 files changed, 151 insertions, 165 deletions
diff --git a/Carpet/LoopControl/interface.ccl b/Carpet/LoopControl/interface.ccl
index c0362e5fa..8769d737b 100644
--- a/Carpet/LoopControl/interface.ccl
+++ b/Carpet/LoopControl/interface.ccl
@@ -5,10 +5,3 @@ IMPLEMENTS: LoopControl
INCLUDE HEADER: loopcontrol.h IN loopcontrol.h
USES INCLUDE HEADER: vectors.h
-
-
-
-CCTK_REAL selftest TYPE=gf TAGS='prolongation="none"'
-{
- var1 var2
-} "Self-test variables"
diff --git a/Carpet/LoopControl/param.ccl b/Carpet/LoopControl/param.ccl
index 005202538..f5a1fa05f 100644
--- a/Carpet/LoopControl/param.ccl
+++ b/Carpet/LoopControl/param.ccl
@@ -37,7 +37,7 @@ BOOLEAN check_type_sizes "Check that Fortran and C types match"
{
} "yes"
-BOOLEAN do_selftest "Perform some self-tests"
+BOOLEAN do_selftest "Perform (expensive) self-tests in every loop"
{
} "no"
diff --git a/Carpet/LoopControl/schedule.ccl b/Carpet/LoopControl/schedule.ccl
index c1b74863a..090da0f5a 100644
--- a/Carpet/LoopControl/schedule.ccl
+++ b/Carpet/LoopControl/schedule.ccl
@@ -8,22 +8,6 @@ if (check_type_sizes) {
} "Check that sizes of control structures are identical in C and Fortran"
}
-if (do_selftest) {
- STORAGE: selftest
-
- SCHEDULE lc_selftest AT basegrid
- {
- LANG: C
- OPTIONS: global loop-local
- } "Perform some self-checks"
-
- SCHEDULE lc_selftest AT postregrid
- {
- LANG: C
- OPTIONS: global loop-local
- } "Perform some self-checks"
-}
-
if (printstats) {
SCHEDULE lc_printstats_analysis AT analysis
{
diff --git a/Carpet/LoopControl/src/lc_selftest.c b/Carpet/LoopControl/src/lc_selftest.c
deleted file mode 100644
index d80421af6..000000000
--- a/Carpet/LoopControl/src/lc_selftest.c
+++ /dev/null
@@ -1,123 +0,0 @@
-#include <cctk.h>
-#include <cctk_Arguments.h>
-
-#include <vectors.h>
-
-#include <loopcontrol.h>
-
-void lc_selftest (CCTK_ARGUMENTS)
-{
- DECLARE_CCTK_ARGUMENTS;
-
-#ifdef CCTK_REAL_VEC_SIZE
- /* Vectorisation is enabled */
- int const vector_size = CCTK_REAL_VEC_SIZE;
-#else
- /* Vectorisation is disabled */
- int const vector_size = 1;
-#endif
-
- /* Initialise (without using LoopControl) */
-#pragma omp parallel for
- for (int k=0; k<cctk_lsh[2]; ++k) {
- for (int j=0; j<cctk_lsh[1]; ++j) {
- for (int i=0; i<cctk_lsh[0]; ++i) {
- int const ind3d = CCTK_GFINDEX3D(cctkGH, i,j,k);
- var1[ind3d] = 0;
- var2[ind3d] = 0;
- }
- }
- }
-
- /* Test 1: Loop over all points */
- {
- int imin[3], imax[3];
- for (int d=0; d<3; ++d) {
- imin[d] = 0;
- imax[d] = cctk_lsh[d];
- }
-#pragma omp parallel
- LC_LOOP3VEC(lc_selftest1,
- i,j,k,
- imin[0],imin[1],imin[2],
- imax[0],imax[1],imax[2],
- cctk_lsh[0],cctk_lsh[1],cctk_lsh[2],
- vector_size)
- {
- int const ind3d = CCTK_GFINDEX3D(cctkGH, i,j,k);
-#pragma omp atomic
- ++ var1[ind3d];
- } LC_ENDLOOP3VEC(lc_selftest1);
- }
-
- /* Test 2: Loop over interior points, then loop over boundary
- points */
- {
- int imin[3], imax[3];
- for (int d=0; d<3; ++d) {
- imin[d] = cctk_nghostzones[d];
- imax[d] = cctk_lsh[d] - cctk_nghostzones[d];
- }
-#pragma omp parallel
- LC_LOOP3VEC(lc_selftest2a,
- i,j,k,
- imin[0],imin[1],imin[2],
- imax[0],imax[1],imax[2],
- cctk_lsh[0],cctk_lsh[1],cctk_lsh[2],
- vector_size)
- {
- int const ind3d = CCTK_GFINDEX3D(cctkGH, i,j,k);
-#pragma omp atomic
- ++ var2[ind3d];
- } LC_ENDLOOP3VEC(lc_selftest2a);
-
- for (int dir=0; dir<3; ++dir) {
- for (int face=0; face<2; ++face) {
- for (int d=0; d<dir; ++d) {
- imin[d] = 0;
- imax[d] = cctk_lsh[d];
- }
- if (face==0) {
- imin[dir] = 0;
- imax[dir] = cctk_nghostzones[dir];
- } else {
- imin[dir] = cctk_lsh[dir] - cctk_nghostzones[dir];
- imax[dir] = cctk_lsh[dir];
- }
- for (int d=dir+1; d<3; ++d) {
- imin[d] = cctk_nghostzones[d];
- imax[d] = cctk_lsh[d] - cctk_nghostzones[d];
- }
-#pragma omp parallel
- LC_LOOP3VEC(lc_selftest2b,
- i,j,k,
- imin[0],imin[1],imin[2],
- imax[0],imax[1],imax[2],
- cctk_lsh[0],cctk_lsh[1],cctk_lsh[2],
- vector_size)
- {
- int const ind3d = CCTK_GFINDEX3D(cctkGH, i,j,k);
-#pragma omp atomic
- ++ var2[ind3d];
- } LC_ENDLOOP3VEC(lc_selftest2b);
- }
- }
- }
-
- /* Evaluate tests (without using LoopControl) */
- int failure = 0;
-#pragma omp parallel for reduction(+: failure)
- for (int k=0; k<cctk_lsh[2]; ++k) {
- for (int j=0; j<cctk_lsh[1]; ++j) {
- for (int i=0; i<cctk_lsh[0]; ++i) {
- int const ind3d = CCTK_GFINDEX3D(cctkGH, i,j,k);
- failure += var1[ind3d] != 1;
- failure += var2[ind3d] != 1;
- }
- }
- }
-
- if (failure) {
- CCTK_WARN (CCTK_WARN_ABORT, "LoopControl self-test failed");
- }
-}
diff --git a/Carpet/LoopControl/src/loopcontrol.c b/Carpet/LoopControl/src/loopcontrol.c
index 83a09a313..b4e4bf328 100644
--- a/Carpet/LoopControl/src/loopcontrol.c
+++ b/Carpet/LoopControl/src/loopcontrol.c
@@ -898,12 +898,13 @@ lc_control_init (lc_control_t * restrict const lc,
lc->kkkstep = (lc->kkkmax - lc->kkkmin + lt->knthreads-1) / lt->knthreads;
/* Find location of current thread */
- lc->thread_num = omp_get_thread_num();
- int c = lc->thread_num;
- int const ci = c % lt->inthreads; c /= lt->inthreads;
- int const cj = c % lt->jnthreads; c /= lt->jnthreads;
- int const ck = c % lt->knthreads; c /= lt->knthreads;
- /* see below where c is continued to be used */
+ lc->thread_num = omp_get_thread_num();
+ int c_outer =
+ lc->thread_num / (lt->inithreads * lt->jnithreads * lt->knithreads);
+ int const ci = c_outer % lt->inthreads; c_outer /= lt->inthreads;
+ int const cj = c_outer % lt->jnthreads; c_outer /= lt->jnthreads;
+ int const ck = c_outer % lt->knthreads; c_outer /= lt->knthreads;
+ assert (c_outer == 0);
lc->iii = lc->iiimin + ci * lc->iiistep;
lc->jjj = lc->jjjmin + cj * lc->jjjstep;
lc->kkk = lc->kkkmin + ck * lc->kkkstep;
@@ -928,11 +929,12 @@ lc_control_init (lc_control_t * restrict const lc,
/*** Inner threads **********************************************************/
/* Inner loop thread parallelism */
- /* (No thread parallelism yet) */
- int const cii = c % lt->inithreads; c /= lt->inithreads;
- int const cjj = c % lt->jnithreads; c /= lt->jnithreads;
- int const ckk = c % lt->knithreads; c /= lt->knithreads;
- assert (c == 0);
+ int c_inner =
+ lc->thread_num % (lt->inithreads * lt->jnithreads * lt->knithreads);
+ int const cii = c_inner % lt->inithreads; c_inner /= lt->inithreads;
+ int const cjj = c_inner % lt->jnithreads; c_inner /= lt->jnithreads;
+ int const ckk = c_inner % lt->knithreads; c_inner /= lt->knithreads;
+ assert (c_inner == 0);
lc->iiii = cii;
lc->jjjj = cjj;
lc->kkkk = ckk;
@@ -951,6 +953,23 @@ lc_control_init (lc_control_t * restrict const lc,
/****************************************************************************/
+ /* Self test */
+ if (do_selftest) {
+ char * restrict mem;
+#pragma omp single copyprivate (mem)
+ {
+ mem =
+ calloc (lc->ilsh * lc->jlsh * lc->klsh, sizeof * lc->selftest_count);
+ }
+ lc->selftest_count = mem;
+ } else {
+ lc->selftest_count = NULL;
+ }
+
+
+
+ /****************************************************************************/
+
/* Timer */
lc->time_calc_begin = omp_get_wtime();
}
@@ -958,8 +977,26 @@ lc_control_init (lc_control_t * restrict const lc,
void
+lc_control_selftest (lc_control_t * restrict const lc,
+ int const imin, int const imax, int const j, int const k)
+{
+ assert (imin >= 0 && imax <= lc->ilsh);
+ assert (j >= 0 && j < lc->jlsh);
+ assert (k >= 0 && k < lc->klsh);
+ for (int i = imin; i < imax; ++i) {
+ int const ind3d = i + lc->ilsh * (j + lc->jlsh * k);
+#pragma omp atomic
+ ++ lc->selftest_count[ind3d];
+ }
+}
+
+
+
+void
lc_control_finish (lc_control_t * restrict const lc)
{
+ DECLARE_CCTK_PARAMETERS;
+
lc_stattime_t * restrict const lt = lc->stattime;
lc_statset_t * restrict const ls = lc->statset;
@@ -967,7 +1004,6 @@ lc_control_finish (lc_control_t * restrict const lc)
int first_iteration;
#pragma omp single copyprivate (ignore_iteration)
{
- DECLARE_CCTK_PARAMETERS;
ignore_iteration = ignore_initial_overhead && lt->time_count == 0.0;
first_iteration = lt->time_count_init == 0.0;
}
@@ -1021,7 +1057,6 @@ lc_control_finish (lc_control_t * restrict const lc)
/* #pragma omp barrier */
{
- DECLARE_CCTK_PARAMETERS;
if (use_simulated_annealing) {
#pragma omp single
{
@@ -1035,6 +1070,77 @@ lc_control_finish (lc_control_t * restrict const lc)
}
}
}
+
+ /* Perform self-check */
+
+ if (do_selftest) {
+ /* Ensure all threads have finished the loop */
+#pragma omp barrier
+ ;
+ /* Assert that exactly the specified points have been set */
+ static int failure = 0;
+#pragma omp for reduction(+: failure)
+ for (int k=0; k<lc->klsh; ++k) {
+ for (int j=0; j<lc->jlsh; ++j) {
+ for (int i=0; i<lc->ilsh; ++i) {
+ int const ind3d = i + lc->ilsh * (j + lc->jlsh * k);
+ char const inside =
+ (i >= lc->imin && i < lc->imax) &&
+ (j >= lc->jmin && j < lc->jmax) &&
+ (k >= lc->kmin && k < lc->kmax);
+ if (lc->selftest_count[ind3d] != inside) {
+ ++ failure;
+#pragma omp critical
+ {
+ fprintf (stderr, " i=[%d,%d,%d] count=%d expected=%d\n",
+ i, j, k,
+ (int) lc->selftest_count[ind3d], (int) inside);
+ }
+ }
+ }
+ }
+ }
+ if (failure) {
+ for (int n=0; n<omp_get_num_threads(); ++n) {
+ if (n == omp_get_thread_num()) {
+ fprintf (stderr, "Thread: %d\n", n);
+ fprintf (stderr, " Arguments:\n");
+ fprintf (stderr, " imin=[%d,%d,%d]\n", lc->imin, lc->jmin, lc->kmin);
+ fprintf (stderr, " imax=[%d,%d,%d]\n", lc->imax, lc->jmax, lc->kmax);
+ fprintf (stderr, " ilsh=[%d,%d,%d]\n", lc->ilsh, lc->jlsh, lc->klsh);
+ fprintf (stderr, " di=%d\n", lc->di);
+ fprintf (stderr, " Thread parallellism:\n");
+ fprintf (stderr, " iiimin =[%d,%d,%d]\n", lc->iiimin, lc->jjjmin, lc->kkkmin);
+ fprintf (stderr, " iiimax =[%d,%d,%d]\n", lc->iiimax, lc->jjjmax, lc->kkkmax);
+ fprintf (stderr, " iiistep=[%d,%d,%d]\n", lc->iiistep, lc->jjjstep, lc->kkkstep);
+ fprintf (stderr, " Current thread:\n");
+ fprintf (stderr, " thread_num=%d\n", lc->thread_num);
+ fprintf (stderr, " iii =[%d,%d,%d]\n", lc->iii, lc->jjj, lc->kkk);
+ fprintf (stderr, " iiii =[%d,%d,%d]\n", lc->iiii, lc->jjjj, lc->kkkk);
+ fprintf (stderr, " Tiling loop:\n");
+ fprintf (stderr, " iimin =[%d,%d,%d]\n", lc->iimin, lc->jjmin, lc->kkmin);
+ fprintf (stderr, " iimax =[%d,%d,%d]\n", lc->iimax, lc->jjmax, lc->kkmax);
+ fprintf (stderr, " iistep=[%d,%d,%d]\n", lc->iistep, lc->jjstep, lc->kkstep);
+ fprintf (stderr, " Inner thread parallelism:\n");
+ fprintf (stderr, " iiiimin =[%d,%d,%d]\n", lc->iiiimin, lc->jjjjmin, lc->kkkkmin);
+ fprintf (stderr, " iiiimax =[%d,%d,%d]\n", lc->iiiimax, lc->jjjjmax, lc->kkkkmax);
+ fprintf (stderr, " iiiistep=[%d,%d,%d]\n", lc->iiiistep, lc->jjjjstep, lc->kkkkstep);
+ }
+#pragma omp barrier
+ ;
+ }
+#pragma omp critical
+ {
+ CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, CCTK_THORNSTRING,
+ "LoopControl loop \"%s\" self-test failed",
+ lc->statmap->name);
+ }
+ }
+#pragma omp single nowait
+ {
+ free (lc->selftest_count);
+ }
+ }
}
diff --git a/Carpet/LoopControl/src/loopcontrol.h b/Carpet/LoopControl/src/loopcontrol.h
index 635d71128..59ddae4e9 100644
--- a/Carpet/LoopControl/src/loopcontrol.h
+++ b/Carpet/LoopControl/src/loopcontrol.h
@@ -216,33 +216,42 @@ typedef struct lc_control_t {
lc_stattime_t * restrict stattime;
/* Copy of arguments (useful for debugging) */
+ /* Full domain */
int imin, jmin, kmin;
int imax, jmax, kmax;
int ilsh, jlsh, klsh;
int di;
/* Control settings for thread parallelism (useful for debugging) */
+ /* Outer thread decomposition of full domain */
int iiimin, jjjmin, kkkmin;
int iiimax, jjjmax, kkkmax;
int iiistep, jjjstep, kkkstep;
/* Control settings for current thread (useful for debugging) */
int thread_num;
+ /* Location of this thread in full domain */
int iii, jjj, kkk;
+ /* Index (not location!) of this thread in loop tile */
int iiii, jjjj, kkkk;
/* Control settings for tiling loop */
+ /* Loop tiling decomposition in this thread's domain */
int iimin, jjmin, kkmin;
int iimax, jjmax, kkmax;
int iistep, jjstep, kkstep;
/* Control settings for inner thread parallelism */
+ /* Inner thread decomposition, as offsets (!) to loop tiling */
int iiiimin, jjjjmin, kkkkmin;
int iiiimax, jjjjmax, kkkkmax;
int iiiistep, jjjjstep, kkkkstep;
/* Timing statistics */
double time_setup_begin, time_calc_begin;
+
+ /* Self check */
+ char * restrict selftest_count;
} lc_control_t;
@@ -297,6 +306,10 @@ lc_control_init (lc_control_t * restrict lc,
int di);
void
+lc_control_selftest (lc_control_t * restrict lc,
+ int imin, int imax, int j, int k);
+
+void
lc_control_finish (lc_control_t * restrict lc);
@@ -321,6 +334,7 @@ lc_control_finish (lc_control_t * restrict lc);
(imax_), (jmax_), (kmax_), \
(ilsh_), (jlsh_), (klsh_), \
lc_di); \
+ int const lc_do_selftest = lc_lc.selftest_count != 0; \
\
/* Coarse loop */ \
for (int lc_kk = lc_lc.kkmin; \
@@ -328,31 +342,40 @@ lc_control_finish (lc_control_t * restrict lc);
lc_kk += lc_lc.kkstep) \
{ \
int const lc_kmin = lc_kk + lc_lc.kkkkmin; \
- int const lc_kmax = lc_min (lc_kk + lc_lc.kkkkmax, lc_lc.kkmax); \
+ int const lc_kmax = \
+ lc_min (lc_kk + lc_min (lc_lc.kkkkmax, lc_lc.kkstep), \
+ lc_lc.kkmax); \
\
for (int lc_jj = lc_lc.jjmin; \
lc_jj < lc_lc.jjmax; \
lc_jj += lc_lc.jjstep) \
{ \
int const lc_jmin = lc_jj + lc_lc.jjjjmin; \
- int const lc_jmax = lc_min (lc_jj + lc_lc.jjjjmax, lc_lc.jjmax); \
+ int const lc_jmax = \
+ lc_min (lc_jj + lc_min (lc_lc.jjjjmax, lc_lc.jjstep), \
+ lc_lc.jjmax); \
\
for (int lc_ii = lc_lc.iimin; \
lc_ii < lc_lc.iimax; \
lc_ii += lc_lc.iistep) \
{ \
int const lc_imin = lc_ii + lc_lc.iiiimin; \
- int const lc_imax = lc_min (lc_ii + lc_lc.iiiimax, lc_lc.iimax); \
+ int const lc_imax = \
+ lc_min (lc_ii + lc_min (lc_lc.iiiimax, lc_lc.iistep), \
+ lc_lc.iimax); \
\
/* Fine loop */ \
for (int k = lc_kmin; k < lc_kmax; ++k) { \
for (int j = lc_jmin; j < lc_jmax; ++j) { \
LC_PRELOOP_STATEMENTS \
{ \
+ if (CCTK_BUILTIN_EXPECT(lc_do_selftest, 0)) { \
+ lc_control_selftest (& lc_lc, lc_imin, lc_imax, j, k); \
+ } \
int const lc_ipos = \
lc_imin + lc_lc.ilsh * (j + lc_lc.jlsh * k); \
int const lc_ioffset = (lc_ipos & - lc_di) - lc_ipos; \
- for (int i = lc_imin + lc_ioffset; i < lc_imax; i += lc_di) {
+ for (int i = lc_imin + lc_ioffset; i < lc_imax; i += lc_di) {
#define LC_ENDLOOP3VEC(name) \
} \
diff --git a/Carpet/LoopControl/src/loopcontrol_types.F90 b/Carpet/LoopControl/src/loopcontrol_types.F90
index 76fcaf910..159f49e92 100644
--- a/Carpet/LoopControl/src/loopcontrol_types.F90
+++ b/Carpet/LoopControl/src/loopcontrol_types.F90
@@ -52,6 +52,9 @@ module loopcontrol_types
! Timing statistics
double precision time_setup_begin, time_calc_begin
+
+ ! Self check
+ CCTK_POINTER selftest_count
end type lc_control_t
end module loopcontrol_types
diff --git a/Carpet/LoopControl/src/make.code.defn b/Carpet/LoopControl/src/make.code.defn
index c821a8438..a0cd4b09b 100644
--- a/Carpet/LoopControl/src/make.code.defn
+++ b/Carpet/LoopControl/src/make.code.defn
@@ -1,7 +1,7 @@
# Main make.code.defn file for thorn LoopControl
# Source files in this directory
-SRCS = loopcontrol.c loopcontrol.F90 lc_get_type_sizes.F90 loopcontrol_types.F90 lc_auto.c lc_siman.c lc_hill.c lc_selftest.c
+SRCS = loopcontrol.c loopcontrol.F90 lc_get_type_sizes.F90 loopcontrol_types.F90 lc_auto.c lc_siman.c lc_hill.c
# Subdirectories containing source files
SUBDIRS =