diff options
author | Erik Schnetter <schnetter@gmail.com> | 2013-01-16 14:46:17 -0500 |
---|---|---|
committer | Erik Schnetter <schnetter@gmail.com> | 2013-01-16 14:46:17 -0500 |
commit | e0ddb73239c73c6de42a01204194173ce65ebff4 (patch) | |
tree | d0380bec049f7807836d68a0001f2ec492f62d0e /Carpet/LoopControl/src/loopcontrol.h | |
parent | 1cddd960d62da42ccd111022f1326740f688b48d (diff) |
LoopControl: Rewrite
Rewrite code in C++.
Remove dependency on GSL.
Modify algorithm; now traverses arrays bottom-up (by splitting the looping region into equal-sized blocks) instead of top-down (splitting the region into a certain number of blocks)
Make multi-threading dynamic
Support SMT (hyper-threading), i.e. threads which share the same cache
Diffstat (limited to 'Carpet/LoopControl/src/loopcontrol.h')
-rw-r--r-- | Carpet/LoopControl/src/loopcontrol.h | 611 |
1 files changed, 253 insertions, 358 deletions
diff --git a/Carpet/LoopControl/src/loopcontrol.h b/Carpet/LoopControl/src/loopcontrol.h index a6bb71be2..04e951c23 100644 --- a/Carpet/LoopControl/src/loopcontrol.h +++ b/Carpet/LoopControl/src/loopcontrol.h @@ -1,392 +1,287 @@ -#ifndef LC_LOOPCONTROL_H -#define LC_LOOPCONTROL_H +#ifndef LOOPCONTROL_H +#define LOOPCONTROL_H /* This file uses the namespace LC_* for macros and lc_* for C identifiers. */ -#include <cctk.h> - +#define LC_DIM 3 + #ifdef CCODE +#include <stddef.h> +#include <stdlib.h> + +#include <cctk.h> + #ifdef __cplusplus extern "C" { #endif - - - -/* A topology */ -typedef struct lc_topology_t { - int nthreads[2][3]; /* [0:outer|1:inner][ijk] */ -} lc_topology_t; - -/* A tiling specification */ -typedef struct lc_tiling_t { - int npoints; -} lc_tiling_t; - - - -typedef struct lc_state_t { - int topology; - int tiling[3]; -} lc_state_t; - - - -/* For simulated annealing */ -typedef struct lc_auto_state_t lc_auto_state_t; - -/* For hill climbing */ -typedef struct lc_hill_state_t lc_hill_state_t; - - - -/* Statistics for one control parameter set (thread topology and - tiling specification) of one user parameter set of one loop */ -typedef struct lc_stattime_t { - struct lc_stattime_t * next; - - /* Keys */ - - lc_state_t state; - int inthreads, jnthreads, knthreads; - int inithreads, jnithreads, knithreads; - int inpoints, jnpoints, knpoints; - - /* Data */ - - /* Statistics */ - /* number of calls and threads */ - double time_count, time_count_init; - /* time spent setting up loops */ - double time_setup_sum, time_setup_sum2; - /* time spent iterating */ - double time_calc_sum, time_calc_sum2; - double time_calc_init; /* time for first calculation */ - - /* wall time tag */ - double last_updated; -} lc_stattime_t; - - - -/* Statistics for one user parameter set (number of threads and number - of iterations) of one loop */ -typedef struct lc_statset_t { - struct lc_statset_t * next; - /* Keys */ - int num_threads; - int npoints[3]; /* [dim] */ - /* Data */ + static inline ptrdiff_t lc_min(ptrdiff_t const i, ptrdiff_t const j) + { + return i < j ? i : j; + } - /* Thread topologies */ - lc_topology_t * restrict topologies; /* [topology] */ - int ntopologies; + static inline ptrdiff_t lc_max(ptrdiff_t const i, ptrdiff_t const j) + { + return i > j ? i : j; + } - /* Tiling specifications */ - lc_tiling_t * restrict tilings[3]; /* [dim] */ - int ntilings[3]; /* [dim] */ - int * restrict topology_ntilings[3]; /* [dim][topology] */ - /* Simulated annealing state */ - lc_auto_state_t * auto_state; - /* Hill climbing state */ - lc_hill_state_t * hill_state; + struct lc_thread_info_t; + struct lc_fine_thread_comm_t; - lc_stattime_t * stattime_list; + struct lc_stats_t; - /* Statistics */ - /* number of calls and threads */ - double time_count, time_count_init; - /* time spent setting up loops */ - double time_setup_sum, time_setup_sum2; - /* time spent iterating */ - double time_calc_sum, time_calc_sum2; - double time_calc_init; /* time for first calculation */ -} lc_statset_t; - - - -/* Statistics for one loop (one source code location) */ -typedef struct lc_statmap_t { - struct lc_statmap_t * next; /* for linked list */ + typedef struct { + ptrdiff_t v[LC_DIM]; + } lc_vec_t; - /* Name */ - char const * restrict name; + typedef struct { + int v[LC_DIM]; + } lc_ivec_t; - lc_statset_t * statset_list; -} lc_statmap_t; - - - -/* Linked list of all loop statistics structures */ -extern lc_statmap_t * lc_statmap_list; - - - -static inline -int -lc_state_valid (lc_statset_t const * restrict const ls, - lc_state_t const * restrict const state) -{ - if (state->topology >= 0 && state->topology < ls->ntopologies) { - int const * restrict const ntilings = - ls->topology_ntilings[state->topology]; - return (state->tiling[0] >= 0 && state->tiling[0] < ntilings[0] && - state->tiling[1] >= 0 && state->tiling[1] < ntilings[1] && - state->tiling[2] >= 0 && state->tiling[2] < ntilings[2]); + typedef struct { + /* Traverse pos from min (inclusive) to max (exclusive) with a + stride of step. Equivalently, traverse idx from 0 (inclusive) + to count (exclusive). */ + lc_vec_t min, max, step, pos; + lc_ivec_t count, idx; + } lc_space_t; + + typedef struct { + lc_vec_t ash; + lc_space_t loop; + lc_space_t thread; + struct lc_thread_info_t* thread_info_ptr; /* shared between all + threads */ + int thread_done; + lc_space_t coarse; /* count, idx, pos are undefined */ + lc_space_t fine; /* count, idx, pos are undefined */ + lc_space_t fine_thread; /* min, max, pos are undefined */ + struct lc_fine_thread_comm_t* fine_thread_comm_ptr; /* shared + between SMT + threads */ + unsigned char* restrict selftest_array; + } lc_control_t; + + + + void lc_stats_init(struct lc_stats_t** stats, + char const* name, char const* file, int line); + void lc_control_init(lc_control_t* restrict control, + struct lc_stats_t *restrict stats, + ptrdiff_t imin, ptrdiff_t jmin, ptrdiff_t kmin, + ptrdiff_t imax, ptrdiff_t jmax, ptrdiff_t kmax, + ptrdiff_t iash, ptrdiff_t jash, ptrdiff_t kash, + ptrdiff_t di, ptrdiff_t dj, ptrdiff_t dk); + void lc_control_finish(lc_control_t* restrict control, + struct lc_stats_t *restrict stats); + + void lc_thread_init(lc_control_t* restrict control); + int lc_thread_done(lc_control_t const* restrict control); + void lc_thread_step(lc_control_t* restrict control); + + void lc_selftest_set(lc_control_t const* restrict control, + ptrdiff_t lmin, ptrdiff_t lmax, + ptrdiff_t imin, ptrdiff_t imax, ptrdiff_t di, + ptrdiff_t i, ptrdiff_t j, ptrdiff_t k); + + + +#define LC_COARSE_SETUP(D) \ + lc_control.coarse.min.v[D] = lc_control.thread.pos.v[D]; \ + lc_control.coarse.max.v[D] = \ + lc_min(lc_control.thread.max.v[D], \ + lc_control.coarse.min.v[D] + lc_control.thread.step.v[D]); \ + ptrdiff_t const lc_cmin##D = lc_control.coarse.min.v[D]; \ + ptrdiff_t const lc_cmax##D = lc_control.coarse.max.v[D]; \ + ptrdiff_t const lc_cstep##D = lc_control.coarse.step.v[D]; +#define LC_COARSE_LOOP(D) \ + for (ptrdiff_t lc_cpos##D = lc_cmin##D; \ + lc_cpos##D < lc_cmax##D; \ + lc_cpos##D += lc_cstep##D) + +#define LC_FINE_SETUP(D) \ + lc_control.fine.min.v[D] = lc_cpos##D; \ + lc_control.fine.max.v[D] = \ + lc_min(lc_control.coarse.max.v[D], \ + lc_control.fine.min.v[D] + lc_control.coarse.step.v[D]); \ + ptrdiff_t /*const*/ lc_fmin##D = lc_control.fine.min.v[D]; \ + ptrdiff_t /*const*/ lc_fmax##D = lc_control.fine.max.v[D]; \ + ptrdiff_t const lc_fstep##D = lc_control.fine.step.v[D]; \ + ptrdiff_t const lc_ftoff##D = \ + lc_control.fine_thread.idx.v[D] * lc_control.fine_thread.step.v[D]; +#define LC_FINE_LOOP(I, NI, D) \ + for (ptrdiff_t I = lc_fmin##D + lc_ftoff##D; \ + I < lc_fmax##D; \ + I += lc_fstep##D) \ + { \ + ptrdiff_t const NI CCTK_ATTRIBUTE_UNUSED = \ + CCTK_BUILTIN_EXPECT(lc_dir##D==0, 1) ? 0 : \ + lc_dir##D<0 ? I+1 : lc_control.loop.max.v[D]-I; + +#if VECTORISE_ALIGNED_ARRAYS + /* Arrays are aligned: fmin0 is the aligned loop boundary; keep it, + and set up imin to be the intended loop boundary */ +# define LC_ALIGN(i,j,k) \ + ptrdiff_t const lc_imin = lc_max(lc_control.loop.min.v[0], lc_fmin0); \ + ptrdiff_t const lc_imax = lc_fmax0; +#else + /* Arrays are not aligned: fine.min[0] and fine.max[0] are the + intended loop boundaries; override fmin0 and fmax0 to be aligned, + and set imin and imax; this may move the fine loop boundaries + except at outer boundaries to avoid partial stores */ +# define LC_ALIGN(i,j,k) \ + lc_fmin0 = lc_control.fine.min.v[0]; \ + lc_fmax0 = lc_control.fine.max.v[0]; \ + ptrdiff_t lc_imin = lc_fmin0; \ + ptrdiff_t lc_imax = lc_fmax0; \ + int const lc_fmin0_is_outer = lc_fmin0 == lc_control.loop.min.v[0]; \ + int const lc_fmax0_is_outer = lc_fmax0 == lc_control.loop.max.v[0]; \ + ptrdiff_t const lc_ipos = lc_fmin0 + lc_ash0 * (j + lc_ash1 * k); \ + ptrdiff_t const lc_ioffset = lc_ipos % lc_align0; \ + lc_fmin0 -= lc_ioffset; \ + if (!lc_fmax0_is_outer) lc_fmax0 -= lc_ioffset; \ + if (!lc_fmin0_is_outer) lc_imin = lc_fmin0; \ + if (!lc_fmax0_is_outer) lc_imax = lc_fmax0; +#endif + +#define LC_SELFTEST(i,j,k) \ + if (CCTK_BUILTIN_EXPECT(lc_control.selftest_array != NULL, 0)) { \ + lc_selftest_set(&lc_control, \ + lc_control.loop.min.v[0], lc_control.loop.max.v[0], \ + lc_imin, lc_imax, lc_align0, i, j, k); \ } - return 0; -} - -static inline -int -lc_state_equal (lc_state_t const * restrict const state1, - lc_state_t const * restrict const state2) -{ - return (state1->topology == state2->topology && - state1->tiling[0] == state2->tiling[0] && - state1->tiling[1] == state2->tiling[1] && - state1->tiling[2] == state2->tiling[2]); -} - - - -void -lc_stattime_init (lc_stattime_t * restrict const lt, - lc_statset_t * restrict const ls, - lc_state_t const * restrict const state); - -lc_stattime_t * -lc_stattime_find (lc_statset_t const * restrict const ls, - lc_state_t const * restrict const state) - CCTK_ATTRIBUTE_PURE; - -lc_stattime_t * -lc_stattime_find_create (lc_statset_t * restrict const ls, - lc_state_t const * restrict const state); - - - -/* TODO: introduce type for num_threads and npoints[3] */ -void -lc_statset_init (lc_statset_t * restrict const ls, - lc_statmap_t * restrict const lm, - int const num_threads, - int const npoints[3]); - -lc_statset_t * -lc_statset_find (lc_statmap_t const * restrict const lm, - int const num_threads, - int const npoints[3]) - CCTK_ATTRIBUTE_PURE; - -lc_statset_t * -lc_statset_find_create (lc_statmap_t * restrict const lm, - int const num_threads, - int const npoints[3]); - - - -typedef struct lc_control_t { - lc_statmap_t * restrict statmap; - lc_statset_t * restrict statset; - lc_stattime_t * restrict stattime; - - /* Copy of arguments (useful for debugging) */ - /* Full domain */ - int imin, jmin, kmin; - int imax, jmax, kmax; - int ilsh, jlsh, klsh; - int di; - - /* Control settings for thread parallelism (useful for debugging) */ - /* Outer thread decomposition of full domain */ - int iiimin, jjjmin, kkkmin; - int iiimax, jjjmax, kkkmax; - int iiistep, jjjstep, kkkstep; - - /* Control settings for current thread (useful for debugging) */ - int thread_num; - /* Location of this thread in full domain */ - int iii, jjj, kkk; - /* Index (not location!) of this thread in loop tile */ - int iiii, jjjj, kkkk; - - /* Control settings for tiling loop */ - /* Loop tiling decomposition in this thread's domain */ - int iimin, jjmin, kkmin; - int iimax, jjmax, kkmax; - int iistep, jjstep, kkstep; - - /* Control settings for inner thread parallelism */ - /* Inner thread decomposition, as offsets (!) to loop tiling */ - int iiiimin, jjjjmin, kkkkmin; - int iiiimax, jjjjmax, kkkkmax; - int iiiistep, jjjjstep, kkkkstep; - - /* Timing statistics */ - double time_setup_begin, time_calc_begin; - - /* Self check */ - char * restrict selftest_count; -} lc_control_t; - - - -static inline -int -lc_min (int const i, int const j) -{ - return i < j ? i : j; -} - -static inline -int -lc_max (int const i, int const j) -{ - return i > j ? i : j; -} - -/* Align by shifting to the right if necessary */ -static inline -int -lc_align (int const i, int const di) -{ - return (i + di - 1) / di * di; -} - - - -void -lc_statmap_init (int * restrict initialised, - lc_statmap_t * restrict ls, - char const * restrict name); - -void -lc_control_init (lc_control_t * restrict lc, - lc_statmap_t * restrict lm, - int imin, int jmin, int kmin, - int imax, int jmax, int kmax, - int ilsh, int jlsh, int klsh, - int di); - -void -lc_control_selftest (lc_control_t * restrict lc, - int imin, int imax, int j, int k); - -void -lc_control_finish (lc_control_t * restrict lc); - - - -#define LC_LOOP3(name, i,j,k, imin,jmin,kmin, imax,jmax,kmax, ilsh,jlsh,klsh) \ - LC_LOOP3VEC(name, i,j,k, imin,jmin,kmin, imax,jmax,kmax, ilsh,jlsh,klsh, 1) + + + +#define LC_LOOP3STR_NORMAL(name, i,j,k, ni,nj,nk, \ + idir_, jdir_, kdir_, \ + imin_,jmin_,kmin_, \ + imax_,jmax_,kmax_, \ + iash_,jash_,kash_, \ + imin,imax, di_) \ + do { \ + typedef int lc_loop3vec_##name; \ + \ + ptrdiff_t const lc_dir0 CCTK_ATTRIBUTE_UNUSED = (idir_); \ + ptrdiff_t const lc_dir1 CCTK_ATTRIBUTE_UNUSED = (jdir_); \ + ptrdiff_t const lc_dir2 CCTK_ATTRIBUTE_UNUSED = (kdir_); \ + \ + ptrdiff_t const lc_ash0 CCTK_ATTRIBUTE_UNUSED = (iash_); \ + ptrdiff_t const lc_ash1 CCTK_ATTRIBUTE_UNUSED = (jash_); \ + ptrdiff_t const lc_ash2 CCTK_ATTRIBUTE_UNUSED = (kash_); \ + \ + ptrdiff_t const lc_align0 CCTK_ATTRIBUTE_UNUSED = (di_); \ + ptrdiff_t const lc_align1 CCTK_ATTRIBUTE_UNUSED = 1; \ + ptrdiff_t const lc_align2 CCTK_ATTRIBUTE_UNUSED = 1; \ + \ + static struct lc_stats_t* lc_stats = NULL; \ + lc_stats_init(&lc_stats, #name, __FILE__, __LINE__); \ + \ + lc_control_t lc_control; \ + lc_control_init(&lc_control, lc_stats, \ + (imin_), (jmin_), (kmin_), \ + (imax_), (jmax_), (kmax_), \ + lc_ash0, lc_ash1, lc_ash2, \ + lc_align0, lc_align1, lc_align2); \ + \ + /* Multithreading */ \ + for (lc_thread_init(&lc_control); \ + !lc_thread_done(&lc_control); \ + lc_thread_step(&lc_control)) \ + { \ + \ + /* Coarse loops */ \ + LC_COARSE_SETUP(2) \ + LC_COARSE_SETUP(1) \ + LC_COARSE_SETUP(0) \ + LC_COARSE_LOOP(2) { \ + LC_COARSE_LOOP(1) { \ + LC_COARSE_LOOP(0) { \ + \ + /* Fine loops */ \ + LC_FINE_SETUP(2) \ + LC_FINE_SETUP(1) \ + LC_FINE_SETUP(0) \ + LC_FINE_LOOP(k, nk, 2) { \ + LC_FINE_LOOP(j, nj, 1) { \ + LC_ALIGN(i,j,k) \ + LC_FINE_LOOP(i, ni, 0) { \ + LC_SELFTEST(i,j,k) \ + { + +#define LC_ENDLOOP3STR_NORMAL(name) \ + } /* body */ \ + }}}}}} /* fine */ \ + }}} /* coarse */ \ + } /* multithreading */ \ + lc_control_finish(&lc_control, lc_stats); \ + typedef lc_loop3vec_##name lc_ensure_proper_nesting; \ + } while(0) + + + +/* Definitions to ensure compatibility with earlier versions of + LoopControl */ +#define LC_LOOP3VEC(name, i,j,k, \ + imin_,jmin,kmin, imax_,jmax,kmax, \ + iash,jash,kash, imin,imax, di) \ + LC_LOOP3STR_NORMAL(name, i,j,k, lc_ni,lc_nj,lc_nk, \ + 0,0,0, \ + imin_,jmin,kmin, imax_,jmax,kmax, \ + iash,jash,kash, imin,imax, di) +#define LC_ENDLOOP3VEC(name) \ + LC_ENDLOOP3STR_NORMAL(name) + +#define LC_LOOP3(name, i,j,k, \ + imin,jmin,kmin, imax,jmax,kmax, \ + iash,jash,kash) \ + LC_LOOP3VEC(name, i,j,k, \ + imin,jmin,kmin, imax,jmax,kmax, \ + iash,jash,kash, lc_imin_,lc_imax_, 1) #define LC_ENDLOOP3(name) \ LC_ENDLOOP3VEC(name) - -#define LC_LOOP3VEC(name, i,j,k, imin_,jmin_,kmin_, imax_,jmax_,kmax_, ilsh_,jlsh_,klsh_, di_) \ - do { \ - typedef int lc_loop3vec_##name; \ - static int lc_initialised = 0; \ - static lc_statmap_t lc_lm; \ - if (! lc_initialised) { \ - lc_statmap_init (& lc_initialised, & lc_lm, #name); \ - } \ - int const lc_di = (di_); \ - lc_control_t lc_lc; \ - lc_control_init (& lc_lc, & lc_lm, \ - (imin_), (jmin_), (kmin_), \ - (imax_), (jmax_), (kmax_), \ - (ilsh_), (jlsh_), (klsh_), \ - lc_di); \ - int const lc_do_selftest = lc_lc.selftest_count != 0; \ - \ - /* Coarse loop */ \ - for (int lc_kk = lc_lc.kkmin; \ - lc_kk < lc_lc.kkmax; \ - lc_kk += lc_lc.kkstep) \ - { \ - int const lc_kmin = lc_kk + lc_lc.kkkkmin; \ - int const lc_kmax = \ - lc_min (lc_kk + lc_min (lc_lc.kkkkmax, lc_lc.kkstep), \ - lc_lc.kkmax); \ - \ - for (int lc_jj = lc_lc.jjmin; \ - lc_jj < lc_lc.jjmax; \ - lc_jj += lc_lc.jjstep) \ - { \ - int const lc_jmin = lc_jj + lc_lc.jjjjmin; \ - int const lc_jmax = \ - lc_min (lc_jj + lc_min (lc_lc.jjjjmax, lc_lc.jjstep), \ - lc_lc.jjmax); \ - \ - for (int lc_ii = lc_lc.iimin; \ - lc_ii < lc_lc.iimax; \ - lc_ii += lc_lc.iistep) \ - { \ - int const lc_imin = lc_ii + lc_lc.iiiimin; \ - int const lc_imax = \ - lc_min (lc_ii + lc_min (lc_lc.iiiimax, lc_lc.iistep), \ - lc_lc.iimax); \ - \ - /* Fine loop */ \ - for (int k = lc_kmin; k < lc_kmax; ++k) { \ - for (int j = lc_jmin; j < lc_jmax; ++j) { \ - LC_PRELOOP_STATEMENTS \ - { \ - if (CCTK_BUILTIN_EXPECT(lc_do_selftest, 0)) { \ - lc_control_selftest (& lc_lc, lc_imin, lc_imax, j, k); \ - } \ - int const lc_ipos = \ - lc_imin + lc_lc.ilsh * (j + lc_lc.jlsh * k); \ - int const lc_ioffset = (lc_ipos & - lc_di) - lc_ipos; \ - for (int i = lc_imin + lc_ioffset; i < lc_imax; i += lc_di) { - -#define LC_ENDLOOP3VEC(name) \ - } \ - } \ - LC_POSTLOOP_STATEMENTS \ - } \ - } \ - } \ - } \ - } \ - lc_control_finish (& lc_lc); \ - typedef lc_loop3vec_##name lc_ensure_proper_nesting; \ - } while (0) - -/* Pre- and post loop statements are inserted around the innermost - loop, which is executed serially. By default these are empty. */ -#define LC_PRELOOP_STATEMENTS -#define LC_POSTLOOP_STATEMENTS - - - + + + /* Replace CCTK_LOOP macros */ -#undef CCTK_LOOP3 -#undef CCTK_ENDLOOP3 -#define CCTK_LOOP3 LC_LOOP3 -#define CCTK_ENDLOOP3 LC_ENDLOOP3 - - - -#ifdef __cplusplus -} +#if !defined CCTK_LOOP3STR_NORMAL || !defined CCTK_ENDLOOP3STR_NORMAL +# error "internal error" #endif - +#undef CCTK_LOOP3STR_NORMAL +#undef CCTK_ENDLOOP3STR_NORMAL +#define CCTK_LOOP3STR_NORMAL(name, i,j,k, ni,nj,nk, \ + idir, jdir, kdir, \ + imin_,jmin,kmin, \ + imax_,jmax,kmax, \ + iash,jash,kash, \ + imin,imax, di) \ + LC_LOOP3STR_NORMAL(name, i,j,k, ni,nj,nk, \ + idir, jdir, kdir, \ + imin_,jmin,kmin, \ + imax_,jmax,kmax, \ + iash,jash,kash, \ + imin,imax, di) +#define CCTK_ENDLOOP3STR_NORMAL(name) \ + LC_ENDLOOP3STR_NORMAL(name) + + + +#ifdef __cplusplus + } #endif - +#endif /* #ifdef CCODE */ #ifdef FCODE # include "loopcontrol_fortran.h" #endif -#endif /* ifndef LC_LOOPCONTROL_H */ +#endif /* #ifndef LOOPCONTROL_H */ |