diff options
author | Erik Schnetter <schnetter@cct.lsu.edu> | 2007-10-10 13:39:00 +0000 |
---|---|---|
committer | Erik Schnetter <schnetter@cct.lsu.edu> | 2007-10-10 13:39:00 +0000 |
commit | 3215a191496ef0f9807d3b574151e91ebc27ca8a (patch) | |
tree | 8536bd7c3c3f0ff0a44d2a7b030b5881b0179b8e /Carpet | |
parent | e15f981b761acb114010667880ef59fd508ee41a (diff) |
LoopControl: Big update, introducing a hill climbing algorithm
darcs-hash:20071010133923-dae7b-a3406485d61ab795191655c89a16e2ae2c487978.gz
Diffstat (limited to 'Carpet')
-rw-r--r-- | Carpet/LoopControl/README | 72 | ||||
-rw-r--r-- | Carpet/LoopControl/param.ccl | 68 | ||||
-rw-r--r-- | Carpet/LoopControl/src/lc_auto.c | 35 | ||||
-rw-r--r-- | Carpet/LoopControl/src/lc_auto.h | 10 | ||||
-rw-r--r-- | Carpet/LoopControl/src/lc_hill.c | 324 | ||||
-rw-r--r-- | Carpet/LoopControl/src/lc_hill.h | 38 | ||||
-rw-r--r-- | Carpet/LoopControl/src/lc_siman.c | 6 | ||||
-rw-r--r-- | Carpet/LoopControl/src/loopcontrol.c | 272 | ||||
-rw-r--r-- | Carpet/LoopControl/src/loopcontrol.h | 94 | ||||
-rw-r--r-- | Carpet/LoopControl/src/make.code.defn | 2 |
10 files changed, 805 insertions, 116 deletions
diff --git a/Carpet/LoopControl/README b/Carpet/LoopControl/README index 00a9f04d1..231398bb0 100644 --- a/Carpet/LoopControl/README +++ b/Carpet/LoopControl/README @@ -7,3 +7,75 @@ Purpose of the thorn: Iterate over multi-dimensional arrays in an efficient manner, using OpenMP (if available) and cach-aware loop tiling. + + + +Comments and Ideas: + +Simulated annealing: Does not seem to work reliably. Needs many +iterations, gets stuck in local minima. Does not handle topology +changes well. Doesn't take past performance into account. + + + +Rewrite in C++? This would simplify the data types. + + + +Try loop skewing (see John Shalf's paper). + + + +Assumption: Each iteration is cheap, so that gradual tuning works. + +Design choice: No explicit tuning, works as black box. + +Assumpton: There are so many configurations (which change at run time) +and so many parameter choices (which would need to be tested for each +configuration) that it is not feasible to test all possibilities. + + + + +Idea for a better algorithm: + +Combine several strategies: + +1.a. Robust default setting: + - Topology: split in z direction + - Tiling: [N,1,] (test this more) + +2.a. Local 1D tiling optimisation (based on line searches): + - Globally define some maximum step size + - Choose a direction (i, j, or k) + - Determine a search interval in that direction + - Use exponential search to find interval boundaries + - Use arithmetic search to find minimum + +2.b. Local full tiling optimisation: + - Perform a series of 1D optimisations, until a local minimum in + for all directions has been found + +2.c. Change topology: + - Choose a new, "nearby" tiling + - Start with a good tiling from the previous topology + - Then perform a full tiling optimisation + +2.d. Choose a random new tiling + - Then perform a full tiling optimisation + +2.e. Choose a random new topology + - Then choose a random new tiling + +3.a. Whenever a test is done, repeat it a few times to avoid random + events + - Maybe don't repeat it right away, but wait some time + +3.b. If a value is old, forget it with a certain probability + - Need to remember a "last updated" field with statistics + +4.a. Offer an option to sample a wide range of configurations to give + the user an overview + - Note that a full sampling is too expensive. + - Use a hierarchical strategy for this? + - Use random sampling for this? diff --git a/Carpet/LoopControl/param.ccl b/Carpet/LoopControl/param.ccl index 009ee76e5..f49e73b37 100644 --- a/Carpet/LoopControl/param.ccl +++ b/Carpet/LoopControl/param.ccl @@ -56,7 +56,7 @@ CCTK_INT lc_knpoints "Number of grid points in the k-direction" STEERABLE=recove -# Automatic +# Automatic: simple cycling BOOLEAN cycle_j_tilings "Cycle through all available tilings in the j-direction" STEERABLE=recover { @@ -64,41 +64,91 @@ BOOLEAN cycle_j_tilings "Cycle through all available tilings in the j-direction" -BOOLEAN use_simulated_annealing "Find a good loop configuration through simulated annealing" +# Automatic: simulated annealing + +BOOLEAN use_simulated_annealing "Find a good loop configuration through simulated annealing" STEERABLE=recover { } "no" -CCTK_INT siman_iters_fixed_T "" +CCTK_INT siman_iters_fixed_T "" STEERABLE=recover { 1:* :: "" } 1 -CCTK_REAL siman_probability_change_topology "" +CCTK_REAL siman_probability_change_topology "" STEERABLE=recover { 0:1 :: "" } 0.1 -CCTK_REAL siman_step_size "" +CCTK_REAL siman_step_size "" STEERABLE=recover { (1.0:* :: "" } 3.0 -CCTK_REAL siman_k "energy scale" +CCTK_REAL siman_k "energy scale" STEERABLE=recover { (0:* :: "" } 1.0e-9 -CCTK_REAL siman_T_initial "initial variability" +CCTK_REAL siman_T_initial "initial variability" STEERABLE=recover { (0:* :: "" } 1.0 -CCTK_REAL siman_mu_T "speed" +CCTK_REAL siman_mu_T "speed" STEERABLE=recover { (0:* :: "" } 1.005 -CCTK_REAL siman_T_min "stopping criterion" +CCTK_REAL siman_T_min "stopping criterion" STEERABLE=recover { (0:* :: "" } 0.01 + + + +# Automatic: random restart hill climbing + +BOOLEAN use_random_restart_hill_climbing "http://en.wikipedia.org/wiki/Hill_climbing http://en.wikipedia.org/wiki/Tabu_search" STEERABLE=always +{ +} "no" + +CCTK_REAL maximum_setup_overhead "Maximum allowable administrative overhead" STEERABLE=always +{ + 0.0:* :: "" +} 0.01 + +CCTK_REAL probability_small_jump "Probability for a small jump once a local minimum has been reached" STEERABLE=always +{ + 0.0:1.0 :: "" +} 0.1 + +CCTK_INT small_jump_distance "Maximum distance for a small jump" STEERABLE=always +{ + 0:* :: "" +} 3 + +CCTK_REAL probability_random_jump "Probability for a random jump once a local minimum has been reached" STEERABLE=always +{ + 0.0:1.0 :: "" +} 0.01 + +CCTK_INT max_jump_attempts "Maximum number of attempts to find a random unknown location" STEERABLE=always +{ + 0:* :: "" +} 10 + +CCTK_REAL immediate_overhead_threshold "The maximum overhead (ratio of current to best known time) allowed during an excursion" STEERABLE=always +{ + 0.0:* :: "" +} 1.0 + +CCTK_REAL delayed_overhead_threshold "The maximum overhead (ratio of current to best known time) allowed during an excursion" STEERABLE=always +{ + 0.0:* :: "" +} 0.1 + +CCTK_INT overhead_threshold_delay "Number of steps in an excursion before the delayed overhead criterion is applied" STEERABLE=always +{ + 0:* :: "" +} 20 diff --git a/Carpet/LoopControl/src/lc_auto.c b/Carpet/LoopControl/src/lc_auto.c index 73d9a43fc..96c6b39d7 100644 --- a/Carpet/LoopControl/src/lc_auto.c +++ b/Carpet/LoopControl/src/lc_auto.c @@ -32,7 +32,7 @@ void take_step (const gsl_rng *rng, void *xp_, double step_size) { DECLARE_CCTK_PARAMETERS; - lc_auto_position_t * restrict const xp = xp_; + lc_state_t * restrict const xp = xp_; if (gsl_rng_uniform (rng) < siman_probability_change_topology) { xp->topology = gsl_rng_uniform_int (rng, statset->ntopologies); } @@ -45,8 +45,8 @@ static double distance (void *xp_, void *yp_) { - lc_auto_position_t const * restrict const xp = xp_; - lc_auto_position_t const * restrict const yp = yp_; + lc_state_t const * restrict const xp = xp_; + lc_state_t const * restrict const yp = yp_; double dist = 10 * (xp->topology != yp->topology); for (int d=0; d<3; ++d) { dist += fabs (xp->tiling[d] - yp->tiling[d]); @@ -58,8 +58,8 @@ static void print_position (void *xp_) { - lc_auto_position_t const * restrict const xp = xp_; - printf (" #%2d/[%3d,%3d,%3d]", + lc_state_t const * restrict const xp = xp_; + printf (" %2d/{%3d,%3d,%3d}", xp->topology, xp->tiling[0], xp->tiling[1], xp->tiling[2]); } @@ -68,8 +68,7 @@ print_position (void *xp_) void lc_auto_init (lc_statset_t * restrict const ls, - int * restrict const topology, - int tiling[3]) + lc_state_t * restrict const state) { DECLARE_CCTK_PARAMETERS; assert (! cycle_j_tilings); /* don't mix strategies */ @@ -91,11 +90,8 @@ lc_auto_init (lc_statset_t * restrict const ls, gsl_rng_env_setup(); ls->auto_state->rng = gsl_rng_alloc (gsl_rng_default); - /* Set initial position */ - ls->auto_state->position.topology = * topology; - for (int d=0; d<3; ++d) { - ls->auto_state->position.tiling[d] = tiling[d]; - } + /* Set initial state */ + ls->auto_state->state = * state; /* Initialise simulated annealing state */ statset = ls; @@ -103,8 +99,8 @@ lc_auto_init (lc_statset_t * restrict const ls, lc_siman_solve (NULL, ls->auto_state->rng, NULL, 0.0, - take_step, distance, print_position, - sizeof (lc_auto_position_t), + take_step, distance, verbose ? print_position : NULL, + sizeof (lc_state_t), ls->auto_state->siman_params); } else { @@ -116,17 +112,14 @@ lc_auto_init (lc_statset_t * restrict const ls, ls->auto_state->siman_state = lc_siman_solve (ls->auto_state->siman_state, ls->auto_state->rng, - & ls->auto_state->position, ls->auto_state->time, - take_step, distance, print_position, - sizeof (lc_auto_position_t), + & ls->auto_state->state, ls->auto_state->time, + take_step, distance, verbose ? print_position : NULL, + sizeof (lc_state_t), ls->auto_state->siman_params); } /* Set thread topology and tiling specification */ - * topology = ls->auto_state->position.topology; - for (int d=0; d<3; ++d) { - tiling[d] = ls->auto_state->position.tiling[d]; - } + * state = ls->auto_state->state; } /* if not the first call */ } diff --git a/Carpet/LoopControl/src/lc_auto.h b/Carpet/LoopControl/src/lc_auto.h index a0c7b1798..a89db6aa8 100644 --- a/Carpet/LoopControl/src/lc_auto.h +++ b/Carpet/LoopControl/src/lc_auto.h @@ -8,18 +8,13 @@ -typedef struct lc_auto_position_t { - int topology; - int tiling[3]; -} lc_auto_position_t; - /* no typedef here; forward declcared in loopcontrol.h */ struct lc_auto_state_t { lc_siman_params_t siman_params; gsl_rng * rng; lc_siman_state_t * siman_state; - lc_auto_position_t position; + lc_state_t state; double time; }; @@ -27,8 +22,7 @@ struct lc_auto_state_t { void lc_auto_init (lc_statset_t * restrict const ls, - int * restrict const topology, - int tiling[3]); + lc_state_t * restrict const state); void lc_auto_finish (lc_statset_t * restrict const ls, diff --git a/Carpet/LoopControl/src/lc_hill.c b/Carpet/LoopControl/src/lc_hill.c new file mode 100644 index 000000000..f24b0efae --- /dev/null +++ b/Carpet/LoopControl/src/lc_hill.c @@ -0,0 +1,324 @@ +#include <assert.h> +#include <stdlib.h> +#include <stdio.h> + +#include <cctk.h> +#include <cctk_Parameters.h> + +#include "loopcontrol.h" + +#include "lc_hill.h" + + + +static inline +int +imin (int const a, int const b) +{ + return a < b ? a : b; +} + +static inline +int +imax (int const a, int const b) +{ + return a > b ? a : b; +} + +static +double +drand (void) +{ + return rand() / (RAND_MAX + 1.0); +} + +static +int +irand (int const imax) +{ + return rand() / (RAND_MAX + 1.0) * imax; +} + + + +static +double +time_for_stattime (lc_stattime_t const * restrict const lt) +{ + assert (lt); + return lt->time_calc_sum / lt->time_count; +} + +static +double +time_for_state (lc_statset_t const * restrict const ls, + lc_state_t const * restrict const state) +{ + lc_stattime_t const * restrict const lt = lc_stattime_find (ls, state); + assert (lt); + return time_for_stattime (lt); +} + + + +void +lc_hill_init (lc_statset_t * restrict const ls, + lc_state_t * const new_state) +{ + DECLARE_CCTK_PARAMETERS; + + lc_hill_state_t * restrict lh = ls->hill_state; + + /* Initialise state */ + if (! lh) { + if (verbose) { + CCTK_INFO ("Hill climbing: Initialising"); + } + ls->hill_state = malloc (sizeof * ls->hill_state); + lh = ls->hill_state; + lh->iteration = 0; + lh->have_best = 0; + lh->excursion_start = 0; + lh->have_previous = 0; + lh->state = * new_state; + return; + } + + /* If the overhead has become too large, do nothing. */ + if (ls->time_setup_sum > maximum_setup_overhead * ls->time_calc_sum) { + /* Stay at the old state. */ + * new_state = lh->state; + return; + } + + ++ lh->iteration; + + if (verbose) { + CCTK_VInfo (CCTK_THORNSTRING, + "Hill climbing: iter %d, state %2d/{%2d,%2d,%2d}, time %g", + lh->iteration, + lh->state.topology, + lh->state.tiling[0], + lh->state.tiling[1], + lh->state.tiling[2], + lh->time); + } + + /* Test whether we have a new best time */ + if (! lh->have_best || lh->time < lh->best_time) { + /* Remember this state */ + if (verbose) { + CCTK_INFO ("Hill climbing: This is a new best time"); + } + lh->best = lh->state; + lh->best_time = lh->time; + lh->have_best = 1; + lh->excursion_start = lh->iteration; + } else if (lh->have_best && lc_state_equal (& lh->state, & lh->best)) { + /* Update time for best state */ + if (verbose) { + CCTK_INFO ("Hill climbing: Updating best time"); + } + lh->best_time = lh->time; + } + + /* Compare the time for the current state with the time for the + previous state. If the previous state was better, backtrack. */ + if (lh->have_previous && lh->previous_time < lh->time) { + if (verbose) { + CCTK_INFO ("Hill climbing: Backtracking"); + } + lh->state = lh->previous; + lh->time = lh->previous_time; + lh->have_previous = 0; + } + + /* Give up if the current time is too bad */ + if (lh->have_best) { + int const immediate_overhead = + lh->time > lh->best_time * (1.0 + immediate_overhead_threshold); + int const delayed_overhead = + lh->iteration > lh->excursion_start + overhead_threshold_delay && + lh->time > lh->best_time * (1.0 + delayed_overhead_threshold); + if (immediate_overhead || delayed_overhead) { + if (verbose) { + CCTK_INFO ("Hill climbing: Reverting to best known state"); + } + lh->excursion_start = lh->iteration; + lh->state = lh->best; + lh->time = lh->best_time; + } + } + + /* Age the state */ + lh->previous = lh->state; + lh->previous_time = lh->time; + lh->have_previous = 1; + + search:; + + /* Look which neighbours exist. */ + typedef enum { nb_boundary, nb_missing, nb_exists } neighbour_t; + neighbour_t neighbours[3][2]; + lc_state_t nb_state[3][2]; + double nb_time[3][2]; + lc_state_t * nb_nonexist_state[6]; + int num_nonexist_states = 0; + lc_state_t * nb_minimum_time = NULL; + double minimum_time; + for (int d=0; d<3; ++d) { + for (int f=0; f<2; ++f) { + nb_state[d][f] = lh->state; + nb_state[d][f].tiling[d] += f ? + 1: -1; + int const ntilings = ls->topology_ntilings[d][nb_state[d][f].topology]; + if (nb_state[d][f].tiling[d] < 0 || + nb_state[d][f].tiling[d] >= ntilings) + { + neighbours[d][f] = nb_boundary; + } else { + lc_stattime_t const * restrict const nb_lt = + lc_stattime_find (ls, & nb_state[d][f]); + if (! nb_lt) { + neighbours[d][f] = nb_missing; + nb_nonexist_state[num_nonexist_states++] = & nb_state[d][f]; + } else { + neighbours[d][f] = nb_exists; + nb_time[d][f] = time_for_stattime (nb_lt); + if (! nb_minimum_time || nb_time[d][f] < minimum_time) { + nb_minimum_time = & nb_state[d][f]; + minimum_time = nb_time[d][f]; + } + } + } + } + } + + /* If not all neighbours exist, then choose a random neighbour and + move there. */ + if (num_nonexist_states > 0) { + if (verbose) { + CCTK_INFO ("Hill climbing: Examining a new state"); + } + int const choice = irand (num_nonexist_states); + lh->state = * nb_nonexist_state[choice]; + * new_state = lh->state; + return; + } + + /* All neighbours exist. Look whether we are in a local + minimum. */ + assert (nb_minimum_time); + if (minimum_time >= lh->time) { + /* We are in a local minimum. */ + if (verbose) { + CCTK_INFO ("Hill climbing: Local minimum reached"); + } + + /* Every so often take a small jump. */ + if (drand() < probability_small_jump) { + /* Be curious, go somewhere nearby. */ + if (verbose) { + CCTK_INFO ("Hill climbing: Making a small jump"); + } + for (int ntries = 0; ntries < max_jump_attempts; ++ ntries) { + lc_state_t try_state = lh->state; + if (drand() < 0.25) { + /* Change the topology, but not the tiling. */ + try_state.topology = irand (ls->ntopologies); + for (int d=0; d<3; ++d) { + if (try_state.tiling[d] >= + ls->topology_ntilings[d][try_state.topology]) + { + /* The tiling doesn't fit for this new topology; don't + choose this topology. */ + goto next_try; + } + } + } else { + /* Change the tiling a bit, but keep the topology */ + for (int d=0; d<3; ++d) { + int const i0 = + imax (try_state.tiling[d] - small_jump_distance, 0); + int const i1 = + imin (try_state.tiling[d] + small_jump_distance + 1, + ls->topology_ntilings[d][try_state.topology]); + try_state.tiling[d] = i0 + irand (i1 - i0); + } + } + if (! lc_stattime_find (ls, & try_state)) { + lh->state = try_state; + * new_state = lh->state; + return; + } + next_try:; + } + /* Don't jump after all. */ + } + + /* Every so often take a random jump. */ + if (drand() < probability_random_jump) { + /* Be adventurous, go somewhere unknown. */ + if (verbose) { + CCTK_INFO ("Hill climbing: Jumping randomly"); + } + for (int ntries = 0; ntries < max_jump_attempts; ++ ntries) { + lc_state_t try_state; + try_state.topology = irand (ls->ntopologies); + for (int d=0; d<3; ++d) { + try_state.tiling[d] = + irand (ls->topology_ntilings[d][try_state.topology]); + } + if (! lc_stattime_find (ls, & try_state)) { + /* The new state is hitherto unknown, use it. */ + lh->state = try_state; + lh->excursion_start = lh->iteration; + lh->have_previous = 0; /* disable backtracking */ + * new_state = lh->state; + return; + } + } + /* Don't jump after all. */ + } + + /* If the current state is not the best state, give up and go + back. */ + if (! lc_state_equal (& lh->state, & lh->best)) { + /* Revert to the best known state. */ + if (verbose) { + CCTK_INFO ("Hill climbing: Reverting to best known state"); + } + lh->state = lh->best; + lh->excursion_start = lh->iteration; + lh->have_previous = 0; + * new_state = lh->best; + return; + } + + /* Be content, do nothing. */ + if (verbose) { + CCTK_INFO ("Hill climbing: Resting"); + } + * new_state = lh->state; + return; + } + + /* One of the neighbours is better. Move to this neighbour, and + continue the search there. */ + if (verbose) { + CCTK_INFO ("Hill climbing: Found a better neighbour, going there"); + } + lh->state = * nb_minimum_time; + lh->time = minimum_time; + goto search; +} + + + +void +lc_hill_finish (lc_statset_t * restrict const ls, + lc_stattime_t const * restrict const lt) +{ + lc_hill_state_t * restrict const lh = ls->hill_state; + + lh->time = time_for_stattime (lt); +} diff --git a/Carpet/LoopControl/src/lc_hill.h b/Carpet/LoopControl/src/lc_hill.h new file mode 100644 index 000000000..7d418314b --- /dev/null +++ b/Carpet/LoopControl/src/lc_hill.h @@ -0,0 +1,38 @@ +#ifndef LC_HILL_H +#define LC_HILL_H + +#include <cctk.h> + +#include "loopcontrol.h" + + + +/* no typedef here; forward declcared in loopcontrol.h */ +struct lc_hill_state_t { + int iteration; + + lc_state_t best; + double best_time; + int have_best; + + int excursion_start; /* -1 if not on an excursion */ + + lc_state_t previous; + double previous_time; + int have_previous; + + lc_state_t state; + double time; +}; + + + +void +lc_hill_init (lc_statset_t * restrict const ls, + lc_state_t * restrict const state); + +void +lc_hill_finish (lc_statset_t * restrict const ls, + lc_stattime_t const * restrict const lt); + +#endif /* #ifndef LC_HILL_H */ diff --git a/Carpet/LoopControl/src/lc_siman.c b/Carpet/LoopControl/src/lc_siman.c index a6fa22bc7..d34d3da0a 100644 --- a/Carpet/LoopControl/src/lc_siman.c +++ b/Carpet/LoopControl/src/lc_siman.c @@ -28,6 +28,8 @@ #include <gsl/gsl_machine.h> #include <gsl/gsl_rng.h> +#include <cctk.h> + #include "lc_siman.h" static inline @@ -39,11 +41,11 @@ double safe_exp (double x) /* avoid underflow errors for large uphill steps */ /* this structure contains internal state information for lc_siman_solve */ -typedef enum { state_initial, state_first, state_looping } lc_state_t; +typedef enum { state_initial, state_first, state_looping } lc_siman_location_t; /* no typedef here; forward declcared in lc_siman.h */ struct lc_siman_state_t { - lc_state_t state; + lc_siman_location_t state; const gsl_rng *r; lc_siman_step_t take_step; lc_siman_metric_t distance; diff --git a/Carpet/LoopControl/src/loopcontrol.c b/Carpet/LoopControl/src/loopcontrol.c index 32661064b..72fd3e40e 100644 --- a/Carpet/LoopControl/src/loopcontrol.c +++ b/Carpet/LoopControl/src/loopcontrol.c @@ -12,10 +12,13 @@ # include <omp.h> #endif +#include <cctk.h> #include <cctk_Parameters.h> #include "loopcontrol.h" + #include "lc_auto.h" +#include "lc_hill.h" @@ -77,9 +80,9 @@ find_thread_topologies (lc_topology_t * restrict const topologies, int const ni = nthreads/(nj*nk); if (nthreads == ni*nj*nk) { assert (* ntopologies < maxntopologies); - topologies[* ntopologies].ni = ni; - topologies[* ntopologies].nj = nj; - topologies[* ntopologies].nk = nk; + topologies[* ntopologies].nthreads[0] = ni; + topologies[* ntopologies].nthreads[1] = nj; + topologies[* ntopologies].nthreads[2] = nk; ++ * ntopologies; } } @@ -101,6 +104,7 @@ find_thread_topologies (lc_topology_t * restrict const topologies, into two subdomains with [100+N, 100-N] points. This is avoided by covering all possible tiling specifications in exponentially growing step sizes. */ +#if 0 static int tiling_compare (const void * const a, const void * const b) { @@ -154,28 +158,60 @@ find_tiling_specifications (lc_tiling_t * restrict const tilings, /* Sort */ qsort (tilings, * ntilings, sizeof * tilings, tiling_compare); } +#endif + +static +void +find_tiling_specifications (lc_tiling_t * restrict const tilings, + const int maxntilings, + int * restrict const ntilings, + int const npoints) +{ + /* In order to reduce the number of possible tilings, require that + the step sizes differ by more than 10% */ + double const distance_factor = 1.1; + /* For N grid points and a minimum spacing factor F, there are at + most log(N) / log(F) possible tilings. There will be fewer, + since the actual spacings will be rounded up to integers. */ + + * ntilings = 0; + + int minnpoints = 0; + for (int n=1; n<npoints; ++n) { + if ((double) n > minnpoints * distance_factor) { + assert (* ntilings < maxntilings); + tilings[* ntilings].npoints = n; + minnpoints = n; + ++ * ntilings; + } + } + + assert (* ntilings < maxntilings); + tilings[* ntilings].npoints = npoints; + minnpoints = npoints; + ++ * ntilings; +} /* Initialise control parameter set statistics */ -static void lc_stattime_init (lc_stattime_t * restrict const lt, lc_statset_t * restrict const ls, - int const topology, - int const tiling[3]) + lc_state_t const * restrict const state) { DECLARE_CCTK_PARAMETERS; /* Check arguments */ assert (lt); assert (ls); + assert (state); /*** Topology ***************************************************************/ - lt->topology = topology; + lt->state.topology = state->topology; - if (topology == -1) { + if (state->topology == -1) { /* User-specified topology */ lt->inthreads = -1; @@ -184,21 +220,21 @@ lc_stattime_init (lc_stattime_t * restrict const lt, } else { - assert (topology >= 0 && topology < ls->ntopologies); + assert (state->topology >= 0 && state->topology < ls->ntopologies); - lt->inthreads = ls->topologies[lt->topology].ni; - lt->jnthreads = ls->topologies[lt->topology].nj; - lt->knthreads = ls->topologies[lt->topology].nk; + lt->inthreads = ls->topologies[lt->state.topology].nthreads[0]; + lt->jnthreads = ls->topologies[lt->state.topology].nthreads[1]; + lt->knthreads = ls->topologies[lt->state.topology].nthreads[2]; } if (debug) { printf ("Thread topology #%d [%d,%d,%d]\n", - lt->topology, lt->inthreads, lt->jnthreads, lt->knthreads); + lt->state.topology, lt->inthreads, lt->jnthreads, lt->knthreads); } /* Assert thread topology consistency */ - if (lt->topology != -1) { + if (lt->state.topology != -1) { assert (lt->inthreads >= 1); assert (lt->jnthreads >= 1); assert (lt->knthreads >= 1); @@ -208,20 +244,21 @@ lc_stattime_init (lc_stattime_t * restrict const lt, /*** Tilings ****************************************************************/ for (int d=0; d<3; ++d) { - lt->tiling[d] = tiling[d]; - if (tiling[d] != -1) { - assert (tiling[d] >= 0 && tiling[d] < ls->ntilings[d]); + lt->state.tiling[d] = state->tiling[d]; + if (state->tiling[d] != -1) { + assert (state->tiling[d] >= 0 && + state->tiling[d] < ls->topology_ntilings[d][lt->state.topology]); } } - if (tiling[0] != -1) { - lt->inpoints = ls->tilings[0][lt->tiling[0]].npoints; + if (state->tiling[0] != -1) { + lt->inpoints = ls->tilings[0][lt->state.tiling[0]].npoints; } - if (tiling[1] != -1) { - lt->jnpoints = ls->tilings[1][lt->tiling[1]].npoints; + if (state->tiling[1] != -1) { + lt->jnpoints = ls->tilings[1][lt->state.tiling[1]].npoints; } - if (tiling[2] != -1) { - lt->knpoints = ls->tilings[2][lt->tiling[2]].npoints; + if (state->tiling[2] != -1) { + lt->knpoints = ls->tilings[2][lt->state.tiling[2]].npoints; } if (debug) { @@ -230,13 +267,13 @@ lc_stattime_init (lc_stattime_t * restrict const lt, } /* Assert tiling specification consistency */ - if (tiling[0] != -1) { + if (state->tiling[0] != -1) { assert (lt->inpoints > 0); } - if (tiling[1] != -1) { + if (state->tiling[1] != -1) { assert (lt->jnpoints > 0); } - if (tiling[2] != -1) { + if (state->tiling[2] != -1) { assert (lt->knpoints > 0); } @@ -249,6 +286,10 @@ lc_stattime_init (lc_stattime_t * restrict const lt, lt->time_calc_sum = 0.0; lt->time_calc_sum2 = 0.0; + lt->last_updated = 0.0; /* never updated */ + + + /* Append to loop statistics list */ /* _Pragma ("omp critical") { */ lt->next = ls->stattime_list; @@ -256,38 +297,49 @@ lc_stattime_init (lc_stattime_t * restrict const lt, /* } */ } -static lc_stattime_t * -lc_stattime_find (lc_statset_t * restrict const ls, - int const topology, - int const tiling[3]) +lc_stattime_find (lc_statset_t const * restrict const ls, + lc_state_t const * restrict const state) { assert (ls); lc_stattime_t * lt; for (lt = ls->stattime_list; lt; lt = lt->next) { - if (lt->topology == topology && - lt->tiling[0] == tiling[0] && - lt->tiling[1] == tiling[1] && - lt->tiling[2] == tiling[2]) - { + if (lc_state_equal (& lt->state, state)) { + break; + } + } + + return lt; +} + +lc_stattime_t * +lc_stattime_find_create (lc_statset_t * restrict const ls, + lc_state_t const * restrict const state) +{ + assert (ls); + + lc_stattime_t * lt; + + for (lt = ls->stattime_list; lt; lt = lt->next) { + if (lc_state_equal (& lt->state, state)) { break; } } if (! lt) { lt = malloc (sizeof * lt); - lc_stattime_init (lt, ls, topology, tiling); + lc_stattime_init (lt, ls, state); } + assert (lt); return lt; } /* Initialise user parameter set statistics */ -static void lc_statset_init (lc_statset_t * restrict const ls, lc_statmap_t * restrict const lm, @@ -326,7 +378,9 @@ lc_statset_init (lc_statset_t * restrict const ls, for (int n = 0; n < ls->ntopologies; ++n) { printf (" %2d: %2d %2d %2d\n", n, - ls->topologies[n].ni, ls->topologies[n].nj, ls->topologies[n].nk); + ls->topologies[n].nthreads[0], + ls->topologies[n].nthreads[1], + ls->topologies[n].nthreads[2]); } } @@ -344,14 +398,28 @@ lc_statset_init (lc_statset_t * restrict const ls, printf ("Dimension %d: %d points\n", d, ls->npoints[d]); } ls->tilings[d] = malloc (maxntilings * sizeof * ls->tilings[d]); + int ntilings; find_tiling_specifications (ls->tilings[d], maxntilings, & ls->ntilings[d], ls->npoints[d]); #if 0 ls->tilings[d] = realloc (ls->tilings[d], ls->ntilings[d] * sizeof * ls->tilings[d]); #endif + ls->topology_ntilings[d] = + malloc (ls->ntopologies * sizeof * ls->topology_ntilings[d]); + for (int n = 0; n < ls->ntopologies; ++n) { + int tiling; + for (tiling = 0; tiling < ls->ntilings[d]; ++tiling) { + if (ls->tilings[d][tiling].npoints * ls->topologies[n].nthreads[d] > + ls->npoints[d]) + { + break; + } + } + ls->topology_ntilings[d][n] = tiling; + } if (debug) { - printf (" Found %d possible tilings\n", ls->ntilings[d]); + printf (" Found %d possible tilings\n", ntilings); printf (" "); for (int n = 0; n < ls->ntilings[d]; ++n) { printf (" %d", ls->tilings[d][n].npoints); @@ -365,11 +433,21 @@ lc_statset_init (lc_statset_t * restrict const ls, /* Simulated annealing state */ ls->auto_state = NULL; + /* Hill climbing state */ + ls->hill_state = NULL; + /* Initialise list */ ls->stattime_list = NULL; + /* Initialise statistics */ + ls->time_count = 0.0; + ls->time_setup_sum = 0.0; + ls->time_setup_sum2 = 0.0; + ls->time_calc_sum = 0.0; + ls->time_calc_sum2 = 0.0; + /* Append to loop statistics list */ /* _Pragma ("omp critical") { */ ls->next = lm->statset_list; @@ -377,9 +455,8 @@ lc_statset_init (lc_statset_t * restrict const ls, /* } */ } -static lc_statset_t * -lc_statset_find (lc_statmap_t * restrict const lm, +lc_statset_find (lc_statmap_t const * restrict const lm, int const num_threads, int const npoints[3]) { @@ -397,11 +474,34 @@ lc_statset_find (lc_statmap_t * restrict const lm, } } + return ls; +} + +lc_statset_t * +lc_statset_find_create (lc_statmap_t * restrict const lm, + int const num_threads, + int const npoints[3]) +{ + assert (lm); + + lc_statset_t * ls; + + for (ls = lm->statset_list; ls; ls = ls->next) { + if (ls->num_threads == num_threads && + ls->npoints[0] == npoints[0] && + ls->npoints[1] == npoints[1] && + ls->npoints[2] == npoints[2]) + { + break; + } + } + if (! ls) { ls = malloc (sizeof * ls); lc_statset_init (ls, lm, num_threads, npoints); } + assert (ls); return ls; } @@ -472,7 +572,7 @@ lc_control_init (lc_control_t * restrict const lc, npoints[1] = lc_max (jmax - jmin, 0); npoints[2] = lc_max (kmax - kmin, 0); - ls = lc_statset_find (lm, num_threads, npoints); + ls = lc_statset_find_create (lm, num_threads, npoints); } @@ -480,9 +580,9 @@ lc_control_init (lc_control_t * restrict const lc, lc_stattime_t * restrict lt; _Pragma ("omp single copyprivate (lt)") { - /* Select topology */ + lc_state_t state; - int topology; + /* Select topology */ if (lc_inthreads != -1 || lc_jnthreads != -1 || lc_knthreads != -1) { @@ -490,69 +590,73 @@ lc_control_init (lc_control_t * restrict const lc, if (lc_inthreads == -1 || lc_jnthreads == -1 || lc_knthreads == -1) { CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, CCTK_THORNSTRING, - "Illegal thread topology [%d,%d,%d] specified\n", + "Illegal thread topology [%d,%d,%d] specified", (int)lc_inthreads, (int)lc_jnthreads, (int)lc_knthreads); } if (lc_inthreads * lc_jnthreads * lc_knthreads != ls->num_threads) { CCTK_VWarn (CCTK_WARN_ABORT, __LINE__, __FILE__, CCTK_THORNSTRING, - "Specified thread topology [%d,%d,%d] is not compatible with the number of threads %d\n", + "Specified thread topology [%d,%d,%d] is not compatible with the number of threads %d", (int)lc_inthreads, (int)lc_jnthreads, (int)lc_knthreads, ls->num_threads); } - topology = -1; + state.topology = -1; } else { /* Split in the k direction */ - topology = ls->ntopologies - 1; + state.topology = ls->ntopologies - 1; } /* Select tiling */ - int tiling[3]; - if (lc_inpoints != -1) { /* User-specified tiling */ - tiling[0] = -1; + state.tiling[0] = -1; } else { - tiling[0] = ls->ntilings[0] - 1; /* as many points as possible */ + /* as many points as possible */ + state.tiling[0] = ls->topology_ntilings[0][state.topology] - 1; } if (lc_jnpoints != -1) { /* User-specified tiling */ - tiling[1] = -1; + state.tiling[1] = -1; } else { if (cycle_j_tilings) { /* cycle through all tilings */ static int count = 0; - tiling[1] = (count ++) % ls->ntilings[1]; + state.tiling[1] = (count ++) % ls->topology_ntilings[1][state.topology]; } else { /* as few points as possible */ - tiling[1] = 0; + state.tiling[1] = 0; } } if (lc_knpoints != -1) { /* User-specified tiling */ - tiling[2] = -1; + state.tiling[2] = -1; } else { - tiling[2] = ls->ntilings[2] - 1; /* as many points as possible */ + /* as many points as possible */ + state.tiling[2] = ls->topology_ntilings[2][state.topology] - 1; } /* Use simulated annealing to find the best loop configuration */ if (use_simulated_annealing) { - lc_auto_init (ls, & topology, tiling); + lc_auto_init (ls, & state); + } + /* Use hill climbing to find the best loop configuration */ + if (use_random_restart_hill_climbing) { + lc_hill_init (ls, & state); } /* Find or create database entry */ - lt = lc_stattime_find (ls, topology, tiling); + lt = lc_stattime_find_create (ls, & state); /* Topology */ - if (topology == -1) { + if (state.topology == -1) { /* User-specified topology */ lt->inthreads = lc_inthreads; lt->jnthreads = lc_jnthreads; @@ -567,15 +671,15 @@ lc_control_init (lc_control_t * restrict const lc, /* Tilings */ - if (tiling[0] == -1) { + if (state.tiling[0] == -1) { /* User-specified tiling */ lt->inpoints = lc_inpoints; } - if (tiling[1] == -1) { + if (state.tiling[1] == -1) { /* User-specified tiling */ lt->jnpoints = lc_jnpoints; } - if (tiling[2] == -1) { + if (state.tiling[2] == -1) { /* User-specified tiling */ lt->knpoints = lc_knpoints; } @@ -663,6 +767,7 @@ lc_control_finish (lc_control_t * restrict const lc) /* Update statistics */ lc_stattime_t * restrict const lt = lc->stattime; + lc_statset_t * restrict const ls = lc->statset; _Pragma ("omp critical") { lt->time_count += 1.0; @@ -671,19 +776,35 @@ lc_control_finish (lc_control_t * restrict const lc) lt->time_calc_sum += time_calc_sum; lt->time_calc_sum2 += time_calc_sum2; + + ls->time_count += 1.0; + + ls->time_setup_sum += time_setup_sum; + ls->time_setup_sum2 += time_setup_sum2; + + ls->time_calc_sum += time_calc_sum; + ls->time_calc_sum2 += time_calc_sum2; } + _Pragma ("omp master") { + lt->last_updated = time_calc_end; + } + + _Pragma ("omp barrier"); + { DECLARE_CCTK_PARAMETERS; if (use_simulated_annealing) { _Pragma ("omp single") { - lc_statset_t * restrict const ls = lc->statset; lc_auto_finish (ls, lt); } } + if (use_random_restart_hill_climbing) { + _Pragma ("omp single") { + lc_hill_finish (ls, lt); + } + } } - - _Pragma ("omp barrier"); } @@ -722,6 +843,7 @@ lc_printstats (CCTK_ARGUMENTS) printf (" statset #%d nthreads=%d npoints=[%d,%d,%d]\n", nsets, ls->num_threads, ls->npoints[0], ls->npoints[1], ls->npoints[2]); + double sum_count = 0.0; double sum_setup = 0.0; double sum_calc = 0.0; double min_calc = DBL_MAX; @@ -730,20 +852,26 @@ lc_printstats (CCTK_ARGUMENTS) for (lc_stattime_t * lt = ls->stattime_list; lt; lt = lt->next) { printf (" stattime #%d topology=%d [%d,%d,%d] tiling=[%d,%d,%d]\n", ntimes, - lt->topology, lt->inthreads, lt->jnthreads, lt->knthreads, + lt->state.topology, lt->inthreads, lt->jnthreads, lt->knthreads, lt->inpoints, lt->jnpoints, lt->knpoints); + double const count = lt->time_count; + double const setup = lt->time_setup_sum / count; + double const calc = lt->time_calc_sum / count; printf (" count: %g setup: %g calc: %g\n", - lt->time_count, lt->time_setup_sum, lt->time_calc_sum); + count, setup, calc); + sum_count += lt->time_count; sum_setup += lt->time_setup_sum; sum_calc += lt->time_calc_sum; - if (lt->time_calc_sum < min_calc) { - min_calc = lt->time_calc_sum; + if (calc < min_calc) { + min_calc = calc; imin_calc = ntimes; } ++ ntimes; } - printf (" total setup: %g total calc: %g min calc: %g (#%d)\n", - sum_setup, sum_calc, min_calc, imin_calc); + printf (" total count: %g total setup: %g total calc: %g\n", + sum_count, sum_setup, sum_calc); + printf (" min calc: %g (#%d)\n", + min_calc, imin_calc); ++ nsets; } ++ nmaps; diff --git a/Carpet/LoopControl/src/loopcontrol.h b/Carpet/LoopControl/src/loopcontrol.h index 43ac5784b..5d640c3b6 100644 --- a/Carpet/LoopControl/src/loopcontrol.h +++ b/Carpet/LoopControl/src/loopcontrol.h @@ -36,7 +36,7 @@ /* A topology */ typedef struct lc_topology_t { - int ni, nj, nk; + int nthreads[3]; } lc_topology_t; /* A tiling specification */ @@ -46,9 +46,19 @@ typedef struct lc_tiling_t { +typedef struct lc_state_t { + int topology; + int tiling[3]; +} lc_state_t; + + + /* For simulated annealing */ typedef struct lc_auto_state_t lc_auto_state_t; +/* For hill climbing */ +typedef struct lc_hill_state_t lc_hill_state_t; + /* Statistics for one control parameter set (thread topology and @@ -58,9 +68,8 @@ typedef struct lc_stattime_t { /* Keys */ - int topology; + lc_state_t state; int inthreads, jnthreads, knthreads; - int tiling[3]; int inpoints, jnpoints, knpoints; /* Data */ @@ -69,8 +78,12 @@ typedef struct lc_stattime_t { double time_count; /* number of calls and threads */ double time_setup_sum, time_setup_sum2; /* time spent setting up loops */ double time_calc_sum, time_calc_sum2; /* time spent iterating */ + + double last_updated; /* wall time tag */ } lc_stattime_t; + + /* Statistics for one user parameter set (number of threads and number of iterations) of one loop */ typedef struct lc_statset_t { @@ -90,13 +103,24 @@ typedef struct lc_statset_t { /* Tiling specifications */ lc_tiling_t * restrict tilings[3]; int ntilings[3]; + int * restrict topology_ntilings[3]; /* [dim][topology] */ /* Simulated annealing state */ lc_auto_state_t * auto_state; + /* Hill climbing state */ + lc_hill_state_t * hill_state; + lc_stattime_t * stattime_list; + + /* Statistics */ + double time_count; /* number of calls and threads */ + double time_setup_sum, time_setup_sum2; /* time spent setting up loops */ + double time_calc_sum, time_calc_sum2; /* time spent iterating */ } lc_statset_t; + + /* Statistics for one loop (one source code location) */ typedef struct lc_statmap_t { struct lc_statmap_t * next; /* for linked list */ @@ -107,11 +131,75 @@ typedef struct lc_statmap_t { lc_statset_t * statset_list; } lc_statmap_t; + + /* Linked list of all loop statistics structures */ extern lc_statmap_t * lc_statmap_list; +static inline +int +lc_state_valid (lc_statset_t const * restrict const ls, + lc_state_t const * restrict const state) +{ + if (state->topology >= 0 && state->topology < ls->ntopologies) { + int const * restrict const ntilings = + ls->topology_ntilings[state->topology]; + return (state->tiling[0] >= 0 && state->tiling[0] < ntilings[0] && + state->tiling[1] >= 0 && state->tiling[1] < ntilings[1] && + state->tiling[2] >= 0 && state->tiling[2] < ntilings[2]); + } + return 0; +} + +static inline +int +lc_state_equal (lc_state_t const * restrict const state1, + lc_state_t const * restrict const state2) +{ + return (state1->topology == state2->topology && + state1->tiling[0] == state2->tiling[0] && + state1->tiling[1] == state2->tiling[1] && + state1->tiling[2] == state2->tiling[2]); +} + + + +void +lc_stattime_init (lc_stattime_t * restrict const lt, + lc_statset_t * restrict const ls, + lc_state_t const * restrict const state); + +lc_stattime_t * +lc_stattime_find (lc_statset_t const * restrict const ls, + lc_state_t const * restrict const state); + +lc_stattime_t * +lc_stattime_find_create (lc_statset_t * restrict const ls, + lc_state_t const * restrict const state); + + + +/* TODO: introduce type for num_threads and npoints[3] */ +void +lc_statset_init (lc_statset_t * restrict const ls, + lc_statmap_t * restrict const lm, + int const num_threads, + int const npoints[3]); + +lc_statset_t * +lc_statset_find (lc_statmap_t const * restrict const lm, + int const num_threads, + int const npoints[3]); + +lc_statset_t * +lc_statset_find_create (lc_statmap_t * restrict const lm, + int const num_threads, + int const npoints[3]); + + + typedef struct lc_control_t { lc_statmap_t * restrict statmap; lc_statset_t * restrict statset; diff --git a/Carpet/LoopControl/src/make.code.defn b/Carpet/LoopControl/src/make.code.defn index 808a7baae..426f7ee52 100644 --- a/Carpet/LoopControl/src/make.code.defn +++ b/Carpet/LoopControl/src/make.code.defn @@ -1,7 +1,7 @@ # Main make.code.defn file for thorn LoopControl # Source files in this directory -SRCS = lc_auto.c lc_siman.c loopcontrol.c loopcontrol.F90 loopcontrol_types.F90 +SRCS = loopcontrol.c loopcontrol.F90 loopcontrol_types.F90 lc_auto.c lc_siman.c lc_hill.c # Subdirectories containing source files SUBDIRS = |