diff options
author | Erik Schnetter <schnetter@gmail.com> | 2013-01-16 14:46:17 -0500 |
---|---|---|
committer | Erik Schnetter <schnetter@gmail.com> | 2013-01-16 14:46:17 -0500 |
commit | e0ddb73239c73c6de42a01204194173ce65ebff4 (patch) | |
tree | d0380bec049f7807836d68a0001f2ec492f62d0e /Carpet/LoopControl/src/loopcontrol.cc | |
parent | 1cddd960d62da42ccd111022f1326740f688b48d (diff) |
LoopControl: Rewrite
Rewrite code in C++.
Remove dependency on GSL.
Modify algorithm; now traverses arrays bottom-up (by splitting the looping region into equal-sized blocks) instead of top-down (splitting the region into a certain number of blocks)
Make multi-threading dynamic
Support SMT (hyper-threading), i.e. threads which share the same cache
Diffstat (limited to 'Carpet/LoopControl/src/loopcontrol.cc')
-rw-r--r-- | Carpet/LoopControl/src/loopcontrol.cc | 827 |
1 files changed, 827 insertions, 0 deletions
diff --git a/Carpet/LoopControl/src/loopcontrol.cc b/Carpet/LoopControl/src/loopcontrol.cc new file mode 100644 index 000000000..2c37a499c --- /dev/null +++ b/Carpet/LoopControl/src/loopcontrol.cc @@ -0,0 +1,827 @@ +#include "loopcontrol.h" + +#include <cctk.h> +#include <cctk_Arguments.h> +#include <cctk_Parameters.h> + +#include <algorithm> +#include <cassert> +#include <cmath> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <iostream> +#include <limits> +#include <list> +#include <ostream> +#include <vector> + +#ifdef _OPENMP +# include <omp.h> +#else + +// Simple non-OpenMP implementations +static inline int omp_get_num_threads() { return 1; } +static inline int omp_get_thread_num() { return 0; } + +#endif + +#ifdef HAVE_CAPABILITY_CYCLECLOCK +// We have a fast, accurate clock + +# include <cycleclock.h> + +#else + +# ifdef _OPENMP +// We use the OpenMP clock + +typedef double ticks; +static inline ticks getticks() { return omp_get_wtime(); } +static inline double elapsed(ticks t1, ticks t0) { return t1-t0; } +static inline double seconds_per_tick() { return 1.0; } + +# else +// We use gettimeofday as fallback + +# include <sys/time.h> +typedef timeval ticks; +static inline ticks getticks() +{ + timeval tp; + gettimeofday(&tp, NULL); + return tp; +} +static inline double elapsed(ticks t1, ticks t0) +{ + return 1.0e+6 * (t1.tv_sec - t0.tv_sec) + (t1.tv_usec - t0.tv_usec); +} +static inline double seconds_per_tick() { return 1.0e-6; } + +# endif +#endif + +using namespace std; + + + +struct lc_thread_info_t { + char padding1[128]; // pad to ensure cache lines are not shared + int idx; // linear index of next coarse thread block + char padding2[128]; +}; + +struct lc_fine_thread_comm_t { + char padding1[128]; // pad to ensure cache lines are not shared + int state; // waiting threads + int value; // broadcast value + char padding2[128]; +}; + + + +struct lc_stats_t { + char const* name; + char const* file; + int line; + int init_count; + double points, threads; + double count, sum, sum2, min, max; + ticks start_time; +}; + + + +extern "C" CCTK_FCALL +void CCTK_FNAME(lc_get_fortran_type_sizes) (ptrdiff_t* type_sizes); + + + +namespace { + + list<lc_stats_t*> all_stats; + + + + void check_fortran_type_sizes() + { + ptrdiff_t type_sizes[4]; + CCTK_FNAME(lc_get_fortran_type_sizes) (type_sizes); + assert(type_sizes[0] == sizeof(lc_vec_t)); + assert(type_sizes[1] == sizeof(lc_ivec_t)); + assert(type_sizes[2] == sizeof(lc_space_t)); + assert(type_sizes[3] == sizeof(lc_control_t)); + } + + + + template<typename T> + T divup(T const i, T const j) + { + assert(i >= 0 and j > 0); + return (i + j - 1) / j; + } + + template<typename T> + T alignup(T const i, T const j) + { + return divup(i, j) * j; + } + + template<typename T> + T divdown(T const i, T const j) + { + assert(i >= 0 and j > 0); + return i / j; + } + + template<typename T> + T aligndown(T const i, T const j) + { + return divdown(i, j) * j; + } + + + + ostream& operator<<(ostream& os, lc_vec_t const& x) + { + os << "["; + for (int d=0; d<LC_DIM; ++d) { + if (d>0) os << ","; + os << x.v[d]; + } + os << "]"; + return os; + } + + ostream& operator<<(ostream& os, lc_ivec_t const& x) + { + os << "["; + for (int d=0; d<LC_DIM; ++d) { + if (d>0) os << ","; + os << x.v[d]; + } + os << "]"; + return os; + } + + ostream& operator<<(ostream& os, lc_space_t const& s) + { + os << "{" + << "min:" << s.min << "," + << "max:" << s.max << "," + << "step:" << s.step << "," + << "pos:" << s.pos << "," + << "count:" << s.count << "," + << "idx:" << s.idx + << "}"; + return os; + } + + ostream& operator<<(ostream& os, lc_control_t const& c) + { + os << "lc_control{\n" + << " ash:" << c.ash << ",\n" + << " loop:" << c.loop << ",\n" + << " thread:" << c.thread << ",\n" + << " coarse:" << c.coarse << ",\n" + << " fine:" << c.fine << "\n" + << " fine_thread:" << c.fine_thread << "\n" + << "}\n"; + return os; + } + + + + ptrdiff_t prod(lc_vec_t const& x) + { + ptrdiff_t r = 1; + for (int d=0; d<LC_DIM; ++d) { + assert(x.v[d] >= 0); + r *= x.v[d]; + } + return r; + } + + ptrdiff_t ind(lc_vec_t const& shape, lc_vec_t const& pos) + { + ptrdiff_t r = 0; + ptrdiff_t f = 1; + for (int d=0; d<LC_DIM; ++d) { + assert(pos.v[d] >= 0 and pos.v[d] < shape.v[d]); + r += f * pos.v[d]; + assert(shape.v[d] >= 0); + f *= shape.v[d]; + } + return r; + } + + ptrdiff_t ind(lc_vec_t const& shape, + ptrdiff_t const i, ptrdiff_t const j, ptrdiff_t const k) + { + lc_vec_t const pos = {{ i, j, k }}; + return ind(shape, pos); + } + + + + void space_set_count(lc_space_t& space) + { + for (int d=0; d<LC_DIM; ++d) { + space.count.v[d] = + divup(space.max.v[d] - space.min.v[d], space.step.v[d]); + } + } + + void space_idx2pos(lc_space_t& space) + { + for (int d=0; d<LC_DIM; ++d) { + space.pos.v[d] = space.min.v[d] + space.idx.v[d] * space.step.v[d]; + } + } + + bool space_global2local(lc_space_t& space, int gidx) + { + assert(gidx >= 0); + for (int d=0; d<LC_DIM; ++d) { + if (space.count.v[d] > 0) { + space.idx.v[d] = gidx % space.count.v[d]; + gidx /= space.count.v[d]; + } else { + space.idx.v[d] = 0; + } + } + return gidx != 0; + } + + int space_local2global(lc_space_t const& space) + { + int gidx = 0; + int fact = 1; + for (int d=0; d<LC_DIM; ++d) { + assert(space.idx.v[d] >= 0 and space.idx.v[d] < space.count.v[d]); + gidx += fact * space.idx.v[d]; + fact *= space.count.v[d]; + } + return gidx; + } + + + + int get_num_fine_threads() + { + DECLARE_CCTK_PARAMETERS; + if (not use_smt_threads) return 1; + if (omp_get_num_threads() == 1) return 1; + static int num_smt_threads = -1; + if (CCTK_BUILTIN_EXPECT(num_smt_threads<0, false)) { +#pragma omp barrier + 0; // PGI compiler needs this +#pragma omp master + if (CCTK_IsFunctionAliased("GetNumSMTThreads")) { + num_smt_threads = GetNumSMTThreads(); + } else { + num_smt_threads = 1; + } +#pragma omp barrier + } + return num_smt_threads; + } + + int get_fine_thread_num() + { + DECLARE_CCTK_PARAMETERS; + if (not use_smt_threads) return 0; + if (omp_get_num_threads() == 1) return 0; + int const thread_num = omp_get_thread_num(); + int const num_fine_threads = get_num_fine_threads(); + return thread_num % num_fine_threads; + } + + int get_num_coarse_threads() + { + int const num_threads = omp_get_num_threads(); + int const num_fine_threads = get_num_fine_threads(); + assert(num_threads % num_fine_threads == 0); + return num_threads / num_fine_threads; + } + + int get_coarse_thread_num() + { + int const thread_num = omp_get_thread_num(); + int const num_fine_threads = get_num_fine_threads(); + return thread_num / num_fine_threads; + } + + // Wait until *ptr is different from old_value + void thread_wait(int const *const ptr, int const old_value) + { + while (*ptr == old_value) { +#pragma omp flush + 0; // PGI compiler needs this + } + } + + int fine_thread_broadcast(lc_fine_thread_comm_t* const comm, int value) + { + int const num_fine_threads = get_num_fine_threads(); + if (num_fine_threads == 1) return value; + assert(num_fine_threads < 8 * int(sizeof comm->state)); + int const fine_thread_num = get_fine_thread_num(); + int const master_mask = 1; + + // Assume comm->count == 0 initially + if (fine_thread_num == 0) { // if on master + + int const all_threads_mask = (1 << num_fine_threads) - 1; + if (comm->state != 0) { + // wait until everybody has acknowledged the previous value +#pragma omp flush + for (;;) { + int const state = comm->state; + if (comm->state == all_threads_mask) break; + thread_wait(&comm->state, state); + } + // mark the value as invalid + comm->state = 0; +#pragma omp flush + } + // publish value + comm->value = value; +#pragma omp flush + // mark the value as valid + comm->state = master_mask; +#pragma omp flush + + } else { // if not on master + + // wait until the value is valid, and it is a new value + int const thread_mask = 1 << fine_thread_num; +#pragma omp flush + for (;;) { + int const state = comm->state; + if ((comm->state & (master_mask | thread_mask)) == master_mask) break; + thread_wait(&comm->state, state); + } + // read value + value = comm->value; +#pragma omp flush + 0; // PGI compiler needs this + // acknowledge the value +#pragma omp atomic + comm->state |= thread_mask; +#pragma omp flush + + } // if not on master + + return value; + } + +} // namespace + + + +void lc_stats_init(lc_stats_t** const stats_ptr, + char const* const name, + char const* const file, + int const line) +{ + if (*stats_ptr) return; + + lc_stats_t* stats; +#pragma omp single copyprivate(stats) + { + stats = new lc_stats_t; + + stats->name = strdup(name); + stats->file = strdup(file); + stats->line = line; + stats->init_count = 0; + stats->points = 0.0; + stats->threads = 0.0; + stats->count = 0.0; + stats->sum = 0.0; + stats->sum2 = 0.0; + stats->min = numeric_limits<double>::max(); + stats->max = 0.0; + + all_stats.push_back(stats); + } +#pragma omp single + { + *stats_ptr = stats; + } +} + + + +void lc_control_init(lc_control_t* restrict const control, + lc_stats_t* const stats, + ptrdiff_t imin, ptrdiff_t jmin, ptrdiff_t kmin, + ptrdiff_t imax, ptrdiff_t jmax, ptrdiff_t kmax, + ptrdiff_t iash, ptrdiff_t jash, ptrdiff_t kash, + ptrdiff_t di, ptrdiff_t dj, ptrdiff_t dk) +{ + DECLARE_CCTK_PARAMETERS; + +#pragma omp barrier +#pragma omp master + { + stats->start_time = getticks(); + } + + // Ensure thread counts are consistent + assert(get_num_coarse_threads() * get_num_fine_threads() == + omp_get_num_threads()); + + // Get cache line size + static ptrdiff_t max_cache_linesize = -1; + if (CCTK_BUILTIN_EXPECT(max_cache_linesize<0, false)) { +#pragma omp barrier +#pragma omp master + if (CCTK_IsFunctionAliased("GetCacheInfo1")) { + int const num_levels = GetCacheInfo1(NULL, NULL, 0); + vector<int> linesizes(num_levels); + vector<int> strides (num_levels); + GetCacheInfo1(&linesizes[0], &strides[0], num_levels); + max_cache_linesize = 1; + for (int level=0; level<num_levels; ++level) { + max_cache_linesize = + max(max_cache_linesize, ptrdiff_t(linesizes[level])); + } + } else { + max_cache_linesize = 1; + } +#pragma omp barrier + } + + // Initialize everything with a large, bogus value + memset(control, 123, sizeof *control); + + ptrdiff_t tilesize_alignment = 1; + if (align_with_cachelines) { + tilesize_alignment = + divup(max_cache_linesize, ptrdiff_t(sizeof(CCTK_REAL))); + tilesize_alignment = alignup(tilesize_alignment, di); + } + + // Parameters (all in units of grid points) + // ptrdiff_t const smt_size[LC_DIM] = { smtsize_i, smtsize_j, smtsize_k }; + // TODO: put fine threads into i direction, so that they share cache + // lines + ptrdiff_t smt_size[LC_DIM] = { 1, 1, 1 }; + { + int const num_fine_threads = get_num_fine_threads(); + if (num_fine_threads <= loopsize_j) { + smt_size[1] = num_fine_threads; + } else if (num_fine_threads <= loopsize_k) { + smt_size[2] = num_fine_threads; + } else { + smt_size[0] = num_fine_threads; + } + } + ptrdiff_t const tile_size[LC_DIM] = { + alignup(ptrdiff_t(tilesize_i), tilesize_alignment), + tilesize_j, + tilesize_k, + }; + ptrdiff_t const loop_size[LC_DIM] = { loopsize_i, loopsize_j, loopsize_k }; + + // Arguments + ptrdiff_t const loop_min[LC_DIM] = { imin, jmin, kmin }; + ptrdiff_t const loop_max[LC_DIM] = { imax, jmax, kmax }; + ptrdiff_t const ash[LC_DIM] = { iash, jash, kash }; + ptrdiff_t const vect_size[LC_DIM] = { di, dj, dk }; + + // Copy ash arguments + for (int d=0; d<LC_DIM; ++d) { + control->ash.v[d] = ash[d]; + } + + // Set up multithreading state + lc_thread_info_t* thread_info_ptr; +#pragma omp single copyprivate(thread_info_ptr) + { + thread_info_ptr = new lc_thread_info_t; + } + control->thread_info_ptr = thread_info_ptr; + + { + lc_fine_thread_comm_t** fine_thread_comm_ptrs; +#pragma omp single copyprivate(fine_thread_comm_ptrs) + { + fine_thread_comm_ptrs = + new lc_fine_thread_comm_t*[get_num_coarse_threads()]; + } + if (get_fine_thread_num() == 0) { + lc_fine_thread_comm_t* fine_thread_comm_ptr; + fine_thread_comm_ptr = new lc_fine_thread_comm_t; + fine_thread_comm_ptr->state = 0; + fine_thread_comm_ptrs[get_coarse_thread_num()] = fine_thread_comm_ptr; + } +#pragma omp barrier + control->fine_thread_comm_ptr = + fine_thread_comm_ptrs[get_coarse_thread_num()]; +#pragma omp barrier +#pragma omp single nowait + { + delete[] fine_thread_comm_ptrs; + } + } + + // Set loop sizes + for(int d=0; d<LC_DIM; ++d) { + // Overall loop: as specified + control->loop.min.v[d] = loop_min[d]; + control->loop.max.v[d] = loop_max[d]; + // Thread loop +#if VECTORISE_ALIGNED_ARRAYS + // Move start to be aligned with vector size + control->thread.min.v[d] = + aligndown(control->loop.min.v[d], vect_size[d]); +#else + control->thread.min.v[d] = control->loop.min.v[d]; +#endif + control->thread.max.v[d] = loop_max[d]; + // Fine threads + control->fine_thread.count.v[d] = smt_size[d]; + } + { + int const fine_thread_num = get_fine_thread_num(); + bool const outside = + space_global2local(control->fine_thread, fine_thread_num); + assert(not outside); + } + + // Set loop step sizes + if (CCTK_EQUALS(initial_setup, "legacy")) { + // Like a non-LoopControl loop: no loop tiling (i.e. do not use + // coarse loops), parallelise only in k direction (i.e. assign + // equal k ranges to threads) + for(int d=0; d<LC_DIM; ++d) { + assert(smt_size[d] == 1); // TODO: implement this + control->fine_thread.step.v[d] = vect_size[d]; + control->fine.step.v[d] = vect_size[d]; + ptrdiff_t const npoints = control->loop.max.v[d] - control->loop.min.v[d]; + ptrdiff_t const nthreads = d!=LC_DIM-1 ? 1 : get_num_coarse_threads(); + control->coarse.step.v[d] = + alignup(divup(npoints, nthreads), control->fine.step.v[d]); + control->thread.step.v[d] = alignup(npoints, control->coarse.step.v[d]); + } + } else if (CCTK_EQUALS(initial_setup, "tiled")) { + // Basic LoopControl setup + for(int d=0; d<LC_DIM; ++d) { + control->fine_thread.step.v[d] = vect_size[d]; + control->fine.step.v[d] = + alignup(smt_size[d], control->fine_thread.step.v[d]); + control->coarse.step.v[d] = + alignup(tile_size[d], control->fine.step.v[d]); + control->thread.step.v[d] = + alignup(loop_size[d], control->coarse.step.v[d]); + } + } else { + CCTK_WARN(CCTK_WARN_ABORT, "internal error"); + } + + if (veryverbose) { +#pragma omp master + CCTK_VInfo(CCTK_THORNSTRING, + "Loop %s (%s:%d): imin=[%td,%td,%td] imax=[%td,%td,%td]\n" + " smt.step=[%td,%td,%td] fine.step=[%td,%td,%td] coarse.step=[%td,%td,%td] thread.step=[%td,%td,%td]", + stats->name, stats->file, stats->line, + control->loop.min.v[0], control->loop.min.v[1], control->loop.min.v[2], + control->loop.max.v[0], control->loop.max.v[1], control->loop.max.v[2], + control->fine_thread.step.v[0], control->fine_thread.step.v[1], control->fine_thread.step.v[2], + control->fine.step.v[0], control->fine.step.v[1], control->fine.step.v[2], + control->coarse.step.v[0], control->coarse.step.v[1], control->coarse.step.v[2], + control->thread.step.v[0], control->thread.step.v[1], control->thread.step.v[2]); + } + + // Initialise selftest + if (selftest) { + unsigned char* restrict selftest_array; +#pragma omp single copyprivate(selftest_array) + { + ptrdiff_t const npoints = prod(control->ash); + selftest_array = new unsigned char[npoints]; + memset(selftest_array, 0, npoints * sizeof *selftest_array); + } + control->selftest_array = selftest_array; + } else { + control->selftest_array = NULL; + } +} + +void lc_control_finish(lc_control_t* restrict const control, + lc_stats_t* restrict const stats) +{ + DECLARE_CCTK_PARAMETERS; + +#pragma omp barrier + + // Finish selftest + if (selftest) { + assert(control->selftest_array); +#pragma omp barrier +#pragma omp single nowait + { + ptrdiff_t nfailed = 0; + for (ptrdiff_t k=0; k<control->ash.v[2]; ++k) { + for (ptrdiff_t j=0; j<control->ash.v[1]; ++j) { + for (ptrdiff_t i=0; i<control->ash.v[0]; ++i) { + bool const inside = + i >= control->loop.min.v[0] and i < control->loop.max.v[0] and + j >= control->loop.min.v[1] and j < control->loop.max.v[1] and + k >= control->loop.min.v[2] and k < control->loop.max.v[2]; + ptrdiff_t const ipos = ind(control->ash, i,j,k); + nfailed += control->selftest_array[ipos] != inside; + } + } + } + if (nfailed > 0) { + CCTK_VWarn(CCTK_WARN_ABORT, __LINE__, __FILE__, CCTK_THORNSTRING, + "LoopControl self-test failed"); + } + delete[] control->selftest_array; + } + control->selftest_array = NULL; + } + + // Collect statistics +#pragma omp barrier +#pragma omp master + { + ticks const end_time = getticks(); + double const elapsed_time = + seconds_per_tick() * elapsed(end_time, stats->start_time); + ptrdiff_t npoints = 1; + for (int d=0; d<LC_DIM; ++d) { + npoints *= control->loop.max.v[d] - control->loop.min.v[d]; + } + if (stats->init_count < 1) { + // Skip the first iteration + ++stats->init_count; + if (veryverbose) { + double const time_point = + elapsed_time * omp_get_num_threads() / npoints; + CCTK_VInfo(CCTK_THORNSTRING, + "Loop %s: time=%g, time/point=%g s", + stats->name, elapsed_time, time_point); + } + } else { + stats->points += double(npoints); + stats->threads += double(omp_get_num_threads()); + stats->count += 1.0; + stats->sum += elapsed_time; + stats->sum2 += pow(elapsed_time, 2.0); + stats->min = fmin(stats->min, elapsed_time); + stats->max = fmax(stats->max, elapsed_time); + if (veryverbose) { + double const avg_thread = stats->sum / stats->count; + double const avg_point = + stats->sum * stats->threads / (stats->count * stats->points); + CCTK_VInfo(CCTK_THORNSTRING, + "Loop %s: count=%g, avg/thread=%g s, avg/point=%g s", + stats->name, stats->count, avg_thread, avg_point); + } + } + } + + // Tear down multithreading state +#pragma omp single nowait + { + delete control->thread_info_ptr; + } + control->thread_info_ptr = NULL; + if (get_fine_thread_num() == 0) { + delete control->fine_thread_comm_ptr; + } + control->fine_thread_comm_ptr = NULL; +} + + + +void lc_thread_init(lc_control_t* restrict const control) +{ + space_set_count(control->thread); +#pragma omp single + { + control->thread_info_ptr->idx = get_num_coarse_threads(); + } + control->thread_done = + space_global2local(control->thread, get_coarse_thread_num()); + space_idx2pos(control->thread); +} + +int lc_thread_done(lc_control_t const* restrict const control) +{ + return control->thread_done; +} + +void lc_thread_step(lc_control_t* restrict const control) +{ + // Get next thread block + int new_global_idx; + if (get_fine_thread_num() == 0) { +#pragma omp critical(LoopControl_lc_thread_step) + { + new_global_idx = control->thread_info_ptr->idx++; + } + } + new_global_idx = + fine_thread_broadcast(control->fine_thread_comm_ptr, new_global_idx); + control->thread_done = space_global2local(control->thread, new_global_idx); + space_idx2pos(control->thread); +} + + + +void lc_selftest_set(lc_control_t const* restrict control, + ptrdiff_t const lmin, ptrdiff_t const lmax, + ptrdiff_t const imin, ptrdiff_t const imax, + ptrdiff_t const di, + ptrdiff_t const i0, ptrdiff_t const j, ptrdiff_t const k) +{ + DECLARE_CCTK_PARAMETERS; + assert(selftest); + assert(imin>=0 and imin<imax and imax<=control->ash.v[0]); + assert(di>0); + assert(j>=0 and j<control->ash.v[1]); + assert(k>=0 and k<control->ash.v[2]); + assert(i0+di-1>=control->loop.min.v[0] and i0<control->loop.max.v[0]); + if (imin>lmin) { + ptrdiff_t const ipos_imin = ind(control->ash, imin,j,k); + assert(ipos_imin % di == 0); + } + if (imax<lmax) { + ptrdiff_t const ipos_imax = ind(control->ash, imax,j,k); + assert(ipos_imax % di == 0); + } + assert(j>=control->loop.min.v[1] and j<control->loop.max.v[1]); + assert(k>=control->loop.min.v[2] and k<control->loop.max.v[2]); + for (ptrdiff_t i=i0; i<i0+di; ++i) { + if (i>=imin and i<imax) { + assert(i>=0 and i<control->ash.v[0]); + assert(i>=control->loop.min.v[0] and i<control->loop.max.v[0]); + ptrdiff_t const ipos = ind(control->ash, i,j,k); + unsigned char& elt = control->selftest_array[ipos]; +#pragma omp atomic + ++elt; + if (elt!=1) { +#pragma omp critical + { + fprintf(stderr, + "thread=%d/%d fine_thread=%d/%d ijk=[%td,%td,%td]\n", + get_coarse_thread_num(), get_num_coarse_threads(), + get_fine_thread_num(), get_num_fine_threads(), + i,j,k); + assert(elt==1); + } + } + } + } +} + + + +int lc_setup(void) +{ + check_fortran_type_sizes(); + return 0; +} + + + +void lc_statistics(CCTK_ARGUMENTS) +{ + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + CCTK_INFO("LoopControl statistics:"); + for (list<lc_stats_t*>::const_iterator + istats = all_stats.begin(); istats != all_stats.end(); ++istats) + { + lc_stats_t const* restrict const stats = *istats; + if (stats->count == 0.0) { + printf(" Loop %s (%s:%d):\n", + stats->name, stats->file, stats->line); + } else { + double const avg_thread = stats->sum / stats->count; + double const avg_point = + stats->sum * stats->threads / (stats->count * stats->points); + printf(" Loop %s (%s:%d): count=%g, avg/thread=%g s, avg/point=%g s\n", + stats->name, stats->file, stats->line, + stats->count, avg_thread, avg_point); + } + } +} + +void lc_statistics_maybe(CCTK_ARGUMENTS) +{ + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + if (not verbose) return; + + lc_statistics(CCTK_PASS_CTOC); +} |