aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorErik Schnetter <schnetter@gmail.com>2013-01-16 14:43:47 -0500
committerErik Schnetter <schnetter@gmail.com>2013-01-16 14:43:47 -0500
commit1cddd960d62da42ccd111022f1326740f688b48d (patch)
tree5c0c591e8fca50868ae5f80b703c4422eef3df30
parentf07af2a351c2096f84487b13c114a1925755eafb (diff)
CarpetLib: Rewrite array padding
Obtain cache information from thorn hwloc. Align allocated memory manually if operator new returns unaligned memory.
-rw-r--r--Carpet/CarpetLib/interface.ccl7
-rw-r--r--Carpet/CarpetLib/param.ccl95
-rw-r--r--Carpet/CarpetLib/src/cacheinfo.cc188
-rw-r--r--Carpet/CarpetLib/src/cacheinfo.hh95
-rw-r--r--Carpet/CarpetLib/src/defs.hh34
-rw-r--r--Carpet/CarpetLib/src/mem.cc60
-rw-r--r--Carpet/CarpetLib/src/memstat.cc4
7 files changed, 173 insertions, 310 deletions
diff --git a/Carpet/CarpetLib/interface.ccl b/Carpet/CarpetLib/interface.ccl
index 0a2086c4f..d64b2c782 100644
--- a/Carpet/CarpetLib/interface.ccl
+++ b/Carpet/CarpetLib/interface.ccl
@@ -77,3 +77,10 @@ CCTK_INT FUNCTION GetBoundarySpecification \
CCTK_INT OUT ARRAY is_staggered, \
CCTK_INT OUT ARRAY shiftout)
USES FUNCTION GetBoundarySpecification
+
+
+
+CCTK_INT FUNCTION GetCacheInfo1(CCTK_INT ARRAY OUT linesizes, \
+ CCTK_INT ARRAY OUT strides, \
+ CCTK_INT IN max_num_cache_levels)
+USES FUNCTION GetCacheInfo1
diff --git a/Carpet/CarpetLib/param.ccl b/Carpet/CarpetLib/param.ccl
index c13d54ba1..6adccca98 100644
--- a/Carpet/CarpetLib/param.ccl
+++ b/Carpet/CarpetLib/param.ccl
@@ -211,100 +211,9 @@ BOOLEAN use_mpi_ssend "Use MPI_Ssend instead of MPI_Isend" STEERABLE=always
-# Memory and cache information -- this is machine dependent and should
-# be determined at run time or set via simfactory
-
-CCTK_INT vector_size "vector size" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-
-CCTK_INT D1size "level 1 data cache" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-CCTK_INT D1linesize "level 1 data cache" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-CCTK_INT D1assoc "level 1 data cache" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-
-CCTK_INT L2size "level 2 unified cache" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-CCTK_INT L2linesize "level 2 unified cache" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-CCTK_INT L2assoc "level 2 unified cache" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-
-CCTK_INT L3size "level 3 unified cache" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-CCTK_INT L3linesize "level 3 unified cache" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-CCTK_INT L3assoc "level 3 unified cache" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-
-CCTK_INT TLB_D1entries "level 1 TLB cache" STEERABLE=recover
+BOOLEAN pad_to_cachelines "Pad arrays to the cache line size (only when VECTORISE_ALIGNED_ARRAYS is set)" STEERABLE=recover
{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-CCTK_INT TLB_D1pagesize "level 1 TLB cache" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-CCTK_INT TLB_D1assoc "level 1 TLB cache" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-
-CCTK_INT TLB_L2entries "level 2 TLB cache" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-CCTK_INT TLB_L2pagesize "level 2 TLB cache" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-CCTK_INT TLB_L2assoc "level 2 TLB cache" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
-
-CCTK_INT pagesize "size of a memory page" STEERABLE=recover
-{
- 0 :: "unknown"
- 1:* :: ""
-} 0
+} "yes"
diff --git a/Carpet/CarpetLib/src/cacheinfo.cc b/Carpet/CarpetLib/src/cacheinfo.cc
index 897cb533e..76640d7c2 100644
--- a/Carpet/CarpetLib/src/cacheinfo.cc
+++ b/Carpet/CarpetLib/src/cacheinfo.cc
@@ -8,146 +8,130 @@
template<int D>
vect<int,D>
-pad_shape (bbox<int,D> const& extent)
+pad_shape(bbox<int,D> const& extent)
{
- assert (all (extent.shape() >= 0));
+ assert(all(extent.shape() >= 0));
return pad_shape(extent.shape() / extent.stride());
}
+namespace {
+ struct cache_info_t {
+ int linesize;
+ int stride;
+ };
+ bool have_cache_info = false;
+ vector<cache_info_t> cache_info;
+}
+
template<int D>
vect<int,D>
-pad_shape (vect<int,D> const& shape)
+pad_shape(vect<int,D> const& shape)
{
DECLARE_CCTK_PARAMETERS;
- assert (all(shape>=0));
+ assert(all(shape>=0));
- static bool have_cacheinfo = false;
- static vector<cacheinfo_t> cacheinfo;
- if (not have_cacheinfo) {
- // Ignore L1 caches that are probably too small to be useful (e.g.
- // on Intel or AMD processors)
- // TODO: make this a parameter
- if (D1size >= 128*1024) {
- cacheinfo.push_back(cacheinfo_t(D1size, D1linesize, D1assoc));
- }
-#if 0
- // TODO: this is too simplistic:
- // Add page size as a cache
- if (pagesize>0) {
- cacheinfo.push_back(cacheinfo_t(pagesize));
- }
-#endif
- if (L2size>0) {
- cacheinfo.push_back(cacheinfo_t(L2size, L2linesize, L2assoc));
- }
- if (L3size>0) {
- cacheinfo.push_back(cacheinfo_t(L3size, L3linesize, L3assoc));
- }
- if (TLB_D1entries>0) {
- ptrdiff_t const TLB_D1size = TLB_D1entries * TLB_D1pagesize * TLB_D1assoc;
- cacheinfo.push_back(cacheinfo_t(TLB_D1size, TLB_D1pagesize, TLB_D1assoc));
- }
- if (TLB_L2entries>0) {
- ptrdiff_t const TLB_L2size = TLB_L2entries * TLB_L2pagesize * TLB_L2assoc;
- cacheinfo.push_back(cacheinfo_t(TLB_L2size, TLB_L2pagesize, TLB_L2assoc));
- }
-
- // TODO: sort caches by their sizes
- for (size_t n=0; n<cacheinfo.size(); ++n) {
- cacheinfo_t const& ci = cacheinfo.at(n);
- if (n>0) {
- // Ensure that the cache size is larger than the next lower
- // cache size
- assert (ci.size() > cacheinfo.at(n-1).size());
- // Ensure that the cache line size is evenly divided by the
- // next lower cache line size
- assert (ci.linesize() % cacheinfo.at(n-1).linesize() == 0);
- assert (ci.stride() > cacheinfo.at(n-1).stride());
+ // Don't pad empty arrays; we don't want to handle all the special
+ // cases for this below
+ if (any(shape==0)) return shape;
+
+ if (CCTK_BUILTIN_EXPECT(not have_cache_info, false)) {
+#pragma omp barrier
+#pragma omp master
+ {
+ if (CCTK_IsFunctionAliased("GetCacheInfo1")) {
+ int const num_levels = GetCacheInfo1(NULL, NULL, 0);
+ vector<int> linesizes(num_levels);
+ vector<int> strides (num_levels);
+ GetCacheInfo1(&linesizes[0], &strides[0], num_levels);
+ cache_info.resize(num_levels);
+ for (int level=0; level<num_levels; ++level) {
+ cache_info[level].linesize = linesizes[level];
+ cache_info[level].stride = strides [level];
+ }
}
- } // for cacheinfo
-
- have_cacheinfo = true;
- } // if not have_cacheinfo
+ have_cache_info = true;
+ }
+#pragma omp barrier
+ }
vect<int,D> padded_shape;
int accumulated_npoints = 1;
for (int d=0; d<D; ++d) {
int npoints = shape[d];
- if (d == 0) {
#if VECTORISE && VECTORISE_ALIGNED_ARRAYS
+ if (d == 0) {
// Pad array to a multiple of the vector size. Note that this is
// a hard requirement, so that we can emit aligned load/store
// operations.
- npoints = align_up (npoints, CCTK_REAL_VEC_SIZE);
-#endif
- if (vector_size > 0) {
- npoints = align_up (npoints, vector_size);
- }
+ npoints = align_up(npoints, CCTK_REAL_VEC_SIZE);
}
- for (size_t n=0; n<cacheinfo.size(); ++n) {
- cacheinfo_t const& ci = cacheinfo.at(n);
-
- // Pad array in this direction to a multiple of this cache line
- // size
- assert (ci.linesize() % sizeof(CCTK_REAL) == 0);
- int const linesize = ci.linesize() / sizeof(CCTK_REAL);
- assert (is_power_of_2(linesize));
- if (npoints * accumulated_npoints >= linesize) {
- // The extent is at least one cache line long: round up to the
- // next full cache line
- npoints = align_up (npoints, linesize);
- } else {
-#if 0
- // The extent is less than one cache line long: Ensure that
- // the array size divides the cache line size evenly by
- // rounding to the next power of 2
- // NOTE: This is disabled, since this would align everything
- // to powers of 2.
- npoints = next_power_of_2(npoints);
+ if (pad_to_cachelines) {
+ for (size_t cache_level=0; cache_level<cache_info.size(); ++cache_level) {
+ // Pad array in this direction to a multiple of this cache
+ // line size
+ int const cache_linesize = cache_info[cache_level].linesize;
+ int const cache_stride = cache_info[cache_level].stride;
+
+ assert(cache_linesize % sizeof(CCTK_REAL) == 0);
+ int const linesize = cache_linesize / sizeof(CCTK_REAL);
+ assert(is_power_of_2(linesize));
+ if (npoints * accumulated_npoints < linesize) {
+ // The extent is less than one cache line long: Ensure that
+ // the array size divides the cache line size evenly by
+ // rounding to the next power of 2
+ npoints = next_power_of_2(npoints);
+ } else {
+ // The extent is at least one cache line long: round up to
+ // the next full cache line
+ int total_npoints = npoints * accumulated_npoints;
+ total_npoints = align_up(total_npoints, linesize);
+ assert(total_npoints % accumulated_npoints == 0);
+ npoints = total_npoints / accumulated_npoints;
+ }
+
+ // Avoid multiples of the cache stride
+ if (cache_stride > 0) {
+ assert(cache_stride % sizeof(CCTK_REAL) == 0);
+ int const stride = cache_stride / sizeof(CCTK_REAL);
+ if (npoints * accumulated_npoints % stride == 0) {
+ assert(stride > linesize);
+ int total_npoints = npoints * accumulated_npoints;
+ total_npoints += max(linesize, accumulated_npoints);
+ assert(total_npoints % accumulated_npoints == 0);
+ npoints = total_npoints / accumulated_npoints;
+ }
+ }
+ } // for cache_level
+ } // if pad_to_cachelines
#endif
- }
-
- // Avoid multiples of the cache stride
- assert (ci.stride() % sizeof(CCTK_REAL) == 0);
- int const stride = ci.stride() / sizeof(CCTK_REAL);
- if (npoints * accumulated_npoints % stride == 0) {
- assert (linesize < stride);
- npoints += linesize;
- }
-
- } // for cacheinfo
padded_shape[d] = npoints;
accumulated_npoints *= npoints;
}
- assert (prod (padded_shape) == accumulated_npoints);
+ assert(prod(padded_shape) == accumulated_npoints);
// self-check
for (int d=0; d<D; ++d) {
- assert (padded_shape[d] >= shape[d]);
+ assert(padded_shape[d] >= shape[d]);
#if VECTORISE && VECTORISE_ALIGNED_ARRAYS
if (d == 0) {
- assert (padded_shape[d] % CCTK_REAL_VEC_SIZE == 0);
+ assert(padded_shape[d] % CCTK_REAL_VEC_SIZE == 0);
}
#endif
- if (vector_size > 0) {
- if (d == 0) {
- assert (padded_shape[d] % vector_size == 0);
- }
- }
-
- // TODO: add self-checks for the other requirements as well
}
+ // Safety check
+ assert(prod(padded_shape) <= 2 * prod(shape) + 1000);
+
if (verbose) {
ostringstream buf;
buf << "padding " << shape << " to " << padded_shape;
- CCTK_INFO (buf.str().c_str());
+ CCTK_INFO(buf.str().c_str());
}
return padded_shape;
@@ -155,8 +139,8 @@ pad_shape (vect<int,D> const& shape)
-template vect<int,3> pad_shape (bbox<int,3> const& extent);
-template vect<int,3> pad_shape (vect<int,3> const& shape);
+template vect<int,3> pad_shape(bbox<int,3> const& extent);
+template vect<int,3> pad_shape(vect<int,3> const& shape);
-template vect<int,4> pad_shape (bbox<int,4> const& extent);
-template vect<int,4> pad_shape (vect<int,4> const& shape);
+template vect<int,4> pad_shape(bbox<int,4> const& extent);
+template vect<int,4> pad_shape(vect<int,4> const& shape);
diff --git a/Carpet/CarpetLib/src/cacheinfo.hh b/Carpet/CarpetLib/src/cacheinfo.hh
index 14b464a86..f72bec144 100644
--- a/Carpet/CarpetLib/src/cacheinfo.hh
+++ b/Carpet/CarpetLib/src/cacheinfo.hh
@@ -11,42 +11,6 @@
-static ptrdiff_t div_down (ptrdiff_t const x, ptrdiff_t const align)
- CCTK_ATTRIBUTE_UNUSED;
-static ptrdiff_t div_down (ptrdiff_t const x, ptrdiff_t const align)
-{
- assert (x >= 0);
- assert (align > 0);
- return x / align;
-}
-
-static ptrdiff_t div_up (ptrdiff_t const x, ptrdiff_t const align)
- CCTK_ATTRIBUTE_UNUSED;
-static ptrdiff_t div_up (ptrdiff_t const x, ptrdiff_t const align)
-{
- assert (x >= 0);
- assert (align > 0);
- return (x + align - 1) / align;
-}
-
-static ptrdiff_t align_down (ptrdiff_t const x, ptrdiff_t const align)
- CCTK_ATTRIBUTE_UNUSED;
-static ptrdiff_t align_down (ptrdiff_t const x, ptrdiff_t const align)
-{
- assert (x >= 0);
- assert (align > 0);
- return div_down(x, align) * align;
-}
-
-static ptrdiff_t align_up (ptrdiff_t const x, ptrdiff_t const align)
- CCTK_ATTRIBUTE_UNUSED;
-static ptrdiff_t align_up (ptrdiff_t const x, ptrdiff_t const align)
-{
- assert (x >= 0);
- assert (align > 0);
- return div_up(x, align) * align;
-}
-
static ptrdiff_t next_power_of_2 (ptrdiff_t const x)
CCTK_ATTRIBUTE_UNUSED;
static ptrdiff_t next_power_of_2 (ptrdiff_t const x)
@@ -90,65 +54,6 @@ static ptrdiff_t lcm (ptrdiff_t const x, ptrdiff_t const y)
-class cacheinfo_t {
- ptrdiff_t m_size; // bytes
- ptrdiff_t m_linesize; // bytes (pagesize for TLBs)
- ptrdiff_t m_associativity;
-public:
- bool invariant () const
- {
- return
- is_power_of_2(m_size) and
- is_power_of_2(m_linesize) and
- is_power_of_2(m_associativity) and
- m_size % (m_linesize * m_associativity) == 0;
- }
- cacheinfo_t (ptrdiff_t const a_size,
- ptrdiff_t const a_linesize,
- ptrdiff_t const a_associativity)
- : m_size (a_size),
- m_linesize (a_linesize),
- m_associativity (a_associativity)
- {
- assert (invariant());
- }
- cacheinfo_t (ptrdiff_t const a_linesize)
- : m_size (previous_power_of_2(numeric_limits<ptrdiff_t>::max())),
- m_linesize (a_linesize),
- m_associativity (1)
- {
- assert (invariant());
- }
- // size in bytes
- ptrdiff_t size() const
- {
- return m_size;
- }
- // line size in bytes
- ptrdiff_t linesize() const
- {
- return m_linesize;
- }
- // associativity
- ptrdiff_t associativity() const
- {
- return m_associativity;
- }
- // number of cache elements
- ptrdiff_t num_elements() const
- {
- return size() / (linesize() * associativity());
- }
- // stride (between main memory locations that use the same cache
- // element) in bytes
- ptrdiff_t stride() const
- {
- return num_elements() * linesize();
- }
-};
-
-
-
// These routines are apparently not pure -- don't know why
template<int D>
vect<int,D> pad_shape (bbox<int,D> const& extent) /*CCTK_ATTRIBUTE_PURE*/;
diff --git a/Carpet/CarpetLib/src/defs.hh b/Carpet/CarpetLib/src/defs.hh
index f41b19d86..02e36330e 100644
--- a/Carpet/CarpetLib/src/defs.hh
+++ b/Carpet/CarpetLib/src/defs.hh
@@ -113,6 +113,40 @@ enum centering { error_centered, vertex_centered, cell_centered };
+template<typename T>
+inline T div_down(T const x, T const align)
+{
+ assert(x >= 0);
+ assert(align > 0);
+ return x / align;
+}
+
+template<typename T>
+inline T div_up(T const x, T const align)
+{
+ assert(x >= 0);
+ assert(align > 0);
+ return (x + align - 1) / align;
+}
+
+template<typename T>
+inline T align_down(T const x, T const align)
+{
+ assert(x >= 0);
+ assert(align > 0);
+ return div_down(x, align) * align;
+}
+
+template<typename T>
+inline T align_up(T const x, T const align)
+{
+ assert(x >= 0);
+ assert(align > 0);
+ return div_up(x, align) * align;
+}
+
+
+
// Useful helper
template<class T>
inline T square (const T x) { return x*x; }
diff --git a/Carpet/CarpetLib/src/mem.cc b/Carpet/CarpetLib/src/mem.cc
index ab4bb1222..89d8fb211 100644
--- a/Carpet/CarpetLib/src/mem.cc
+++ b/Carpet/CarpetLib/src/mem.cc
@@ -39,6 +39,37 @@ double gmem::max_allocated_objects = 0;
+namespace {
+ size_t get_max_cache_linesize()
+ {
+ static size_t max_cache_linesize = 0;
+ if (CCTK_BUILTIN_EXPECT(max_cache_linesize==0, false)) {
+#pragma omp barrier
+#pragma omp master
+ {
+ max_cache_linesize = 1;
+ if (CCTK_IsFunctionAliased("GetCacheInfo1")) {
+ int const num_levels = GetCacheInfo1(NULL, NULL, 0);
+ vector<int> linesizes(num_levels);
+ vector<int> strides (num_levels);
+ GetCacheInfo1(&linesizes[0], &strides[0], num_levels);
+ for (int level=0; level<num_levels; ++level) {
+ max_cache_linesize =
+ max(max_cache_linesize, size_t(linesizes[level]));
+ }
+ }
+ }
+#pragma omp barrier
+ }
+ assert(max_cache_linesize>0);
+ return max_cache_linesize;
+ }
+
+ bool need_alignment = false;
+}
+
+
+
// TODO: Make this a plain class instead of a template
template<typename T>
@@ -71,14 +102,17 @@ mem (size_t const vectorlength, size_t const nelems,
}
try {
// TODO: use posix_memalign instead, if available
- size_t const alignment = CCTK_REAL_VEC_SIZE * sizeof(T);
+ size_t const max_cache_linesize = get_max_cache_linesize();
+ size_t const vector_size = CCTK_REAL_VEC_SIZE * sizeof(T);
+ size_t const alignment = align_up(max_cache_linesize, vector_size);
+ // Safety check
+ assert(alignment <= 1024);
// Assume optimistically that operator new returns well-aligned
// pointers
- static bool need_alignment = false;
if (not need_alignment) {
// Operator new works fine; just call it
storage_base_ = new T [vectorlength * nelems];
- need_alignment = size_t (storage_base_) & (alignment-1);
+ need_alignment = size_t(storage_base_) & (alignment-1);
if (need_alignment) {
// This pointer is no good; try again with manual alignment
delete [] storage_base_;
@@ -89,23 +123,13 @@ mem (size_t const vectorlength, size_t const nelems,
} else {
allocate_with_alignment:
// Operator new needs manual alignment
- size_t const max_padding = CCTK_REAL_VEC_SIZE - 1;
+ size_t const max_padding = alignment / sizeof(T) - 1;
storage_base_ = new T [vectorlength * nelems + max_padding];
- storage_ = (T*) (size_t (storage_base_ + max_padding) & -alignment);
-#warning "TODO"
- if (not (storage_ >= storage_base_ and
- storage_ <= storage_base_ + max_padding)) {
- cerr << "alignment=" << alignment << "\n"
- << "max_padding=" << max_padding << "\n"
- << "vectorlength=" << vectorlength << "\n"
- << "nelems=" << nelems << "\n"
- << "storage_base_=" << storage_base_ << "\n"
- << "storage_=" << storage_ << "\n";
- }
- assert(storage_ >= storage_base_ and
- storage_ <= storage_base_ + max_padding);
+ storage_ = (T*) (size_t(storage_base_ + max_padding) & ~(alignment-1));
+ assert(size_t(storage_) >= size_t(storage_base_ ) and
+ size_t(storage_) <= size_t(storage_base_ + max_padding));
}
- assert (not (size_t (storage_) & (alignment-1)));
+ assert(not (size_t(storage_) & (alignment-1)));
owns_storage_ = true;
} catch (...) {
T Tdummy;
diff --git a/Carpet/CarpetLib/src/memstat.cc b/Carpet/CarpetLib/src/memstat.cc
index 25a19d95c..714278890 100644
--- a/Carpet/CarpetLib/src/memstat.cc
+++ b/Carpet/CarpetLib/src/memstat.cc
@@ -80,10 +80,10 @@ void CarpetLib_printmemstats (CCTK_ARGUMENTS)
#endif
cout << "Memory statistics from CarpetLib:" << eol
- << " Current number of objects: " << gmem::total_allocated_objects << eol
+ << " Current number of objects: " << size_t(gmem::total_allocated_objects) << eol
<< " Current allocated memory: "
<< setprecision(3) << gmem::total_allocated_bytes / gmem::MEGA << " MB" << eol
- << " Maximum number of objects: " << gmem::max_allocated_objects << eol
+ << " Maximum number of objects: " << size_t(gmem::max_allocated_objects) << eol
<< " Maximum allocated memory: "
<< setprecision(3) << gmem::max_allocated_bytes / gmem::MEGA << " MB" << eol
<< " Current administrative memory: "