1 files changed, 776 insertions, 0 deletions
diff --git a/Carpet/Carpet/src/LoadBalanceReal/carpet_boxtypes.F90 b/Carpet/Carpet/src/LoadBalanceReal/carpet_boxtypes.F90
new file mode 100644
index 000000000..a03d312c6
--- /dev/null
+++ b/Carpet/Carpet/src/LoadBalanceReal/carpet_boxtypes.F90
@@ -0,0 +1,776 @@
+#include "cctk.h"
+
+module carpet_boxtypes
+
+  use carpet_region
+  implicit none
+
+  integer :: ghostsize
+  real(wp) :: alpha
+  logical :: limit_size
+  integer :: granularity
+  integer :: granularity_boundary
+  integer :: procid
+
+  contains
+
+!   The basic calculation of the cost in each of the 3 directions.
+    subroutine cost_box ( reg, cost )
+      type(bbox), intent(in) :: reg
+      integer, dimension(3), intent(out) :: cost
+      integer :: i
+    
+      do i = 1, 3
+        cost(i) = (reg%dim(i)%upper -  reg%dim(i)%lower)/reg%dim(i)%stride + 1
+      end do
+    end subroutine cost_box
+
+
+!   Find a sorted list of the best options for splitting npoints in 2 chunks
+!   among nprocs processors. The fractional remainder is passed in through 
+!   fract. The maximum allowed number of options to return is nmax, and the
+!   actual number of returned options is nact. The result is returned in
+!   np (number of procs for the first chunk) and nc (a measure of the
+!   imabalance of the split.
+    subroutine choosenprocs ( npoints, nprocs, fract, nmax, nact, np, nc )
+      integer, intent(in) :: npoints, nmax
+      real(wp), intent(in) :: nprocs, fract
+      integer, intent(out) :: nact
+      integer, dimension(:), intent(out) :: np
+      real(wp), dimension(:), intent(out) :: nc
+      integer :: idealcost
+      integer :: quot, i, j, round, ind
+      real(wp) :: c1, c2, currentimbal
+      logical :: exchange
+
+!     The idealcost should be npoints/nprocs but I rescale with nprocs for
+!     The purpose of avoiding some problems with floating point comparisons.
+      idealcost = npoints
+
+!     Initialize the cost to the maximum possible floating point number.
+      nc = minval( empty )
+
+!     initialize the actual returned number of options to 0.
+      nact = 0
+
+!     Initialize an integer index to zero. Used to keep track where the current
+!     option should be inserted in the result vector.
+      ind = 1
+
+!     Calculate the starting number of processors to start the search with
+!     (quot) as the largest integer smaller than nprocs.
+      quot = floor(nprocs)
+!     If the fractional part we have to leave unsplit is larger than 1
+!     decrement quot by 1.
+      if ( fract > 1.0_wp ) quot = quot-1
+!     If nprocs has zero fractional part, we can start at quot/2 due to
+!     symmetry.
+      if ( .not. ( nprocs>quot ) ) then
+        quot = quot / 2
+      end if
+
+!     Try all options until reaching a single processor.
+      options: do i=quot, 1, -1
+
+!       Calculate the number of points that will yield the smallest imbalance.
+        round = nint(real(npoints,wp)/nprocs*i)
+
+!       Calculate the imbalance for the current choice of processors.
+!       This has been scaled by nprocs (like idealcost has been above).
+        c1 = abs ( (round*nprocs)/i - idealcost )
+        c2 = abs ( (npoints-round)/(nprocs-i)*nprocs - idealcost )
+        currentimbal = max ( c1, c2 )
+
+!       Initialize the boolean exchange that keeps track of whether the current
+!       choice needs to replace an entry in the return vectors.
+        exchange = .false.
+
+!       Start from the current number of entries in the return vector and check
+!       if the current choice is better than any of them. If so indicate that
+!       an exchange has to happen and record the index.
+        do j = nact, 1, -1
+          if ( currentimbal < nc(j) ) then
+            exchange = .true.
+            ind = j
+          end if
+        end do
+
+!       If an exchange has to happen, shift all elements with indices higher
+!       than or equal to ind to the right.
+        if (exchange) then
+          np(ind:nmax) = eoshift(np(ind:nmax),-1)
+          nc(ind:nmax) = eoshift(nc(ind:nmax),-1)
+        end if
+
+!       If we don't need to do an exchange, and if nact<nmax add the current
+!       case after the previous entries.
+        if (nact<nmax .and. .not.exchange) then
+          nact = nact+1
+          np(nact) = i
+          nc(nact) = currentimbal
+        end if
+
+!       If the return vectors are empty or an exhange has to be done, insert
+!       the current choice at the appropriate position (ind) and increment the
+!       number of elements in the return vector (capped at nmax)
+        if (nact == 0 .or. exchange) then
+          nact = min(nact+1,nmax)
+          np(ind) = i
+          nc(ind) = currentimbal
+        end if
+      end do options
+
+!     This is commented out for now. Used to return only the elements that are
+!     as good as the best case.
+!      nbad = nact + 1
+!      do i = 2, nact
+!        if ( abs(nc(i)-nc(i-1))>1.0d-10 ) nbad = i
+!      end do
+!      nact = nbad - 1
+    end subroutine choosenprocs
+
+
+!   Calculate a measure of the load imbalance of a super region.
+!   The measure is a weighted sum taking into account the imbalance in the
+!   number of points (computation) and the imbalance in the number of ghost
+!   zones (communication). Ghost zones are added with a multiplication factor
+!   of alpha.
+    subroutine calc_imbalance ( sregion, imbalance )
+!     The intent has been removed to make it compile with gfortran 4.1.
+!      type(superregion2), pointer, intent(in) :: sregion
+      type(superregion2), pointer :: sregion
+      real(wp), intent(out) :: imbalance
+      integer, dimension(3) :: cost
+      real(wp) :: maxcost, maxgcost, n
+      real(wp) :: idealcost, idealgcost
+
+!     Initialize counter and accumulation variables
+      n = 0.0_wp
+      maxcost = 0.0_wp
+      idealcost = 0.0_wp
+      maxgcost = 0.0_wp
+      idealgcost = 0.0_wp
+
+!     Traverse the tree structure and add values for all leaves.
+      call traverse_tree ( sregion )
+
+!     Calculate the ideal cost
+      idealcost = idealcost/n
+      idealgcost = idealgcost/n
+
+!     Calculate the imbalance
+      imbalance = ( maxcost/idealcost - 1.0_wp ) &
+                  + alpha * ( maxgcost/idealgcost - 1.0_wp )
+
+      contains
+
+!       Routine to traverse the tree and accumulate the cost of the leaves.
+        recursive subroutine traverse_tree ( sreg )
+!         The intent has been removed to make it compile with gfortran 4.1.
+!          type(superregion2), pointer, intent(in) :: sreg
+          type(superregion2), pointer :: sreg
+          integer :: ich, nch, tcost, gcost
+
+!         If the region has children recursively traverse the children.
+          if ( associated(sreg%children) ) then
+            nch = size(sreg%children)
+            do ich = 1, nch
+              call traverse_tree ( sreg%children(ich)%point )
+            end do
+
+!         Otherwise it's a leaf and we accumulate the cost and update the
+!         maximum cost.
+          else
+            call cost_box ( sreg%extent, cost )
+            tcost = product ( cost, mask = cost /= 0 )
+            gcost = product ( cost+2*ghostsize, mask = cost /= 0 ) - tcost
+            tcost = tcost
+            gcost = gcost
+            maxcost = max ( maxcost, tcost / sreg%frac )
+            maxgcost = max ( maxgcost, gcost / sreg%frac )
+            idealcost = idealcost + tcost
+            idealgcost = idealgcost + gcost
+            n = n + sreg%frac
+          end if
+
+        end subroutine traverse_tree
+
+    end subroutine calc_imbalance
+
+
+!   Assign processor numbers to the leaves in a tree starting from start_proc.
+    subroutine AssignProcs ( minproc, maxproc, frac1, frac2, sregion )
+      integer, intent(in) :: minproc, maxproc
+      real(wp ), intent(in) :: frac1, frac2
+      logical :: lower_is_outer, upper_is_outer
+      integer :: boundarysize, minsize
+!     The intent has been removed to make it compile with gfortran 4.1.
+!      type(superregion2), pointer, intent(inout) :: sregion
+      type(superregion2), pointer :: sregion
+      character :: msg*200
+
+!     Initialize.
+      if ( frac1 == 0 ) then
+        procid = minproc
+      else
+        procid = minproc + 1
+      end if
+
+!     Traverse the tree.
+      call traverse_treep ( sregion )
+      
+      contains
+
+!       Routine to traverse the tree and assign processor ids to the leaves.
+        recursive subroutine traverse_treep ( sreg )
+!          The intent has been removed to make it compile with gfortran 4.1.
+!          type(superregion2), pointer, intent(inout) :: sreg
+          type(superregion2), pointer :: sreg
+          integer :: ich, nch, maxcost, np
+          integer, dimension(3) :: cost
+          integer, dimension(1) :: mydim
+
+!         If the region has children recursively traverse the children.
+          if ( associated(sreg%children) ) then
+            nch = size(sreg%children)
+            do ich = 1, nch
+              call traverse_treep ( sreg%children(ich)%point )
+            end do
+
+!         Otherwise it's a leaf and we assign the processor id and increment it.
+          else
+            if ( sreg%frac == 1.0_wp ) then
+              sreg%processor = procid
+              procid = procid + 1
+            else
+              call cost_box ( sreg%extent, cost )
+              maxcost = maxval(cost)
+              mydim = maxloc (cost)
+              lower_is_outer = sreg%outer_boundaries%obound(mydim(1),1)/=0
+              upper_is_outer = sreg%outer_boundaries%obound(mydim(1),2)/=0
+              if (frac1 == 0.0_wp) then
+                np = 0
+              else if (frac2 == 0.0_wp) then
+                np = maxcost
+              else
+                np = nint(frac1/(frac1+frac2)*maxcost)
+              end if
+              if (limit_size) then
+                boundarysize = ghostsize
+                minsize = boundarysize+ghostsize
+                if (lower_is_outer .and. np>0 .and. np<minsize) then
+                  if (np<minsize/2) then
+                    np = 0
+                  else
+                    np = minsize
+                  end if
+                end if
+                if (upper_is_outer .and. &
+                    np<maxcost .and. np>maxcost-minsize) then
+                  if (np>maxcost-minsize/2) then
+                    np = maxcost
+                  else
+                    np = maxcost-minsize
+                  end if
+                end if
+              end if
+              if ((lower_is_outer .and. np>0 .and. np<minsize) .or. &
+                  (upper_is_outer .and. &
+                   np<maxcost .and. np>maxcost-minsize)) then
+                write (msg, '("Not enough grid points: np=",i4," gs=",i4, "maxcost=",i4)') &
+                       np, boundarysize, maxcost
+                call CCTK_WARN(CCTK_WARN_ABORT, msg)
+              end if
+              if (np>0 .and. np<maxcost) then
+                call split_sregion ( sreg, mydim(1), (/ np /) )
+                sreg%processor = -1
+                sreg%children(1)%point%processor = minproc
+                sreg%children(1)%point%frac = frac1
+                sreg%children(2)%point%processor = maxproc
+                sreg%children(2)%point%frac = frac2
+              else if (np==0) then
+                sreg%processor = maxproc
+              else if (np==maxcost) then
+                sreg%processor = minproc
+              end if
+              
+            end if
+          end if
+
+        end subroutine traverse_treep
+
+      end subroutine AssignProcs
+
+
+!   Routine to sum up the load of an array of super regions. Return the result
+!   in the pre-allocated array of length nprocs. The array should be
+!   initialized to zero.
+    subroutine calc_proc_load ( sregions, proc_load )
+      type(ptr), dimension(:), intent(in) :: sregions
+      integer, dimension(:), intent(inout) :: proc_load
+      integer :: i
+
+!     For each of the super regions in the array.
+      do i = 1, size(sregions)
+
+!       Traverse the tree and 
+        call traverse_treel ( sregions(i)%point )
+      end do
+
+      contains
+
+!       Routine to traverse the tree and add the load of each leaf to the
+!       assigned processor.
+        recursive subroutine traverse_treel ( sreg )
+!         The intent has been removed to make it compile with gfortran 4.1.
+!          type(superregion2), pointer, intent(in) :: sreg
+          type(superregion2), pointer :: sreg
+          integer, dimension(3) :: cost
+          integer :: ich, nch
+
+!         If the region has children recursively traverse the children.
+          if ( associated(sreg%children) ) then
+            nch = size(sreg%children)
+            do ich = 1, nch
+              call traverse_treel ( sreg%children(ich)%point )
+            end do
+
+!         Otherwise it's a leaf and we calculate the cost and accumulate it
+!         into the correct spot in the proc_load array.
+
+          else
+            call cost_box ( sreg%extent, cost )
+            proc_load(sreg%processor+1) = proc_load(sreg%processor+1) + &
+                                          product ( cost )
+          end if
+
+        end subroutine traverse_treel
+
+    end subroutine calc_proc_load
+
+
+!   Split a superregion (newreg) recursively into smaller boxes while trying
+!   to minimize the load imbalance. Here we allow for fractional processor
+!   numbers, nprocs. The fraction that should be left unsplit is given by
+!   fract (0.0<=fract<2). The inputs maxchoices and maxfactor are used to
+!   determine how many options to examine at a given recursion level. The
+!   variable clevel should be initialized to 0 and is used to keep track of
+!   the recursion level. The number of options testes is counter and returned
+!   in ncount.
+    recursive subroutine SplitRegionsRecursive ( nprocs, fract, maxchoices, &
+                                                 maxfactor, clevel, &
+                                                 newreg, ncount )
+      real(wp), intent(in) :: nprocs, fract
+      integer, intent(in) :: maxchoices, maxfactor
+      integer, intent(inout) :: clevel
+!     The intent has been removed to make it compile with gfortran 4.1.
+!      type(superregion2), pointer, intent(inout) :: newreg
+      type(superregion2), pointer :: newreg
+      integer, intent(inout) :: ncount
+      type(bbox) :: reg
+      integer, dimension(3) :: cost
+      integer :: maxcost, ngood, i, np1, lo1, up1, str
+      real(wp) :: p1, p2
+      integer, dimension(1) :: mydim
+      integer, dimension(maxchoices) :: npr
+      real(wp), dimension(maxchoices) :: ncr
+      type(ptr), dimension(maxchoices) :: newregarr
+      real(wp), dimension(maxchoices) :: imbalancearr
+      logical :: lower_is_outer, upper_is_outer
+      integer :: boundarysize, minsize
+      type(bbox) :: reg1, reg2
+      integer, dimension(1) :: minimbalanceloc
+      integer :: nlocal
+      character :: msg*200
+
+!     Extract the bounding box information
+      reg = newreg%extent
+
+!     Make sure the array with the test super regions are nullified.
+      do i = 1, maxchoices
+        nullify ( newregarr(i)%point )
+      end do
+
+!     Initialize the imbalance array to max floating point value.
+      imbalancearr(:) = minval(empty)
+
+!     Increment the recursion level variable.
+      clevel = clevel + 1
+
+!     Calculate the number of choices to examine at the current recursion
+!     level based on the value of maxchoices and maxfactor.
+      nlocal = max ( maxchoices - ( clevel - 1)  / maxfactor, 1 )
+
+!     If we are called with nprocs==max(1,fract) mark the region to be a leaf
+!     in the tree, increment the choice counter and decrement the recursion
+!     level counter and return.
+      if (nprocs <= max(1.0_wp,fract+0.000001_wp) ) then
+        ncount = ncount + 1
+        clevel = clevel - 1
+        newreg%processor = 1
+        newreg%frac = nprocs
+        return
+      end if
+
+!     Calculate the cost of the current region.
+      call cost_box ( reg, cost )
+
+!     Find the dimension with the most points.
+      maxcost = maxval(cost)
+      mydim = maxloc (cost)
+
+!     Get the list of chunk options to investigate.
+      call choosenprocs ( cost(mydim(1)), nprocs, fract, nlocal, ngood, &
+                                                                 npr, ncr )
+
+!     Loop over the chunk options to investigate.
+      chunks: do i=1, ngood
+
+!       Initialize the super region.
+        allocate ( newregarr(i)%point )
+        newregarr(i)%point = newreg
+
+!       Find the processor numbers of the two chunks (p1 always an integer).
+        p1 = npr(i)
+        p2 = nprocs - npr(i)
+
+!       Find the number of points in each of the 2 chunks while making
+!       sure the chunks are not too small compared to the ghostsize if
+!       limit_size is true.
+!       Take also the granularity into account.
+!       At the outer boundary, take also the boundary size into account,
+!       assuming that the boundary size is equal to the ghost size.
+!       (Boundary points cannot be split, and near a boundary, the
+!        minimum number of interior points is the number of ghost points.)
+        np1 = nint((real(maxcost,wp)*p1)/nprocs/granularity)*granularity
+!       TODO: Take granularity_boundary into account as well!
+!       TODO: Take granularity into account when limiting!
+        if (limit_size) then
+          lower_is_outer = &
+               newregarr(i)%point%outer_boundaries%obound(mydim(1),1)/=0
+          upper_is_outer = &
+               newregarr(i)%point%outer_boundaries%obound(mydim(1),2)/=0
+          boundarysize = ghostsize
+          minsize = boundarysize+ghostsize
+          if (lower_is_outer .and. np1>0 .and. np1<minsize) then
+            np1 = minsize
+          end if
+          if (upper_is_outer .and. np1<maxcost .and. np1>maxcost-minsize) then
+            np1 = maxcost-minsize
+          end if
+          if ((lower_is_outer .and. np1>0 .and. np1<minsize) .or. &
+              (upper_is_outer .and. np1<maxcost .and. np1>maxcost-minsize)) then
+            write (msg, '("Not enough grid points: np=",i4," gs=",i4, "maxcost=",i4)') &
+                   np1, boundarysize, maxcost
+            call CCTK_WARN(CCTK_WARN_ABORT, msg)
+          end if
+        end if
+
+!       Split the region (reg) into two regions (reg1, reg2) taking the
+!       stride into account.
+        reg1 = reg
+        reg2 = reg
+        lo1 = reg%dim(mydim(1))%lower
+        up1 = reg%dim(mydim(1))%upper
+        str = reg%dim(mydim(1))%stride
+        reg1%dim(mydim(1))%upper = lo1 + (np1 - 1)*str
+        reg2%dim(mydim(1))%lower = reg1%dim(mydim(1))%upper + str
+
+!       Add the new leaves to the tree.
+        call split_sregion ( newregarr(i)%point, mydim(1), (/ np1 /) )
+
+!       Split region reg1 among p1 processors and region reg2 among p2
+!       processors adding the new regions to the super region.
+        call SplitRegionsRecursive ( p1, fract, maxchoices, maxfactor, &
+                                     clevel, &
+                                     newregarr(i)%point%children(1)%point, &
+                                     ncount )
+        call SplitRegionsRecursive ( p2, fract, maxchoices, maxfactor, &
+                                     clevel, &
+                                     newregarr(i)%point%children(2)%point, &
+                                     ncount )
+
+!       Calculate the load imbalance of the resulting super region.
+        call calc_imbalance ( newregarr(i)%point, imbalancearr(i) )
+
+!       If the balance is perfect exit the do loop and don't do any further
+!       tests.
+        if ( imbalancearr(i) == 0.0_wp ) then
+          ngood = i
+          exit
+        end if
+      end do chunks
+
+!     Find the location of the minimum load imbalance.
+      minimbalanceloc = minloc ( imbalancearr(1:ngood) )      
+
+!     Insert the best choice into the tree structure      
+      call point_to_children ( newreg,  newregarr(minimbalanceloc(1))%point )
+
+!     Cut the connection between the best choice and its children.
+      call disassociate ( newregarr(minimbalanceloc(1))%point )
+
+!     Deallocate all the tested options.
+      do i = 1, ngood
+        call destroy_sregion ( newregarr(i)%point )
+      end do
+
+!     Decrement the recursion level counter.
+      clevel = clevel - 1
+
+    end subroutine SplitRegionsRecursive
+
+
+!   Routine to return the union of 2 pre-sorted arrays of integers.
+!   only n1 elements of list1 and n2 elements of list2 are actually used.
+!   The number of elements in the union is n. The union is returned in list.
+    subroutine union (n1, list1, n2, list2, n, list )
+      integer, intent(in) :: n1, n2
+      integer, dimension(:), intent(in) :: list1, list2
+      integer, intent(out) :: n
+      integer, dimension(:), intent(out) :: list
+      integer :: j, k, newl
+
+!     Initialize n and list to 0. The integers j and k are used to march through
+!     the 2 input arrays and are initialized to 1 if the lists have some
+!     valid elements and 0 otherwise.
+      n = 0; list = 0; j = min(1,n1); k = min(1,n2)
+
+!     Initialize the output array to 0
+      list = 0
+
+!     If both input arrays are 0, so are the output array and we are done.
+      if (n1==0 .and. n2==0) then
+        return
+      end if
+
+!     Repeat until j>n1 and k>n2.
+      loop: do
+
+!       If there are still valid elements in both list1 and list2...
+        lists: if ( (j>0 .and. j<=n1) .and. (k>0 .and. k<=n2) ) then
+
+!         The next element to add to list is the minimum of the current values
+!         in list1 and list2
+          newl = min ( list1(j), list2(k) )
+
+!         If the element is not already in the output list....
+          both: if ( n > 0 ) then
+            if ( newl > list(n) ) then
+
+!             Add the element
+              n = n + 1
+              list(n) = newl
+
+!             If the added element comes from list1 increment j
+              if ( list1(j) == list(n) ) j = j+1
+
+!             If the added element comes from list2 increment j
+              if ( list2(k) == list(n) ) k = k+1
+
+            end if
+
+!         Or if the output list is empty
+          else both
+
+!           Add the element
+            n = n + 1
+            list(n) = newl
+
+!           If the added element comes from list1 increment j
+            if ( list1(j) == list(n) ) j = j+1
+
+!           If the added element comes from list2 increment j
+            if ( list2(k) == list(n) ) k = k+1
+
+          end if both
+
+!       If there there is only valid elements left in list1
+        else if (j>0 .and. j<=n1) then lists
+
+!         Add the current element of list1 to list if it is not already there
+!         or list is empty and increment j.
+          newl = list1(j)
+          list_one: if ( n > 0 ) then
+            if ( newl > list(n) ) then
+              n = n + 1
+              list(n) = newl
+              j = j+1
+            end if
+          else list_one
+            n = n + 1
+            list(n) = newl
+            j = j+1
+          end if list_one
+
+!       If there there is only valid elements left in list2
+        else if (k>0 .and. k<=n2) then lists
+
+!         Add the current element of list2 to list if it is not already there
+!         or list is empty and increment k.
+          newl = list2(k)
+          list_two: if ( n > 0 ) then
+            if ( newl > list(n) ) then
+              n = n + 1
+              list(n) = newl
+              k = k+1
+            end if
+          else list_two
+            n = n + 1
+            list(n) = newl
+            k = k+1
+          end if list_two
+        end if lists
+
+!       If both list1 and list2 have been fully traversed then exit.
+        if ( j > n1 .and. k > n2 ) exit
+
+      end do loop
+
+    end subroutine union
+
+
+!   Routine to figure out how many processors to assign to a list of super
+!   regions and if necessary split super regions into regions to maintain load
+!   balance across all regions. Returns a lits of potentially split super
+!   regions and 2 arrays containing the minimum and maximum processor id
+!   assigned to all leaf regions in the list of super region tree structures.
+    subroutine SplitSuperRegions ( sregion, nprocs )
+      type(ptr), dimension(:), intent(inout) :: sregion
+      integer, intent(in) :: nprocs
+      integer :: i, nregs, totcost, &
+                 clevel, ncount, maxcost, np
+      logical :: lower_is_outer, upper_is_outer
+      integer :: boundarysize, minsize
+      integer, dimension(1) :: mydir
+      integer, dimension(:), allocatable :: cost
+      integer, dimension(3) :: mycost
+      integer, dimension(:,:), allocatable :: lcost
+      real(wp) :: idealcost, nrprocs
+      real(wp), dimension(:), allocatable :: nsplit, accumsplit, frac1, frac2
+      real(wp) :: fract
+      integer, dimension(:), allocatable :: accumcost, minmap, maxmap, &
+                                            splitnum
+      character :: msg*200
+
+!     Set nregs to the number of super regions in the input list.
+      nregs = size(sregion)
+
+!     Allocate helper arrays. 
+      allocate ( cost(nregs), accumcost(nregs+1), minmap(nregs), &
+                 maxmap(nregs), nsplit(nregs), accumsplit(nregs+1), &
+                 frac1(nregs), frac2(nregs), lcost(nregs,3), splitnum(nregs) )
+
+!     Set cost to the total cost of each super region and accumcost to the
+!     accumulated cost.
+      accumcost(1) = 0
+      do i = 1, nregs
+        call cost_box ( sregion(i)%point%extent, lcost(i,:) )
+        cost(i) = product ( lcost(i,:) )
+        accumcost(i+1) = accumcost(i) + cost(i)
+      end do
+
+!     Calculate the total cost (of all super regions) as well as the idealcost.
+!     Note that totcost is an integer, while idealcost is real.
+      totcost = sum ( cost )
+      idealcost = real(totcost,wp) / nprocs
+
+!     Calculate the ideal processor location for the beginning and end of each
+!     super region, the fractional number of processors to be assigned to this
+!     region as well as the accumulated fractional processor count.
+      accumsplit(1) = 0.0_wp
+      do i = 1, nregs
+        minmap(i) = floor(accumcost(i)/idealcost) 
+        maxmap(i) = ceiling(accumcost(i+1)/idealcost) - 1 
+        nsplit(i) = cost(i)/idealcost
+        accumsplit(i+1) = accumsplit(i)+nsplit(i)
+      end do
+
+!     Calculate the size of the fractional regions at the beginning and the
+!     end of each super region.
+      do i = 1, nregs
+        if ( maxmap(i) > minmap(i) ) then
+          frac1(i) = ceiling(accumsplit(i))-accumsplit(i)
+          frac2(i) = accumsplit(i+1)-floor(accumsplit(i+1))
+        else
+          frac1(i) = 0.0_wp
+          frac2(i) = 0.0_wp
+        end if
+      end do
+ 
+!     Loop over the super regions.
+      do i = 1, nregs
+        nrprocs = nsplit(i)
+!       If the region begins and ends on the same processor, don't split
+!       but just assign the region to that processor.
+        if ( maxmap(i) == minmap(i) ) then
+          sregion(i)%point%processor = minmap(i)
+          sregion(i)%point%frac = nrprocs
+!       Otherwise split the region.
+        else
+!         Initialize counter variables used in the recursive routine.
+          clevel = 0
+          ncount = 0
+!         Calculate the fractional part that has to be left unsplit. This is
+!         the sum of the fractional leftovers at either end.
+          fract = frac1(i)+frac2(i)
+!         If nrprocs > fract (fuzzy logic) then split recursively on nrprocs
+!         processors with fract left unsplit (0.0<fract<2.0) and then
+!         assign processors to the resulting regions.
+          if ( abs ( nrprocs -  fract ) > 0.000001_wp ) then
+            call SplitRegionsRecursive ( nrprocs, fract, 4, 3, &
+                    clevel, sregion(i)%point, ncount )
+            call AssignProcs ( minmap(i), maxmap(i), frac1(i), frac2(i), &
+                               sregion(i)%point )
+!         Otherwise we just need to split the superregion into 2 regions
+!         of size fract1 and fract2.
+          else
+!           Find the cost of the region.
+            call cost_box ( sregion(i)%point%extent, mycost )
+!           Find the direction of maximal cost.
+            maxcost = maxval(mycost)
+            mydir = maxloc (mycost)
+!           Find the number of gridpoints in direction mydir of the first
+!           split region.
+            np = nint(frac1(i)/fract*mycost(mydir(1)))
+            if (limit_size) then
+              lower_is_outer = &
+                   sregion(i)%point%outer_boundaries%obound(mydir(1),1)/=0
+              upper_is_outer = &
+                   sregion(i)%point%outer_boundaries%obound(mydir(1),2)/=0
+              boundarysize = ghostsize
+              minsize = boundarysize+ghostsize
+              if (lower_is_outer .and. np>0 .and. np<minsize) then
+                np = minsize
+              end if
+              if (upper_is_outer .and. np<maxcost .and. np>maxcost-minsize) then
+                np = maxcost-minsize
+              end if
+              if ((lower_is_outer .and. np>0 .and. np<minsize) .or. &
+                  (upper_is_outer .and. &
+                   np<maxcost .and. np>maxcost-minsize)) then
+                write (msg, '("Not enough grid points: np=",i4," gs=",i4, "maxcost=",i4)') &
+                       np, boundarysize, maxcost
+                call CCTK_WARN(CCTK_WARN_ABORT, msg)
+              end if
+            end if
+!           Split the region.
+            call split_sregion ( sregion(i)%point, mydir(1), (/ np /) )
+!           Assign the processors to the resulting regions.
+            sregion(i)%point%processor = -1
+            sregion(i)%point%children(1)%point%processor = minmap(i)
+            sregion(i)%point%children(1)%point%frac = frac1(i)
+            sregion(i)%point%children(2)%point%processor = maxmap(i)
+            sregion(i)%point%children(2)%point%frac = frac2(i)
+          end if
+        end if
+      end do
+
+!     Deallocate local helper arrays.
+      deallocate ( cost, accumcost, minmap, &
+                 maxmap, nsplit, accumsplit, &
+                 frac1, frac2, lcost, splitnum )
+ 
+    end subroutine SplitSuperRegions
+
+end module carpet_boxtypes