module carpet_boxtypes use carpet_region implicit none integer :: ghostsize real(wp) :: alpha logical :: limit_size integer :: granularity integer :: granularity_boundary integer :: procid contains ! The basic calculation of the cost in each of the 3 directions. subroutine cost_box ( reg, cost ) type(bbox), intent(in) :: reg integer, dimension(3), intent(out) :: cost integer :: i do i = 1, 3 cost(i) = (reg%dim(i)%upper - reg%dim(i)%lower)/reg%dim(i)%stride + 1 end do end subroutine cost_box ! Find a sorted list of the best options for splitting npoints in 2 chunks ! among nprocs processors. The fractional remainder is passed in through ! fract. The maximum allowed number of options to return is nmax, and the ! actual number of returned options is nact. The result is returned in ! np (number of procs for the first chunk) and nc (a measure of the ! imabalance of the split. subroutine choosenprocs ( npoints, nprocs, fract, nmax, nact, np, nc ) integer, intent(in) :: npoints, nmax real(wp), intent(in) :: nprocs, fract integer, intent(out) :: nact integer, dimension(:), intent(out) :: np real(wp), dimension(:), intent(out) :: nc integer :: idealcost integer :: quot, i, j, round, ind real(wp) :: c1, c2, currentimbal logical :: exchange ! The idealcost should be npoints/nprocs but I rescale with nprocs for ! The purpose of avoiding some problems with floating point comparisons. idealcost = npoints ! Initialize the cost to the maximum possible floating point number. nc = minval( empty ) ! initialize the actual returned number of options to 0. nact = 0 ! Initialize an integer index to zero. Used to keep track where the current ! option should be inserted in the result vector. ind = 1 ! Calculate the starting number of processors to start the search with ! (quot) as the largest integer smaller than nprocs. quot = floor(nprocs) ! If the fractional part we have to leave unsplit is larger than 1 ! decrement quot by 1. if ( fract > 1.0_wp ) quot = quot-1 ! If nprocs has zero fractional part, we can start at quot/2 due to ! symmetry. if ( .not. ( nprocs>quot ) ) then quot = quot / 2 end if ! Try all options until reaching a single processor. options: do i=quot, 1, -1 ! Calculate the number of points that will yield the smallest imbalance. round = nint(real(npoints,wp)/nprocs*i) ! Calculate the imbalance for the current choice of processors. ! This has been scaled by nprocs (like idealcost has been above). c1 = abs ( (round*nprocs)/i - idealcost ) c2 = abs ( (npoints-round)/(nprocs-i)*nprocs - idealcost ) currentimbal = max ( c1, c2 ) ! Initialize the boolean exchange that keeps track of whether the current ! choice needs to replace an entry in the return vectors. exchange = .false. ! Start from the current number of entries in the return vector and check ! if the current choice is better than any of them. If so indicate that ! an exchange has to happen and record the index. do j = nact, 1, -1 if ( currentimbal < nc(j) ) then exchange = .true. ind = j end if end do ! If an exchange has to happen, shift all elements with indices higher ! than or equal to ind to the right. if (exchange) then np(ind:nmax) = eoshift(np(ind:nmax),-1) nc(ind:nmax) = eoshift(nc(ind:nmax),-1) end if ! If we don't need to do an exchange, and if nact1.0d-10 ) nbad = i ! end do ! nact = nbad - 1 end subroutine choosenprocs ! Calculate a measure of the load imbalance of a super region. ! The measure is a weighted sum taking into account the imbalance in the ! number of points (computation) and the imbalance in the number of ghost ! zones (communication). Ghost zones are added with a multiplication factor ! of alpha. subroutine calc_imbalance ( sregion, imbalance ) ! The intent has been removed to make it compile with gfortran 4.1. ! type(superregion2), pointer, intent(in) :: sregion type(superregion2), pointer :: sregion real(wp), intent(out) :: imbalance integer, dimension(3) :: cost real(wp) :: maxcost, maxgcost, n real(wp) :: idealcost, idealgcost ! Initialize counter and accumulation variables n = 0.0_wp maxcost = 0.0_wp idealcost = 0.0_wp maxgcost = 0.0_wp idealgcost = 0.0_wp ! Traverse the tree structure and add values for all leaves. call traverse_tree ( sregion ) ! Calculate the ideal cost idealcost = idealcost/n idealgcost = idealgcost/n ! Calculate the imbalance imbalance = ( maxcost/idealcost - 1.0_wp ) & + alpha * ( maxgcost/idealgcost - 1.0_wp ) contains ! Routine to traverse the tree and accumulate the cost of the leaves. recursive subroutine traverse_tree ( sreg ) ! The intent has been removed to make it compile with gfortran 4.1. ! type(superregion2), pointer, intent(in) :: sreg type(superregion2), pointer :: sreg integer :: ich, nch, tcost, gcost ! If the region has children recursively traverse the children. if ( associated(sreg%children) ) then nch = size(sreg%children) do ich = 1, nch call traverse_tree ( sreg%children(ich)%point ) end do ! Otherwise it's a leaf and we accumulate the cost and update the ! maximum cost. else call cost_box ( sreg%extent, cost ) tcost = product ( cost, mask = cost /= 0 ) gcost = product ( cost+2*ghostsize, mask = cost /= 0 ) - tcost tcost = tcost gcost = gcost maxcost = max ( maxcost, tcost / sreg%frac ) maxgcost = max ( maxgcost, gcost / sreg%frac ) idealcost = idealcost + tcost idealgcost = idealgcost + gcost n = n + sreg%frac end if end subroutine traverse_tree end subroutine calc_imbalance ! Assign processor numbers to the leaves in a tree starting from start_proc. subroutine AssignProcs ( minproc, maxproc, frac1, frac2, sregion ) integer, intent(in) :: minproc, maxproc real(wp ), intent(in) :: frac1, frac2 ! The intent has been removed to make it compile with gfortran 4.1. ! type(superregion2), pointer, intent(inout) :: sregion type(superregion2), pointer :: sregion ! Initialize. if ( frac1 == 0 ) then procid = minproc else procid = minproc + 1 end if ! Traverse the tree. call traverse_treep ( sregion ) contains ! Routine to traverse the tree and assign processor ids to the leaves. recursive subroutine traverse_treep ( sreg ) ! The intent has been removed to make it compile with gfortran 4.1. ! type(superregion2), pointer, intent(inout) :: sreg type(superregion2), pointer :: sreg integer :: ich, nch, maxcost, np integer, dimension(3) :: cost integer, dimension(1) :: mydim ! If the region has children recursively traverse the children. if ( associated(sreg%children) ) then nch = size(sreg%children) do ich = 1, nch call traverse_treep ( sreg%children(ich)%point ) end do ! Otherwise it's a leaf and we assign the processor id and increment it. else if ( sreg%frac == 1.0_wp ) then sreg%processor = procid procid = procid + 1 else if ( ( frac1 /= 0.0_wp ) .and. ( frac2 /= 0.0_wp ) ) then call cost_box ( sreg%extent, cost ) maxcost = maxval(cost) mydim = maxloc (cost) np = nint(frac1/(frac1+frac2)*cost(mydim(1))) call split_sregion ( sreg, mydim(1), (/ np /) ) sreg%processor = -1 sreg%children(1)%point%processor = minproc sreg%children(1)%point%frac = frac1 sreg%children(2)%point%processor = maxproc sreg%children(2)%point%frac = frac2 else if ( frac1 == 0.0_wp ) then sreg%processor = maxproc else if ( frac2 == 0.0_wp ) then sreg%processor = minproc end if end if end if end subroutine traverse_treep end subroutine AssignProcs ! Routine to sum up the load of an array of super regions. Return the result ! in the pre-allocated array of length nprocs. The array should be ! initialized to zero. subroutine calc_proc_load ( sregions, proc_load ) type(ptr), dimension(:), intent(in) :: sregions integer, dimension(:), intent(inout) :: proc_load integer :: i ! For each of the super regions in the array. do i = 1, size(sregions) ! Traverse the tree and call traverse_treel ( sregions(i)%point ) end do contains ! Routine to traverse the tree and add the load of each leaf to the ! assigned processor. recursive subroutine traverse_treel ( sreg ) ! The intent has been removed to make it compile with gfortran 4.1. ! type(superregion2), pointer, intent(in) :: sreg type(superregion2), pointer :: sreg integer, dimension(3) :: cost integer :: ich, nch ! If the region has children recursively traverse the children. if ( associated(sreg%children) ) then nch = size(sreg%children) do ich = 1, nch call traverse_treel ( sreg%children(ich)%point ) end do ! Otherwise it's a leaf and we calculate the cost and accumulate it ! into the correct spot in the proc_load array. else call cost_box ( sreg%extent, cost ) proc_load(sreg%processor+1) = proc_load(sreg%processor+1) + & product ( cost ) end if end subroutine traverse_treel end subroutine calc_proc_load ! Split a superregion (newreg) recursively into smaller boxes while trying ! to minimize the load imbalance. Here we allow for fractional processor ! numbers, nprocs. The fraction that should be left unsplit is given by ! fract (0.0<=fract<2). The inputs maxchoices and maxfactor are used to ! determine how many options to examine at a given recursion level. The ! variable clevel should be initialized to 0 and is used to keep track of ! the recursion level. The number of options testes is counter and returned ! in ncount. recursive subroutine SplitRegionsRecursive ( nprocs, fract, maxchoices, & maxfactor, clevel, & newreg, ncount ) real(wp), intent(in) :: nprocs, fract integer, intent(in) :: maxchoices, maxfactor integer, intent(inout) :: clevel ! The intent has been removed to make it compile with gfortran 4.1. ! type(superregion2), pointer, intent(inout) :: newreg type(superregion2), pointer :: newreg integer, intent(inout) :: ncount type(bbox) :: reg integer, dimension(3) :: cost integer :: maxcost, ngood, i, np1, lo1, up1, str real(wp) :: p1, p2 integer, dimension(1) :: mydim integer, dimension(maxchoices) :: npr real(wp), dimension(maxchoices) :: ncr type(ptr), dimension(maxchoices) :: newregarr real(wp), dimension(maxchoices) :: imbalancearr type(bbox) :: reg1, reg2 integer, dimension(1) :: minimbalanceloc integer :: nlocal ! Extract the bounding box information reg = newreg%extent ! Make sure the array with the test super regions are nullified. do i = 1, maxchoices nullify ( newregarr(i)%point ) end do ! Initialize the imbalance array to max floating point value. imbalancearr(:) = minval(empty) ! Increment the recursion level variable. clevel = clevel + 1 ! Calculate the number of choices to examine at the current recursion ! level based on the value of maxchoices and maxfactor. nlocal = max ( maxchoices - ( clevel - 1) / maxfactor, 1 ) ! If we are called with nprocs==max(1,fract) mark the region to be a leaf ! in the tree, increment the choice counter and decrement the recursion ! level counter and return. if (nprocs <= max(1.0_wp,fract+0.000001_wp) ) then ncount = ncount + 1 clevel = clevel - 1 newreg%processor = 1 newreg%frac = nprocs return end if ! Calculate the cost of the current region. call cost_box ( reg, cost ) ! Find the dimension with the most points. maxcost = maxval(cost) mydim = maxloc (cost) ! Get the list of chunk options to investigate. call choosenprocs ( cost(mydim(1)), nprocs, fract, nlocal, ngood, & npr, ncr ) ! Loop over the chunk options to investigate. chunks: do i=1, ngood ! Initialize the super region. allocate ( newregarr(i)%point ) newregarr(i)%point = newreg ! Find the processor numbers of the two chunks (p1 always an integer). p1 = npr(i) p2 = nprocs - npr(i) ! Find the number of points in each of the 2 chunks while making ! sure the chunks are not too small compared to the ghostsize if ! limit_size is true. ! Take also the granularity into account. np1 = nint((real(maxcost,wp)*p1)/nprocs/granularity)*granularity ! TODO: Take granularity_boundary into account as well! if ( limit_size ) then np1 = max(ghostsize,np1) np1 = min(maxcost-ghostsize,np1) end if ! Split the region (reg) into two regions (reg1, reg2) taking the ! stride into account. reg1 = reg reg2 = reg lo1 = reg%dim(mydim(1))%lower up1 = reg%dim(mydim(1))%upper str = reg%dim(mydim(1))%stride reg1%dim(mydim(1))%upper = lo1 + (np1 - 1)*str reg2%dim(mydim(1))%lower = reg1%dim(mydim(1))%upper + str ! Add the new leaves to the tree. call split_sregion ( newregarr(i)%point, mydim(1), (/ np1 /) ) ! Split region reg1 among p1 processors and region reg2 among p2 ! processors adding the new regions to the super region. call SplitRegionsRecursive ( p1, fract, maxchoices, maxfactor, & clevel, & newregarr(i)%point%children(1)%point, & ncount ) call SplitRegionsRecursive ( p2, fract, maxchoices, maxfactor, & clevel, & newregarr(i)%point%children(2)%point, & ncount ) ! Calculate the load imbalance of the resulting super region. call calc_imbalance ( newregarr(i)%point, imbalancearr(i) ) ! If the balance is perfect exit the do loop and don't do any further ! tests. if ( imbalancearr(i) == 0.0_wp ) then ngood = i exit end if end do chunks ! Find the location of the minimum load imbalance. minimbalanceloc = minloc ( imbalancearr(1:ngood) ) ! Insert the best choice into the tree structure call point_to_children ( newreg, newregarr(minimbalanceloc(1))%point ) ! Cut the connection between the best choice and its children. call disassociate ( newregarr(minimbalanceloc(1))%point ) ! Deallocate all the tested options. do i = 1, ngood call destroy_sregion ( newregarr(i)%point ) end do ! Decrement the recursion level counter. clevel = clevel - 1 end subroutine SplitRegionsRecursive ! Routine to return the union of 2 pre-sorted arrays of integers. ! only n1 elements of list1 and n2 elements of list2 are actually used. ! The number of elements in the union is n. The union is returned in list. subroutine union (n1, list1, n2, list2, n, list ) integer, intent(in) :: n1, n2 integer, dimension(:), intent(in) :: list1, list2 integer, intent(out) :: n integer, dimension(:), intent(out) :: list integer :: j, k, newl ! Initialize n and list to 0. The integers j and k are used to march through ! the 2 input arrays and are initialized to 1 if the lists have some ! valid elements and 0 otherwise. n = 0; list = 0; j = min(1,n1); k = min(1,n2) ! Initialize the output array to 0 list = 0 ! If both input arrays are 0, so are the output array and we are done. if (n1==0 .and. n2==0) then return end if ! Repeat until j>n1 and k>n2. loop: do ! If there are still valid elements in both list1 and list2... lists: if ( (j>0 .and. j<=n1) .and. (k>0 .and. k<=n2) ) then ! The next element to add to list is the minimum of the current values ! in list1 and list2 newl = min ( list1(j), list2(k) ) ! If the element is not already in the output list.... both: if ( n > 0 ) then if ( newl > list(n) ) then ! Add the element n = n + 1 list(n) = newl ! If the added element comes from list1 increment j if ( list1(j) == list(n) ) j = j+1 ! If the added element comes from list2 increment j if ( list2(k) == list(n) ) k = k+1 end if ! Or if the output list is empty else both ! Add the element n = n + 1 list(n) = newl ! If the added element comes from list1 increment j if ( list1(j) == list(n) ) j = j+1 ! If the added element comes from list2 increment j if ( list2(k) == list(n) ) k = k+1 end if both ! If there there is only valid elements left in list1 else if (j>0 .and. j<=n1) then lists ! Add the current element of list1 to list if it is not already there ! or list is empty and increment j. newl = list1(j) list_one: if ( n > 0 ) then if ( newl > list(n) ) then n = n + 1 list(n) = newl j = j+1 end if else list_one n = n + 1 list(n) = newl j = j+1 end if list_one ! If there there is only valid elements left in list2 else if (k>0 .and. k<=n2) then lists ! Add the current element of list2 to list if it is not already there ! or list is empty and increment k. newl = list2(k) list_two: if ( n > 0 ) then if ( newl > list(n) ) then n = n + 1 list(n) = newl k = k+1 end if else list_two n = n + 1 list(n) = newl k = k+1 end if list_two end if lists ! If both list1 and list2 have been fully traversed then exit. if ( j > n1 .and. k > n2 ) exit end do loop end subroutine union ! Routine to figure out how many processors to assign to a list of super ! regions and if necessary split super regions into regions to maintain load ! balance across all regions. Returns a lits of potentially split super ! regions and 2 arrays containing the minimum and maximum processor id ! assigned to all leaf regions in the list of super region tree structures. subroutine SplitSuperRegions ( sregion, nprocs ) type(ptr), dimension(:), intent(inout) :: sregion integer, intent(in) :: nprocs integer :: i, nregs, totcost, & clevel, ncount, maxcost, np integer, dimension(1) :: mydir integer, dimension(:), allocatable :: cost integer, dimension(3) :: mycost integer, dimension(:,:), allocatable :: lcost real(wp) :: idealcost, nrprocs real(wp), dimension(:), allocatable :: nsplit, accumsplit, frac1, frac2 real(wp) :: fract integer, dimension(:), allocatable :: accumcost, minmap, maxmap, & splitnum ! Set nregs to the number of super regions in the input list. nregs = size(sregion) ! Allocate helper arrays. allocate ( cost(nregs), accumcost(nregs+1), minmap(nregs), & maxmap(nregs), nsplit(nregs), accumsplit(nregs+1), & frac1(nregs), frac2(nregs), lcost(nregs,3), splitnum(nregs) ) ! Set cost to the total cost of each super region and accumcost to the ! accumulated cost. accumcost(1) = 0 do i = 1, nregs call cost_box ( sregion(i)%point%extent, lcost(i,:) ) cost(i) = product ( lcost(i,:) ) accumcost(i+1) = accumcost(i) + cost(i) end do ! Calculate the total cost (of all super regions) as well as the idealcost. ! Note that totcost is an integer, while idealcost is real. totcost = sum ( cost ) idealcost = real(totcost,wp) / nprocs ! Calculate the ideal processor location for the beginning and end of each ! super region, the fractional number of processors to be assigned to this ! region as well as the accumulated fractional processor count. accumsplit(1) = 0.0_wp do i = 1, nregs minmap(i) = floor(accumcost(i)/idealcost) maxmap(i) = ceiling(accumcost(i+1)/idealcost) - 1 nsplit(i) = cost(i)/idealcost accumsplit(i+1) = accumsplit(i)+nsplit(i) end do ! Calculate the size of the fractional regions at the beginning and the ! end of each super region. do i = 1, nregs if ( maxmap(i) > minmap(i) ) then frac1(i) = ceiling(accumsplit(i))-accumsplit(i) frac2(i) = accumsplit(i+1)-floor(accumsplit(i+1)) else frac1(i) = 0.0_wp frac2(i) = 0.0_wp end if end do ! Loop over the super regions. do i = 1, nregs nrprocs = nsplit(i) ! If the region begins and ends on the same processor, don't split ! but just assign the region to that proceesor. if ( maxmap(i) == minmap(i) ) then sregion(i)%point%processor = minmap(i) sregion(i)%point%frac = nrprocs ! Otherwise split the region. else ! Initialize counter variables used in the recursive routine. clevel = 0 ncount = 0 ! Calculate the fractional part that has to be left unsplit. This is ! the sum of the fractional leftovers at either end. fract = frac1(i)+frac2(i) ! If nrprocs > fract (fuzzy logic) then split recursively on nrprocs ! processors with fract left unsplit (0.0 0.000001_wp ) then call SplitRegionsRecursive ( nrprocs, fract, 4, 3, & clevel, sregion(i)%point, ncount ) call AssignProcs ( minmap(i), maxmap(i), frac1(i), frac2(i), & sregion(i)%point ) ! Otherwise we just need to split the superregion into 2 regions ! of size fract1 and fract2. else ! Find the cost of the region. call cost_box ( sregion(i)%point%extent, mycost ) ! Find the direction of maximal cost. maxcost = maxval(mycost) mydir = maxloc (mycost) ! Find the number of gridpoints in direction mydir of the first ! split region. np = nint(frac1(i)/fract*mycost(mydir(1))) ! Split the region. call split_sregion ( sregion(i)%point, mydir(1), (/ np /) ) ! Assign the processors to the resulting regions. sregion(i)%point%processor = -1 sregion(i)%point%children(1)%point%processor = minmap(i) sregion(i)%point%children(1)%point%frac = frac1(i) sregion(i)%point%children(2)%point%processor = maxmap(i) sregion(i)%point%children(2)%point%frac = frac2(i) end if end if end do ! Deallocate local helper arrays. deallocate ( cost, accumcost, minmap, & maxmap, nsplit, accumsplit, & frac1, frac2, lcost, splitnum ) end subroutine SplitSuperRegions end module carpet_boxtypes