// Newton.cc -- solve Theta(h) = 0 via Newton's method
// $Header$
//
// Newton_solve - driver to solve Theta(h) = 0 for all our horizons
//
/// broadcast_status - broadcast status from active processors to all processors
/// broadcast_horizon_data - broadcast AH data to all processors
///
/// print_status - print Newton-iteration status on each active proc
/// Newton_step - take the Newton step, scaling it down if it's too large
//

#include <stdio.h>
#include <assert.h>
#include <limits.h>
#include <math.h>

#include "util_Table.h"
#include "cctk.h"
#include "cctk_Arguments.h"

#include "config.h"
#include "stdc.h"
#include "../jtutil/util.hh"
#include "../jtutil/array.hh"
#include "../jtutil/cpm_map.hh"
#include "../jtutil/linear_map.hh"
using jtutil::error_exit;

#include "../patch/coords.hh"
#include "../patch/grid.hh"
#include "../patch/fd_grid.hh"
#include "../patch/patch.hh"
#include "../patch/patch_edge.hh"
#include "../patch/patch_interp.hh"
#include "../patch/ghost_zone.hh"
#include "../patch/patch_system.hh"

#include "../elliptic/Jacobian.hh"

#include "../gr/gfns.hh"
#include "../gr/gr.hh"

#include "horizon_sequence.hh"
#include "BH_diagnostics.hh"
#include "driver.hh"

//******************************************************************************

//
// prototypes for functions local to this file
//

namespace {
bool broadcast_status(const cGH* GH,
		      int N_procs, int N_active_procs,
		      int my_proc, bool my_active_flag,
		      int hn, int iteration,
		      enum expansion_status expansion_status,
		      fp rms_norm, fp infinity_norm,
		      bool found_this_horizon, bool I_need_more_iterations,
		      struct iteration_status_buffers& isb);
void broadcast_horizon_data(const cGH* GH,
			    bool broadcast_flag, bool broadcast_horizon_shape,
			    struct BH_diagnostics& BH_diagnostics,
			    patch_system& ps,
			    struct horizon_buffers& horizon_buffers);

void print_status(int N_active_procs,
		  const struct iteration_status_buffers& isb);
void Newton_step(patch_system& ps,
		 fp max_allowable_Delta_h_over_h,
		 const struct verbose_info& verbose_info);
	  }

//******************************************************************************

//
// This function tries to finds each horizon assigned to this processor,
// by solving Theta(h) = 0 via Newton's method.  For each horizon found,
// it computes the BH diagnostics, optionally prints them (via CCTK_VInfo()),
// and optionally writes them to a BH diagnostics output file.  It also
// optionally writes the horizon shape itself to a data file.
//
// This function must be called synchronously across all processors,
// and it operates synchronously, returning only when every processor
// is done with all its horizons.  See ./README.parallel for a discussion
// of the parallel/multiprocessor issues and algorithm.
//
void Newton(const cGH* GH,
	    int N_procs, int N_active_procs, int my_proc,
	    horizon_sequence& hs, struct AH_data* const AH_data_array[],
	    const struct cactus_grid_info& cgi,
	    const struct geometry_info& gi,
	    const struct Jacobian_info& Jacobian_info,
	    const struct solver_info& solver_info,
	    const struct IO_info& IO_info,
	    const struct BH_diagnostics_info& BH_diagnostics_info,
	    bool broadcast_horizon_shape,
	    const struct error_info& error_info,
	    const struct verbose_info& verbose_info,
	    struct iteration_status_buffers& isb)
{
const bool my_active_flag = hs.has_genuine_horizons();
const int N_horizons = hs.N_horizons();

// print out which horizons we're finding on this processor
if (hs.has_genuine_horizons())
   then CCTK_VInfo(CCTK_THORNSTRING,
		   "proc %d: searching for horizon%s %s/%d",
		   my_proc,
		   (hs.my_N_horizons() > 1 ? "s" : ""),
		   hs.sequence_string(","), N_horizons);
   else CCTK_VInfo(CCTK_THORNSTRING,
		   "proc %d: dummy horizon(s) only",
		   my_proc);

    //
    // each pass through this loop finds a single horizon,
    // or does corresponding dummy-horizon calls
    //
    // note there is no explicit exit test, rather we exit from the middle
    // of the loop (only) when all processors are done with all their genuine
    // horizons
    //
    for (int hn = hs.init_hn() ; ; hn = hs.next_hn())
    {
    if (verbose_info.print_algorithm_details)
       then CCTK_VInfo(CCTK_THORNSTRING,
		       "Newton_solve(): processor %d working on horizon %d",
		       my_proc, hn);

    const bool horizon_is_genuine               = hs.is_genuine();
    const bool there_is_another_genuine_horizon = hs.is_next_genuine();
    if (verbose_info.print_algorithm_details)
       then {
	    CCTK_VInfo(CCTK_THORNSTRING,
		       "                horizon_is_genuine=%d",
		       int(horizon_is_genuine));
	    CCTK_VInfo(CCTK_THORNSTRING,
		       "                there_is_another_genuine_horizon=%d",
		       int(there_is_another_genuine_horizon));
	    }

    struct AH_data* AH_data_ptr
	                  = horizon_is_genuine ? AH_data_array[hn]   : NULL;
    patch_system*  ps_ptr = horizon_is_genuine ? AH_data_ptr->ps_ptr : NULL;
    Jacobian*     Jac_ptr = horizon_is_genuine ? AH_data_ptr->Jac_ptr: NULL;

    const int max_iterations
       = horizon_is_genuine
	 ? (AH_data_ptr->initial_find_flag
	    ? solver_info.max_Newton_iterations__initial
	    : solver_info.max_Newton_iterations__subsequent)
	 : INT_MAX;

	//
	// each pass through this loop does a single Newton iteration
	// on the current horizon (which might be either genuine or dummy)
	//
	for (int iteration = 1 ; ; ++iteration)
	{
	if (verbose_info.print_algorithm_debug)
	   then CCTK_VInfo(CCTK_THORNSTRING,
			   "beginning iteration %d (horizon_is_genuine=%d)",
			   iteration, int(horizon_is_genuine));


	//
	// evaluate the expansion Theta(h) and its norms
	// *** this is a synchronous operation across all processors ***
	//
	if (horizon_is_genuine
	    && solver_info.debugging_output_at_each_Newton_iteration)
	   then output_gridfn(*ps_ptr, gfns::gfn__h,
			      IO_info, IO_info.h_base_file_name,
			      hn, verbose_info.print_algorithm_highlights,
			      iteration);

	jtutil::norm<fp> Theta_norms;
	const enum expansion_status
	  expansion_status = expansion(ps_ptr,
				       cgi, gi,
				       error_info, (iteration == 1),
				       true,	// yes, we want Jacobian coeffs
				       verbose_info.print_algorithm_details,
				       &Theta_norms);
	const bool Theta_is_ok = (expansion_status == expansion_success);
	const bool norms_are_ok = horizon_is_genuine && Theta_is_ok;
	if (verbose_info.print_algorithm_debug)
	   then {
		CCTK_VInfo(CCTK_THORNSTRING,
			   "   Newton_solve(): Theta_is_ok=%d",
			   Theta_is_ok);
		if (norms_are_ok)
		   then CCTK_VInfo(CCTK_THORNSTRING,
				   "   Theta rms-norm %.1e, infinity-norm %.1e",
				   double(Theta_norms.rms_norm()),
				   double(Theta_norms.infinity_norm()));
		}
	if (horizon_is_genuine && Theta_is_ok
	    && solver_info.debugging_output_at_each_Newton_iteration)
	   then output_gridfn(*ps_ptr, gfns::gfn__Theta,
			      IO_info, IO_info.Theta_base_file_name,
			      hn, verbose_info.print_algorithm_highlights,
			      iteration);


	//
	// have we found this horizon?
	// if so, compute and output BH diagnostics
	//
	const bool found_this_horizon
	   = norms_are_ok && (Theta_norms.infinity_norm()
			      <= solver_info.Theta_norm_for_convergence);
	if (horizon_is_genuine)
	   then AH_data_ptr->found_flag = found_this_horizon;

	if (found_this_horizon)
	   then {
		struct BH_diagnostics& BH_diagnostics
			= AH_data_ptr->BH_diagnostics;
		BH_diagnostics.compute(*ps_ptr, BH_diagnostics_info);
		if (IO_info.output_BH_diagnostics)
		   then {
			if (AH_data_ptr->BH_diagnostics_fileptr == NULL)
			   then AH_data_ptr->BH_diagnostics_fileptr
				  = BH_diagnostics.setup_output_file
					(IO_info, N_horizons, hn);
			BH_diagnostics.output(AH_data_ptr
					      ->BH_diagnostics_fileptr,
					      IO_info);
			}
		}


	//
	// see if we need more iterations (either on this or another horizon)
	//

	// does *this* horizon need more iterations?
	// i.e. has this horizon's Newton iteration not yet converged?
	const bool this_horizon_needs_more_iterations
	   = horizon_is_genuine && Theta_is_ok
	     && !found_this_horizon
	     && (iteration < max_iterations);

	// do I (this processor) need to do more iterations
	// on this or a following horizon?
	const bool I_need_more_iterations
	   = this_horizon_needs_more_iterations
	     || there_is_another_genuine_horizon;

	if (verbose_info.print_algorithm_details)
	   then {
		CCTK_VInfo(CCTK_THORNSTRING,
			   "   flags: found_this_horizon=%d",
			   int(found_this_horizon));
		CCTK_VInfo(CCTK_THORNSTRING,
			   "          this_horizon_needs_more_iterations=%d",
			   int(this_horizon_needs_more_iterations));
		CCTK_VInfo(CCTK_THORNSTRING,
			   "          I_need_more_iterations=%d",
			   int(I_need_more_iterations));
		}


	//
	// broadcast iteration status from each active processor
	// to all processors, and inclusive-or the "we need more iterations"
	// flags to see if *any* (active) processor needs more iterations
	//
	const bool any_proc_needs_more_iterations
	  = broadcast_status(GH,
			     N_procs, N_active_procs,
			     my_proc, my_active_flag,
			     hn, iteration, expansion_status,
			     (norms_are_ok ? Theta_norms.rms_norm() : 0.0),
			     (norms_are_ok ? Theta_norms.infinity_norm() : 0.0),
			     found_this_horizon, I_need_more_iterations,
			     isb);
	// set found-this-horizon flags
	// for all active processors' non-dummy horizons
		  {
		for (int found_proc = 0 ;
		     found_proc < N_active_procs ;
		     ++found_proc)
		{
		const int found_hn = isb.hn_buffer[found_proc];
		if (found_hn > 0)
		   then AH_data_array[found_hn]->found_flag
				= isb.found_horizon_buffer[found_proc];
		}
		  }
	if (verbose_info.print_algorithm_details)
	   then {
		CCTK_VInfo(CCTK_THORNSTRING,
			   "          ==> any_proc_needs_more_iterations=%d",
			   int(any_proc_needs_more_iterations));
		}


	//
	// print the iteration status info
	//
	if ((my_proc == 0) && verbose_info.print_algorithm_highlights)
	   then print_status(N_active_procs, isb);


	//
	// for each processor which found a horizon,
	// broadcast its horizon info to all processors
	// and print the BH diagnostics on processor 0
	//
		  {
		for (int found_proc = 0 ;
		     found_proc < N_active_procs ;
		     ++found_proc)
		{
		if (! isb.found_horizon_buffer[found_proc])
		   then continue;

		const int found_hn = isb.hn_buffer[found_proc];
		struct AH_data& found_AH_data = *AH_data_array[found_hn];

		if (verbose_info.print_algorithm_details)
		   then CCTK_VInfo(CCTK_THORNSTRING,
			   "   broadcasting proc %d/horizon %d diagnostics%s",
				   found_proc, found_hn,
				   (broadcast_horizon_shape ? "+shape" : ""));

		broadcast_horizon_data(GH,
				       my_proc == found_proc,
				       broadcast_horizon_shape,
				       found_AH_data.BH_diagnostics,
				       *found_AH_data.ps_ptr,
				       found_AH_data.horizon_buffers);

		if ((my_proc == 0) && verbose_info.print_physics_details)
		   then found_AH_data.BH_diagnostics
				     .print(N_horizons, found_hn);
		}
		  }


	//
	// if we found our horizon, maybe output the horizon shape?
	//
	if (found_this_horizon)
	   then {
		if (IO_info.output_h)
		   then output_gridfn(*ps_ptr, gfns::gfn__h,
				      IO_info, IO_info.h_base_file_name,
				      hn, verbose_info
					  .print_algorithm_highlights);
		if (IO_info.output_Theta)
		   then output_gridfn(*ps_ptr, gfns::gfn__Theta,
				      IO_info, IO_info.Theta_base_file_name,
				      hn, verbose_info
					  .print_algorithm_highlights);
		}


	//
	// are all processors done with all their genuine horizons?
	// or if this is a single-processor run, are we done with this horizon?
	//
	if (! any_proc_needs_more_iterations)
	   then {
		if (verbose_info.print_algorithm_details)
		   then CCTK_VInfo(CCTK_THORNSTRING,
				   "==> all processors are finished!");
		return;					// *** NORMAL RETURN ***
		}
	if ((N_procs == 1) && !this_horizon_needs_more_iterations)
	   then {
		if (verbose_info.print_algorithm_debug)
		   then CCTK_VInfo(CCTK_THORNSTRING,
			   "==> [single processor] Skipping to next horizon");
		break;					// *** LOOP EXIT ***
		}


	//
	// compute the Jacobian matrix
	// *** this is a synchronous operation across all processors ***
	//
	if (verbose_info.print_algorithm_debug)
	   then CCTK_VInfo(CCTK_THORNSTRING,
			   "   computing Jacobian: genuine/dummy flag %d",
			   int(this_horizon_needs_more_iterations));
	const enum expansion_status
	  Jacobian_status
	     = expansion_Jacobian
		 (this_horizon_needs_more_iterations ? ps_ptr : NULL,
		  this_horizon_needs_more_iterations ? Jac_ptr : NULL,
		  cgi, gi, Jacobian_info,
		  error_info, (iteration == 1),
		  verbose_info.print_algorithm_details);
	const bool Jacobian_is_ok = (Jacobian_status == expansion_success);


	//
	// skip to the next horizon unless
	// this is a genuine Jacobian computation, and it went ok
	//
	if (! (this_horizon_needs_more_iterations && Jacobian_is_ok))
	   then {
		if (verbose_info.print_algorithm_debug)
		   then CCTK_VInfo(CCTK_THORNSTRING,
				   "   skipping to next horizon");
		break;					// *** LOOP EXIT ***
		}


	//
	// compute the Newton step
	//
	if (verbose_info.print_algorithm_details)
	   then CCTK_VInfo(CCTK_THORNSTRING,
			   "solving linear system for Delta_h (%d unknowns)",
			   Jac_ptr->N_rows());
	const fp rcond
	   = Jac_ptr->solve_linear_system(gfns::gfn__Theta,
					  gfns::gfn__Delta_h,
					  verbose_info.print_algorithm_details);
	if	(rcond == 0.0)
	   then {
		CCTK_VWarn(1, __LINE__, __FILE__, CCTK_THORNSTRING,
		   "Newton_solve: Jacobian matrix is numerically singular!");
		// give up on this horizon
		break;				// *** LOOP CONTROL ***
		}
	if (verbose_info.print_algorithm_details)
	   then {
		if (rcond > 0.0)
		   then CCTK_VInfo(CCTK_THORNSTRING,
			   "done solving linear system (condition number %.1e)",
				   double(rcond));
		   else CCTK_VInfo(CCTK_THORNSTRING,
				   "done solving linear system");
		}

	if (solver_info.debugging_output_at_each_Newton_iteration)
	   then output_gridfn(*ps_ptr, gfns::gfn__Delta_h,
			      IO_info, IO_info.Delta_h_base_file_name,
			      hn, verbose_info.print_algorithm_details,
			      iteration);


	//
	// take the Newton step (scaled if need be)
	//
	Newton_step(*ps_ptr,
		    solver_info.max_allowable_Delta_h_over_h,
		    verbose_info);

	// end of this Newton iteration
	}

    // end of this horizon
    }

// we should never get to here
assert( false );
}

//******************************************************************************
//******************************************************************************
//******************************************************************************

//
// This function (which must be called on *every* processor) broadcasts
// the per-iteration status information from each active processor to
// all processors.
//
// Arguments:
// GH --> The Cactus grid hierarchy.
// N_procs = The total number of processors.
// N_active_procs = The number of active processors.
// my_active_flag = Is this processor an active processor?
// hn,iteration,status,
// {rms,infinity}_norm,
// found_this_horizon,I_need_more_iterations
//	= On an active processors, these are the values to be broadcast.
//	  On a dummy processors, these are ignored.
// isb = (out) Holds both user buffers (set to the broadcast results)
//	       and low-level buffers (used within this function).  If the
//	       buffer pointers are NULL then the buffers are allocated.
//
// Results:
// This function returns the inclusive-or over all active processors,
// of the broadcast I_need_more_iterations flags.
//
namespace {
bool broadcast_status(const cGH* GH,
		      int N_procs, int N_active_procs,
		      int my_proc, bool my_active_flag,
		      int hn, int iteration,
		      enum expansion_status expansion_status,
		      fp rms_norm, fp infinity_norm,
		      bool found_this_horizon, bool I_need_more_iterations,
		      struct iteration_status_buffers& isb)
{
assert( my_proc >= 0 );
assert( my_proc < N_procs );

//
// We do the broadcast via a Cactus reduction operation (this is a KLUDGE,
// but until Cactus gets a generic interprocessor communications mechanism
// it's the best we can do without mucking with MPI ourself).  To do the
// gather via a reduction, we set up a 2-D buffer whose entries are all
// 0s, except that on each processor the [my_proc] row has the values we
// want to gather.  Then we do a sum-reduction of the buffers across
// processors.  For the actual reduction we treat the buffers as 1-D
// arrays on each processor (this slightly simplifies the code).
//
// To reduce overheads, we do the entire operation with a single (CCTK_REAL)
// Cactus reduce, casting values to CCTK_REAL as necessary (and encoding
// Boolean flags into signs of known-to-be-positive values.
//
// Alas MPI (and thus Cactus) requires the input and output reduce buffers
// to be distinct, so we need two copies of the buffers on each processor.
//

// the buffers are actually 2-D arrays; these are the column numbers
// ... if we wanted to, we could pack hn, iteration, and expansion_status
//     all into a single (64-bit) floating-point value, but it's not
//     worth the trouble...
enum	{
	// CCTK_INT buffer
	buffer_var__hn = 0,	// also encodes found_this_horizon flag
				// in sign: +=true, -=false
	buffer_var__iteration,	// also encodes I_need_more_iterations flag
				// in sign: +=true, -=false
	buffer_var__expansion_status,
	buffer_var__rms_norm,
	buffer_var__infinity_norm,
	N_buffer_vars // no comma
	};

//
// allocate buffers if this is the first use
//
if (isb.hn_buffer == NULL)
   then {
	isb.hn_buffer               = new int [N_active_procs];
	isb.iteration_buffer        = new int [N_active_procs];
	isb.expansion_status_buffer = new enum expansion_status
					      [N_active_procs];
	isb.rms_norm_buffer         = new fp  [N_active_procs];
	isb.infinity_norm_buffer    = new fp  [N_active_procs];
	isb.found_horizon_buffer    = new bool[N_active_procs];

	isb.send_buffer_ptr    = new jtutil::array2d<CCTK_REAL>
						    (0, N_active_procs-1,
						     0, N_buffer_vars-1);
	isb.receive_buffer_ptr = new jtutil::array2d<CCTK_REAL>
						    (0, N_active_procs-1,
						     0, N_buffer_vars-1);
	}
jtutil::array2d<CCTK_REAL>& send_buffer    = *isb.send_buffer_ptr;
jtutil::array2d<CCTK_REAL>& receive_buffer = *isb.receive_buffer_ptr;

//
// pack this processor's values into the reduction buffer
//
jtutil::zero_C_array(send_buffer.N_array(), send_buffer.data_array());
if (my_active_flag)
   then {
	assert( send_buffer.is_valid_i(my_proc) );
	assert( hn >= 0 );		// encoding scheme assumes this
	assert( iteration > 0 );	// encoding scheme assumes this
	send_buffer(my_proc, buffer_var__hn)
		= found_this_horizon ? +hn : -hn;
	send_buffer(my_proc, buffer_var__iteration)
		= I_need_more_iterations ? +iteration : -iteration;
	send_buffer(my_proc, buffer_var__expansion_status)
		= int(expansion_status);
	send_buffer(my_proc, buffer_var__rms_norm)      = rms_norm;
	send_buffer(my_proc, buffer_var__infinity_norm) = infinity_norm;
	}

//
// do the reduction
//

// this name is appropriate for PUGHReduce, caveat user for other drivers :)
const int reduction_handle = CCTK_ReductionArrayHandle("sum");
if (reduction_handle < 0)
   then CCTK_VWarn(FATAL_ERROR, __LINE__, __FILE__, CCTK_THORNSTRING,
		   "broadcast_status(): can't get sum-reduction handle!");
								/*NOTREACHED*/

const int reduction_status
   = CCTK_ReduceLocArrayToArray1D(GH,
				  -1,	// broadcast results to all processors
				  reduction_handle,
				  static_cast<const void*>
					     (send_buffer   .data_array()),
				  static_cast<      void*>
					     (receive_buffer.data_array()),
				  send_buffer.N_array(),
				  CCTK_VARIABLE_REAL);
if (reduction_status < 0)
   then CCTK_VWarn(FATAL_ERROR, __LINE__, __FILE__, CCTK_THORNSTRING,
		   "broadcast_status(): error status %d from reduction!",
		   reduction_status);				/*NOTREACHED*/

//
// unpack the reduction buffer back to the high-level result buffers and
// compute the inclusive-or of the broadcast I_need_more_iterations flags
//
bool any_proc_needs_more_iterations = false;
	for (int proc = 0 ; proc < N_active_procs ; ++proc)
	{
	const int hn_temp = static_cast<int>(
			      receive_buffer(proc, buffer_var__hn)
					    );
	isb.hn_buffer[proc] = jtutil::abs(hn_temp);
	isb.found_horizon_buffer[proc] = (hn_temp > 0);

	const int iteration_temp = static_cast<int>(
				     receive_buffer(proc, buffer_var__iteration)
						   );
	isb.iteration_buffer[proc] = jtutil::abs(iteration_temp);
	const bool proc_needs_more_iterations = (iteration_temp > 0);
	any_proc_needs_more_iterations |= proc_needs_more_iterations;

	isb.expansion_status_buffer[proc]
		= static_cast<enum expansion_status>(
		    static_cast<int>(
		      receive_buffer(proc, buffer_var__expansion_status)
				    )
						    );

	isb.rms_norm_buffer[proc]
		= receive_buffer(proc, buffer_var__rms_norm);
	isb.infinity_norm_buffer[proc]
		= receive_buffer(proc, buffer_var__infinity_norm);
	}

return any_proc_needs_more_iterations;
}
	  }

//******************************************************************************

//
// This function (which must be called on *every* processor) broadcasts
// the BH diagnostics and (ghosted) horizon shape from a specified processor
// to all processors.
//
// Arguments:
// GH --> The Cactus grid hierarchy.
// broadcast_flag = true on the broadcasting processor,
//		    false on all other processors
// BH_diagnostics = On the broadcasting processor, this is the BH diagnostics
//		    to broadcast; on all other processors, it's set to the
//		    broadcast BH diagnostics.
// ps = On the broadcasting processor,  gfn__h  is broadcast;
//      on all other processors,  gfn__h  is set to the broadcast values.
// horizon_buffers = Internal buffers for use in the broadcast;
//		     if  N_buffer == 0  then we set N_buffer and allocate
//		     the buffers.
//
namespace {
void broadcast_horizon_data(const cGH* GH,
			    bool broadcast_flag, bool broadcast_horizon_shape,
			    struct BH_diagnostics& BH_diagnostics,
			    patch_system& ps,
			    struct horizon_buffers& horizon_buffers)
{
//
// Implementation notes:
//
// We do the send via a Cactus sum-reduce where the data are 0 on
// all processors except the sending one.
//
// To reduce the interprocessor-communications cost, we actually only
// broadcast the nominal-grid horizon shape; we then do a synchronize
// operation on the patch system to recover the full ghosted-grid shape.
//

if (horizon_buffers.N_buffer == 0)
   then {
	// allocate the buffers
	horizon_buffers.N_buffer
		= BH_diagnostics::N_buffer
		  + (broadcast_horizon_shape ? ps.N_grid_points() : 0);
	horizon_buffers.send_buffer
		= new CCTK_REAL[horizon_buffers.N_buffer];
	horizon_buffers.receive_buffer
		= new CCTK_REAL[horizon_buffers.N_buffer];
	}

if (broadcast_flag)
   then {
	// pack the data to be broadcast into the send buffer
	BH_diagnostics.copy_to_buffer(horizon_buffers.send_buffer);
	if (broadcast_horizon_shape)
	   then {
		int posn = BH_diagnostics::N_buffer;
			for (int pn = 0 ; pn < ps.N_patches() ; ++pn)
			{
			const patch& p = ps.ith_patch(pn);
				for (int irho = p.min_irho() ;
				     irho <= p.max_irho() ;
				     ++irho)
				{
				for (int isigma = p.min_isigma() ;
				     isigma <= p.max_isigma() ;
				     ++isigma)
				{
				assert( posn < horizon_buffers.N_buffer );
				horizon_buffers.send_buffer[posn++]
				  = p.ghosted_gridfn(gfns::gfn__h, irho,isigma);
				}
				}
			}
		assert( posn == horizon_buffers.N_buffer );
		}
	}
   else jtutil::zero_C_array(horizon_buffers.N_buffer,
			     horizon_buffers.send_buffer);

// this name is appropriate for PUGHReduce, caveat user for other drivers :)
const int reduction_handle = CCTK_ReductionArrayHandle("sum");
if (reduction_handle < 0)
   then CCTK_VWarn(FATAL_ERROR, __LINE__, __FILE__, CCTK_THORNSTRING,
		   "broadcast_horizon_data(): can't get sum-reduction handle!");
								/*NOTREACHED*/

const int status
   = CCTK_ReduceLocArrayToArray1D(GH,
				  -1,	// result broadcast to all processors
				  reduction_handle,
				  static_cast<const void*>
					     (horizon_buffers.send_buffer),
				  static_cast<      void*>
					     (horizon_buffers.receive_buffer),
				  horizon_buffers.N_buffer,
				  CCTK_VARIABLE_REAL);
if (status < 0)
   then CCTK_VWarn(FATAL_ERROR, __LINE__, __FILE__, CCTK_THORNSTRING,
		   "broadcast_horizon_data(): error status %d from reduction!",
		   status);					/*NOTREACHED*/

if (!broadcast_flag)
   then {
	// unpack the data from the receive buffer
	BH_diagnostics.copy_from_buffer(horizon_buffers.receive_buffer);
	if (broadcast_horizon_shape)
	   then {
		int posn = BH_diagnostics::N_buffer;
			for (int pn = 0 ; pn < ps.N_patches() ; ++pn)
			{
			patch& p = ps.ith_patch(pn);
				for (int irho = p.min_irho() ;
				     irho <= p.max_irho() ;
				     ++irho)
				{
				for (int isigma = p.min_isigma() ;
				     isigma <= p.max_isigma() ;
				     ++isigma)
				{
				assert( posn < horizon_buffers.N_buffer );
				p.ghosted_gridfn(gfns::gfn__h, irho,isigma)
				   = horizon_buffers.receive_buffer[posn++];
				}
				}
			}
		assert( posn == horizon_buffers.N_buffer );

		// recover the full ghosted-grid horizon shape
		// (we only broadcast the nominal-grid shape)
		ps.synchronize();
		}
	}
}
	  }

//******************************************************************************
//******************************************************************************
//******************************************************************************

//
// This function is called on processor #0 to print the status of the
// Newton iteration on each active processor.
//
// Arguments:
// N_active_procs = The number of active processors.
// isb = The high-level buffers here give the information to be printed.
//
namespace {
void print_status(int N_active_procs,
		  const struct iteration_status_buffers& isb)
{
	for (int proc = 0 ; proc < N_active_procs ; ++proc)
	{
	// don't print anything for processors doing dummy evaluations
	if (isb.hn_buffer[proc] == 0)
	   then continue;

	if (isb.expansion_status_buffer[proc] == expansion_success)
	   then CCTK_VInfo(CCTK_THORNSTRING,
		   "   proc %d/horizon %d:it %d |Theta| rms=%.1e inf=%.1e",
			   proc, isb.hn_buffer[proc],
			   isb.iteration_buffer[proc],
			   double(isb.rms_norm_buffer[proc]),
			   double(isb.infinity_norm_buffer[proc]));
	   else CCTK_VInfo(CCTK_THORNSTRING,
		   "   proc %d/horizon %d: %s",
			   proc, isb.hn_buffer[proc],
			   expansion_status_string(
			     isb.expansion_status_buffer[proc]
						  ));
	}
}
	  }

//******************************************************************************

//
// This function takes the Newton step, scaling it down if it's too large.
//
// Arguments:
// ps = The patch system containing the gridfns h and Delta_h.
// max_allowable_Delta_h_over_h = The maximum allowable
//				     ||Delta_h||_infinity / ||h||_mean
//				  Any step over this is internally clamped
//				  (scaled down) to this size.
//
namespace {
void Newton_step(patch_system& ps,
		 fp max_allowable_Delta_h_over_h,
		 const struct verbose_info& verbose_info)
{
//
// compute scale factor (1 for small steps, <1 for large steps)
//

jtutil::norm<fp> h_norms;
ps.ghosted_gridfn_norms(gfns::gfn__h, h_norms);
const fp max_allowable_Delta_h = max_allowable_Delta_h_over_h * h_norms.mean();

jtutil::norm<fp> Delta_h_norms;
ps.gridfn_norms(gfns::gfn__Delta_h, Delta_h_norms);
const fp max_Delta_h = Delta_h_norms.infinity_norm();

const fp scale = (max_Delta_h <= max_allowable_Delta_h)
		 ? 1.0
		 : max_allowable_Delta_h / max_Delta_h;

if (verbose_info.print_algorithm_details)
   then {
	if (scale == 1.0)
	   then CCTK_VInfo(CCTK_THORNSTRING,
			   "h += Delta_h (rms-norm=%.1e, infinity-norm=%.1e)",
			   Delta_h_norms.rms_norm(),
			   Delta_h_norms.infinity_norm());
	   else CCTK_VInfo(CCTK_THORNSTRING,
			   "h += %g * Delta_h (infinity-norm clamped to %.2g)",
			   scale,
			   scale * Delta_h_norms.infinity_norm());
	}


//
// take the Newton step (scaled if necessary)
//
	for (int pn = 0 ; pn < ps.N_patches() ; ++pn)
	{
	patch& p = ps.ith_patch(pn);

		for (int irho = p.min_irho() ; irho <= p.max_irho() ; ++irho)
		{
		for (int isigma = p.min_isigma() ;
		     isigma <= p.max_isigma() ;
		     ++isigma)
		{
		p.ghosted_gridfn(gfns::gfn__h, irho,isigma)
			-= scale * p.gridfn(gfns::gfn__Delta_h, irho,isigma);
		}
		}
	}
}
	  }