From 04cdac670f351fc7b256e7064d15fd644494a95b Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Fri, 16 Aug 2019 10:51:20 +0200
Subject: Add support for multi-component runs.

---
 schedule.ccl |   8 +-
 src/qms.c    | 329 +++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 311 insertions(+), 26 deletions(-)

diff --git a/schedule.ccl b/schedule.ccl
index e8241c1..cbea68c 100644
--- a/schedule.ccl
+++ b/schedule.ccl
@@ -7,11 +7,11 @@ if (CCTK_EQUALS(lapse_source, "QMS_MG")) {
 
     SCHEDULE qms_mg_eval IN ML_BSSN_evolCalcGroup BEFORE ML_BSSN_lapse_evol {
         LANG: C
-        SYNC: ML_lapse
+        SYNC: ML_W
     } "Quasimaximal slicing eval W"
     SCHEDULE qms_mg_eval IN ML_CCZ4_evolCalcGroup BEFORE ML_CCZ4_lapse_evol {
         LANG: C
-        SYNC: ML_lapse
+        SYNC: ML_W
     } "Quasimaximal slicing eval W"
 
     SCHEDULE qms_mg_solve IN CCTK_POSTSTEP {
@@ -40,10 +40,6 @@ if (CCTK_EQUALS(lapse_source, "QMS_MG")) {
         LANG: C
     } ""
 
-    #SCHEDULE quasimaximal_slicing_axi IN MoL_PseudoEvolution {
-    #    LANG: C
-    #} "Quasimaximal slicing"
-
     STORAGE: W_mg[2]
     STORAGE: W_pred0
     STORAGE: W_pred1
diff --git a/src/qms.c b/src/qms.c
index cfad019..646c7b9 100644
--- a/src/qms.c
+++ b/src/qms.c
@@ -17,6 +17,8 @@
 #include "cctk_Timers.h"
 #include "util_Table.h"
 
+#include <mpi.h>
+
 #define SQR(x) ((x) * (x))
 #define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0)
 #define MAX(x, y) ((x) > (y) ? (x) : (y))
@@ -32,6 +34,7 @@ typedef struct CoordPatch {
     ptrdiff_t grid_size[3];
 
     MG2DContext *solver;
+    MPI_Comm     solver_comm;
 
     int y_idx;
 
@@ -40,6 +43,22 @@ typedef struct CoordPatch {
 
     double *filter;
     ptrdiff_t filter_stride;
+
+    /* MPI sync for the initial guess */
+    int       nb_components;
+    ptrdiff_t (*solver_start)[2];
+    size_t    (*solver_size)[2];
+    double   *u_guess;
+    ptrdiff_t u_guess_stride;
+    ptrdiff_t u_guess_start[2];
+    size_t    u_guess_size[2];
+
+    int          *u_guess_sendcounts;
+    int          *u_guess_senddispl;
+    MPI_Datatype *u_guess_sendtypes;
+    int          *u_guess_recvcounts;
+    int          *u_guess_recvdispl;
+    MPI_Datatype *u_guess_recvtypes;
 } CoordPatch;
 
 typedef struct QMSMGContext {
@@ -138,6 +157,69 @@ static void log_callback(const MG2DContext *ctx, int level,
     fputs(buf, stderr);
 }
 
+/**
+ * Compute the refinement level geometry.
+ *
+ * The level is assumed to be a single square patch that is composed of
+ * nProcs components. The lower x and z boundaries are assumed to be reflection.
+ * For our purposes, both the physical outer boundaries and the refinement
+ * boundaries are equivalent, while the intercomponent boundaries are treated
+ * differently. Since Cactus only allows distinguishing between "physical" and
+ * "all other" boundaries, we need to compute the extents manually.
+ *
+ * @param level_size will contain the number of points in x/z directions on the
+ * complete refinement level, including all the ghost zones up to the outer
+ * level boundary.
+ *
+ * @param component_start indices of the component origin in the refinement level
+ *
+ * @param component_size number of points owned by this component; when the
+ * component is on the outer level boundary, this includes all the ghost zones
+ * up to the outer boundary.
+ */
+static void get_extents(size_t **pextents, int *level_size, cGH *gh)
+{
+    int nb_procs   = CCTK_nProcs(gh);
+    int local_proc = CCTK_MyProc(gh);
+
+    size_t *extents;
+
+    extents = calloc(nb_procs, 4 * sizeof(*extents));
+    if (!extents)
+        CCTK_WARN(0, "Error allocating the extents");
+
+    for (int dir = 0; dir < 2; dir++) {
+        extents[4 * local_proc + 0 + dir]  = gh->cctk_lbnd[2 * dir];
+        extents[4 * local_proc + 2 + dir]  = gh->cctk_lsh[2 * dir] - gh->cctk_nghostzones[2 * dir];
+    }
+
+    if (nb_procs > 1)
+        MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, extents, 4 * sizeof(*extents), MPI_BYTE, MPI_COMM_WORLD);
+
+    level_size[0] = 0;
+    level_size[1] = 0;
+
+    for (int dir = 0; dir < 2; dir++) {
+        for (int comp = 0; comp < nb_procs; comp++) {
+            size_t start = extents[4 * comp + 0 + dir];
+            size_t size  = extents[4 * comp + 2 + dir];
+            if (start + size > level_size[dir])
+                level_size[dir] = start + size;
+        }
+
+        // if the upper boundary is an inter-component boundary, substract the ghost zones
+        for (int comp = 0; comp < nb_procs; comp++) {
+            size_t start = extents[4 * comp + 0 + dir];
+            size_t size  = extents[4 * comp + 2 + dir];
+
+            if (start + size < level_size[dir])
+                extents[4 * comp + 2 + dir] -= gh->cctk_nghostzones[2 * dir];
+        }
+    }
+
+    *pextents   = extents;
+}
+
 static CoordPatch *get_coord_patch(QMSMGContext *ms, int level)
 {
     cGH *gh = ms->gh;
@@ -146,11 +228,20 @@ static CoordPatch *get_coord_patch(QMSMGContext *ms, int level)
 
     const int integrator_substeps = MoLNumIntegratorSubsteps();
 
+    const int nb_procs   = CCTK_nProcs(gh);
+    const int local_proc = CCTK_MyProc(gh);
+
     const double *a_x = CCTK_VarDataPtr(gh, 0, "grid::x");
     const double *a_y = CCTK_VarDataPtr(gh, 0, "grid::y");
     const double *a_z = CCTK_VarDataPtr(gh, 0, "grid::z");
 
-    size_t solver_size[2];
+    size_t *extents;
+    size_t component_start[2], component_size[2];
+
+    int level_size[2];
+
+    int *solver_ranks;
+    int nb_solver_ranks;
 
     CoordPatch *cp;
     MG2DContext *solver;
@@ -182,24 +273,78 @@ static CoordPatch *get_coord_patch(QMSMGContext *ms, int level)
     if (i == gh->cctk_lsh[1])
         CCTK_WARN(0, "The grid does not include y==0");
 
+    get_extents(&extents, level_size, gh);
+    component_start[0] = extents[4 * local_proc + 0 + 0];
+    component_start[1] = extents[4 * local_proc + 0 + 1];
+    component_size[0]  = extents[4 * local_proc + 2 + 0];
+    component_size[1]  = extents[4 * local_proc + 2 + 1];
     for (int dir = 0; dir < 2; dir++) {
-        ptrdiff_t offset_right;
+        cp->bnd_intercomp[dir][0] = component_start[dir] > 0;
+        cp->bnd_intercomp[dir][1] = (component_start[dir] + component_size[dir]) < level_size[dir];
+        cp->offset_left[dir]      = gh->cctk_nghostzones[2 * dir];
+    }
+
+    if (level_size[0] != level_size[1])
+        CCTK_WARN(0, "Non-square domain");
+
+    solver_ranks = calloc(nb_procs, sizeof(*solver_ranks));
+    if (!solver_ranks)
+        CCTK_WARN(0, "Error allocating solver ranks");
+
+    cp->nb_components = nb_procs;
+    cp->solver_size  = calloc(nb_procs, sizeof(*cp->solver_size));
+    cp->solver_start = calloc(nb_procs, sizeof(*cp->solver_start));
+    if (!cp->solver_size || !cp->solver_start)
+        CCTK_WARN(0, "Error allocating solver sizes/starts");
+
+    nb_solver_ranks = 0;
+    for (int proc = 0; proc < nb_procs; proc++) {
+        for (int dir = 0; dir < 2; dir++) {
+            size_t start = extents[4 * proc + 0 + dir];
+            size_t end   = start + extents[4 * proc + 2 + dir];
+            ptrdiff_t offset_right;
+
+            offset_right = gh->cctk_nghostzones[2 * dir];
+            if (cp->level > 0) {
+                offset_right *= integrator_substeps * 2;
+                offset_right -= ms->boundary_offset;
+            }
 
-        cp->offset_left[dir] = gh->cctk_nghostzones[2 * dir];
+            end = MIN(end, level_size[dir] - offset_right);
 
-        offset_right = gh->cctk_nghostzones[2 * dir];
-        if (cp->level > 0) {
-            offset_right *= integrator_substeps * 2;
-            offset_right -= ms->boundary_offset;
+            if (end > start) {
+                cp->solver_start[proc][dir] = start;
+                cp->solver_size[proc][dir]  = end - start;
+            } else {
+                cp->solver_start[proc][dir] = 0;
+                cp->solver_size[proc][dir]  = 0;
+            }
         }
 
-        solver_size[dir] = cp->grid_size[2 * dir] - cp->offset_left[dir] - offset_right;
+        if (cp->solver_size[proc][0] > 0 && cp->solver_size[proc][1] > 0)
+            solver_ranks[nb_solver_ranks++] = proc;
     }
 
-    if (solver_size[0] != solver_size[1])
-        CCTK_WARN(0, "Non-square domain");
+    if (nb_solver_ranks > 1) {
+        MPI_Group grp_world, grp_solver;
+
+        MPI_Comm_group(MPI_COMM_WORLD, &grp_world);
+        MPI_Group_incl(grp_world, nb_solver_ranks, solver_ranks, &grp_solver);
+        MPI_Comm_create(MPI_COMM_WORLD, grp_solver, &cp->solver_comm);
+        MPI_Group_free(&grp_solver);
+        MPI_Group_free(&grp_world);
+    } else {
+        cp->solver_comm = MPI_COMM_NULL;
+    }
+
+    if (!(cp->solver_size[local_proc][0] > 0 && cp->solver_size[local_proc][1] > 0))
+        goto finish;
+
+    if (cp->solver_comm == MPI_COMM_NULL)
+        solver = mg2d_solver_alloc(cp->solver_size[local_proc][0]);
+    else
+        solver = mg2d_solver_alloc_mpi(cp->solver_comm, component_start, cp->solver_size[local_proc]);
 
-    solver = mg2d_solver_alloc(solver_size[0]);
     if (!solver)
         CCTK_WARN(0, "Error allocating the solver");
 
@@ -237,7 +382,7 @@ static CoordPatch *get_coord_patch(QMSMGContext *ms, int level)
         int ci = mg2d_bnd_coord_idx(i);
         for (size_t j = 0; j < solver->fd_stencil; j++) {
             memset(solver->boundaries[i]->val + j * solver->boundaries[i]->val_stride - j,
-                   0, sizeof(*solver->boundaries[i]->val) * (solver_size[!ci] + 2 * j));
+                   0, sizeof(*solver->boundaries[i]->val) * (solver->local_size[!ci] + 2 * j));
         }
     }
 
@@ -247,15 +392,15 @@ static CoordPatch *get_coord_patch(QMSMGContext *ms, int level)
     }
 
     if ((solver->domain_size - 1) * solver->step[0] >= ms->filter_start) {
-        cp->filter_stride = solver->domain_size;
-        cp->filter = calloc(solver->domain_size * cp->filter_stride, sizeof(*cp->filter));
+        cp->filter_stride = solver->local_size[0];
+        cp->filter = calloc(solver->local_size[1] * cp->filter_stride, sizeof(*cp->filter));
         if (!cp->filter)
             CCTK_WARN(0, "Error allocating the filter");
 
-        for (int idx_z = 0; idx_z < solver->domain_size; idx_z++) {
-            const double z = solver->step[1] * idx_z;
-            for (int idx_x = 0; idx_x < solver->domain_size; idx_x++) {
-                const double  x = solver->step[0] * idx_x;
+        for (int idx_z = 0; idx_z < solver->local_size[1]; idx_z++) {
+            const double z = solver->step[1] * (solver->local_start[1] + idx_z);
+            for (int idx_x = 0; idx_x < solver->local_size[0]; idx_x++) {
+                const double  x = solver->step[0] * (solver->local_start[0] + idx_x);
                 const double  r = sqrt(SQR(x) + SQR(z));
 
                 double fact = exp(-36.0 * (pow((r - ms->filter_start) / (ms->filter_end - ms->filter_start), 6.0)));
@@ -266,6 +411,10 @@ static CoordPatch *get_coord_patch(QMSMGContext *ms, int level)
         }
     }
 
+finish:
+	free(solver_ranks);
+    free(extents);
+
     ms->nb_patches++;
     return cp;
 }
@@ -274,6 +423,9 @@ static void coord_patch_free(CoordPatch *cp)
 {
     mg2d_solver_free(&cp->solver);
 
+    if (cp->solver_comm != MPI_COMM_NULL)
+        MPI_Comm_free(&cp->solver_comm);
+
     free(cp->filter);
 }
 
@@ -971,6 +1123,144 @@ static void solution_to_grid(CoordPatch *cp, MG2DContext *solver, double *dst)
     }
 }
 
+typedef struct Rect {
+    ptrdiff_t start[2];
+    size_t    size[2];
+} Rect;
+
+static int rect_intersect(Rect *dst, const Rect *src1, const Rect *src2)
+{
+    ptrdiff_t intersect_start0 = MAX(src1->start[0], src2->start[0]);
+    ptrdiff_t intersect_start1 = MAX(src1->start[1], src2->start[1]);
+    ptrdiff_t intersect_end0   = MIN(src1->start[0] + src1->size[0], src2->start[0] + src2->size[0]);
+    ptrdiff_t intersect_end1   = MIN(src1->start[1] + src1->size[1], src2->start[1] + src2->size[1]);
+
+    if (intersect_start0 < intersect_end0 && intersect_start1 < intersect_end1) {
+        dst->start[0] = intersect_start0;
+        dst->start[1] = intersect_start1;
+        dst->size[0]  = intersect_end0 - intersect_start0;
+        dst->size[1]  = intersect_end1 - intersect_start1;
+
+        return 1;
+    }
+
+    dst->size[0] = 0;
+    dst->size[1] = 0;
+
+    return 0;
+}
+
+static int guess_from_coarser(QMSMGContext *ms, CoordPatch *cp, CoordPatch *cp_coarse)
+{
+    int ret;
+
+    if (cp->nb_components > 1) {
+        int comp_fine, comp_coarse = 0;
+
+        // FIXME skip levels with skipped components,
+        // those require somewhat more complex logic to implement
+        MPI_Comm_size(cp->solver_comm, &comp_fine);
+        if (cp_coarse->solver_comm != MPI_COMM_NULL)
+            MPI_Comm_size(cp_coarse->solver_comm, &comp_coarse);
+        if (comp_fine != cp->nb_components || comp_coarse != cp->nb_components)
+            return 0;
+
+        if (!cp->u_guess) {
+            const int ghosts = cp->solver->fd_stencil + 1;
+            int local_proc = CCTK_MyProc(ms->gh);
+
+            cp->u_guess_sendcounts = calloc(cp->nb_components, sizeof(*cp->u_guess_sendcounts));
+            cp->u_guess_senddispl  = calloc(cp->nb_components, sizeof(*cp->u_guess_senddispl));
+            cp->u_guess_sendtypes  = calloc(cp->nb_components, sizeof(*cp->u_guess_sendtypes));
+            cp->u_guess_recvcounts = calloc(cp->nb_components, sizeof(*cp->u_guess_recvcounts));
+            cp->u_guess_recvdispl  = calloc(cp->nb_components, sizeof(*cp->u_guess_recvdispl));
+            cp->u_guess_recvtypes  = calloc(cp->nb_components, sizeof(*cp->u_guess_recvtypes));
+            if (!cp->u_guess_sendcounts || !cp->u_guess_senddispl || !cp->u_guess_sendtypes ||
+                !cp->u_guess_recvcounts || !cp->u_guess_recvdispl || !cp->u_guess_recvtypes)
+                return -ENOMEM;
+
+            cp->u_guess_size[0]  = cp->solver_size[local_proc][0] / 2 + 2 * ghosts;
+            cp->u_guess_size[1]  = cp->solver_size[local_proc][1] / 2 + 2 * ghosts;
+            cp->u_guess_start[0] = cp->solver_start[local_proc][0] / 2 - ghosts;
+            cp->u_guess_start[1] = cp->solver_start[local_proc][1] / 2 - ghosts;
+            cp->u_guess_stride   = cp->u_guess_size[0];
+            cp->u_guess          = calloc(cp->u_guess_stride * cp->u_guess_size[1],
+                                          sizeof(*cp->u_guess));
+            if (!cp->u_guess)
+                return -ENOMEM;
+
+            for (int comp_fine = 0; comp_fine < cp->nb_components; comp_fine++) {
+                Rect dst_fine = {
+                    .start = { cp->solver_start[comp_fine][0], cp->solver_start[comp_fine][1]},
+                    .size  = { cp->solver_size[comp_fine][0],  cp->solver_size[comp_fine][1]},
+                };
+                Rect dst_coarse = {
+                    .start = { dst_fine.start[0] / 2 - ghosts,     dst_fine.start[1] / 2 - ghosts },
+                    .size  = { dst_fine.size[0]  / 2 + ghosts * 2, dst_fine.size[1] / 2 + ghosts * 2 },
+                };
+
+                for (int comp_coarse = 0; comp_coarse < cp->nb_components; comp_coarse++) {
+                    Rect overlap;
+                    Rect src = {
+                        .start = { cp_coarse->solver_start[comp_coarse][0], cp_coarse->solver_start[comp_coarse][1]},
+                        .size  = { cp_coarse->solver_size[comp_coarse][0],  cp_coarse->solver_size[comp_coarse][1]},
+                    };
+                    for (int dim = 0; dim < 2; dim++) {
+                        if (src.start[dim] == 0) {
+                            src.start[dim] -= ghosts;
+                            src.size[dim]  += ghosts;
+                        }
+                    }
+
+                    rect_intersect(&overlap, &dst_coarse, &src);
+                    if (comp_fine == local_proc) {
+                        if (overlap.size[0] > 0 && overlap.size[1] > 0) {
+                            cp->u_guess_recvcounts[comp_coarse] = 1;
+                            cp->u_guess_recvdispl[comp_coarse]  = ((overlap.start[1] - dst_coarse.start[1]) * cp->u_guess_stride +
+                                                                   (overlap.start[0] - dst_coarse.start[0])) * sizeof(double);
+                            MPI_Type_vector(overlap.size[1], overlap.size[0], cp->u_guess_stride,
+                                            MPI_DOUBLE, &cp->u_guess_recvtypes[comp_coarse]);
+                            MPI_Type_commit(&cp->u_guess_recvtypes[comp_coarse]);
+                        } else {
+                            cp->u_guess_recvcounts[comp_coarse] = 0;
+                            cp->u_guess_recvdispl[comp_coarse]  = 0;
+                            MPI_Type_dup(MPI_BYTE, &cp->u_guess_recvtypes[comp_coarse]);
+                        }
+                    }
+                    if (comp_coarse == local_proc) {
+                        if (overlap.size[0] > 0 && overlap.size[1] > 0) {
+                            ptrdiff_t src_origin[2] = { cp_coarse->solver_start[comp_coarse][0], cp_coarse->solver_start[comp_coarse][1]};
+                            cp->u_guess_sendcounts[comp_fine] = 1;
+                            cp->u_guess_senddispl[comp_fine]  = ((overlap.start[1] - src_origin[1]) * cp_coarse->solver->u_stride +
+                                                                 (overlap.start[0] - src_origin[0])) * sizeof(double);
+                            MPI_Type_vector(overlap.size[1], overlap.size[0], cp_coarse->solver->u_stride,
+                                            MPI_DOUBLE, &cp->u_guess_sendtypes[comp_fine]);
+                            MPI_Type_commit(&cp->u_guess_sendtypes[comp_fine]);
+                        } else {
+                            cp->u_guess_sendcounts[comp_fine] = 0;
+                            cp->u_guess_senddispl[comp_fine]  = 0;
+                            MPI_Type_dup(MPI_BYTE, &cp->u_guess_sendtypes[comp_fine]);
+                        }
+                    }
+                }
+            }
+        }
+
+        MPI_Alltoallw(cp_coarse->solver->u, cp->u_guess_sendcounts, cp->u_guess_senddispl, cp->u_guess_sendtypes,
+                      cp->u_guess,     cp->u_guess_recvcounts, cp->u_guess_recvdispl, cp->u_guess_recvtypes,
+                      MPI_COMM_WORLD);
+        ret = mg2d_init_guess(cp->solver,
+                              cp->u_guess, cp->u_guess_stride,
+                              cp->u_guess_start, cp->u_guess_size, cp_coarse->solver->step);
+    } else {
+        ret = mg2d_init_guess(cp->solver, cp_coarse->solver->u, cp_coarse->solver->u_stride,
+                              cp_coarse->solver->local_start, cp_coarse->solver->local_size, cp_coarse->solver->step);
+    }
+
+    return ret;
+}
+
+
 void qms_mg_solve(CCTK_ARGUMENTS)
 {
     QMSMGContext *ms = qms_context;
@@ -1039,8 +1329,7 @@ void qms_mg_solve(CCTK_ARGUMENTS)
             /* use the solution from the coarser level as the initial guess */
             CoordPatch *cp_coarse = get_coord_patch(ms, reflevel - 1);
 
-            ret = mg2d_init_guess(cp->solver, cp_coarse->solver->u, cp_coarse->solver->u_stride,
-                                  cp_coarse->solver->local_start, cp_coarse->solver->local_size, cp_coarse->solver->step);
+            ret = guess_from_coarser(ms, cp, cp_coarse);
             if (ret < 0)
                 CCTK_WARN(0, "Error setting the initial guess");
         }
-- 
cgit v1.2.3