From fe88665e50aac0cdecc6d0b1e27a58bbdf937950 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Wed, 26 Jun 2019 17:47:06 +0200
Subject: Make initial guess interpolation work with MPI.

---
 src/maximal_slicing_axi_mg.c | 222 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 202 insertions(+), 20 deletions(-)

diff --git a/src/maximal_slicing_axi_mg.c b/src/maximal_slicing_axi_mg.c
index 9d32120..d9d1508 100644
--- a/src/maximal_slicing_axi_mg.c
+++ b/src/maximal_slicing_axi_mg.c
@@ -22,6 +22,7 @@
 #define SQR(x) ((x) * (x))
 #define ABS(x) ((x >= 0) ? (x) : -(x))
 #define MIN(x, y) ((x) > (y) ? (y) : (x))
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
 #define ARRAY_ELEMS(x) (sizeof(x) / sizeof(*x))
 
 #define CPINDEX(cp, i, j, k) ((k * cp->grid_size[1] + j) * cp->grid_size[0] + i)
@@ -75,6 +76,9 @@ typedef struct CoordPatch {
 
     ptrdiff_t y_idx;
 
+    size_t *extents;
+    int  nb_extents;
+
     /* number of x/z grid points by which the elliptic solver domain is offset
      * from the cactus grid */
     ptrdiff_t offset_left[2];
@@ -89,6 +93,21 @@ typedef struct CoordPatch {
     int          *recvcounts;
     int          *recvdispl;
     MPI_Datatype *recvtypes;
+
+    /* MPI sync for the initial guess */
+    ptrdiff_t (*solver_start)[2];
+    size_t    (*solver_size)[2];
+    double   *u_guess;
+    ptrdiff_t u_guess_stride;
+    ptrdiff_t u_guess_start[2];
+    size_t    u_guess_size[2];
+
+    int          *u_guess_sendcounts;
+    int          *u_guess_senddispl;
+    MPI_Datatype *u_guess_sendtypes;
+    int          *u_guess_recvcounts;
+    int          *u_guess_recvdispl;
+    MPI_Datatype *u_guess_recvtypes;
 } CoordPatch;
 
 typedef struct MSMGContext {
@@ -173,6 +192,8 @@ static void coord_patch_free(CoordPatch *cp)
     cp->solver_eval    = NULL;
     cp->nb_solver_eval = 0;
 
+    free(cp->extents);
+
     if (cp->synctype)
         MPI_Type_free(&cp->synctype);
     free(cp->sendcounts);
@@ -181,6 +202,26 @@ static void coord_patch_free(CoordPatch *cp)
     free(cp->recvcounts);
     free(cp->recvdispl);
     free(cp->recvtypes);
+
+    if (cp->u_guess_sendtypes) {
+        for (int i = 0; i < cp->nb_components; i++)
+            if (cp->u_guess_sendtypes[i] != MPI_DATATYPE_NULL)
+                MPI_Type_free(&cp->u_guess_sendtypes[i]);
+    }
+    if (cp->u_guess_recvtypes) {
+        for (int i = 0; i < cp->nb_components; i++)
+            if (cp->u_guess_recvtypes[i] != MPI_DATATYPE_NULL)
+                MPI_Type_free(&cp->u_guess_recvtypes[i]);
+    }
+    free(cp->u_guess);
+    free(cp->u_guess_sendcounts);
+    free(cp->u_guess_senddispl);
+    free(cp->u_guess_sendtypes);
+    free(cp->u_guess_recvcounts);
+    free(cp->u_guess_recvdispl);
+    free(cp->u_guess_recvtypes);
+    free(cp->solver_size);
+    free(cp->solver_start);
 }
 
 static void log_callback(const MG2DContext *ctx, int level,
@@ -342,7 +383,7 @@ static CoordPatch *get_coord_patch(MSMGContext *ms, int level)
     CoordPatch *cp;
 
     size_t *extents;
-    size_t interior_size[2], component_start[2], component_size[2];
+    size_t component_start[2], component_size[2];
 
     int *solver_ranks;
     int nb_solver_ranks;
@@ -396,6 +437,12 @@ static CoordPatch *get_coord_patch(MSMGContext *ms, int level)
     if (!solver_ranks)
         CCTK_WARN(0, "Error allocating solver ranks");
 
+    cp->nb_components = nb_procs;
+    cp->solver_size  = calloc(nb_procs, sizeof(*cp->solver_size));
+    cp->solver_start = calloc(nb_procs, sizeof(*cp->solver_start));
+    if (!cp->solver_size || !cp->solver_start)
+        CCTK_WARN(0, "Error allocating solver sizes/starts");
+
     /* on the finest level we allocate a solver for each of the substeps
      * between the coarser-grid timelevels */
     if (level == ms->nb_levels - 1) {
@@ -470,7 +517,6 @@ static CoordPatch *get_coord_patch(MSMGContext *ms, int level)
             if (!cp->sendcounts || !cp->senddispl || !cp->sendtypes ||
                 !cp->recvcounts || !cp->recvdispl || !cp->recvtypes)
                 CCTK_WARN(0, "Error allocating sync arrays");
-            cp->nb_components = nb_procs;
 
             MPI_Type_vector(gz, gx, ms->gh->cctk_lsh[0], MPI_DOUBLE, &cp->synctype);
             MPI_Type_commit(&cp->synctype);
@@ -599,19 +645,17 @@ static CoordPatch *get_coord_patch(MSMGContext *ms, int level)
                 offset_right--;
 
             end = MIN(end, level_size[dir] - offset_right);
-            if (end > start)
-                size[dir] = end - start;
-            else
-                size[dir] = 0;
+            if (end > start) {
+                cp->solver_start[proc][dir] = start;
+                cp->solver_size[proc][dir]  = end - start;
+            } else {
+                cp->solver_start[proc][dir] = 0;
+                cp->solver_size[proc][dir]  = 0;
+            }
         }
 
-        if (size[0] > 0 && size[1] > 0)
+        if (cp->solver_size[proc][0] > 0 && cp->solver_size[proc][1] > 0)
             solver_ranks[nb_solver_ranks++] = proc;
-
-        if (proc == local_proc) {
-            interior_size[0] = size[0];
-            interior_size[1] = size[1];
-        }
     }
 
     if (nb_solver_ranks > 1) {
@@ -625,15 +669,17 @@ static CoordPatch *get_coord_patch(MSMGContext *ms, int level)
     } else
         cp->solver_comm = MPI_COMM_NULL;
 
-    if (interior_size[0] > 0 && interior_size[1] > 0) {
-        cp->solver = solver_alloc(ms, level, component_start, interior_size,
+    if (cp->solver_size[local_proc][0] > 0 && cp->solver_size[local_proc][1] > 0) {
+        cp->solver = solver_alloc(ms, level, cp->solver_start[local_proc], cp->solver_size[local_proc],
                                   a_x[1] - a_x[0], cp->solver_comm);
         if (!cp->solver)
             CCTK_WARN(0, "Error allocating the solver");
     }
 
     free(solver_ranks);
-    free(extents);
+
+    cp->extents    = extents;
+    cp->nb_extents = nb_procs;
 
     ms->nb_patches++;
     return cp;
@@ -1104,6 +1150,143 @@ finish:
     ms->count_eval++;
 }
 
+typedef struct Rect {
+    ptrdiff_t start[2];
+    size_t    size[2];
+} Rect;
+
+static int rect_intersect(Rect *dst, const Rect *src1, const Rect *src2)
+{
+    ptrdiff_t intersect_start0 = MAX(src1->start[0], src2->start[0]);
+    ptrdiff_t intersect_start1 = MAX(src1->start[1], src2->start[1]);
+    ptrdiff_t intersect_end0   = MIN(src1->start[0] + src1->size[0], src2->start[0] + src2->size[0]);
+    ptrdiff_t intersect_end1   = MIN(src1->start[1] + src1->size[1], src2->start[1] + src2->size[1]);
+
+    if (intersect_start0 < intersect_end0 && intersect_start1 < intersect_end1) {
+        dst->start[0] = intersect_start0;
+        dst->start[1] = intersect_start1;
+        dst->size[0]  = intersect_end0 - intersect_start0;
+        dst->size[1]  = intersect_end1 - intersect_start1;
+
+        return 1;
+    }
+
+    dst->size[0] = 0;
+    dst->size[1] = 0;
+
+    return 0;
+}
+
+static int guess_from_coarser(MSMGContext *ms, CoordPatch *cp, CoordPatch *cp_coarse)
+{
+    int ret;
+
+    if (cp->nb_components > 1) {
+        int comp_fine, comp_coarse = 0;
+
+        // FIXME skip levels with skipped components,
+        // those require somewhat more complex logic to implement
+        MPI_Comm_size(cp->solver_comm, &comp_fine);
+        if (cp_coarse->solver_comm != MPI_COMM_NULL)
+            MPI_Comm_size(cp_coarse->solver_comm, &comp_coarse);
+        if (comp_fine != cp->nb_components || comp_coarse != cp->nb_components)
+            return 0;
+
+        if (!cp->u_guess) {
+            const int ghosts = cp->solver->fd_stencil + 1;
+            int local_proc = CCTK_MyProc(ms->gh);
+
+            cp->u_guess_sendcounts = calloc(cp->nb_components, sizeof(*cp->u_guess_sendcounts));
+            cp->u_guess_senddispl  = calloc(cp->nb_components, sizeof(*cp->u_guess_senddispl));
+            cp->u_guess_sendtypes  = calloc(cp->nb_components, sizeof(*cp->u_guess_sendtypes));
+            cp->u_guess_recvcounts = calloc(cp->nb_components, sizeof(*cp->u_guess_recvcounts));
+            cp->u_guess_recvdispl  = calloc(cp->nb_components, sizeof(*cp->u_guess_recvdispl));
+            cp->u_guess_recvtypes  = calloc(cp->nb_components, sizeof(*cp->u_guess_recvtypes));
+            if (!cp->u_guess_sendcounts || !cp->u_guess_senddispl || !cp->u_guess_sendtypes ||
+                !cp->u_guess_recvcounts || !cp->u_guess_recvdispl || !cp->u_guess_recvtypes)
+                return -ENOMEM;
+
+            cp->u_guess_size[0] = cp->solver_size[local_proc][0] / 2 + 2 * ghosts;
+            cp->u_guess_size[1] = cp->solver_size[local_proc][1] / 2 + 2 * ghosts;
+            cp->u_guess_start[0] = cp->solver_start[local_proc][0] / 2 - ghosts;
+            cp->u_guess_start[1] = cp->solver_start[local_proc][1] / 2 - ghosts;
+            cp->u_guess_stride  = cp->u_guess_size[0];
+            cp->u_guess= calloc(cp->u_guess_stride * cp->u_guess_size[1],
+                                sizeof(*cp->u_guess));
+            if (!cp->u_guess)
+                return -ENOMEM;
+
+            for (int comp_fine = 0; comp_fine < cp->nb_components; comp_fine++) {
+                Rect dst_fine = {
+                    .start = { cp->solver_start[comp_fine][0], cp->solver_start[comp_fine][1]},
+                    .size  = { cp->solver_size[comp_fine][0],  cp->solver_size[comp_fine][1]},
+                };
+                Rect dst_coarse = {
+                    .start = { dst_fine.start[0] / 2 - ghosts,     dst_fine.start[1] / 2 - ghosts },
+                    .size  = { dst_fine.size[0]  / 2 + ghosts * 2, dst_fine.size[1] / 2 + ghosts * 2 },
+                };
+
+                for (int comp_coarse = 0; comp_coarse < cp->nb_components; comp_coarse++) {
+                    Rect overlap;
+                    Rect src = {
+                        .start = { cp_coarse->solver_start[comp_coarse][0], cp_coarse->solver_start[comp_coarse][1]},
+                        .size  = { cp_coarse->solver_size[comp_coarse][0],  cp_coarse->solver_size[comp_coarse][1]},
+                    };
+                    for (int dim = 0; dim < 2; dim++) {
+                        if (src.start[dim] == 0) {
+                            src.start[dim] -= ghosts;
+                            src.size[dim]  += ghosts;
+                        }
+                    }
+
+                    rect_intersect(&overlap, &dst_coarse, &src);
+                    if (comp_fine == local_proc) {
+                        if (overlap.size[0] > 0 && overlap.size[1] > 0) {
+                            cp->u_guess_recvcounts[comp_coarse] = 1;
+                            cp->u_guess_recvdispl[comp_coarse]  = ((overlap.start[1] - dst_coarse.start[1]) * cp->u_guess_stride +
+                                                                   (overlap.start[0] - dst_coarse.start[0])) * sizeof(double);
+                            MPI_Type_vector(overlap.size[1], overlap.size[0], cp->u_guess_stride,
+                                            MPI_DOUBLE, &cp->u_guess_recvtypes[comp_coarse]);
+                            MPI_Type_commit(&cp->u_guess_recvtypes[comp_coarse]);
+                        } else {
+                            cp->u_guess_recvcounts[comp_coarse] = 0;
+                            cp->u_guess_recvdispl[comp_coarse]  = 0;
+                            cp->u_guess_recvtypes[comp_coarse]  = MPI_BYTE;
+                        }
+                    }
+                    if (comp_coarse == local_proc) {
+                        if (overlap.size[0] > 0 && overlap.size[1] > 0) {
+                            ptrdiff_t src_origin[2] = { cp_coarse->solver_start[comp_coarse][0], cp_coarse->solver_start[comp_coarse][1]};
+                            cp->u_guess_sendcounts[comp_fine] = 1;
+                            cp->u_guess_senddispl[comp_fine]  = ((overlap.start[1] - src_origin[1]) * cp_coarse->solver->u_stride +
+                                                                 (overlap.start[0] - src_origin[0])) * sizeof(double);
+                            MPI_Type_vector(overlap.size[1], overlap.size[0], cp_coarse->solver->u_stride,
+                                            MPI_DOUBLE, &cp->u_guess_sendtypes[comp_fine]);
+                            MPI_Type_commit(&cp->u_guess_sendtypes[comp_fine]);
+                        } else {
+                            cp->u_guess_sendcounts[comp_fine] = 0;
+                            cp->u_guess_senddispl[comp_fine]  = 0;
+                            cp->u_guess_sendtypes[comp_fine]  = MPI_BYTE;
+                        }
+                    }
+                }
+            }
+        }
+
+        MPI_Alltoallw(cp_coarse->solver->u, cp->u_guess_sendcounts, cp->u_guess_senddispl, cp->u_guess_sendtypes,
+                      cp->u_guess,     cp->u_guess_recvcounts, cp->u_guess_recvdispl, cp->u_guess_recvtypes,
+                      MPI_COMM_WORLD);
+        ret = mg2d_init_guess(cp->solver,
+                              cp->u_guess, cp->u_guess_stride,
+                              cp->u_guess_start, cp->u_guess_size, cp_coarse->solver->step);
+    } else {
+        ret = mg2d_init_guess(cp->solver, cp_coarse->solver->u, cp_coarse->solver->u_stride,
+                              cp_coarse->solver->local_start, cp_coarse->solver->local_size, cp_coarse->solver->step);
+    }
+
+    return ret;
+}
+
 void msa_mg_solve(CCTK_ARGUMENTS)
 {
     DECLARE_CCTK_ARGUMENTS;
@@ -1194,12 +1377,11 @@ void msa_mg_solve(CCTK_ARGUMENTS)
             }
         } else {
             /* use the solution from the coarser level as the initial guess */
-            //CoordPatch *cp1 = get_coord_patch(ms, reflevel - 1);
+            CoordPatch *cp1 = get_coord_patch(ms, reflevel - 1);
 
-            //ret = mg2d_init_guess(cp->solver, cp1->solver->u, cp1->solver->u_stride,
-            //                      cp1->solver->local_size, cp1->solver->step);
-            //if (ret < 0)
-            //    CCTK_WARN(0, "Error setting the initial guess");
+            ret = guess_from_coarser(ms, cp, cp1);
+            if (ret < 0)
+                CCTK_WARN(0, "Error setting the initial guess");
         }
 
         /* if the condition above was false, then lapse_mg should be filled by
-- 
cgit v1.2.3