From ba38a083ec6bd6bc4d421b09c5d501881aa5c4b8 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Sat, 27 Aug 2022 01:56:02 +0200 Subject: Implement coarse-level initial guess for MPI. --- schedule.ccl | 4 ++ src/qms.c | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 166 insertions(+), 17 deletions(-) diff --git a/schedule.ccl b/schedule.ccl index 8072632..1cc7236 100644 --- a/schedule.ccl +++ b/schedule.ccl @@ -25,6 +25,10 @@ if (CCTK_EQUALS(lapse_source, "QMS_MG")) { SCHEDULE qms_mg_sync IN CCTK_POSTSTEP BEFORE qms_mg_solve { SYNC: W_val + SYNC: W_pred0 + SYNC: W_pred1 + SYNC: W_val0 + SYNC: W_val1 LANG: C } "" diff --git a/src/qms.c b/src/qms.c index aab5394..ab803c2 100644 --- a/src/qms.c +++ b/src/qms.c @@ -73,6 +73,19 @@ typedef struct CoordPatch { int *u_guess_recvcounts; int *u_guess_recvdispl; MPI_Datatype *u_guess_recvtypes; + + /* MPI sync for the coarser-level prediction */ + double *interp_tmp_sync; + ptrdiff_t interp_tmp_stride; + ptrdiff_t interp_tmp_start[2]; + size_t interp_tmp_size[2]; + + int *interp_tmp_sendcounts; + int *interp_tmp_senddispl; + MPI_Datatype *interp_tmp_sendtypes; + int *interp_tmp_recvcounts; + int *interp_tmp_recvdispl; + MPI_Datatype *interp_tmp_recvtypes; } CoordPatch; typedef struct QMSMGContext { @@ -433,6 +446,15 @@ static void coord_patch_free(CoordPatch **pcp) free(cp->interp_tmp0); free(cp->interp_tmp1); + + free(cp->interp_tmp_sync); + free(cp->interp_tmp_sendcounts); + free(cp->interp_tmp_senddispl); + free(cp->interp_tmp_sendtypes); + free(cp->interp_tmp_recvcounts); + free(cp->interp_tmp_recvdispl); + free(cp->interp_tmp_recvtypes); + free(cp->extents); free(cp); @@ -1350,6 +1372,121 @@ static int guess_from_coarser(QMSMGContext *ms, CoordPatch *cp, CoordPatch *cp_c return ret; } +static int interp_coarse(QMSMGContext *ms, CoordPatch *cp, double *dst, + CoordPatch *cp_coarse, const double *src) +{ + int ret; + + if (cp->nb_components > 1) { + const int local_proc = CCTK_MyProc(ms->gh); + int comp_fine, comp_coarse = 1; + + MPI_Comm_size(cp->solver_comm, &comp_fine); + if (cp_coarse->solver_comm != MPI_COMM_NULL) + MPI_Comm_size(cp_coarse->solver_comm, &comp_coarse); + + qms_assert(comp_fine == cp->nb_components && comp_coarse == cp->nb_components); + + if (!cp->interp_tmp_sync) { + const int ghosts = ms->gh->cctk_nghostzones[0]; + + cp->interp_tmp_sendcounts = calloc(cp->nb_components, sizeof(*cp->interp_tmp_sendcounts)); + cp->interp_tmp_senddispl = calloc(cp->nb_components, sizeof(*cp->interp_tmp_senddispl)); + cp->interp_tmp_sendtypes = calloc(cp->nb_components, sizeof(*cp->interp_tmp_sendtypes)); + cp->interp_tmp_recvcounts = calloc(cp->nb_components, sizeof(*cp->interp_tmp_recvcounts)); + cp->interp_tmp_recvdispl = calloc(cp->nb_components, sizeof(*cp->interp_tmp_recvdispl)); + cp->interp_tmp_recvtypes = calloc(cp->nb_components, sizeof(*cp->interp_tmp_recvtypes)); + if (!cp->interp_tmp_sendcounts || !cp->interp_tmp_senddispl || !cp->interp_tmp_sendtypes || + !cp->interp_tmp_recvcounts || !cp->interp_tmp_recvdispl || !cp->interp_tmp_recvtypes) + return -ENOMEM; + + cp->interp_tmp_start[0] = (ptrdiff_t)cp->extents[4 * local_proc + 0] / 2 - ghosts; + cp->interp_tmp_start[1] = (ptrdiff_t)cp->extents[4 * local_proc + 1] / 2 - ghosts; + cp->interp_tmp_size[0] = cp->extents[4 * local_proc + 2] / 2 + 2 * ghosts; + cp->interp_tmp_size[1] = cp->extents[4 * local_proc + 3] / 2 + 2 * ghosts; + cp->interp_tmp_stride = cp->interp_tmp_size[0]; + + cp->interp_tmp_sync = calloc(cp->interp_tmp_stride * cp->interp_tmp_size[1], + sizeof(*cp->interp_tmp_sync)); + if (!cp->interp_tmp_sync) + return -ENOMEM; + + for (int comp_fine = 0; comp_fine < cp->nb_components; comp_fine++) { + Rect dst_fine = { + .start = { cp->extents[4 * comp_fine + 0], cp->extents[4 * comp_fine + 1]}, + .size = { cp->extents[4 * comp_fine + 2], cp->extents[4 * comp_fine + 3]}, + }; + Rect dst_coarse = { + .start = { dst_fine.start[0] / 2 - ghosts, dst_fine.start[1] / 2 - ghosts }, + .size = { dst_fine.size[0] / 2 + ghosts * 2, dst_fine.size[1] / 2 + ghosts * 2 }, + }; + + for (int comp_coarse = 0; comp_coarse < cp_coarse->nb_components; comp_coarse++) { + Rect overlap; + Rect src = { + .start = { cp_coarse->extents[4 * comp_coarse + 0], cp_coarse->extents[4 * comp_coarse + 1]}, + .size = { cp_coarse->extents[4 * comp_coarse + 2], cp_coarse->extents[4 * comp_coarse + 3]}, + }; + for (int dim = 0; dim < 2; dim++) { + if (src.start[dim] == 0) { + src.start[dim] -= ghosts; + src.size[dim] += ghosts; + } + } + + rect_intersect(&overlap, &dst_coarse, &src); + if (comp_fine == local_proc) { + if (overlap.size[0] > 0 && overlap.size[1] > 0) { + cp->interp_tmp_recvcounts[comp_coarse] = 1; + cp->interp_tmp_recvdispl[comp_coarse] = ((overlap.start[1] - dst_coarse.start[1]) * cp->interp_tmp_stride + + (overlap.start[0] - dst_coarse.start[0])) * sizeof(double); + MPI_Type_vector(overlap.size[1], overlap.size[0], cp->interp_tmp_stride, + MPI_DOUBLE, &cp->interp_tmp_recvtypes[comp_coarse]); + MPI_Type_commit(&cp->interp_tmp_recvtypes[comp_coarse]); + } else { + cp->interp_tmp_recvcounts[comp_coarse] = 0; + cp->interp_tmp_recvdispl[comp_coarse] = 0; + MPI_Type_dup(MPI_BYTE, &cp->interp_tmp_recvtypes[comp_coarse]); + } + } + if (comp_coarse == local_proc) { + if (overlap.size[0] > 0 && overlap.size[1] > 0) { + ptrdiff_t src_origin[2] = { cp_coarse->extents[4 * comp_coarse + 0] - cp_coarse->offset_left[0], + cp_coarse->extents[4 * comp_coarse + 1] - cp_coarse->offset_left[1] }; + cp->interp_tmp_sendcounts[comp_fine] = 1; + cp->interp_tmp_senddispl[comp_fine] = ((overlap.start[1] - src_origin[1]) * cp_coarse->grid_size[0] + + (overlap.start[0] - src_origin[0])) * sizeof(double); + MPI_Type_vector(overlap.size[1], overlap.size[0], cp_coarse->grid_size[0], + MPI_DOUBLE, &cp->interp_tmp_sendtypes[comp_fine]); + MPI_Type_commit(&cp->interp_tmp_sendtypes[comp_fine]); + } else { + cp->interp_tmp_sendcounts[comp_fine] = 0; + cp->interp_tmp_senddispl[comp_fine] = 0; + MPI_Type_dup(MPI_BYTE, &cp->interp_tmp_sendtypes[comp_fine]); + } + } + } + } + } + + MPI_Alltoallw(src, cp->interp_tmp_sendcounts, cp->interp_tmp_senddispl, cp->interp_tmp_sendtypes, + cp->interp_tmp_sync, cp->interp_tmp_recvcounts, cp->interp_tmp_recvdispl, cp->interp_tmp_recvtypes, + MPI_COMM_WORLD); + ret = mg2d_interp(cp->solver, dst + CCTK_GFINDEX3D(ms->gh, cp->offset_left[0], 0, cp->offset_left[1]), + cp->grid_size[0], (ptrdiff_t [2]){ cp->extents[4 * local_proc + 0], cp->extents[4 * local_proc + 1] }, + (size_t [2]){cp->extents[4 * local_proc + 2], cp->extents[4 * local_proc + 3] }, cp->solver->step, + cp->interp_tmp_sync, cp->interp_tmp_stride, cp->interp_tmp_start, cp->interp_tmp_size, cp_coarse->solver->step); + } else { + ret = mg2d_interp(cp->solver, dst + CCTK_GFINDEX3D(ms->gh, cp->offset_left[0], 0, cp->offset_left[1]), + cp->grid_size[0], (ptrdiff_t [2]){ 0, 0 }, (size_t [2]){cp->grid_size[0] - cp->offset_left[0], cp->grid_size[2] - cp->offset_left[1] }, + cp->solver->step, + src, cp_coarse->grid_size[0], (ptrdiff_t [2]){ -cp_coarse->offset_left[0], -cp_coarse->offset_left[1] }, + (size_t [2]){cp_coarse->grid_size[0], cp_coarse->grid_size[2] }, cp_coarse->solver->step); + } + + return ret; +} + void qms_mg_solve(CCTK_ARGUMENTS) { QMSMGContext *ms = qms_context; @@ -1506,15 +1643,18 @@ skip_solve: if (reflevel > 0 && reflevel <= ms->solve_level) { CoordPatch *cp_coarse = get_coord_patch(ms, reflevel - 1); - double *W_pred0_coarse = VarDataPtrI(ms->gh, 0, reflevel - 1, 0, 0, CCTK_VarIndex("QuasiMaximalSlicingMG::W_pred0")); - double *W_pred1_coarse = VarDataPtrI(ms->gh, 0, reflevel - 1, 0, 0, CCTK_VarIndex("QuasiMaximalSlicingMG::W_pred1")); - const int integrator_substeps = MoLNumIntegratorSubsteps(); - const int interior_size = cctk_lsh[0] - cp->offset_left[0] - cctk_nghostzones[0] * integrator_substeps * 2; - const double bnd_loc = x[CCTK_GFINDEX3D(ms->gh, cp->offset_left[0] + interior_size, 0, 0)]; + const int local_proc = CCTK_MyProc(ms->gh); + + const ptrdiff_t bnd_idx = cp->level_size[0] - cctk_nghostzones[0] * integrator_substeps * 2 - 1; + const double bnd_loc = bnd_idx * cp->solver->step[0]; const double transition_start = bnd_loc * 0.7; + const ptrdiff_t smooth_end[2] = { MIN(cp->extents[local_proc * 4 + 0] + cp->extents[local_proc * 4 + 2], bnd_idx), + MIN(cp->extents[local_proc * 4 + 1] + cp->extents[local_proc * 4 + 3], bnd_idx) }; + const ptrdiff_t smooth_size[2] = { smooth_end[0] - cp->extents[local_proc * 4 + 0], smooth_end[1] - cp->extents[local_proc * 4 + 1] }; + double tp0 = W_pred0_time[reflevel - 1]; double tp1 = W_pred1_time[reflevel - 1]; @@ -1523,24 +1663,29 @@ skip_solve: qms_assert(pred_time >= tp0 && pred_time <= tp1); - mg2d_interp(cp->solver, cp->interp_tmp0 + CCTK_GFINDEX3D(ms->gh, cp->offset_left[0], 0, cp->offset_left[1]), - cctk_lsh[0], (ptrdiff_t [2]){ 0, 0 }, (size_t [2]){cctk_lsh[0] - cp->offset_left[0], cctk_lsh[2] - cp->offset_left[1] }, - cp->solver->step, - W_pred0_coarse, cp_coarse->grid_size[0], (ptrdiff_t [2]){ -cp_coarse->offset_left[0], -cp_coarse->offset_left[1] }, - (size_t [2]){cp_coarse->grid_size[0], cp_coarse->grid_size[2] }, cp_coarse->solver->step); - mg2d_interp(cp->solver, cp->interp_tmp1 + CCTK_GFINDEX3D(ms->gh, cp->offset_left[0], 0, cp->offset_left[1]), - cctk_lsh[0], (ptrdiff_t [2]){ 0, 0 }, (size_t [2]){cctk_lsh[0] - cp->offset_left[0], cctk_lsh[2] - cp->offset_left[1] }, - cp->solver->step, - W_pred1_coarse, cp_coarse->grid_size[0], (ptrdiff_t [2]){ -cp_coarse->offset_left[0], -cp_coarse->offset_left[1] }, - (size_t [2]){cp_coarse->grid_size[0], cp_coarse->grid_size[2] }, cp_coarse->solver->step); + //mg2d_interp(cp->solver, cp->interp_tmp0 + CCTK_GFINDEX3D(ms->gh, cp->offset_left[0], 0, cp->offset_left[1]), + // cctk_lsh[0], (ptrdiff_t [2]){ 0, 0 }, (size_t [2]){cctk_lsh[0] - cp->offset_left[0], cctk_lsh[2] - cp->offset_left[1] }, + // cp->solver->step, + // W_pred0_coarse, cp_coarse->grid_size[0], (ptrdiff_t [2]){ -cp_coarse->offset_left[0], -cp_coarse->offset_left[1] }, + // (size_t [2]){cp_coarse->grid_size[0], cp_coarse->grid_size[2] }, cp_coarse->solver->step); + //mg2d_interp(cp->solver, cp->interp_tmp1 + CCTK_GFINDEX3D(ms->gh, cp->offset_left[0], 0, cp->offset_left[1]), + // cctk_lsh[0], (ptrdiff_t [2]){ 0, 0 }, (size_t [2]){cctk_lsh[0] - cp->offset_left[0], cctk_lsh[2] - cp->offset_left[1] }, + // cp->solver->step, + // W_pred1_coarse, cp_coarse->grid_size[0], (ptrdiff_t [2]){ -cp_coarse->offset_left[0], -cp_coarse->offset_left[1] }, + // (size_t [2]){cp_coarse->grid_size[0], cp_coarse->grid_size[2] }, cp_coarse->solver->step); + + interp_coarse(ms, cp, cp->interp_tmp0, cp_coarse, + VarDataPtrI(ms->gh, 0, reflevel - 1, local_proc, 0, CCTK_VarIndex("QuasiMaximalSlicingMG::W_pred0"))); + interp_coarse(ms, cp, cp->interp_tmp1, cp_coarse, + VarDataPtrI(ms->gh, 0, reflevel - 1, local_proc, 0, CCTK_VarIndex("QuasiMaximalSlicingMG::W_pred1"))); #pragma omp parallel for for (int i = 0; i < grid_size; i++) W_pred1[i] = factp0 * cp->interp_tmp0[i] + factp1 * cp->interp_tmp1[i]; #pragma omp parallel for - for (int idx_z = 0; idx_z < interior_size; idx_z++) - for (int idx_x = 0; idx_x < interior_size; idx_x++) { + for (int idx_z = 0; idx_z < smooth_size[1]; idx_z++) + for (int idx_x = 0; idx_x < smooth_size[0]; idx_x++) { const ptrdiff_t idx = CCTK_GFINDEX3D(ms->gh, cp->offset_left[0] + idx_x, 0, cp->offset_left[1] + idx_z); -- cgit v1.2.3