/* * Multigrid solver for a 2nd order 2D linear PDE. * Copyright 2018 Anton Khirnov * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include "common.h" #include "components.h" #include "cpu.h" #include "ell_grid_solve.h" #include "log.h" #include "mg2d.h" #include "mg2d_boundary.h" #include "mg2d_boundary_internal.h" #include "mg2d_constants.h" #include "timer.h" #include "transfer.h" typedef struct MG2DLevel { unsigned int depth; /* comm used for restriction to/prolongation from child * size is equal to dg->nb_components and is guaranteed to * be a superset of the child's comm */ MPI_Comm mpi_comm; DomainGeometry *dg; EGSContext *solver; int egs_init_flags; GridTransferContext *transfer_restrict; GridTransferContext *transfer_prolong; NDArray *prolong_tmp_base; NDArray *prolong_tmp; Rect restrict_dst_extents; Rect prolong_src_extents; NDArray *restrict_dst; int *restrict_sendcounts; int *restrict_senddispl; MPI_Datatype *restrict_sendtypes; int *restrict_recvcounts; int *restrict_recvdispl; MPI_Datatype *restrict_recvtypes; NDArray *prolong_dst; int *prolong_sendcounts; int *prolong_senddispl; MPI_Datatype *prolong_sendtypes; int *prolong_recvcounts; int *prolong_recvdispl; MPI_Datatype *prolong_recvtypes; struct MG2DLevel *child; MG2DLogger logger; char log_prefix[32]; MG2DInternal *priv; /* timings */ int64_t count_cycles; Timer timer_solve; Timer timer_prolong; Timer timer_restrict; Timer timer_correct; Timer timer_reinit; Timer timer_mpi_sync; } MG2DLevel; typedef struct DomainHierarchy { int nb_levels; DomainGeometry **levels; double (*stepsizes)[2]; } DomainHierarchy; struct MG2DInternal { MPI_Comm mpi_comm; DomainHierarchy dh; DomainGeometry *dg; unsigned int local_component; MG2DLogger logger; TPContext *tp; MG2DLevel *root; NDArray *u; NDArray *rhs; NDArray *diff_coeffs_base[MG2D_DIFF_COEFF_NB]; NDArray *diff_coeffs_tmp[MG2D_DIFF_COEFF_NB]; /** * This is the full boundary val spanning all the components. * A pointer inside it is exported as MG2DDCBoundarySpec.val */ NDArray *dc_bnd_val[MG2D_DIFF_COEFF_NB][4]; GridTransferContext *transfer_init; int cpuflags; Timer timer_solve; Timer timer_levels_init; }; static void log_callback(MG2DLogger *log, int level, const char *fmt, va_list vl) { MG2DContext *ctx = log->opaque; ctx->log_callback(ctx, level, fmt, vl); } static void log_callback_level(MG2DLogger *log, int level, const char *fmt, va_list vl) { MG2DLevel *l = log->opaque; mg2di_log(&l->priv->logger, level, l->log_prefix); mg2di_vlog(&l->priv->logger, level, fmt, vl); } static void log_egs_step(MG2DLevel *level, const char *step_desc, double res_old, double res_new) { mg2di_log(&level->logger, MG2D_LOG_DEBUG, "%s %g->%g (%g)\n", step_desc, res_old, res_new, res_old / res_new); } static int coarse_correct_task(void *arg, unsigned int job_idx, unsigned int thread_idx) { MG2DLevel *level = arg; for (size_t idx0 = 0; idx0 < level->solver->domain_size[0]; idx0++) { const ptrdiff_t idx_dst = job_idx * level->solver->u->stride[0] + idx0; const ptrdiff_t idx_src = job_idx * level->prolong_tmp->stride[0] + idx0; level->solver->u->data[idx_dst] -= level->prolong_tmp->data[idx_src]; } return 0; } static int mg_solve(MG2DContext *ctx, MG2DLevel *level, enum EGSType solve_type, const char *step_desc, int export_res) { double res_old; int ret; res_old = level->solver->residual_max; mg2di_timer_start(&level->timer_solve); ret = mg2di_egs_solve(level->solver, solve_type, export_res); mg2di_timer_stop(&level->timer_solve); if (ret < 0) return ret; log_egs_step(level, step_desc, res_old, level->solver->residual_max); return 0; } static int mg_restrict_rhs(MG2DContext *ctx, MG2DLevel *l) { int multi_component = l->dg->nb_components > 1; NDArray *a_dst = multi_component ? l->restrict_dst : l->child->solver->rhs; int ret = 0; mg2di_timer_start(&l->timer_restrict); ret = mg2di_gt_transfer(l->transfer_restrict, a_dst, l->solver->residual); mg2di_timer_stop(&l->timer_restrict); if (ret < 0) goto finish; if (multi_component) { mg2di_timer_start(&l->timer_mpi_sync); MPI_Alltoallw(a_dst->data, l->restrict_sendcounts, l->restrict_senddispl, l->restrict_sendtypes, l->child ? l->child->solver->rhs->data : NULL, l->restrict_recvcounts, l->restrict_recvdispl, l->restrict_recvtypes, l->mpi_comm); mg2di_timer_stop(&l->timer_mpi_sync); } finish: return ret; } static int mg_prolong_u(MG2DContext *ctx, MG2DLevel *l) { int multi_component = l->dg->nb_components > 1; NDArray *a_src = multi_component ? l->prolong_dst : l->child->solver->u_exterior; int ret = 0; if (multi_component) { mg2di_timer_start(&l->timer_mpi_sync); MPI_Alltoallw(l->child ? l->child->solver->u_exterior->data : NULL, l->prolong_sendcounts, l->prolong_senddispl, l->prolong_sendtypes, l->prolong_dst->data, l->prolong_recvcounts, l->prolong_recvdispl, l->prolong_recvtypes, l->mpi_comm); mg2di_timer_stop(&l->timer_mpi_sync); } mg2di_timer_start(&l->timer_prolong); ret = mg2di_gt_transfer(l->transfer_prolong, l->prolong_tmp, a_src); mg2di_timer_stop(&l->timer_prolong); if (ret < 0) return ret; return 0; } static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level, int allow_exact) { enum EGSType solve_type = (allow_exact && level->dg->nb_components == 1 && level->solver->domain_size[0] <= ctx->max_exact_size) ? EGS_SOLVE_EXACT : EGS_SOLVE_RELAXATION; double res_old, res_new; int ret = 0; /* on the refined levels, use zero as the initial guess for the * solution (correction for the upper level) */ if (level->depth > 0) { mg2di_timer_start(&level->timer_reinit); memset(level->solver->u->data, 0, sizeof(*level->solver->u->data) * level->solver->u->stride[0] * level->solver->domain_size[1]); /* re-init the current-level solver (re-calc the residual) */ ret = mg2di_egs_init(level->solver, level->egs_init_flags); if (ret < 0) return ret; mg2di_timer_stop(&level->timer_reinit); level->egs_init_flags |= EGS_INIT_FLAG_SAME_DIFF_COEFFS; } res_old = level->solver->residual_max; /* handle exact solve */ if (solve_type == EGS_SOLVE_EXACT) { ret = mg_solve(ctx, level, solve_type, "coarse-step", 0); if (ret < 0) return ret; level->count_cycles++; goto finish; } for (int i = 0; i < ctx->nb_cycles; i++) { double res_prev; /* pre-restrict relaxation */ for (int j = 0; j < ctx->nb_relax_pre; j++) { ret = mg_solve(ctx, level, solve_type, "pre-relax", j == ctx->nb_relax_pre - 1 && level->depth < ctx->priv->dh.nb_levels - 1); if (ret < 0) return ret; } if (level->depth < ctx->priv->dh.nb_levels - 1) { /* restrict the residual as to the coarser-level rhs */ ret = mg_restrict_rhs(ctx, level); if (ret < 0) return ret; /* solve on the coarser level */ if (level->child) { ret = mg_solve_subgrid(ctx, level->child, 1); if (ret < 0) return ret; } /* prolongate the coarser-level correction */ ret = mg_prolong_u(ctx, level); if (ret < 0) return ret; /* apply the correction */ mg2di_timer_start(&level->timer_correct); tp_execute(ctx->priv->tp, level->solver->domain_size[1], coarse_correct_task, level); mg2di_timer_stop(&level->timer_correct); /* re-init the current-level solver (re-calc the residual) */ res_prev = level->solver->residual_max; mg2di_timer_start(&level->timer_reinit); ret = mg2di_egs_init(level->solver, level->egs_init_flags); mg2di_timer_stop(&level->timer_reinit); if (ret < 0) return ret; level->egs_init_flags |= EGS_INIT_FLAG_SAME_DIFF_COEFFS; log_egs_step(level, "correct", res_prev, level->solver->residual_max); } /* post-correct relaxation */ for (int j = 0; j < ctx->nb_relax_post; j++) { ret = mg_solve(ctx, level, solve_type, "post-relax", 0); if (ret < 0) return ret; } level->count_cycles++; } finish: res_new = level->solver->residual_max; if (!isfinite(res_new) || (res_new > 1e2 * ctx->tol && res_old / res_new <= 1e-1)) { mg2di_log(&level->logger, MG2D_LOG_ERROR, "The relaxation step has diverged: %g -> %g\n", res_old, res_new); return MG2D_ERR_DIVERGE; } return 0; } static void bnd_zero(MG2DBoundary *bdst, size_t nb_rows, size_t domain_size) { for (size_t i = 0; i < nb_rows; i++) { memset(bdst->val + i * bdst->val_stride - i, 0, sizeof(*bdst->val) * (domain_size + 2 * i)); } } static void bnd_copy(MG2DBoundary *bdst, const double *src, ptrdiff_t src_stride, size_t nb_rows, size_t domain_size) { for (size_t i = 0; i < nb_rows; i++) { memcpy(bdst->val + i * bdst->val_stride - i, src + i * src_stride - i, sizeof(*bdst->val) * (domain_size + 2 * i)); } } static void mirror(double *dst, const ptrdiff_t dst_stride[2], const size_t size[2], double parity) { if (dst_stride[1] == 1 && parity == 1.0) { for (int j = 1; j <= size[0]; j++) memcpy(dst + j * dst_stride[0], dst - j * dst_stride[0], sizeof(*dst) * size[1]); } else { for (size_t i = 0; i < size[1]; i++) { for (int j = 1; j <= size[0]; j++) dst[dst_stride[1] * i + dst_stride[0] * j] = parity * dst[dst_stride[1] * i - dst_stride[0] * j]; } } } static void dc_boundaries_apply(NDArray *a, const int *reflect, const double *parity, const DomainComponent *dc) { for (int order = 0; order < 2; order++) { for (int bnd_idx = 0; bnd_idx < 4; bnd_idx++) { const int ci = mg2d_bnd_coord_idx(bnd_idx); double *dst = a->data + mg2d_bnd_is_upper(bnd_idx) * ((a->shape[!ci] - 1) * a->stride[!ci]); const ptrdiff_t dst_strides[2] = { mg2d_bnd_out_dir(bnd_idx) * a->stride[!ci], a->stride[ci] }; if (!dc->bnd_is_outer[bnd_idx]) continue; if (order == 0 && !reflect[bnd_idx]) { for (ptrdiff_t bnd_layer = 1; bnd_layer <= FD_STENCIL_MAX; bnd_layer++) for (ptrdiff_t row_idx = -(ptrdiff_t)FD_STENCIL_MAX; row_idx < (ptrdiff_t)a->shape[ci] + FD_STENCIL_MAX; row_idx++) { dst[bnd_layer * dst_strides[0] + row_idx * dst_strides[1]] = 2.0 * dst[(bnd_layer - 1) * dst_strides[0] + row_idx * dst_strides[1]] - dst[(bnd_layer - 2) * dst_strides[0] + row_idx * dst_strides[1]]; } } else if (order == 1 && reflect[bnd_idx]) { const size_t size[2] = { FD_STENCIL_MAX, a->shape[ci] + 2 * FD_STENCIL_MAX }; mirror(dst - FD_STENCIL_MAX * dst_strides[1], dst_strides, size, parity[bnd_idx]); } } } } static int restrict_dc_sync(MG2DContext *ctx, MG2DLevel *l) { MG2DInternal *priv = ctx->priv; Rect *overlaps_recv = NULL, *overlaps_send = NULL; MPI_Datatype *sendtypes = NULL, *recvtypes = NULL; int *senddispl = NULL, *recvdispl = NULL, *sendcounts = NULL, *recvcounts = NULL; const ptrdiff_t *lo = l->dg->components[priv->local_component].interior.start; NDArray **diff_coeffs = l->depth > 0 ? l->solver->diff_coeffs : priv->diff_coeffs_tmp; const ptrdiff_t stride = diff_coeffs[0]->stride[0]; int ret = 0; overlaps_recv = calloc(l->dg->nb_components, sizeof(*overlaps_recv)); overlaps_send = calloc(l->dg->nb_components, sizeof(*overlaps_send)); sendtypes = calloc(l->dg->nb_components, sizeof(*sendtypes)); senddispl = calloc(l->dg->nb_components, sizeof(*senddispl)); sendcounts = calloc(l->dg->nb_components, sizeof(*sendcounts)); recvtypes = calloc(l->dg->nb_components, sizeof(*recvtypes)); recvdispl = calloc(l->dg->nb_components, sizeof(*recvdispl)); recvcounts = calloc(l->dg->nb_components, sizeof(*recvcounts)); if (!overlaps_recv || !overlaps_send || !sendtypes || !recvtypes || !senddispl || !recvdispl || !sendcounts || !recvcounts) { ret = -ENOMEM; goto fail; } ret = mg2di_dg_edge_overlaps(overlaps_recv, overlaps_send, l->dg, priv->local_component, FD_STENCIL_MAX); if (ret < 0) goto fail; /* construct the send/receive parameters */ for (unsigned int i = 0; i < l->dg->nb_components; i++) { if (i == priv->local_component) { MPI_Type_dup(MPI_INT, &sendtypes[i]); MPI_Type_dup(MPI_INT, &recvtypes[i]); sendcounts[i] = 0; recvcounts[i] = 0; senddispl[i] = 0; recvdispl[i] = 0; continue; } /* receive */ MPI_Type_vector(overlaps_recv[i].size[1], overlaps_recv[i].size[0], stride, MPI_DOUBLE, &recvtypes[i]); MPI_Type_commit(&recvtypes[i]); recvcounts[i] = 1; recvdispl[i] = ((overlaps_recv[i].start[1] - lo[1]) * stride + (overlaps_recv[i].start[0] - lo[0])) * sizeof(double); /* send */ MPI_Type_vector(overlaps_send[i].size[1], overlaps_send[i].size[0], stride, MPI_DOUBLE, &sendtypes[i]); MPI_Type_commit(&sendtypes[i]); sendcounts[i] = 1; senddispl[i] = ((overlaps_send[i].start[1] - lo[1]) * stride + (overlaps_send[i].start[0] - lo[0])) * sizeof(double); } for (int i = 0; i < MG2D_DIFF_COEFF_NB; i++) { MPI_Alltoallw(diff_coeffs[i]->data, sendcounts, senddispl, sendtypes, diff_coeffs[i]->data, recvcounts, recvdispl, recvtypes, l->mpi_comm); } fail: if (sendtypes) { for (unsigned int i = 0; i < l->dg->nb_components; i++) { if (sendtypes[i]) MPI_Type_free(&sendtypes[i]); } } if (recvtypes) { for (unsigned int i = 0; i < l->dg->nb_components; i++) { if (recvtypes[i]) MPI_Type_free(&recvtypes[i]); } } free(sendtypes); free(senddispl); free(sendcounts); free(recvtypes); free(recvdispl); free(recvcounts); free(overlaps_recv); free(overlaps_send); return ret; } static int restrict_diff_coeffs(MG2DContext *ctx, MG2DLevel *l) { MG2DInternal *priv = ctx->priv; const DomainComponent *dc_src = &l->dg->components[priv->local_component]; int multi_component = l->dg->nb_components > 1; int ret = 0; int reflect[MG2D_DIFF_COEFF_NB][4]; double parity[MG2D_DIFF_COEFF_NB][4]; /* prepare the source arrays */ for (int dc_idx = 0; dc_idx < ARRAY_ELEMS(ctx->diff_coeffs); dc_idx++) { MG2DDiffCoeffs *dc = ctx->diff_coeffs[dc_idx]; NDArray *a_dc = l->depth > 0 ? l->solver->diff_coeffs[dc_idx] : priv->diff_coeffs_tmp[dc_idx]; if (!l->depth) ndarray_copy(a_dc, priv->root->solver->diff_coeffs[dc_idx]); for (int bnd_idx = 0; bnd_idx < ARRAY_ELEMS(dc->boundaries); bnd_idx++) { const int ci = mg2d_bnd_coord_idx(bnd_idx); double *dst = a_dc->data + mg2d_bnd_is_upper(bnd_idx) * ((a_dc->shape[!ci] - 1) * a_dc->stride[!ci]); reflect[dc_idx][bnd_idx] = ctx->boundaries[bnd_idx]->type == MG2D_BC_TYPE_REFLECT; parity [dc_idx][bnd_idx] = 1.0; if (dc_idx == MG2D_DIFF_COEFF_11 || (ci == 0 && dc_idx == MG2D_DIFF_COEFF_10) || (ci == 1 && dc_idx == MG2D_DIFF_COEFF_01)) parity[dc_idx][bnd_idx] *= -1.0; if (dc->boundaries[bnd_idx].flags & MG2D_DC_FLAG_POLE) parity[dc_idx][bnd_idx] *= -1.0; /* gather the boundary values for interpolation onto coarser levels */ if (!l->depth && multi_component && dc->boundaries[bnd_idx].flags &MG2D_DC_FLAG_DISCONT) { int *recvcounts = NULL, *displs = NULL; recvcounts = calloc(l->dg->nb_components, sizeof(*recvcounts)); displs = calloc(l->dg->nb_components, sizeof(*displs)); if (!recvcounts || !displs) { free(recvcounts); free(displs); return -ENOMEM; } for (int comp = 0; comp < l->dg->nb_components; comp++) { if (l->dg->components[comp].bnd_is_outer[bnd_idx]) { recvcounts[comp] = l->dg->components[comp].interior.size[!ci]; displs[comp] = l->dg->components[comp].interior.start[!ci]; } else { recvcounts[comp] = 0; displs[comp] = 0; } } MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, priv->dc_bnd_val[dc_idx][bnd_idx]->data + FD_STENCIL_MAX, recvcounts, displs, MPI_DOUBLE, priv->mpi_comm); free(recvcounts); free(displs); } if (l->depth > 0) continue; // Make the boundaries with poles continuous by multiplying by x // FIXME only implemented for 0L boundary if (bnd_idx == MG2D_BOUNDARY_0L && dc->boundaries[bnd_idx].flags & MG2D_DC_FLAG_POLE) { const ptrdiff_t x_offset = l->dg->components[priv->local_component].interior.start[0]; for (size_t idx0 = 0; idx0 < a_dc->shape[0]; idx0++) { if (!x_offset) a_dc->data[idx0 * a_dc->stride[0]] = dc->boundaries[MG2D_BOUNDARY_0L].val[idx0]; for (size_t idx1 = !x_offset; idx1 < a_dc->shape[1]; idx1++) { const double x = ctx->step[0] * (x_offset + idx1); a_dc->data[idx0 * a_dc->stride[0] + idx1 * a_dc->stride[1]] *= x; } } } // Make the boundaries with discontinuities continous by // substracting the discontinous jump if (dc->boundaries[bnd_idx].flags & MG2D_DC_FLAG_DISCONT && l->dg->components[priv->local_component].bnd_is_outer[bnd_idx]) { const double *src = dc->boundaries[bnd_idx].val; for (int j = 0; j < a_dc->shape[ci]; j++) dst[j * a_dc->stride[ci]] -= src[j]; } } /* fill the boundary ghost zones for interpolation */ dc_boundaries_apply(a_dc, reflect[dc_idx], parity[dc_idx], dc_src); } /* get the ghost points for interpolation */ if (multi_component) { ret = restrict_dc_sync(ctx, l); if (ret < 0) return ret; } /* execute the interpolations on the continuous data */ for (int i = 0; i < MG2D_DIFF_COEFF_NB; i++) { ret = mg2di_gt_transfer(l->transfer_restrict, multi_component ? l->restrict_dst : l->child->solver->diff_coeffs[i], l->depth > 0 ? l->solver->diff_coeffs[i] : priv->diff_coeffs_tmp[i]); if (ret < 0) return ret; if (multi_component) { mg2di_timer_start(&l->timer_mpi_sync); MPI_Alltoallw(l->restrict_dst->data, l->restrict_sendcounts, l->restrict_senddispl, l->restrict_sendtypes, l->child ? l->child->solver->diff_coeffs[i]->data : NULL, l->restrict_recvcounts, l->restrict_recvdispl, l->restrict_recvtypes, l->mpi_comm); mg2di_timer_stop(&l->timer_mpi_sync); } } return ret; } /* return the coeffs into their final form */ static int diff_coeffs_fixup(MG2DContext *ctx, MG2DLevel *l) { MG2DInternal *priv = ctx->priv; int ret; for (int i = 0; i < ARRAY_ELEMS(l->solver->diff_coeffs); i++) { MG2DDiffCoeffs *dc = ctx->diff_coeffs[i]; NDArray *a_dst = l->solver->diff_coeffs[i]; for (int bnd_idx = 0; bnd_idx < ARRAY_ELEMS(dc->boundaries); bnd_idx++) { const int ci = mg2d_bnd_coord_idx(bnd_idx); if (!l->dg->components[priv->local_component].bnd_is_outer[bnd_idx]) continue; if (dc->boundaries[bnd_idx].flags & MG2D_DC_FLAG_DISCONT) { GridTransferContext *gt_bnd = NULL; NDArray *dc_src = NULL, *dc_dst = NULL; double *dst = a_dst->data + mg2d_bnd_is_upper(bnd_idx) * ((a_dst->shape[!ci] - 1) * a_dst->stride[!ci]); const ptrdiff_t stride_dst = a_dst->stride[ci]; const size_t size_dst = l->solver->domain_size[!ci]; ret = ndarray_slice(&dc_src, priv->dc_bnd_val[i][bnd_idx], &NDASLICE(FD_STENCIL_MAX, -FD_STENCIL_MAX, 1)); if (ret < 0) goto fail; ret = ndarray_alloc(&dc_dst, 1, &size_dst, 0); if (ret < 0) goto fail; gt_bnd = mg2di_gt_alloc(1, GRID_TRANSFER_LAGRANGE_1); if (!gt_bnd) { ret = -ENOMEM; goto fail; } gt_bnd->tp = priv->tp; gt_bnd->cpuflags = priv->cpuflags; gt_bnd->src.start[0] = 0; gt_bnd->src.size[0] = priv->dg->domain_size[1]; gt_bnd->src.step[0] = ctx->step[1]; gt_bnd->dst.start[0] = l->dg->components[priv->local_component].interior.start[!ci]; gt_bnd->dst.size[0] = l->dg->components[priv->local_component].interior.size[!ci]; gt_bnd->dst.step[0] = l->solver->step[1]; ret = mg2di_gt_init(gt_bnd); if (ret < 0) goto fail; ret = mg2di_gt_transfer(gt_bnd, dc_dst, dc_src); if (ret < 0) goto fail; for (int idx = 0; idx < dc_dst->shape[0]; idx++) dst[stride_dst * idx] += dc_dst->data[idx]; fail: mg2di_gt_free(>_bnd); ndarray_free(&dc_src); ndarray_free(&dc_dst); if (ret < 0) return ret; } } if (dc->boundaries[MG2D_BOUNDARY_0L].flags & MG2D_DC_FLAG_POLE) { const ptrdiff_t x_offset = l->dg->components[priv->local_component].interior.start[0]; for (size_t idx0 = 0; idx0 < a_dst->shape[0]; idx0++) { if (!x_offset) a_dst->data[idx0 * a_dst->stride[0]] = 0.0; for (size_t idx1 = !x_offset; idx1 < a_dst->shape[1]; idx1++) { const double x = l->solver->step[0] * (x_offset + idx1); a_dst->data[idx0 * a_dst->stride[0] + idx1 * a_dst->stride[1]] /= x; } } } } return 0; } static double findmax(double *arr, size_t size[2], ptrdiff_t stride) { double ret = 0.0; for (size_t y = 0; y < size[1]; y++) for (size_t x = 0; x < size[0]; x++) { double val = fabs(arr[y * stride + x]); if (val > ret) ret = val; } return ret; } static int mg_setup_restrict(MG2DContext *ctx, MG2DLevel *l, enum GridTransferOperator op) { MG2DInternal *priv = ctx->priv; const DomainComponent *dc_src = &l->dg->components[priv->local_component]; GridTransferContext *gt; int ret; gt = mg2di_gt_alloc(2, op); if (!gt) return -ENOMEM; l->transfer_restrict = gt; gt->tp = priv->tp; gt->cpuflags = priv->cpuflags; gt->extrapolate_distance = 1; for (int dim = 0; dim < 2; dim++) { gt->src.start[!dim] = dc_src->interior.start[dim]; gt->src.size[!dim] = dc_src->interior.size[dim]; gt->src.step[!dim] = priv->dh.stepsizes[l->depth][dim]; gt->dst.start[!dim] = l->restrict_dst_extents.start[dim]; gt->dst.size[!dim] = l->restrict_dst_extents.size[dim]; gt->dst.step[!dim] = priv->dh.stepsizes[l->depth + 1][dim]; } ret = mg2di_gt_init(gt); if (ret < 0) return ret; return 0; } static int mg_setup_prolong(MG2DContext *ctx, MG2DLevel *l, enum GridTransferOperator op) { MG2DInternal *priv = ctx->priv; const DomainComponent *dc_dst = &l->dg->components[priv->local_component]; GridTransferContext *gt; int ret; gt = mg2di_gt_alloc(2, op); if (!gt) return -ENOMEM; l->transfer_prolong = gt; gt->tp = priv->tp; gt->cpuflags = priv->cpuflags; for (int dim = 0; dim < 2; dim++) { gt->src.start[!dim] = l->prolong_src_extents.start[dim]; gt->src.size[!dim] = l->prolong_src_extents.size[dim]; gt->src.step[!dim] = priv->dh.stepsizes[l->depth + 1][dim]; gt->dst.start[!dim] = dc_dst->interior.start[dim]; gt->dst.size[!dim] = dc_dst->interior.size[dim]; gt->dst.step[!dim] = priv->dh.stepsizes[l->depth][dim]; } ret = mg2di_gt_init(gt); if (ret < 0) return ret; return 0; } static int mg_levels_init(MG2DContext *ctx) { MG2DInternal *priv = ctx->priv; MG2DLevel *cur, *prev; double diff0_max, diff2_max, tmp; int ret; cur = priv->root; prev = NULL; if (priv->u) { ret = ndarray_copy(cur->solver->u, priv->u); if (ret < 0) return ret; ndarray_free(&priv->u); ctx->u = cur->solver->u->data; ctx->u_stride = cur->solver->u->stride[0]; } if (priv->rhs) { ret = ndarray_copy(cur->solver->rhs, priv->rhs); if (ret < 0) return ret; ndarray_free(&priv->rhs); ctx->rhs = cur->solver->rhs->data; ctx->rhs_stride = cur->solver->rhs->stride[0]; } if (ctx->diff_coeffs[0]->data == priv->diff_coeffs_tmp[0]->data) { for (int i = 0; i < ARRAY_ELEMS(ctx->diff_coeffs); i++) { ndarray_copy(cur->solver->diff_coeffs[i], priv->diff_coeffs_tmp[i]); ctx->diff_coeffs[i]->data = cur->solver->diff_coeffs[i]->data; ctx->diff_coeffs[i]->stride = cur->solver->diff_coeffs[i]->stride[0]; } } while (cur) { cur->solver->step[0] = priv->dh.stepsizes[cur->depth][0]; cur->solver->step[1] = priv->dh.stepsizes[cur->depth][1]; for (int bnd_idx = 0; bnd_idx < ARRAY_ELEMS(cur->solver->boundaries); bnd_idx++) { const int ci = mg2d_bnd_coord_idx(bnd_idx); MG2DBoundary *bsrc = ctx->boundaries[bnd_idx]; MG2DBoundary *bdst = cur->solver->boundaries[bnd_idx]; bdst->type = bsrc->type; switch (bsrc->type) { case MG2D_BC_TYPE_FIXVAL: if (!prev) { bnd_copy(bdst, bsrc->val, bsrc->val_stride, ctx->fd_stencil, cur->solver->domain_size[!ci]); break; } /* fall-through */ case MG2D_BC_TYPE_REFLECT: case MG2D_BC_TYPE_FALLOFF: bnd_zero(bdst, ctx->fd_stencil, cur->solver->domain_size[!ci]); break; default: return -ENOSYS; } } cur->solver->logger = cur->logger; cur->solver->cpuflags = priv->cpuflags; cur->solver->tp = priv->tp; cur->solver->fd_stencil = ctx->fd_stencil; prev = cur; cur = cur->child; } cur = priv->root; while (cur) { mg2di_gt_free(&cur->transfer_restrict); mg2di_gt_free(&cur->transfer_prolong); if (cur->depth < priv->dh.nb_levels - 1) { enum GridTransferOperator op_interp; enum GridTransferOperator op_restrict; switch (ctx->fd_stencil) { case 1: op_interp = GRID_TRANSFER_LAGRANGE_3; break; case 2: op_interp = GRID_TRANSFER_LAGRANGE_5; break; default: return -ENOSYS; } if (priv->dh.levels[cur->depth]->domain_size[0] == 2 * (priv->dh.levels[cur->depth + 1]->domain_size[0] - 1) + 1) { if (ctx->fd_stencil == 1) op_restrict = GRID_TRANSFER_FW_2; else op_restrict = GRID_TRANSFER_FW_1; } else { op_restrict = GRID_TRANSFER_LAGRANGE_1; } ret = mg_setup_restrict(ctx, cur, op_restrict); if (ret < 0) return ret; ret = mg_setup_prolong(ctx, cur, op_interp); if (ret < 0) return ret; ret = restrict_diff_coeffs(ctx, cur); if (ret < 0) return ret; } cur = cur->child; } diff0_max = findmax(priv->diff_coeffs_tmp[MG2D_DIFF_COEFF_00]->data, priv->root->solver->domain_size, priv->diff_coeffs_tmp[MG2D_DIFF_COEFF_00]->stride[0]); diff2_max = findmax(priv->diff_coeffs_tmp[MG2D_DIFF_COEFF_20]->data, priv->root->solver->domain_size, priv->diff_coeffs_tmp[MG2D_DIFF_COEFF_20]->stride[0]); tmp = findmax(priv->diff_coeffs_tmp[MG2D_DIFF_COEFF_02]->data, priv->root->solver->domain_size, priv->diff_coeffs_tmp[MG2D_DIFF_COEFF_02]->stride[0]); diff2_max = MAX(diff2_max, tmp); if (priv->dg->nb_components > 1) { MPI_Allreduce(MPI_IN_PLACE, &diff0_max, 1, MPI_DOUBLE, MPI_MAX, priv->mpi_comm); MPI_Allreduce(MPI_IN_PLACE, &diff2_max, 1, MPI_DOUBLE, MPI_MAX, priv->mpi_comm); } cur = priv->root; while (cur) { cur->solver->relax->relax_factor = ctx->cfl_factor; cur->solver->relax->relax_multiplier = 1.0 / (diff2_max + cur->solver->step[0] * cur->solver->step[1] * diff0_max / 8.0); if (cur->depth > 0) { ret = diff_coeffs_fixup(ctx, cur); if (ret < 0) return ret; } cur->egs_init_flags &= ~EGS_INIT_FLAG_SAME_DIFF_COEFFS; cur = cur->child; } return 0; } static void mg_dh_uninit(DomainHierarchy *dh) { for (int i = 0; i < dh->nb_levels; i++) mg2di_dg_free(&dh->levels[i]); free(dh->levels); free(dh->stepsizes); memset(dh, 0, sizeof(*dh)); } static int mg_level_partition(DomainGeometry **res, size_t domain_size, const DomainGeometry *src) { DomainGeometry *dg; if (domain_size <= (1 << 7) + 1) { dg = mg2di_dg_alloc(1); if (!dg) return -ENOMEM; dg->domain_size[0] = domain_size; dg->domain_size[1] = domain_size; dg->components[0].interior.start[0] = 0; dg->components[0].interior.start[1] = 0; dg->components[0].interior.size[0] = domain_size; dg->components[0].interior.size[1] = domain_size; dg->components[0].exterior.start[0] = -FD_STENCIL_MAX; dg->components[0].exterior.start[1] = -FD_STENCIL_MAX; dg->components[0].exterior.size[0] = domain_size + 2 * FD_STENCIL_MAX; dg->components[0].exterior.size[1] = domain_size + 2 * FD_STENCIL_MAX; for (int i = 0; i < 4; i++) dg->components[0].bnd_is_outer[i] = 1; } else { const double step_scale = (double)(domain_size - 1) / (src->domain_size[0] - 1); dg = mg2di_dg_alloc(src->nb_components); if (!dg) return -ENOMEM; dg->domain_size[0] = domain_size; dg->domain_size[1] = domain_size; // FIXME duplicated with restrict extents calculation for (int comp = 0; comp < src->nb_components; comp++) { const Rect *rect_src = &src->components[comp].interior; DomainComponent *dc_dst = &dg->components[comp]; for (int dim = 0; dim < 2; dim++) { ptrdiff_t start, end; start = ceil(rect_src->start[dim] * step_scale); end = ceil((rect_src->start[dim] + rect_src->size[dim]) * step_scale); end = MIN(end, domain_size); dc_dst->interior.start[dim] = start; dc_dst->interior.size[dim] = (end >= start) ? end - start : 0; dc_dst->exterior.start[dim] = start ? start : -FD_STENCIL_MAX; dc_dst->exterior.size[dim] = dc_dst->interior.size[dim] + (end < domain_size ? 0 : FD_STENCIL_MAX) + (start ? 0 : FD_STENCIL_MAX); } dc_dst->bnd_is_outer[MG2D_BOUNDARY_0L] = dc_dst->interior.start[0] == 0; dc_dst->bnd_is_outer[MG2D_BOUNDARY_1L] = dc_dst->interior.start[1] == 0; dc_dst->bnd_is_outer[MG2D_BOUNDARY_0U] = (dc_dst->interior.start[0] + dc_dst->interior.size[0]) == domain_size; dc_dst->bnd_is_outer[MG2D_BOUNDARY_1U] = (dc_dst->interior.start[1] + dc_dst->interior.size[1]) == domain_size; } } *res = dg; return 0; } static int log2i(int n) { int ret = 0; while (n) { ret++; n >>= 1; } return ret - 1; } static int mg_dh_init(DomainHierarchy *dh, const DomainGeometry *root, const double *step_root, unsigned int max_levels) { DomainGeometry *next_dg; size_t next_size; int ret = 0; ret = mg2di_dg_copy(&next_dg, root); if (ret < 0) return ret; next_size = root->domain_size[0]; for (int depth = 0; depth < max_levels; depth++) { DomainGeometry **dg_tmp; double (*stepsizes_tmp)[2]; size_t cur_size = next_size; DomainGeometry *cur_dg = next_dg; next_dg = NULL; // choose the domain size for the next child // the children all have sizes 2**n + 1 // but on the top level we skip the closest lower one if it is too close if (depth == 0) { next_size = (1 << log2i(cur_size - 2)) + 1; if ((double)cur_size / next_size < 1.5) next_size = (next_size >> 1) + 1; } else next_size = (cur_size >> 1) + 1; ret = mg_level_partition(&next_dg, next_size, cur_dg); if (ret < 0) { mg2di_dg_free(&cur_dg); goto fail; } dg_tmp = realloc(dh->levels, (depth + 1) * sizeof(*dh->levels)); if (!dg_tmp) { mg2di_dg_free(&cur_dg); ret = -ENOMEM; goto fail; } dh->levels = dg_tmp; stepsizes_tmp = realloc(dh->stepsizes, (depth + 1) * sizeof(*dh->stepsizes)); if (!stepsizes_tmp) { mg2di_dg_free(&cur_dg); ret = -ENOMEM; goto fail; } dh->stepsizes = stepsizes_tmp; dh->levels[depth] = cur_dg; dh->stepsizes[depth][0] = step_root[0] * (root->domain_size[0] - 1) / (cur_dg->domain_size[0] - 1); dh->stepsizes[depth][1] = step_root[1] * (root->domain_size[1] - 1) / (cur_dg->domain_size[1] - 1); dh->nb_levels++; if (next_size <= 4) break; } fail: mg2di_dg_free(&next_dg); if (ret < 0) mg_dh_uninit(dh); return ret; } static int threadpool_init(MG2DContext *ctx) { MG2DInternal *priv = ctx->priv; int ret; ret = tp_init(&priv->tp, ctx->nb_threads); if (ret < 0) return ret; return 0; } static int mg_levels_alloc(MG2DContext *ctx); int mg2d_solve(MG2DContext *ctx) { MG2DInternal *priv = ctx->priv; MG2DLevel *root; EGSContext *s_root; double res_orig, res_prev, res_cur; int ret; if (!priv->root) { ret = mg_levels_alloc(ctx); if (ret < 0) { mg2di_log(&priv->logger, MG2D_LOG_ERROR, "Error allocating the levels\n"); return ret; } } if (!priv->tp) { ret = threadpool_init(ctx); if (ret < 0) return ret; } root = priv->root; s_root = root->solver; mg2di_timer_start(&priv->timer_solve); mg2di_timer_start(&priv->timer_levels_init); ret = mg_levels_init(ctx); mg2di_timer_stop(&priv->timer_levels_init); if (ret < 0) goto finish; mg2di_timer_start(&root->timer_reinit); ret = mg2di_egs_init(s_root, 0); mg2di_timer_stop(&root->timer_reinit); if (ret < 0) goto finish; root->egs_init_flags |= EGS_INIT_FLAG_SAME_DIFF_COEFFS; res_orig = s_root->residual_max; res_prev = res_orig; for (int i = 0; i < ctx->maxiter; i++) { ret = mg_solve_subgrid(ctx, root, 1); if (ret < 0) goto fail; res_cur = s_root->residual_max; if (res_cur < ctx->tol) { mg2di_log(&priv->logger, MG2D_LOG_INFO, "converged on iteration %d, residual %g\n", i, res_cur); //ndarray_copy(priv->u, s_root->u); goto finish; } mg2di_log(&priv->logger, MG2D_LOG_VERBOSE, "finished toplevel iteration %d, residual %g -> %g (%g)\n", i, res_prev, res_cur, res_prev / res_cur); if (res_cur / res_prev > 1e1 || res_cur / res_orig > 1e3) { mg2di_log(&priv->logger, MG2D_LOG_ERROR, "A multigrid iteration diverged\n"); ret = MG2D_ERR_DIVERGE; goto fail; } res_prev = res_cur; } ret = MG2D_ERR_MAXITER_REACHED; mg2di_log(&priv->logger, MG2D_LOG_ERROR, "Maximum number of iterations (%d) reached\n", ctx->maxiter); fail: mg2di_log(&priv->logger, MG2D_LOG_ERROR, "The solver failed to converge\n"); finish: ctx->residual_max = s_root->residual_max; mg2di_timer_stop(&priv->timer_solve); return ret; } static void mg_level_free(MG2DLevel **plevel) { MG2DLevel *level = *plevel; if (!level) return; if (level->restrict_sendtypes) { for (int i = 0; i < level->dg->nb_components; i++) { if (level->restrict_sendtypes[i]) MPI_Type_free(&level->restrict_sendtypes[i]); } } if (level->restrict_recvtypes) { for (int i = 0; i < level->dg->nb_components; i++) { if (level->restrict_recvtypes[i]) MPI_Type_free(&level->restrict_recvtypes[i]); } } free(level->restrict_sendtypes); free(level->restrict_recvtypes); free(level->restrict_sendcounts); free(level->restrict_senddispl); free(level->restrict_recvcounts); free(level->restrict_recvdispl); if (level->prolong_sendtypes) { for (int i = 0; i < level->dg->nb_components; i++) { if (level->prolong_sendtypes[i]) MPI_Type_free(&level->prolong_sendtypes[i]); } } if (level->prolong_recvtypes) { for (int i = 0; i < level->dg->nb_components; i++) { if (level->prolong_recvtypes[i]) MPI_Type_free(&level->prolong_recvtypes[i]); } } free(level->prolong_sendtypes); free(level->prolong_recvtypes); free(level->prolong_sendcounts); free(level->prolong_senddispl); free(level->prolong_recvcounts); free(level->prolong_recvdispl); ndarray_free(&level->restrict_dst); ndarray_free(&level->prolong_dst); ndarray_free(&level->prolong_tmp); ndarray_free(&level->prolong_tmp_base); mg2di_egs_free(&level->solver); mg2di_gt_free(&level->transfer_restrict); mg2di_gt_free(&level->transfer_prolong); mg2di_dg_free(&level->dg); free(level); *plevel = NULL; } static MG2DLevel *mg_level_alloc(const DomainGeometry *dg, unsigned int local_component, MPI_Comm comm) { const DomainComponent *dc = &dg->components[local_component]; MG2DLevel *level; int ret; level = calloc(1, sizeof(*level)); if (!level) return NULL; ret = mg2di_dg_copy(&level->dg, dg); if (ret < 0) goto fail; level->mpi_comm = comm; ret = ndarray_alloc(&level->prolong_tmp_base, 2, (size_t [2]){dc->interior.size[1] + 1, dc->interior.size[0] + 1}, 0); if (ret < 0) goto fail; ret = ndarray_slice(&level->prolong_tmp, level->prolong_tmp_base, (NDASlice [2]){ NDASLICE(0, -1, 1), NDASLICE(0, -1, 1) }); if (ret < 0) goto fail; if (dg->nb_components > 1) level->solver = mg2di_egs_alloc_mpi(level->mpi_comm, dg); else level->solver = mg2di_egs_alloc(dg->domain_size); if (!level->solver) goto fail; if (dg->nb_components > 1) { level->restrict_sendcounts = calloc(dg->nb_components, sizeof(*level->restrict_sendcounts)); level->restrict_senddispl = calloc(dg->nb_components, sizeof(*level->restrict_senddispl)); level->restrict_sendtypes = calloc(dg->nb_components, sizeof(*level->restrict_sendtypes)); level->restrict_recvcounts = calloc(dg->nb_components, sizeof(*level->restrict_recvcounts)); level->restrict_recvdispl = calloc(dg->nb_components, sizeof(*level->restrict_recvdispl)); level->restrict_recvtypes = calloc(dg->nb_components, sizeof(*level->restrict_recvtypes)); if (!level->restrict_sendcounts || !level->restrict_senddispl || !level->restrict_sendtypes || !level->restrict_recvcounts || !level->restrict_recvdispl || !level->restrict_recvtypes) goto fail; level->prolong_sendcounts = calloc(dg->nb_components, sizeof(*level->prolong_sendcounts)); level->prolong_senddispl = calloc(dg->nb_components, sizeof(*level->prolong_senddispl)); level->prolong_sendtypes = calloc(dg->nb_components, sizeof(*level->prolong_sendtypes)); level->prolong_recvcounts = calloc(dg->nb_components, sizeof(*level->prolong_recvcounts)); level->prolong_recvdispl = calloc(dg->nb_components, sizeof(*level->prolong_recvdispl)); level->prolong_recvtypes = calloc(dg->nb_components, sizeof(*level->prolong_recvtypes)); if (!level->prolong_sendcounts || !level->prolong_senddispl || !level->prolong_sendtypes || !level->prolong_recvcounts || !level->prolong_recvdispl || !level->prolong_recvtypes) goto fail; } mg2di_timer_init(&level->timer_solve); mg2di_timer_init(&level->timer_prolong); mg2di_timer_init(&level->timer_restrict); mg2di_timer_init(&level->timer_correct); mg2di_timer_init(&level->timer_reinit); mg2di_timer_init(&level->timer_mpi_sync); return level; fail: mg_level_free(&level); return NULL; } static int mg_interdomain_setup(MG2DContext *ctx, MG2DLevel *level) { MG2DInternal *priv = ctx->priv; const DomainGeometry *dg_fine = priv->dh.levels[level->depth]; const DomainGeometry *dg_coarse = priv->dh.levels[level->depth + 1]; const double step_scale[2] = { priv->dh.stepsizes[level->depth][0] / priv->dh.stepsizes[level->depth + 1][0], priv->dh.stepsizes[level->depth][1] / priv->dh.stepsizes[level->depth + 1][1] }; const unsigned int lc = priv->local_component; Rect *restrict_components = NULL, *prolong_components = NULL; int ret = 0; restrict_components = calloc(dg_fine->nb_components, sizeof(*restrict_components)); prolong_components = calloc(dg_fine->nb_components, sizeof(*prolong_components)); if (!restrict_components || !prolong_components) return -ENOMEM; /* calculate the extents restricted onto by each component */ for (int comp = 0; comp < dg_fine->nb_components; comp++) { const Rect *dc_src = &dg_fine->components[comp].interior; Rect *dst = restrict_components + comp; for (int dim = 0; dim < 2; dim++) { ptrdiff_t end; dst->start[dim] = ceil(dc_src->start[dim] * step_scale[dim]); end = ceil((dc_src->start[dim] + dc_src->size[dim]) * step_scale[dim]); end = MIN(end, dg_coarse->domain_size[dim]); dst->size[dim] = end >= dst->start[dim] ? end - dst->start[dim] : 0; } } level->restrict_dst_extents = restrict_components[lc]; /* calculate the extents prolongated from by each component */ for (int comp = 0; comp < dg_fine->nb_components; comp++) { const Rect *dc_src = &dg_fine->components[comp].interior; Rect *dst = prolong_components + comp; for (int dim = 0; dim < 2; dim++) { ptrdiff_t start, end; start = floor(dc_src->start[dim] * step_scale[dim]); end = ceil((dc_src->start[dim] + dc_src->size[dim]) * step_scale[dim]); start = MAX(start, 0) - FD_STENCIL_MAX; end = MIN(end, dg_coarse->domain_size[dim]) + FD_STENCIL_MAX; dst->start[dim] = start; dst->size[dim] = end >= start ? end - start : 0; } } level->prolong_src_extents = prolong_components[lc]; if (dg_fine->nb_components == 1) goto finish; ret = ndarray_alloc(&level->restrict_dst, 2, (size_t [2]){ restrict_components[lc].size[1], restrict_components[lc].size[0] }, 0); if (ret < 0) goto finish; ret = ndarray_alloc(&level->prolong_dst, 2, (size_t [2]){ prolong_components[lc].size[1], prolong_components[lc].size[0] }, 0); if (ret < 0) goto finish; /* setup the MPI transfer parameters for restrict */ for (int comp = 0; comp < dg_fine->nb_components; comp++) { NDArray *data; const Rect *loc; Rect overlap; /* send from local component to comp */ loc = &restrict_components[lc]; data = level->restrict_dst; memset(&overlap, 0, sizeof(overlap)); if (comp < dg_coarse->nb_components) { mg2di_rect_intersect(&overlap, loc, &dg_coarse->components[comp].interior); } if (overlap.size[0] && overlap.size[1]) { MPI_Type_vector(overlap.size[1], overlap.size[0], data->stride[0], MPI_DOUBLE, &level->restrict_sendtypes[comp]); MPI_Type_commit(&level->restrict_sendtypes[comp]); level->restrict_sendcounts[comp] = 1; level->restrict_senddispl[comp] = sizeof(*data->data) * ((overlap.start[1] - loc->start[1]) * data->stride[0] + (overlap.start[0] - loc->start[0]) * data->stride[1]); } else { level->restrict_sendcounts[comp] = 0; level->restrict_senddispl[comp] = 0; MPI_Type_dup(MPI_INT, &level->restrict_sendtypes[comp]); } /* receive on local component from comp */ memset(&overlap, 0, sizeof(overlap)); if (lc < dg_coarse->nb_components) { loc = &dg_coarse->components[lc].interior; data = level->child->solver->rhs; mg2di_rect_intersect(&overlap, loc, &restrict_components[comp]); } if (overlap.size[0] && overlap.size[1]) { MPI_Type_vector(overlap.size[1], overlap.size[0], data->stride[0], MPI_DOUBLE, &level->restrict_recvtypes[comp]); MPI_Type_commit(&level->restrict_recvtypes[comp]); level->restrict_recvcounts[comp] = 1; level->restrict_recvdispl[comp] = sizeof(*data->data) * ((overlap.start[1] - loc->start[1]) * data->stride[0] + (overlap.start[0] - loc->start[0]) * data->stride[1]); } else { level->restrict_recvcounts[comp] = 0; level->restrict_recvdispl[comp] = 0; MPI_Type_dup(MPI_INT, &level->restrict_recvtypes[comp]); } } /* setup the MPI transfer parameters for prolong */ for (int comp = 0; comp < dg_fine->nb_components; comp++) { NDArray *data; const Rect *loc; Rect overlap; /* send from local component to comp */ memset(&overlap, 0, sizeof(overlap)); if (lc < dg_coarse->nb_components) { loc = &dg_coarse->components[lc].exterior; data = level->child->solver->u_exterior; mg2di_rect_intersect(&overlap, loc, &prolong_components[comp]); } if (overlap.size[0] && overlap.size[1]) { MPI_Type_vector(overlap.size[1], overlap.size[0], data->stride[0], MPI_DOUBLE, &level->prolong_sendtypes[comp]); MPI_Type_commit(&level->prolong_sendtypes[comp]); level->prolong_sendcounts[comp] = 1; level->prolong_senddispl[comp] = sizeof(*data->data) * ((overlap.start[1] - loc->start[1]) * data->stride[0] + (overlap.start[0] - loc->start[0]) * data->stride[1]); } else { level->prolong_sendcounts[comp] = 0; level->prolong_senddispl[comp] = 0; MPI_Type_dup(MPI_INT, &level->prolong_sendtypes[comp]); } /* receive on local component from comp */ memset(&overlap, 0, sizeof(overlap)); loc = &prolong_components[lc]; data = level->prolong_dst; if (comp < dg_coarse->nb_components) { mg2di_rect_intersect(&overlap, loc, &dg_coarse->components[comp].exterior); } if (overlap.size[0] && overlap.size[1]) { MPI_Type_vector(overlap.size[1], overlap.size[0], data->stride[0], MPI_DOUBLE, &level->prolong_recvtypes[comp]); MPI_Type_commit(&level->prolong_recvtypes[comp]); level->prolong_recvcounts[comp] = 1; level->prolong_recvdispl[comp] = sizeof(*data->data) * ((overlap.start[1] - loc->start[1]) * data->stride[0] + (overlap.start[0] - loc->start[0]) * data->stride[1]); } else { level->prolong_recvcounts[comp] = 0; level->prolong_recvdispl[comp] = 0; MPI_Type_dup(MPI_INT, &level->prolong_recvtypes[comp]); } } finish: free(restrict_components); free(prolong_components); return ret; } static int mg_levels_alloc(MG2DContext *ctx) { MG2DInternal *priv = ctx->priv; MPI_Comm comm_parent; MG2DLevel **dst, *level; int ret = 0; if (ctx->adaptive_step) { mg2di_log(&priv->logger, MG2D_LOG_ERROR, "Adaptive stepping not supported.\n"); return -EINVAL; } /* compute the levels geometries/partitioning */ ret = mg_dh_init(&priv->dh, priv->dg, ctx->step, ctx->max_levels); if (ret < 0) { mg2di_log(&priv->logger, MG2D_LOG_ERROR, "Error partitioning the domain\n"); return ret; } /* allocate the solver for each level */ comm_parent = priv->mpi_comm; dst = &priv->root; for (int depth = 0; depth < priv->dh.nb_levels; depth++) { const DomainGeometry *dg = priv->dh.levels[depth]; MPI_Comm comm_cur = MPI_COMM_NULL; if (dg->nb_components > 1 && comm_parent != MPI_COMM_NULL) { MPI_Group grp_parent, grp_cur; int *ranks; ranks = calloc(dg->nb_components, sizeof(*ranks)); if (!ranks) return -ENOMEM; for (int i = 0; i < dg->nb_components; i++) ranks[i] = i; MPI_Comm_group(comm_parent, &grp_parent); MPI_Group_incl(grp_parent, dg->nb_components, ranks, &grp_cur); MPI_Comm_create(comm_parent, grp_cur, &comm_cur); MPI_Group_free(&grp_parent); MPI_Group_free(&grp_cur); free(ranks); } if (priv->local_component >= dg->nb_components) break; level = mg_level_alloc(dg, priv->local_component, comm_cur); if (!level) { mg2di_log(&priv->logger, MG2D_LOG_ERROR, "Error allocating level %d", depth); return -ENOMEM; } level->depth = depth; level->priv = priv; level->logger.opaque = level; level->logger.log = log_callback_level; for (int i = 0; i < MIN(level->depth, sizeof(level->log_prefix) - 1); i++) level->log_prefix[i] = ' '; snprintf(level->log_prefix + level->depth, MAX(0, sizeof(level->log_prefix) - level->depth), "[%d]", depth); *dst = level; dst = &level->child; comm_parent = comm_cur; } /* setup inter-grid MPI sync */ for (MG2DLevel *cur = priv->root; cur; cur = cur->child) { if (cur->depth == priv->dh.nb_levels - 1) break; ret = mg_interdomain_setup(ctx, cur); if (ret < 0) { mg2di_log(&priv->logger, MG2D_LOG_ERROR, "Error setting up inter-level transfers\n"); return ret; } } return ret; } static void log_default_callback(const MG2DContext *ctx, int level, const char *fmt, va_list vl) { if (level > ctx->log_level) return; vfprintf(stderr, fmt, vl); } static MG2DContext *solver_alloc(DomainGeometry *dg, unsigned int local_component) { const size_t *domain_size = dg->components[local_component].interior.size; MG2DContext *ctx; MG2DInternal *priv; int ret; if (dg->domain_size[0] < 3 || dg->domain_size[1] < 3 || SIZE_MAX / dg->domain_size[0] < dg->domain_size[1]) return NULL; ctx = calloc(1, sizeof(*ctx)); if (!ctx) return NULL; ctx->priv = calloc(1, sizeof(*ctx->priv)); if (!ctx->priv) goto fail; priv = ctx->priv; priv->dg = dg; priv->local_component = local_component; priv->logger.log = log_callback; priv->logger.opaque = ctx; priv->cpuflags = mg2di_cpu_flags_get(); for (int i = 0; i < ARRAY_ELEMS(ctx->boundaries); i++) { const int ci = mg2d_bnd_coord_idx(i); ctx->boundaries[i] = mg2di_bc_alloc(domain_size[!ci]); if (!ctx->boundaries[i]) { goto fail; } } ret = ndarray_alloc(&priv->u, 2, (size_t [2]){ domain_size[1], domain_size[0] }, NDARRAY_ALLOC_ZERO); if (ret < 0) goto fail; ctx->u = priv->u->data; ctx->u_stride = priv->u->stride[0]; ret = ndarray_alloc(&priv->rhs, 2, (size_t [2]){ domain_size[1], domain_size[0] }, NDARRAY_ALLOC_ZERO); if (ret < 0) goto fail; ctx->rhs = priv->rhs->data; ctx->rhs_stride = priv->rhs->stride[0]; for (int i = 0; i < ARRAY_ELEMS(ctx->diff_coeffs); i++) { MG2DDiffCoeffs *dc; const NDASlice slice[2] = { NDASLICE(FD_STENCIL_MAX, -FD_STENCIL_MAX, 1), NDASLICE(FD_STENCIL_MAX, -FD_STENCIL_MAX, 1) }; ret = ndarray_alloc(&priv->diff_coeffs_base[i], 2, (size_t [2]){ domain_size[1] + 2 * FD_STENCIL_MAX, domain_size[0] + 2 * FD_STENCIL_MAX }, NDARRAY_ALLOC_ZERO); if (ret < 0) goto fail; ret = ndarray_slice(&priv->diff_coeffs_tmp[i], priv->diff_coeffs_base[i], slice); if (ret < 0) goto fail; dc = calloc(1, sizeof(*dc)); if (!dc) goto fail; ctx->diff_coeffs[i] = dc; dc->data = priv->diff_coeffs_tmp[i]->data; dc->stride = priv->diff_coeffs_tmp[i]->stride[0]; for (int bnd_idx = 0; bnd_idx < ARRAY_ELEMS(dc->boundaries); bnd_idx++) { const int ci = mg2d_bnd_coord_idx(bnd_idx); ret = ndarray_alloc(&priv->dc_bnd_val[i][bnd_idx], 1, (size_t [1]){ dg->domain_size[!ci] + 2 * FD_STENCIL_MAX }, NDARRAY_ALLOC_ZERO); if (ret < 0) goto fail; dc->boundaries[bnd_idx].val = priv->dc_bnd_val[i][bnd_idx]->data + FD_STENCIL_MAX + dg->components[local_component].interior.start[!ci]; } } ctx->max_levels = 16; ctx->max_exact_size = 5; ctx->maxiter = 16; ctx->tol = 1e-12; ctx->nb_cycles = 1; ctx->nb_relax_pre = 1; ctx->nb_relax_post = 1; ctx->log_callback = log_default_callback; ctx->log_level = MG2D_LOG_INFO; ctx->nb_threads = 1; mg2di_timer_init(&priv->timer_levels_init); mg2di_timer_init(&priv->timer_solve); return ctx; fail: mg2d_solver_free(&ctx); return NULL; } MG2DContext *mg2d_solver_alloc(size_t domain_size) { MG2DContext *ctx; DomainGeometry *dg; dg = mg2di_dg_alloc(1); if (!dg) return NULL; dg->domain_size[0] = domain_size; dg->domain_size[1] = domain_size; dg->components[0].interior.start[0] = 0; dg->components[0].interior.start[1] = 0; dg->components[0].interior.size[0] = domain_size; dg->components[0].interior.size[1] = domain_size; dg->components[0].exterior.start[0] = -FD_STENCIL_MAX; dg->components[0].exterior.start[1] = -FD_STENCIL_MAX; dg->components[0].exterior.size[0] = domain_size + 2 * FD_STENCIL_MAX; dg->components[0].exterior.size[1] = domain_size + 2 * FD_STENCIL_MAX; for (int i = 0; i < 4; i++) dg->components[0].bnd_is_outer[i] = 1; ctx = solver_alloc(dg, 0); if (!ctx) goto fail; ctx->priv->mpi_comm = MPI_COMM_NULL; ctx->domain_size = domain_size; ctx->local_start[0] = 0; ctx->local_start[1] = 0; ctx->local_size[0] = domain_size; ctx->local_size[1] = domain_size; return ctx; fail: mg2di_dg_free(&dg); return NULL; } MG2DContext *mg2d_solver_alloc_mpi(MPI_Comm comm, const size_t local_start[2], const size_t local_size[2]) { MG2DContext *ctx = NULL; DomainGeometry *dg = NULL; size_t *domainspec = NULL; int nb_processes, rank; MPI_Comm_size(comm, &nb_processes); MPI_Comm_rank(comm, &rank); dg = mg2di_dg_alloc(nb_processes); if (!dg) goto fail; domainspec = calloc(4 * nb_processes, sizeof(*domainspec)); if (!domainspec) goto fail; domainspec[4 * rank + 0] = local_start[0]; domainspec[4 * rank + 1] = local_start[1]; domainspec[4 * rank + 2] = local_size[0]; domainspec[4 * rank + 3] = local_size[1]; MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, domainspec, 4 * sizeof(*domainspec), MPI_BYTE, comm); for (unsigned int proc = 0; proc < nb_processes; proc++) { size_t *proc_start = domainspec + 4 * proc; size_t *proc_size = domainspec + 4 * proc + 2; dg->components[proc].interior.start[0] = proc_start[0]; dg->components[proc].interior.start[1] = proc_start[1]; dg->components[proc].interior.size[0] = proc_size[0]; dg->components[proc].interior.size[1] = proc_size[1]; dg->domain_size[0] = MAX(dg->domain_size[0], proc_start[0] + proc_size[0]); dg->domain_size[1] = MAX(dg->domain_size[1], proc_start[1] + proc_size[1]); } mg2di_assert(dg->domain_size[0] == dg->domain_size[1]); for (unsigned int proc = 0; proc < nb_processes; proc++) { DomainComponent *dc = &dg->components[proc]; dc->exterior = dc->interior; for (int bnd_idx = 0; bnd_idx < 4; bnd_idx++) { const int ci = mg2d_bnd_coord_idx(bnd_idx); const int upper = mg2d_bnd_is_upper(bnd_idx); if (upper && dc->interior.start[ci] + dc->interior.size[ci] == dg->domain_size[ci]) dc->exterior.size[ci] += FD_STENCIL_MAX; if (!upper && dc->interior.start[ci] == 0) { dc->exterior.start[ci] -= FD_STENCIL_MAX; dc->exterior.size[ci] += FD_STENCIL_MAX; } } dc->bnd_is_outer[MG2D_BOUNDARY_0L] = dc->interior.start[0] == 0; dc->bnd_is_outer[MG2D_BOUNDARY_1L] = dc->interior.start[1] == 0; dc->bnd_is_outer[MG2D_BOUNDARY_0U] = (dc->interior.start[0] + dc->interior.size[0]) == dg->domain_size[0]; dc->bnd_is_outer[MG2D_BOUNDARY_1U] = (dc->interior.start[1] + dc->interior.size[1]) == dg->domain_size[1]; } free(domainspec); ctx = solver_alloc(dg, rank); if (!ctx) return NULL; ctx->priv->mpi_comm = comm; ctx->domain_size = dg->domain_size[0]; ctx->local_start[0] = local_start[0]; ctx->local_start[1] = local_start[1]; ctx->local_size[0] = local_size[0]; ctx->local_size[1] = local_size[1]; return ctx; fail: mg2d_solver_free(&ctx); mg2di_dg_free(&dg); free(domainspec); return NULL; } void mg2d_solver_free(MG2DContext **pctx) { MG2DContext *ctx = *pctx; if (!ctx) return; while (ctx->priv->root) { MG2DLevel *next = ctx->priv->root->child; mg_level_free(&ctx->priv->root); ctx->priv->root = next; } for (int i = 0; i < ARRAY_ELEMS(ctx->boundaries); i++) mg2di_bc_free(&ctx->boundaries[i]); tp_free(&ctx->priv->tp); mg2di_gt_free(&ctx->priv->transfer_init); ndarray_free(&ctx->priv->u); ndarray_free(&ctx->priv->rhs); for (int i = 0; i < ARRAY_ELEMS(ctx->priv->diff_coeffs_base); i++) { for (int j = 0; j < ARRAY_ELEMS(ctx->priv->dc_bnd_val[i]); j++) ndarray_free(&ctx->priv->dc_bnd_val[i][j]); free(ctx->diff_coeffs[i]); ndarray_free(&ctx->priv->diff_coeffs_tmp[i]); ndarray_free(&ctx->priv->diff_coeffs_base[i]); } mg2di_dg_free(&ctx->priv->dg); mg_dh_uninit(&ctx->priv->dh); free(ctx->priv); free(ctx); *pctx = NULL; } void mg2d_print_stats(MG2DContext *ctx, const char *prefix) { MG2DInternal *priv = ctx->priv; MG2DLevel *level = priv->root; int64_t other, levels_total = 0; if (!level) return; if (!prefix) prefix = ""; mg2di_log(&priv->logger, ctx->log_level, "%s%ld solves; %g s total time; %g ms avg per call; %g avg cycles per solve\n", prefix, priv->timer_solve.nb_runs, priv->timer_solve.time_nsec / 1e9, priv->timer_solve.time_nsec / 1e6 / priv->timer_solve.nb_runs, (double)level->count_cycles / priv->timer_solve.nb_runs); while (level) { char buf[1024], *p; int ret; EGSRelaxContext *r = level->solver->relax; EGSExactContext *e = level->solver->exact; int64_t level_total = level->timer_solve.time_nsec + level->timer_prolong.time_nsec + level->timer_restrict.time_nsec + level->timer_correct.time_nsec + level->timer_reinit.time_nsec + level->timer_mpi_sync.time_nsec; if (!level->count_cycles) { level = level->child; continue; } levels_total += level_total; p = buf; ret = snprintf(p, sizeof(buf) - (p - buf), "%2.2f%% level %d: %ld cycles %g s total time %g ms avg per call", level_total * 100.0 / priv->timer_solve.time_nsec, level->depth, level->count_cycles, level_total / 1e9, level_total / 1e6 / level->count_cycles); if (ret > 0) p += ret; ret = snprintf(p, sizeof(buf) - (p - buf), "||%2.2f%% solve %2.2f%% reinit ", level->timer_solve.time_nsec * 100.0 / level_total, level->timer_reinit.time_nsec * 100.0 / level_total); if (ret > 0) p += ret; if (level->timer_prolong.nb_runs) { ret = snprintf(p, sizeof(buf) - (p - buf), "%2.2f%% prolong %2.2f%% restrict %2.2f%% correct", level->timer_prolong.time_nsec * 100.0 / level_total, level->timer_restrict.time_nsec * 100.0 / level_total, level->timer_correct.time_nsec * 100.0 / level_total); if (ret > 0) p += ret; } if (level->timer_mpi_sync.nb_runs) { ret = snprintf(p, sizeof(buf) - (p - buf), " %2.2f%% sync", level->timer_mpi_sync.time_nsec * 100.0 / level_total); if (ret > 0) p += ret; } ret = snprintf(p, sizeof(buf) - (p - buf), "||%2.2f%% init %2.2f%% residual %2.2f%% boundaries (%2.2f%% fixval %2.2f%% reflect %2.2f%% falloff %2.2f%% corners)", level->solver->timer_init.time_nsec * 100.0 / level->solver->timer_solve.time_nsec, level->solver->timer_res_calc.time_nsec * 100.0 / level->solver->timer_solve.time_nsec, level->solver->timer_bnd.time_nsec * 100.0 / level->solver->timer_solve.time_nsec, level->solver->timer_bnd_fixval.time_nsec * 100.0 / level->solver->timer_solve.time_nsec, level->solver->timer_bnd_reflect.time_nsec * 100.0 / level->solver->timer_solve.time_nsec, level->solver->timer_bnd_falloff.time_nsec * 100.0 / level->solver->timer_solve.time_nsec, level->solver->timer_bnd_corners.time_nsec * 100.0 / level->solver->timer_solve.time_nsec); if (ret > 0) p += ret; if (r->timer_correct.nb_runs) { ret = snprintf(p, sizeof(buf) - (p - buf), " %2.2f%% correct", r->timer_correct.time_nsec * 100.0 / level->solver->timer_solve.time_nsec); if (ret > 0) p += ret; } else if (e->timer_mat_construct.nb_runs) { ret = snprintf(p, sizeof(buf) - (p - buf), " %2.2f%% const %2.2f%% bicgstab (%ld; %g it/slv) %2.2f%% lu (%ld) %2.2f%% export", e->timer_mat_construct.time_nsec * 100.0 / level->solver->timer_solve.time_nsec, e->timer_bicgstab.time_nsec * 100.0 / level->solver->timer_solve.time_nsec, e->timer_bicgstab.nb_runs, (double)e->bicgstab_iterations / e->timer_bicgstab.nb_runs, e->timer_lu_solve.time_nsec * 100.0 / level->solver->timer_solve.time_nsec, e->timer_lu_solve.nb_runs, e->timer_export.time_nsec * 100.0 / level->solver->timer_solve.time_nsec); if (ret > 0) p += ret; } if (level->solver->timer_mpi_sync.nb_runs) { ret = snprintf(p, sizeof(buf) - (p - buf), " %2.2f%% sync", level->solver->timer_mpi_sync.time_nsec * 100.0 / level->solver->timer_solve.time_nsec); if (ret > 0) p += ret; } mg2di_log(&priv->logger, ctx->log_level, "%s%s\n", prefix, buf); level = level->child; } mg2di_log(&priv->logger, ctx->log_level, "%s%2.2f%% levels init %g s total time %g ms avg per call\n", prefix, priv->timer_levels_init.time_nsec * 100.0 / priv->timer_solve.time_nsec, priv->timer_levels_init.time_nsec / 1e9, priv->timer_levels_init.time_nsec / 1e6 / priv->timer_levels_init.nb_runs); other = priv->timer_solve.time_nsec - levels_total - priv->timer_levels_init.time_nsec; mg2di_log(&priv->logger, ctx->log_level, "%s%2.2f%% other %g s total time %g ms avg per call\n", prefix, other * 100.0 / priv->timer_solve.time_nsec, other / 1e9, other / 1e6 / priv->timer_solve.nb_runs); } unsigned int mg2d_max_fd_stencil(void) { return FD_STENCIL_MAX; } int mg2d_init_guess(MG2DContext *ctx, const double *src, ptrdiff_t src_stride, const ptrdiff_t src_start[2], const size_t src_size[2], const double src_step[2]) { MG2DInternal *priv = ctx->priv; NDArray *a_src; int ret; if (!priv->tp) { ret = threadpool_init(ctx); if (ret < 0) return ret; } if (priv->transfer_init && (priv->transfer_init->src.size[0] != src_size[0] || priv->transfer_init->src.size[1] != src_size[1] || fabs(priv->transfer_init->src.step[0] - src_step[0]) > 1e-15 || fabs(priv->transfer_init->src.step[1] - src_step[1]) > 1e-15)) { mg2di_gt_free(&priv->transfer_init); } if (!priv->transfer_init) { priv->transfer_init = mg2di_gt_alloc(2, GRID_TRANSFER_LAGRANGE_3); if (!priv->transfer_init) return -ENOMEM; priv->transfer_init->tp = priv->tp; priv->transfer_init->cpuflags = priv->cpuflags; priv->transfer_init->src.start[0] = src_start[1]; priv->transfer_init->src.start[1] = src_start[0]; priv->transfer_init->src.size[0] = src_size[1]; priv->transfer_init->src.size[1] = src_size[0]; priv->transfer_init->src.step[0] = src_step[1]; priv->transfer_init->src.step[1] = src_step[0]; priv->transfer_init->dst.start[0] = ctx->local_start[1]; priv->transfer_init->dst.start[1] = ctx->local_start[0]; priv->transfer_init->dst.size[0] = ctx->local_size[1]; priv->transfer_init->dst.size[1] = ctx->local_size[0]; priv->transfer_init->dst.step[0] = ctx->step[1]; priv->transfer_init->dst.step[1] = ctx->step[0]; ret = mg2di_gt_init(priv->transfer_init); if (ret < 0) return ret; } ret = ndarray_wrap(&a_src, 2, (size_t [2]){ src_size[1], src_size[0] }, src, (ptrdiff_t [2]){ src_stride, 1 }); if (ret < 0) return ret; ret = mg2di_gt_transfer(priv->transfer_init, priv->u ? priv->u : priv->root->solver->u, a_src); ndarray_free(&a_src); return ret; }