From 2bd55ca0e3fb500bff5cf5ceb36a80a196f0f29d Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Tue, 18 Jun 2019 12:00:45 +0200 Subject: implement several relax steps per sync --- ell_grid_solve.c | 135 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 105 insertions(+), 30 deletions(-) diff --git a/ell_grid_solve.c b/ell_grid_solve.c index 996d450..97b8688 100644 --- a/ell_grid_solve.c +++ b/ell_grid_solve.c @@ -91,8 +91,10 @@ struct EGSInternal { NDArray *diff_coeffs_base; NDArray *diff_coeffs[MG2D_DIFF_COEFF_NB]; - ptrdiff_t residual_calc_offset[2]; - size_t residual_calc_size[2]; + ptrdiff_t (*residual_calc_offset)[2]; + size_t (*residual_calc_size)[2]; + ptrdiff_t bnd_calc_offset[2]; + size_t bnd_calc_size[2]; int bnd_zero[4]; int reflect_skip; @@ -110,6 +112,9 @@ struct EGSInternal { size_t fd_stencil; + unsigned int steps_per_sync; + unsigned int steps_since_sync; + int *sync_sendcounts; int *sync_senddispl; MPI_Datatype *sync_sendtypes; @@ -141,38 +146,40 @@ static void boundaries_sync(EGSContext *ctx, NDArray *a_dst) { EGSInternal *priv = ctx->priv; - mg2di_timer_start(&ctx->timer_mpi_sync); + if (priv->dg->nb_components > 1) { + mg2di_timer_start(&ctx->timer_mpi_sync); - MPI_Alltoallw(a_dst->data, priv->sync_sendcounts, priv->sync_senddispl, priv->sync_sendtypes, - a_dst->data, priv->sync_recvcounts, priv->sync_recvdispl, priv->sync_recvtypes, - priv->comm); + MPI_Alltoallw(a_dst->data, priv->sync_sendcounts, priv->sync_senddispl, priv->sync_sendtypes, + a_dst->data, priv->sync_recvcounts, priv->sync_recvdispl, priv->sync_recvtypes, + priv->comm); - mg2di_timer_stop(&ctx->timer_mpi_sync); + mg2di_timer_stop(&ctx->timer_mpi_sync); + } } static void residual_calc(EGSContext *ctx, int export_res) { EGSInternal *priv = ctx->priv; const double *diff_coeffs[MG2D_DIFF_COEFF_NB]; + ptrdiff_t *offset = priv->residual_calc_offset[priv->steps_since_sync]; + size_t *size = priv->residual_calc_size[priv->steps_since_sync]; + NDArray *dst = export_res ? ctx->residual : priv->u_next; int reflect_flags = 0; mg2di_timer_start(&ctx->timer_res_calc); for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++) - diff_coeffs[i] = NDPTR2D(priv->diff_coeffs[i], priv->residual_calc_offset[1], priv->residual_calc_offset[0]); + diff_coeffs[i] = NDPTR2D(priv->diff_coeffs[i], offset[0], offset[1]); for (int bnd_idx = 0; bnd_idx < 4; bnd_idx++) if (ctx->boundaries[bnd_idx]->type == MG2D_BC_TYPE_REFLECT && priv->dg->components[priv->local_component].bnd_is_outer[bnd_idx]) reflect_flags |= 1 << bnd_idx; - mg2di_residual_calc(priv->rescalc, priv->residual_calc_size, &ctx->residual_max, - NDPTR2D(export_res ? ctx->residual : priv->u_next, priv->residual_calc_offset[1], priv->residual_calc_offset[0]), - export_res ? ctx->residual->stride[0] : priv->u_next->stride[0], - NDPTR2D(ctx->u, priv->residual_calc_offset[1], priv->residual_calc_offset[0]), - ctx->u->stride[0], - NDPTR2D(ctx->rhs, priv->residual_calc_offset[1], priv->residual_calc_offset[0]), - ctx->rhs->stride[0], + mg2di_residual_calc(priv->rescalc, size, &ctx->residual_max, + NDPTR2D(dst, offset[0], offset[1]), dst->stride[0], + NDPTR2D(ctx->u, offset[0], offset[1]), ctx->u->stride[0], + NDPTR2D(ctx->rhs, offset[0], offset[1]), ctx->rhs->stride[0], diff_coeffs, priv->diff_coeffs[0]->stride[0], export_res ? 0.0 : 1.0, export_res ? 1.0 : priv->r.relax_factor, reflect_flags, FD_STENCIL_MAX); @@ -257,9 +264,10 @@ static void boundaries_apply(EGSContext *ctx, NDArray *a_dst, int init) const ptrdiff_t stride_boundary = strides[!ci]; const ptrdiff_t stride_offset = strides[ci]; const size_t size_boundary = ctx->domain_size[!ci]; + //const size_t size_boundary = priv->bnd_calc_size[!ci]; const size_t size_offset = ctx->domain_size[ci]; - double *dst = a_dst->data + mg2d_bnd_is_upper(i) * ((size_offset - 1) * stride_offset); + double *dst = a_dst->data + mg2d_bnd_is_upper(i) * ((size_offset - 1) * stride_offset);// + priv->bnd_calc_offset[!ci] * stride_boundary; const ptrdiff_t dst_strides[] = { stride_boundary, mg2d_bnd_out_dir(i) * stride_offset }; if (bnd->type != bnd_type_order[order_idx] || @@ -335,9 +343,6 @@ static void boundaries_apply(EGSContext *ctx, NDArray *a_dst, int init) } mg2di_timer_stop(&ctx->timer_bnd_corners); - if (priv->dg->nb_components > 1) - boundaries_sync(ctx, a_dst); - mg2di_timer_stop(&ctx->timer_bnd); } @@ -399,15 +404,26 @@ static int solve_relax_step(EGSContext *ctx, int export_res) priv->reflect_skip = 0; boundaries_apply(ctx, ctx->u, 0); + + priv->steps_since_sync++; + if (priv->steps_since_sync >= priv->steps_per_sync) { + boundaries_sync(ctx, ctx->u); + priv->steps_since_sync = 0; + } } residual_calc(ctx, 1); - if (priv->dg->nb_components > 1) - boundaries_sync(ctx, ctx->residual); + boundaries_sync(ctx, ctx->residual); } else { mg2di_assert(u_next_valid); residual_calc(ctx, 0); boundaries_apply(ctx, priv->u_next, 0); + + priv->steps_since_sync++; + if (priv->steps_since_sync >= priv->steps_per_sync) { + boundaries_sync(ctx, priv->u_next); + priv->steps_since_sync = 0; + } priv->u_next_valid = 1; } @@ -863,7 +879,7 @@ static int init_mpi_sync(EGSContext *ctx) } ret = mg2di_dg_edge_overlaps(overlaps_recv, overlaps_send, - dg, priv->local_component, FD_STENCIL_MAX); + dg, priv->local_component, ctx->fd_stencil * priv->steps_per_sync); if (ret < 0) goto fail; @@ -1005,12 +1021,12 @@ int mg2di_egs_init(EGSContext *ctx, int flags) goto finish; } - priv->residual_calc_size[0] = ctx->domain_size[0]; - priv->residual_calc_size[1] = ctx->domain_size[1]; - priv->residual_calc_offset[0] = ctx->boundaries[MG2D_BOUNDARY_1L]->type == MG2D_BC_TYPE_FIXVAL && - dc->bnd_is_outer[MG2D_BOUNDARY_1L]; - priv->residual_calc_offset[1] = ctx->boundaries[MG2D_BOUNDARY_0L]->type == MG2D_BC_TYPE_FIXVAL && - dc->bnd_is_outer[MG2D_BOUNDARY_0L]; + priv->residual_calc_size[priv->steps_per_sync - 1][0] = ctx->domain_size[0]; + priv->residual_calc_size[priv->steps_per_sync - 1][1] = ctx->domain_size[1]; + priv->residual_calc_offset[priv->steps_per_sync - 1][0] = ctx->boundaries[MG2D_BOUNDARY_0L]->type == MG2D_BC_TYPE_FIXVAL && + dc->bnd_is_outer[MG2D_BOUNDARY_0L]; + priv->residual_calc_offset[priv->steps_per_sync - 1][1] = ctx->boundaries[MG2D_BOUNDARY_1L]->type == MG2D_BC_TYPE_FIXVAL && + dc->bnd_is_outer[MG2D_BOUNDARY_1L]; for (int bnd_idx = 0; bnd_idx < ARRAY_ELEMS(ctx->boundaries); bnd_idx++) { MG2DBoundary *bnd = ctx->boundaries[bnd_idx]; @@ -1031,7 +1047,27 @@ int mg2di_egs_init(EGSContext *ctx, int flags) priv->bnd_zero[bnd_idx] = maxval == 0.0; - priv->residual_calc_size[ci]--; + priv->residual_calc_size[priv->steps_per_sync - 1][ci]--; + } + } + + for (int step = priv->steps_per_sync - 2; step >= 0; step--) { + const int buffer_size = ctx->fd_stencil; + int ic_0l = !dc->bnd_is_outer[MG2D_BOUNDARY_0L]; + int ic_0u = !dc->bnd_is_outer[MG2D_BOUNDARY_0U]; + int ic_1l = !dc->bnd_is_outer[MG2D_BOUNDARY_1L]; + int ic_1u = !dc->bnd_is_outer[MG2D_BOUNDARY_1U]; + + priv->residual_calc_offset[step][0] = priv->residual_calc_offset[step + 1][0] - buffer_size * ic_0l; + priv->residual_calc_offset[step][1] = priv->residual_calc_offset[step + 1][1] - buffer_size * ic_1l; + priv->residual_calc_size[step][0] = priv->residual_calc_size[step + 1][0] + buffer_size * (ic_0l + ic_0u); + priv->residual_calc_size[step][1] = priv->residual_calc_size[step + 1][1] + buffer_size * (ic_1l + ic_1u); + + if (step == 0) { + priv->bnd_calc_offset[0] = -buffer_size * ic_0l; + priv->bnd_calc_offset[1] = -buffer_size * ic_1l; + priv->bnd_calc_size[0] = ctx->domain_size[0] + buffer_size * (ic_0l + ic_0u); + priv->bnd_calc_size[1] = ctx->domain_size[1] + buffer_size * (ic_1l + ic_1u); } } @@ -1049,11 +1085,27 @@ finish: if (ret >= 0) { priv->reflect_skip = 0; + if (!(flags & EGS_INIT_FLAG_SAME_DIFF_COEFFS)) { + for (int dc_idx = 0; dc_idx < ARRAY_ELEMS(priv->diff_coeffs); dc_idx++) + boundaries_sync(ctx, priv->diff_coeffs[dc_idx]); + } + boundaries_sync(ctx, ctx->rhs); + boundaries_apply(ctx, ctx->u, 1); + boundaries_sync(ctx, ctx->u); + priv->steps_since_sync = 0; + boundaries_apply(ctx, priv->u_next, 1); + boundaries_sync(ctx, priv->u_next); residual_calc(ctx, 0); boundaries_apply(ctx, priv->u_next, 0); + + priv->steps_since_sync++; + if (priv->steps_since_sync >= priv->steps_per_sync) { + boundaries_sync(ctx, priv->u_next); + priv->steps_since_sync = 0; + } priv->u_next_valid = 1; } @@ -1077,7 +1129,7 @@ static int arrays_alloc(EGSContext *ctx, const size_t domain_size[2]) int ret; for (int bnd_idx = 0; bnd_idx < 4; bnd_idx++) - extra_points[bnd_idx] = dc->bnd_is_outer[bnd_idx] ? 0 : FD_STENCIL_MAX; + extra_points[bnd_idx] = dc->bnd_is_outer[bnd_idx] ? 0 : FD_STENCIL_MAX * priv->steps_per_sync; size_alloc[0] = dc->exterior.size[1] + extra_points[MG2D_BOUNDARY_1L] + extra_points[MG2D_BOUNDARY_1U]; size_alloc[1] = dc->exterior.size[0] + extra_points[MG2D_BOUNDARY_0L] + extra_points[MG2D_BOUNDARY_0U]; @@ -1206,6 +1258,26 @@ static EGSContext *egs_alloc(const DomainGeometry *dg, unsigned int local_compon ctx->priv->local_component = local_component; ctx->priv->comm = MPI_COMM_NULL; + if (dg->nb_components > 1) { + char *steps = getenv("MG2D_STEPS_PER_SYNC"); + if (steps) + ctx->priv->steps_per_sync = strtol(steps, NULL, 0); + + if (ctx->priv->steps_per_sync <= 0) + ctx->priv->steps_per_sync = 2; + for (int i = 0; i < dg->nb_components; i++) { + if (dg->components[i].interior.size[0] < ctx->priv->steps_per_sync * FD_STENCIL_MAX || + dg->components[i].interior.size[1] < ctx->priv->steps_per_sync * FD_STENCIL_MAX) { + ctx->priv->steps_per_sync = 1; + break; + } + } + } else + ctx->priv->steps_per_sync = 1; + + ctx->priv->residual_calc_offset = calloc(ctx->priv->steps_per_sync, sizeof(*ctx->priv->residual_calc_offset)); + ctx->priv->residual_calc_size = calloc(ctx->priv->steps_per_sync, sizeof(*ctx->priv->residual_calc_size)); + mg2di_timer_init(&ctx->exact->timer_mat_construct); mg2di_timer_init(&ctx->exact->timer_bicgstab); mg2di_timer_init(&ctx->exact->timer_lu_solve); @@ -1351,6 +1423,9 @@ void mg2di_egs_free(EGSContext **pctx) free(ctx->priv->sync_recvcounts); free(ctx->priv->sync_recvdispl); + free(ctx->priv->residual_calc_offset); + free(ctx->priv->residual_calc_size); + mg2di_dg_free(&ctx->priv->dg); tp_free(&ctx->priv->tp_internal); -- cgit v1.2.3