/* * Multigrid solver for a 2nd order 2D linear PDE. * Copyright 2018 Anton Khirnov * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include "common.h" #include "cpu.h" #include "ell_grid_solve.h" #include "log.h" #include "mg2d.h" #include "mg2d_boundary.h" #include "mg2d_boundary_internal.h" #include "mg2d_constants.h" #include "ndarray.h" typedef struct MG2DLevel { unsigned int depth; EGSContext *solver; double *prolong_tmp; ptrdiff_t prolong_tmp_stride; struct MG2DLevel *child; /* timings */ int64_t count_cycles; int64_t time_relax; int64_t time_prolong; int64_t time_restrict; int64_t time_correct; int64_t time_reinit; } MG2DLevel; struct MG2DInternal { MG2DLogger logger; TPContext *tp; MG2DLevel *root; int cpuflags; int64_t time_solve; int64_t count_solve; }; static void log_callback(MG2DLogger *log, int level, const char *fmt, va_list vl) { MG2DContext *ctx = log->opaque; ctx->log_callback(ctx, level, fmt, vl); } static void log_relax_step(MG2DContext *ctx, MG2DLevel *level, const char *step_desc, double res_old, double res_new) { char prefix[32] = { 0 }; for (int i = 0; i < MIN(sizeof(prefix) - 1, level->depth); i++) prefix[i] = ' '; mg2di_log(&ctx->priv->logger, MG2D_LOG_DEBUG, "%s[%d]%s %g->%g (%g)\n", prefix, level->depth, step_desc, res_old, res_new, res_old / res_new); } static void mg_restrict_inject(EGSContext *coarse, double *dst, ptrdiff_t dst_stride, EGSContext *fine, const double *src, ptrdiff_t src_stride) { for (size_t j = 0; j < coarse->domain_size[1]; j++) for (size_t i = 0; i < coarse->domain_size[0]; i++) { const ptrdiff_t idx_dst = j * dst_stride + i; const size_t fine_i = i * 2; const size_t fine_j = j * 2; const ptrdiff_t idx_src = fine_j * src_stride + fine_i; dst[idx_dst] = src[idx_src]; } } static void mg_restrict_fw(EGSContext *coarse, double *dst, ptrdiff_t dst_stride, EGSContext *fine, const double *src, ptrdiff_t src_stride) { for (size_t j = 0; j < coarse->domain_size[1]; j++) for (size_t i = 0; i < coarse->domain_size[0]; i++) { const ptrdiff_t idx_dst = j * dst_stride + i; const size_t fine_i = i * 2; const size_t fine_j = j * 2; const ptrdiff_t idx_src = fine_j * src_stride + fine_i; dst[idx_dst] = 0.25 * src[idx_src] + 0.125 * (src[idx_src + 1] + src[idx_src - 1] + src[idx_src + src_stride] + src[idx_src - src_stride]) + 0.0625 * (src[idx_src + 1 + src_stride] + src[idx_src - 1 + src_stride] + src[idx_src + 1 - src_stride] + src[idx_src - 1 - src_stride]); } } static void mg_prolongate(EGSContext *fine, double *dst, ptrdiff_t dst_stride, EGSContext *coarse, const double *src, ptrdiff_t src_stride) { // for simplicity, this code writes one point over the domain size; // this allows us to avoid treating the boundary layers specially // dst is allocated with extra ghost zones for this purpose for (size_t j = 0; j < coarse->domain_size[1]; j++) for (size_t i = 0; i < coarse->domain_size[0]; i++) { const ptrdiff_t idx_src = j * src_stride + i; const ptrdiff_t idx_dst = 2 * (j * dst_stride + i); const double src00 = src[idx_src]; const double src10 = src[idx_src + 1]; const double src01 = src[idx_src + src_stride]; const double src11 = src[idx_src + src_stride + 1]; dst[idx_dst] = src00; dst[idx_dst + 1] = 0.5 * (src00 + src10); dst[idx_dst + dst_stride] = 0.5 * (src00 + src01); dst[idx_dst + dst_stride + 1] = 0.25 * (src00 + src10 + src01 + src11); } } static void mg_prolongate_bicubic(EGSContext *fine, double *dst, ptrdiff_t dst_stride, EGSContext *coarse, const double *src, ptrdiff_t src_stride) { // for simplicity, this code writes one point over the domain size; // this allows us to avoid treating the boundary layers specially // dst is allocated with extra ghost zones for this purpose for (size_t j = 0; j < coarse->domain_size[1]; j++) for (size_t i = 0; i < coarse->domain_size[0]; i++) { const ptrdiff_t idx_src = j * src_stride + i; const ptrdiff_t idx_dst = 2 * (j * dst_stride + i); const double src00 = src[idx_src]; const double srcm10 = src[idx_src - 1]; const double srcp10 = src[idx_src + 1]; const double srcp20 = src[idx_src + 2]; const double srcm1m1 = src[idx_src - src_stride - 1]; const double src0m1 = src[idx_src - src_stride]; const double srcp1m1 = src[idx_src - src_stride + 1]; const double srcp2m1 = src[idx_src - src_stride + 2]; const double srcm1p1 = src[idx_src + src_stride - 1]; const double src0p1 = src[idx_src + src_stride]; const double srcp1p1 = src[idx_src + src_stride + 1]; const double srcp2p1 = src[idx_src + src_stride + 2]; const double srcm1p2 = src[idx_src + 2 * src_stride - 1]; const double src0p2 = src[idx_src + 2 * src_stride]; const double srcp1p2 = src[idx_src + 2 * src_stride + 1]; const double srcp2p2 = src[idx_src + 2 * src_stride + 2]; dst[idx_dst] = src00; dst[idx_dst + 1] = (-1.0 * srcm10 + 9.0 * src00 + 9.0 * srcp10 - 1.0 * srcp20) / 16.0; dst[idx_dst + dst_stride] = (-1.0 * src0m1 + 9.0 * src00 + 9.0 * src0p1 - 1.0 * src0p2) / 16.0; dst[idx_dst + dst_stride + 1] = (- 1.0 * (-1.0 * srcm1m1 + 9.0 * src0m1 + 9.0 * srcp1m1 - 1.0 * srcp2m1) + 9.0 * (-1.0 * srcm10 + 9.0 * src00 + 9.0 * srcp10 - 1.0 * srcp20) + 9.0 * (-1.0 * srcm1p1 + 9.0 * src0p1 + 9.0 * srcp1p1 - 1.0 * srcp2p1) - 1.0 * (-1.0 * srcm1p2 + 9.0 * src0p2 + 9.0 * srcp1p2 - 1.0 * srcp2p2)) / (16.0 * 16.0); } } typedef struct InterpParamsBilin { size_t dst_domain_size[2]; ptrdiff_t src_stride; ptrdiff_t dst_stride; const double *src; double *dst; double dx_dst; double dy_dst; double dx_src; double dy_src; double scalex; double scaley; } InterpParamsBilin; static int mg_interp_bilinear_kernel(void *arg, unsigned int job_idx, unsigned int thread_idx) { InterpParamsBilin *ip = arg; unsigned int idx1_dst = job_idx; double *dst = ip->dst; const double *src = ip->src; const double dx_dst = ip->dx_dst; const double dy_dst = ip->dy_dst; const double dx_src = ip->dx_src; const double dy_src = ip->dy_src; const double scalex = ip->scalex; const double scaley = ip->scaley; const ptrdiff_t src_stride = ip->src_stride; const ptrdiff_t dst_stride = ip->dst_stride; const double y = idx1_dst * dy_dst; const size_t idx1_src = idx1_dst * scaley; const double y0 = idx1_src * dy_src; const double y1 = y0 + dy_src; const double fact_y0 = (y1 - y) / dy_src; const double fact_y1 = (y - y0) / dy_src; for (size_t idx0_dst = 0; idx0_dst < ip->dst_domain_size[0]; idx0_dst++) { const double x = idx0_dst * dx_dst; const size_t idx0_src = idx0_dst * scalex; const size_t idx_src = idx1_src * src_stride + idx0_src; const double x0 = idx0_src * dx_src; const double x1 = x0 + dx_src; const double fact_x0 = (x1 - x) / dx_src; const double fact_x1 = (x - x0) / dx_src; const double val = fact_x0 * (fact_y0 * src[idx_src] + fact_y1 * src[idx_src + src_stride]) + fact_x1 * (fact_y0 * src[idx_src + 1] + fact_y1 * src[idx_src + 1 + src_stride]); dst[idx1_dst * dst_stride + idx0_dst] = val; } return 0; } static void mg_interp_bilinear(TPContext *tp, EGSContext *ctx_dst, double *dst, ptrdiff_t dst_stride, EGSContext *ctx_src, const double *src, ptrdiff_t src_stride) { InterpParamsBilin ip; ip.dst_domain_size[0] = ctx_dst->domain_size[0]; ip.dst_domain_size[1] = ctx_dst->domain_size[1]; ip.src_stride = src_stride; ip.dst_stride = dst_stride; ip.dx_dst = ctx_dst->step[0]; ip.dy_dst = ctx_dst->step[1]; ip.dx_src = ctx_src->step[0]; ip.dy_src = ctx_src->step[1]; ip.scalex = ip.dx_dst / ip.dx_src; ip.scaley = ip.dy_dst / ip.dy_src; ip.src = src; ip.dst = dst; tp_execute(tp, ctx_dst->domain_size[1], mg_interp_bilinear_kernel, &ip); } static int mg_interp_bicubic_kernel(void *arg, unsigned int job_idx, unsigned int thread_idx) { InterpParamsBilin *ip = arg; unsigned int idx1_dst = job_idx; double *dst = ip->dst; const double *src = ip->src; const double dx_dst = ip->dx_dst; const double dy_dst = ip->dy_dst; const double dx_src = ip->dx_src; const double dy_src = ip->dy_src; const double scalex = ip->scalex; const double scaley = ip->scaley; const ptrdiff_t src_stride = ip->src_stride; const ptrdiff_t dst_stride = ip->dst_stride; const double y = idx1_dst * dy_dst; const size_t idx1_src = idx1_dst * scaley; const double ym1 = idx1_src * dy_src; const double ym2 = ym1 - dy_src; const double yp1 = ym1 + dy_src; const double yp2 = yp1 + dy_src; const double fact_ym2 = (y - ym1) * (y - yp1) * (y - yp2) / ( (ym2 - ym1) * (ym2 - yp1) * (ym2 - yp2)); const double fact_ym1 = (y - ym2) * (y - yp1) * (y - yp2) / ((ym1 - ym2) * (ym1 - yp1) * (ym1 - yp2)); const double fact_yp1 = (y - ym2) * (y - ym1) * (y - yp2) / ((yp1 - ym2) * (yp1 - ym1) * (yp1 - yp2)); const double fact_yp2 = (y - ym2) * (y - ym1) * (y - yp1) / ((yp2 - ym2) * (yp2 - ym1) * (yp2 - yp1) ); const double fact_y[4] = { fact_ym2, fact_ym1, fact_yp1, fact_yp2 }; for (size_t idx0_dst = 0; idx0_dst < ip->dst_domain_size[0]; idx0_dst++) { const double x = idx0_dst * dx_dst; const size_t idx0_src = idx0_dst * scalex; const size_t idx_src = (idx1_src - 1) * src_stride + (idx0_src - 1); const double xm1 = idx0_src * dx_src; const double xm2 = xm1 - dx_src; const double xp1 = xm1 + dx_src; const double xp2 = xp1 + dx_src; const double fact_xm2 = (x - xm1) * (x - xp1) * (x - xp2) / ( (xm2 - xm1) * (xm2 - xp1) * (xm2 - xp2)); const double fact_xm1 = (x - xm2) * (x - xp1) * (x - xp2) / ((xm1 - xm2) * (xm1 - xp1) * (xm1 - xp2)); const double fact_xp1 = (x - xm2) * (x - xm1) * (x - xp2) / ((xp1 - xm2) * (xp1 - xm1) * (xp1 - xp2)); const double fact_xp2 = (x - xm2) * (x - xm1) * (x - xp1) / ((xp2 - xm2) * (xp2 - xm1) * (xp2 - xp1) ); const double fact_x[4] = { fact_xm2, fact_xm1, fact_xp1, fact_xp2 }; double val = 0.0; for (int idx_y = 0; idx_y < 4; idx_y++) { double tmp = 0.0; for (int idx_x = 0; idx_x < 4; idx_x++) tmp += fact_x[idx_x] * src[idx_src + idx_y * src_stride + idx_x]; val += fact_y[idx_y] * tmp; } dst[idx1_dst * dst_stride + idx0_dst] = val; } return 0; } static void mg_interp_bicubic(TPContext *tp, EGSContext *ctx_dst, double *dst, ptrdiff_t dst_stride, EGSContext *ctx_src, const double *src, ptrdiff_t src_stride) { InterpParamsBilin ip; ip.dst_domain_size[0] = ctx_dst->domain_size[0]; ip.dst_domain_size[1] = ctx_dst->domain_size[1]; ip.src_stride = src_stride; ip.dst_stride = dst_stride; ip.dx_dst = ctx_dst->step[0]; ip.dy_dst = ctx_dst->step[1]; ip.dx_src = ctx_src->step[0]; ip.dy_src = ctx_src->step[1]; ip.scalex = ip.dx_dst / ip.dx_src; ip.scaley = ip.dy_dst / ip.dy_src; ip.src = src; ip.dst = dst; tp_execute(tp, ctx_dst->domain_size[1], mg_interp_bicubic_kernel, &ip); } static int coarse_correct_task(void *arg, unsigned int job_idx, unsigned int thread_idx) { MG2DLevel *level = arg; for (size_t idx0 = 0; idx0 < level->solver->domain_size[0]; idx0++) { const ptrdiff_t idx_dst = job_idx * level->solver->u->stride[0] + idx0; const ptrdiff_t idx_src = job_idx * level->prolong_tmp_stride + idx0; level->solver->u->data[idx_dst] -= level->prolong_tmp[idx_src]; } return 0; } static void restrict_residual(MG2DContext *ctx, MG2DLevel *dst, MG2DLevel *src) { EGSContext *s_src = src->solver; EGSContext *s_dst = dst->solver; if (s_src->domain_size[0] == 2 * (s_dst->domain_size[0] - 1) + 1) { mg_restrict_fw(s_dst, s_dst->rhs->data, s_dst->rhs->stride[0], s_src, s_src->residual->data, s_src->residual->stride[0]); } else { if (ctx->fd_stencil == 1) mg_interp_bilinear(ctx->priv->tp, s_dst, s_dst->rhs->data, s_dst->rhs->stride[0], s_src, s_src->residual->data, s_src->residual->stride[0]); else mg_interp_bicubic(ctx->priv->tp, s_dst, s_dst->rhs->data, s_dst->rhs->stride[0], s_src, s_src->residual->data, s_src->residual->stride[0]); } } static void prolong_solution(MG2DContext *ctx, MG2DLevel *dst, MG2DLevel *src) { EGSContext *s_src = src->solver; EGSContext *s_dst = dst->solver; if (s_dst->domain_size[0] == 2 * (s_src->domain_size[0] - 1) + 1) { if (ctx->fd_stencil == 1) mg_prolongate(s_dst, dst->prolong_tmp, dst->prolong_tmp_stride, s_src, s_src->u->data, s_src->u->stride[0]); else mg_prolongate_bicubic(s_dst, dst->prolong_tmp, dst->prolong_tmp_stride, s_src, s_src->u->data, s_src->u->stride[0]); } else { if (ctx->fd_stencil == 1) mg_interp_bilinear(ctx->priv->tp, s_dst, dst->prolong_tmp, dst->prolong_tmp_stride, s_src, s_src->u->data, s_src->u->stride[0]); else mg_interp_bicubic(ctx->priv->tp, s_dst, dst->prolong_tmp, dst->prolong_tmp_stride, s_src, s_src->u->data, s_src->u->stride[0]); } } static int mg_relax_step(MG2DContext *ctx, MG2DLevel *level, const char *step_desc) { double res_old; int64_t start; int ret; res_old = level->solver->residual_max; start = gettime(); ret = mg2di_egs_solve(level->solver); level->time_relax += gettime() - start; if (ret < 0) return ret; log_relax_step(ctx, level, step_desc, res_old, level->solver->residual_max); return 0; } static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level) { double res_old, res_new; int ret; /* on the refined levels, use zero as the initial guess for the * solution (correction for the upper level) */ if (level->depth > 0) { memset(level->solver->u->data, 0, sizeof(*level->solver->u->data) * level->solver->u->stride[0] * level->solver->domain_size[1]); /* re-init the current-level solver (re-calc the residual) */ ret = mg2di_egs_init(level->solver); if (ret < 0) return ret; } res_old = level->solver->residual_max; /* handle coarsest grid */ if (!level->child) { ret = mg_relax_step(ctx, level, "coarse-step"); if (ret < 0) return ret; level->count_cycles++; goto finish; } for (int i = 0; i < ctx->nb_cycles; i++) { double res_prev; int64_t start; /* pre-restrict relaxation */ for (int j = 0; j < ctx->nb_relax_pre; j++) { ret = mg_relax_step(ctx, level, "pre-relax"); if (ret < 0) return ret; } /* restrict the residual as to the coarser-level rhs */ start = gettime(); restrict_residual(ctx, level->child, level); level->time_restrict += gettime() - start; /* solve on the coarser level */ ret = mg_solve_subgrid(ctx, level->child); if (ret < 0) return ret; /* prolongate the coarser-level correction */ start = gettime(); prolong_solution(ctx, level, level->child); level->time_prolong += gettime() - start; /* apply the correction */ start = gettime(); tp_execute(ctx->priv->tp, level->solver->domain_size[1], coarse_correct_task, level); level->time_correct += gettime() - start; /* re-init the current-level solver (re-calc the residual) */ res_prev = level->solver->residual_max; start = gettime(); ret = mg2di_egs_init(level->solver); if (ret < 0) return ret; level->time_reinit += gettime() - start; log_relax_step(ctx, level, "correct", res_prev, level->solver->residual_max); /* post-correct relaxation */ for (int j = 0; j < ctx->nb_relax_post; j++) { ret = mg_relax_step(ctx, level, "post-relax"); if (ret < 0) return ret; } level->count_cycles++; } finish: res_new = level->solver->residual_max; if (!isfinite(res_new) || (res_new > 1e2 * DBL_EPSILON && res_old / res_new <= 1e-1)) { mg2di_log(&ctx->priv->logger, MG2D_LOG_ERROR, "The relaxation step at level %d has diverged: %g -> %g\n", level->depth, res_old, res_new); return MG2D_ERR_DIVERGE; } return 0; } static void array_copy(double *dst, ptrdiff_t dst_stride, const double *src, ptrdiff_t src_stride, size_t linesize, size_t nb_lines) { for (size_t i = 0; i < nb_lines; i++) memcpy(dst + i * dst_stride, src + i * src_stride, linesize * sizeof(*dst)); } static void bnd_zero(MG2DBoundary *bdst, size_t nb_rows, size_t domain_size) { for (size_t i = 0; i < nb_rows; i++) { memset(bdst->val + i * bdst->val_stride - i, 0, sizeof(*bdst->val) * (domain_size + 2 * i)); } } static void bnd_copy(MG2DBoundary *bdst, const double *src, ptrdiff_t src_stride, size_t nb_rows, size_t domain_size) { for (size_t i = 0; i < nb_rows; i++) { memcpy(bdst->val + i * bdst->val_stride - i, src + i * src_stride - i, sizeof(*bdst->val) * (domain_size + 2 * i)); } } static void restrict_diff_coeffs(MG2DContext *ctx, MG2DLevel *dst, MG2DLevel *src) { EGSContext *s_src = src->solver; EGSContext *s_dst = dst->solver; if (s_src->domain_size[0] == 2 * (s_dst->domain_size[0] - 1) + 1) { for (int i = 0; i < ARRAY_ELEMS(s_src->diff_coeffs); i++) { mg_restrict_inject(s_dst, s_dst->diff_coeffs[i]->data, s_dst->diff_coeffs[i]->stride[0], s_src, s_src->diff_coeffs[i]->data, s_src->diff_coeffs[i]->stride[0]); } } else { for (int i = 0; i < ARRAY_ELEMS(s_src->diff_coeffs); i++) { mg_interp_bilinear(ctx->priv->tp, s_dst, s_dst->diff_coeffs[i]->data, s_dst->diff_coeffs[i]->stride[0], s_src, s_src->diff_coeffs[i]->data, s_src->diff_coeffs[i]->stride[0]); } } } static double findmax(double *arr, size_t size[2], ptrdiff_t stride) { double ret = 0.0; for (size_t y = 0; y < size[1]; y++) for (size_t x = 0; x < size[0]; x++) { double val = fabs(arr[y * stride + x]); if (val > ret) ret = val; } return ret; } static int mg_levels_init(MG2DContext *ctx) { MG2DInternal *priv = ctx->priv; MG2DLevel *cur, *prev; double diff0_max, diff2_max, tmp; diff0_max = findmax(ctx->diff_coeffs[MG2D_DIFF_COEFF_00], priv->root->solver->domain_size, ctx->diff_coeffs_stride); diff2_max = findmax(ctx->diff_coeffs[MG2D_DIFF_COEFF_20], priv->root->solver->domain_size, ctx->diff_coeffs_stride); tmp = findmax(ctx->diff_coeffs[MG2D_DIFF_COEFF_02], priv->root->solver->domain_size, ctx->diff_coeffs_stride); diff2_max = MAX(diff2_max, tmp); cur = priv->root; prev = NULL; array_copy(cur->solver->u->data, cur->solver->u->stride[0], ctx->u, ctx->u_stride, ctx->domain_size, ctx->domain_size); array_copy(cur->solver->rhs->data, cur->solver->rhs->stride[0], ctx->rhs, ctx->rhs_stride, ctx->domain_size, ctx->domain_size); for (int i = 0; i < ARRAY_ELEMS(ctx->diff_coeffs); i++) { array_copy(cur->solver->diff_coeffs[i]->data, cur->solver->diff_coeffs[i]->stride[0], ctx->diff_coeffs[i], ctx->diff_coeffs_stride, ctx->domain_size, ctx->domain_size); } while (cur) { if (!prev) { cur->solver->step[0] = ctx->step[0]; cur->solver->step[1] = ctx->step[1]; } else { cur->solver->step[0] = prev->solver->step[0] * ((double)(prev->solver->domain_size[0] - 1) / (cur->solver->domain_size[0] - 1)); cur->solver->step[1] = prev->solver->step[1] * ((double)(prev->solver->domain_size[1] - 1) / (cur->solver->domain_size[1] - 1)); } /* Set the equation coefficients. */ if (prev) restrict_diff_coeffs(ctx, cur, prev); for (int i = 0; i < ARRAY_ELEMS(cur->solver->boundaries); i++) { MG2DBoundary *bsrc = ctx->boundaries[i]; MG2DBoundary *bdst = cur->solver->boundaries[i]; bdst->type = bsrc->type; switch (bsrc->type) { case MG2D_BC_TYPE_FIXVAL: if (!prev) bnd_copy(bdst, bsrc->val, bsrc->val_stride, ctx->fd_stencil, cur->solver->domain_size[0]); else bnd_zero(bdst, ctx->fd_stencil, cur->solver->domain_size[0]); break; case MG2D_BC_TYPE_REFLECT: case MG2D_BC_TYPE_FALLOFF: bnd_zero(bdst, ctx->fd_stencil, cur->solver->domain_size[0]); break; default: return -ENOSYS; } } cur->solver->logger = priv->logger; cur->solver->cpuflags = priv->cpuflags; cur->solver->fd_stencil = ctx->fd_stencil; if (cur->solver->solver_type == EGS_SOLVER_RELAXATION) { EGSRelaxContext *r = cur->solver->solver_data; r->relax_factor = ctx->cfl_factor; r->relax_multiplier = 1.0 / (diff2_max + cur->solver->step[0] * cur->solver->step[1] * diff0_max / 8.0); } prev = cur; cur = cur->child; } return 0; } static int threadpool_init(MG2DContext *ctx) { MG2DInternal *priv = ctx->priv; MG2DLevel *level = priv->root; int ret; ret = tp_init(&priv->tp, ctx->nb_threads); if (ret < 0) return ret; while (level) { level->solver->tp = priv->tp; level = level->child; } return 0; } static int mg_levels_alloc(MG2DContext *ctx, size_t domain_size); int mg2d_solve(MG2DContext *ctx) { MG2DInternal *priv = ctx->priv; MG2DLevel *root; EGSContext *s_root; int64_t time_start; double res_orig, res_prev, res_cur; int ret; if (!priv->root) { ret = mg_levels_alloc(ctx, ctx->domain_size); if (ret < 0) return ret; } if (!priv->tp) { ret = threadpool_init(ctx); if (ret < 0) return ret; } root = priv->root; s_root = root->solver; time_start = gettime(); ret = mg_levels_init(ctx); if (ret < 0) return ret; ret = mg2di_egs_init(s_root); if (ret < 0) return ret; res_orig = s_root->residual_max; res_prev = res_orig; for (int i = 0; i < ctx->maxiter; i++) { ret = mg_solve_subgrid(ctx, root); if (ret < 0) goto fail; res_cur = s_root->residual_max; if (res_cur < ctx->tol) { mg2di_log(&priv->logger, MG2D_LOG_INFO, "converged on iteration %d, residual %g\n", i, res_cur); array_copy(ctx->u, ctx->u_stride, s_root->u->data, s_root->u->stride[0], ctx->domain_size, ctx->domain_size); priv->time_solve += gettime() - time_start; priv->count_solve++; return 0; } mg2di_log(&priv->logger, MG2D_LOG_VERBOSE, "finished toplevel iteration %d, residual %g -> %g (%g)\n", i, res_prev, res_cur, res_prev / res_cur); if (res_cur / res_prev > 1e1 || res_cur / res_orig > 1e3) { mg2di_log(&priv->logger, MG2D_LOG_ERROR, "A multigrid iteration diverged\n"); ret = MG2D_ERR_DIVERGE; goto fail; } res_prev = res_cur; } ret = -EDOM; mg2di_log(&priv->logger, MG2D_LOG_ERROR, "Maximum number of iterations (%d) reached\n", ctx->maxiter); fail: mg2di_log(&priv->logger, MG2D_LOG_ERROR, "The solver failed to converge\n"); return ret; } static void mg_level_free(MG2DLevel **plevel) { MG2DLevel *level = *plevel; if (!level) return; free(level->prolong_tmp); mg2di_egs_free(&level->solver); free(level); *plevel = NULL; } static MG2DLevel *mg_level_alloc(enum EGSType type, const size_t domain_size) { MG2DLevel *level; int ret; level = calloc(1, sizeof(*level)); if (!level) return NULL; ret = posix_memalign((void**)&level->prolong_tmp, 32, sizeof(*level->prolong_tmp) * SQR(domain_size + 1)); if (ret != 0) goto fail; level->prolong_tmp_stride = domain_size + 1; level->solver = mg2di_egs_alloc(type, (size_t [2]){domain_size, domain_size}); if (!level->solver) goto fail; return level; fail: mg_level_free(&level); return NULL; } static int log2i(int n) { int ret = 0; while (n) { ret++; n >>= 1; } return ret - 1; } static int mg_levels_alloc(MG2DContext *ctx, size_t domain_size) { MG2DInternal *priv = ctx->priv; MG2DLevel **dst; size_t next_size; dst = &priv->root; next_size = domain_size; for (int depth = 0; depth < ctx->max_levels; depth++) { enum EGSType cur_type; size_t cur_size = next_size; // choose the domain size for the next child // the children all have sizes 2**n + 1 // but on the top level we skip the closest lower one if it is too close if (depth == 0) { next_size = (1 << log2i(cur_size - 2)) + 1; if ((double)cur_size / next_size < 1.5) next_size = (next_size >> 1) + 1; } else next_size = (cur_size >> 1) + 1; cur_type = (cur_size <= ctx->max_exact_size) ? EGS_SOLVER_EXACT : EGS_SOLVER_RELAXATION; *dst = mg_level_alloc(cur_type, cur_size); if (!*dst) return -ENOMEM; (*dst)->depth = depth; if (cur_type == EGS_SOLVER_EXACT) break; if (next_size <= 4) break; dst = &(*dst)->child; } return 0; } static void log_default_callback(const MG2DContext *ctx, int level, const char *fmt, va_list vl) { if (level > ctx->log_level) return; vfprintf(stderr, fmt, vl); } MG2DContext *mg2d_solver_alloc(size_t domain_size) { MG2DContext *ctx; MG2DInternal *priv; if (domain_size < 3 || SIZE_MAX / domain_size < domain_size) return NULL; ctx = calloc(1, sizeof(*ctx)); if (!ctx) return NULL; ctx->priv = calloc(1, sizeof(*ctx->priv)); if (!ctx->priv) goto fail; priv = ctx->priv; priv->logger.log = log_callback; priv->logger.opaque = ctx; priv->cpuflags = mg2di_cpu_flags_get(); for (int i = 0; i < ARRAY_ELEMS(ctx->boundaries); i++) { ctx->boundaries[i] = mg2di_bc_alloc(domain_size); if (!ctx->boundaries[i]) { goto fail; } } ctx->u = calloc(SQR(domain_size), sizeof(*ctx->u)); if (!ctx->u) goto fail; ctx->u_stride = domain_size; ctx->rhs = calloc(SQR(domain_size), sizeof(*ctx->rhs)); if (!ctx->rhs) goto fail; ctx->rhs_stride = domain_size; for (int i = 0; i < ARRAY_ELEMS(ctx->diff_coeffs); i++) { ctx->diff_coeffs[i] = calloc(SQR(domain_size), sizeof(*ctx->diff_coeffs[i])); if (!ctx->diff_coeffs[i]) goto fail; } ctx->diff_coeffs_stride = domain_size; ctx->domain_size = domain_size; ctx->max_levels = 16; ctx->max_exact_size = 5; ctx->maxiter = 16; ctx->tol = 1e-12; ctx->nb_cycles = 1; ctx->nb_relax_pre = 1; ctx->nb_relax_post = 1; ctx->log_callback = log_default_callback; ctx->log_level = MG2D_LOG_INFO; ctx->nb_threads = 1; return ctx; fail: mg2d_solver_free(&ctx); return NULL; } void mg2d_solver_free(MG2DContext **pctx) { MG2DContext *ctx = *pctx; if (!ctx) return; while (ctx->priv->root) { MG2DLevel *next = ctx->priv->root->child; mg_level_free(&ctx->priv->root); ctx->priv->root = next; } for (int i = 0; i < ARRAY_ELEMS(ctx->boundaries); i++) mg2di_bc_free(&ctx->boundaries[i]); tp_free(&ctx->priv->tp); free(ctx->priv); free(ctx->u); free(ctx->rhs); for (int i = 0; i < ARRAY_ELEMS(ctx->diff_coeffs); i++) free(ctx->diff_coeffs[i]); free(ctx); *pctx = NULL; } void mg2d_print_stats(MG2DContext *ctx, const char *prefix) { MG2DInternal *priv = ctx->priv; MG2DLevel *level = priv->root; int64_t other, levels_total = 0; if (!prefix) prefix = ""; mg2di_log(&priv->logger, MG2D_LOG_VERBOSE, "%s%ld solves; %g s total time; %g ms avg per call\n", prefix, priv->count_solve, priv->time_solve / 1e6, priv->time_solve / 1e3 / priv->count_solve); while (level) { char buf[1024], *p; int ret; EGSRelaxContext *r = NULL; EGSExactContext *e = NULL; int64_t level_total = level->time_relax + level->time_prolong + level->time_restrict + level->time_correct + level->time_reinit; if (level->solver->solver_type == EGS_SOLVER_RELAXATION) r = level->solver->solver_data; else if (level->solver->solver_type == EGS_SOLVER_EXACT) e = level->solver->solver_data; level_total = level->time_relax + level->time_prolong + level->time_restrict + level->time_correct + level->time_reinit; levels_total += level_total; p = buf; ret = snprintf(p, sizeof(buf) - (p - buf), "%2.2f%% level %d: %ld cycles %g s total time %g ms avg per call", level_total * 100.0 / priv->time_solve, level->depth, level->count_cycles, level_total / 1e6, level_total / 1e3 / level->count_cycles); if (ret > 0) p += ret; if (level->child) { ret = snprintf(p, sizeof(buf) - (p - buf), "||%2.2f%% relax %2.2f%% prolong %2.2f%% restrict %2.2f%% correct %2.2f%% reinit", level->time_relax * 100.0 / level_total, level->time_prolong * 100.0 / level_total, level->time_restrict * 100.0 / level_total, level->time_correct * 100.0 / level_total, level->time_reinit * 100.0 / level_total); if (ret > 0) p += ret; } ret = snprintf(p, sizeof(buf) - (p - buf), "||%2.2f%% residual %2.2f%% boundaries", level->solver->time_res_calc * 100.0 / level->solver->time_total, level->solver->time_boundaries * 100.0 / level->solver->time_total); if (ret > 0) p += ret; if (r) { ret = snprintf(p, sizeof(buf) - (p - buf), " %2.2f%% correct", r->time_correct * 100.0 / level->solver->time_total); if (ret > 0) p += ret; } else if (e) { ret = snprintf(p, sizeof(buf) - (p - buf), " %2.2f%% matrix construct %2.2f%% linear solve", e->time_mat_construct * 100.0 / level->solver->time_total, e->time_lin_solve * 100.0 / level->solver->time_total); if (ret > 0) p += ret; } mg2di_log(&priv->logger, MG2D_LOG_VERBOSE, "%s%s\n", prefix, buf); level = level->child; } other = priv->time_solve - levels_total; mg2di_log(&priv->logger, MG2D_LOG_VERBOSE, "%s%2.2f%% other %g s total time %g ms avg per call\n", prefix, other * 100.0 / priv->time_solve, other / 1e6, other / 1e3 / priv->count_solve); } unsigned int mg2d_max_fd_stencil(void) { return FD_STENCIL_MAX; }