#include #include #include #include #include #include #include #include #include #include #include #include #include "cctk.h" #include "cctk_Arguments.h" #include "cctk_Parameters.h" #include "cctk_Timers.h" #include "util_Table.h" #include #define SQR(x) ((x) * (x)) #define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0) #define MAX(x, y) ((x) > (y) ? (x) : (y)) #define MIN(x, y) ((x) > (y) ? (y) : (x)) #define ABS(x) ((x >= 0) ? (x) : -(x)) #define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr)) #define CPINDEX(cp, i, j, k) ((k * cp->grid_size[1] + j) * cp->grid_size[0] + i) #define qms_assert(x) \ do { \ if (!(x)) { \ fprintf(stderr, "%s:%d assertion " #x " failed\n", \ __FILE__, __LINE__); \ abort(); \ } \ } while (0) /* precomputed values for a given refined grid */ typedef struct CoordPatch { int level; ptrdiff_t grid_size[3]; MG2DContext *solver; MPI_Comm solver_comm; int y_idx; ptrdiff_t offset_left[2]; int bnd_intercomp[2][2]; int level_size[2]; size_t *extents; int nb_extents; double *interp_tmp0; double *interp_tmp1; /* MPI sync for the initial guess */ int nb_components; ptrdiff_t (*solver_start)[2]; size_t (*solver_size)[2]; double *u_guess; ptrdiff_t u_guess_stride; ptrdiff_t u_guess_start[2]; size_t u_guess_size[2]; int *u_guess_sendcounts; int *u_guess_senddispl; MPI_Datatype *u_guess_sendtypes; int *u_guess_recvcounts; int *u_guess_recvdispl; MPI_Datatype *u_guess_recvtypes; /* MPI sync for the coarser-level prediction */ double *interp_tmp_sync; ptrdiff_t interp_tmp_stride; ptrdiff_t interp_tmp_start[2]; size_t interp_tmp_size[2]; int *interp_tmp_sendcounts; int *interp_tmp_senddispl; MPI_Datatype *interp_tmp_sendtypes; int *interp_tmp_recvcounts; int *interp_tmp_recvdispl; MPI_Datatype *interp_tmp_recvtypes; } CoordPatch; typedef struct QMSMGContext { cGH *gh; int log_level; int fd_stencil; int maxiter; int max_exact_size; int nb_cycles; int nb_relax_post; int nb_relax_pre; double tol_residual_base; double cfl_factor; double outer_smooth_fact; int adaptive_step; int solve_level; int solve_level_max; int boundary_offset; CoordPatch **patches; int nb_patches; /* timings */ int64_t time_solve; int64_t count_solve; int64_t time_export; int64_t count_export; int64_t time_mg2d; int64_t count_mg2d; int64_t time_fill; int64_t count_fill; int64_t time_bnd; int64_t count_bnd; int64_t time_eval; int64_t count_eval; } QMSMGContext; static const struct { const char *str; enum MG2DLogLevel level; } log_levels[] = { { "fatal", MG2D_LOG_FATAL }, { "error", MG2D_LOG_ERROR }, { "warning", MG2D_LOG_WARNING }, { "info", MG2D_LOG_INFO }, { "verbose", MG2D_LOG_VERBOSE }, { "debug", MG2D_LOG_DEBUG }, }; #include static inline int64_t gettime(void) { struct timeval tv; gettimeofday(&tv, NULL); return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; } static QMSMGContext *qms_context; static int ctz(int a) { int ret = 0; if (!a) return INT_MAX; while (!(a & 1)) { a >>= 1; ret++; } return ret; } static void log_callback(const MG2DContext *ctx, int level, const char *fmt, va_list vl) { QMSMGContext *ms = ctx->opaque; int target_level = ms->log_level; uint8_t buf[1024]; int ret; if (level > target_level) return; ret = snprintf(buf, sizeof(buf), "[%d] t=%g ", ctz(ms->gh->cctk_levfac[0]), ms->gh->cctk_time); if (ret >= sizeof(buf)) return; vsnprintf(buf + ret, sizeof(buf) - ret, fmt, vl); fputs(buf, stderr); } /** * Compute the refinement level geometry. * * The level is assumed to be a single square patch that is composed of * nProcs components. The lower x and z boundaries are assumed to be reflection. * For our purposes, both the physical outer boundaries and the refinement * boundaries are equivalent, while the intercomponent boundaries are treated * differently. Since Cactus only allows distinguishing between "physical" and * "all other" boundaries, we need to compute the extents manually. * * @param pextents On return will contain a nProcs*4-sized array, where for each * i from 0 to nProcs, elements [i * 4 + 0/1] contain the index of the x/z * component origin, while elements [i * 4 + 2/3] contain the number of points * owned by that component along x/z dimension. * * @param level_size will contain the number of points in x/z directions on the * complete refinement level, including all the ghost zones up to the outer * level boundary. */ static void get_extents(size_t **pextents, int *level_size, cGH *gh) { int nb_procs = CCTK_nProcs(gh); int local_proc = CCTK_MyProc(gh); size_t *extents; extents = calloc(nb_procs, 4 * sizeof(*extents)); if (!extents) CCTK_WARN(0, "Error allocating the extents"); for (int dir = 0; dir < 2; dir++) { extents[4 * local_proc + 0 + dir] = gh->cctk_lbnd[2 * dir]; extents[4 * local_proc + 2 + dir] = gh->cctk_lsh[2 * dir] - gh->cctk_nghostzones[2 * dir]; } if (nb_procs > 1) MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, extents, 4 * sizeof(*extents), MPI_BYTE, MPI_COMM_WORLD); level_size[0] = 0; level_size[1] = 0; for (int dir = 0; dir < 2; dir++) { for (int comp = 0; comp < nb_procs; comp++) { size_t start = extents[4 * comp + 0 + dir]; size_t size = extents[4 * comp + 2 + dir]; if (start + size > level_size[dir]) level_size[dir] = start + size; } // if the upper boundary is an inter-component boundary, substract the ghost zones for (int comp = 0; comp < nb_procs; comp++) { size_t start = extents[4 * comp + 0 + dir]; size_t size = extents[4 * comp + 2 + dir]; if (start + size < level_size[dir]) extents[4 * comp + 2 + dir] -= gh->cctk_nghostzones[2 * dir]; } } *pextents = extents; } static CoordPatch *get_coord_patch(QMSMGContext *ms, int level) { for (int i = 0; i < ms->nb_patches; i++) if (ms->patches[i]->level == level) return ms->patches[i]; return NULL; } static int alloc_coord_patch(QMSMGContext *ms, int level) { cGH *gh = ms->gh; const char *omp_threads = getenv("OMP_NUM_THREADS"); const int integrator_substeps = MoLNumIntegratorSubsteps(); const int nb_procs = CCTK_nProcs(gh); const int local_proc = CCTK_MyProc(gh); const double *a_x = CCTK_VarDataPtr(gh, 0, "grid::x"); const double *a_y = CCTK_VarDataPtr(gh, 0, "grid::y"); const double *a_z = CCTK_VarDataPtr(gh, 0, "grid::z"); size_t *extents; size_t component_start[2], component_size[2]; int *solver_ranks; int nb_solver_ranks; CoordPatch *cp; MG2DContext *solver; int i, ret; cp = calloc(1, sizeof(*cp)); if (!cp) CCTK_WARN(0, "Error allocating patch"); ms->patches = realloc(ms->patches, sizeof(*ms->patches) * (ms->nb_patches + 1)); ms->patches[ms->nb_patches] = cp; memset(cp, 0, sizeof(*cp)); cp->level = level; cp->grid_size[0] = gh->cctk_lsh[0]; cp->grid_size[1] = gh->cctk_lsh[1]; cp->grid_size[2] = gh->cctk_lsh[2]; for (i = 0; i < gh->cctk_lsh[1]; i++) if (fabs(a_y[CCTK_GFINDEX3D(gh, 0, i, 0)]) < 1e-8) { cp->y_idx = i; break; } if (i == gh->cctk_lsh[1]) CCTK_WARN(0, "The grid does not include y==0"); get_extents(&extents, cp->level_size, gh); component_start[0] = extents[4 * local_proc + 0 + 0]; component_start[1] = extents[4 * local_proc + 0 + 1]; component_size[0] = extents[4 * local_proc + 2 + 0]; component_size[1] = extents[4 * local_proc + 2 + 1]; for (int dir = 0; dir < 2; dir++) { cp->bnd_intercomp[dir][0] = component_start[dir] > 0; cp->bnd_intercomp[dir][1] = (component_start[dir] + component_size[dir]) < cp->level_size[dir]; cp->offset_left[dir] = gh->cctk_nghostzones[2 * dir]; } cp->extents = extents; if (cp->level_size[0] != cp->level_size[1]) CCTK_WARN(0, "Non-square domain"); solver_ranks = calloc(nb_procs, sizeof(*solver_ranks)); if (!solver_ranks) CCTK_WARN(0, "Error allocating solver ranks"); cp->nb_components = nb_procs; cp->solver_size = calloc(nb_procs, sizeof(*cp->solver_size)); cp->solver_start = calloc(nb_procs, sizeof(*cp->solver_start)); if (!cp->solver_size || !cp->solver_start) CCTK_WARN(0, "Error allocating solver sizes/starts"); nb_solver_ranks = 0; for (int proc = 0; proc < nb_procs; proc++) { for (int dir = 0; dir < 2; dir++) { size_t start = extents[4 * proc + 0 + dir]; size_t end = start + extents[4 * proc + 2 + dir]; ptrdiff_t offset_right; offset_right = gh->cctk_nghostzones[2 * dir]; if (cp->level > 0) { offset_right *= integrator_substeps * 2; offset_right -= ms->boundary_offset; } end = MIN(end, cp->level_size[dir] - offset_right); if (end > start) { cp->solver_start[proc][dir] = start; cp->solver_size[proc][dir] = end - start; } else { cp->solver_start[proc][dir] = 0; cp->solver_size[proc][dir] = 0; } } if (cp->solver_size[proc][0] > 0 && cp->solver_size[proc][1] > 0) solver_ranks[nb_solver_ranks++] = proc; } if (nb_solver_ranks > 1) { MPI_Group grp_world, grp_solver; MPI_Comm_group(MPI_COMM_WORLD, &grp_world); MPI_Group_incl(grp_world, nb_solver_ranks, solver_ranks, &grp_solver); MPI_Comm_create(MPI_COMM_WORLD, grp_solver, &cp->solver_comm); MPI_Group_free(&grp_solver); MPI_Group_free(&grp_world); } else { cp->solver_comm = MPI_COMM_NULL; } if (!(cp->solver_size[local_proc][0] > 0 && cp->solver_size[local_proc][1] > 0)) goto finish; if (cp->solver_comm == MPI_COMM_NULL) solver = mg2d_solver_alloc(cp->solver_size[local_proc][0]); else solver = mg2d_solver_alloc_mpi(cp->solver_comm, component_start, cp->solver_size[local_proc]); if (!solver) CCTK_WARN(0, "Error allocating the solver"); cp->solver = solver; solver->step[0] = a_x[1] - a_x[0]; solver->step[1] = a_z[CCTK_GFINDEX3D(gh, 0, 0, 1)] - a_z[0]; solver->fd_stencil = ms->fd_stencil; solver->boundaries[MG2D_BOUNDARY_0L]->type = MG2D_BC_TYPE_REFLECT; solver->boundaries[MG2D_BOUNDARY_1L]->type = MG2D_BC_TYPE_REFLECT; solver->boundaries[MG2D_BOUNDARY_0U]->type = MG2D_BC_TYPE_FIXVAL; solver->boundaries[MG2D_BOUNDARY_1U]->type = MG2D_BC_TYPE_FIXVAL; solver->maxiter = ms->maxiter; solver->tol = ms->tol_residual_base / (solver->step[0] * solver->step[1]); solver->nb_cycles = ms->nb_cycles; solver->nb_relax_pre = ms->nb_relax_pre; solver->nb_relax_post = ms->nb_relax_post; solver->cfl_factor = ms->cfl_factor; solver->adaptive_step = ms->adaptive_step; solver->max_exact_size = ms->max_exact_size; solver->opaque = ms; solver->log_callback = log_callback; if (omp_threads) solver->nb_threads = strtol(omp_threads, NULL, 0); if (solver->nb_threads <= 0) solver->nb_threads = 1; /* initialize boundary values to zero, * * non-zero values on the outer boundaries of refined levels are filled in elsewhere */ for (int i = 0; i < 4; i++) { int ci = mg2d_bnd_coord_idx(i); for (size_t j = 0; j < solver->fd_stencil; j++) { memset(solver->boundaries[i]->val + j * solver->boundaries[i]->val_stride - j, 0, sizeof(*solver->boundaries[i]->val) * (solver->local_size[!ci] + 2 * j)); } } if (level > 0) { cp->interp_tmp0 = calloc(cp->grid_size[0] * cp->grid_size[2], sizeof(*cp->interp_tmp0)); cp->interp_tmp1 = calloc(cp->grid_size[0] * cp->grid_size[2], sizeof(*cp->interp_tmp1)); if (!cp->interp_tmp0 || !cp->interp_tmp1) CCTK_WARN(0, "Error allocating arrays"); } finish: free(solver_ranks); ms->nb_patches++; return 0; } static void coord_patch_free(CoordPatch **pcp) { CoordPatch *cp = *pcp; if (!cp) return; mg2d_solver_free(&cp->solver); if (cp->solver_comm != MPI_COMM_NULL) MPI_Comm_free(&cp->solver_comm); free(cp->interp_tmp0); free(cp->interp_tmp1); free(cp->interp_tmp_sync); free(cp->interp_tmp_sendcounts); free(cp->interp_tmp_senddispl); free(cp->interp_tmp_sendtypes); free(cp->interp_tmp_recvcounts); free(cp->interp_tmp_recvdispl); free(cp->interp_tmp_recvtypes); free(cp->extents); free(cp); *pcp = NULL; } static void print_stats(QMSMGContext *ms) { int orig_log_level; int64_t total = ms->time_solve + ms->time_eval; fprintf(stderr, "%2.2f%% eval: %ld runs; %g s total; %g ms avg per run\n", (double)ms->time_eval * 100.0 / total, ms->count_eval, ms->time_eval / 1e6, ms->time_eval / 1e3 / ms->count_eval); fprintf(stderr, "%2.2f%% solve: %ld runs; %g s total; %g ms avg per run\n", (double)ms->time_solve * 100.0 / total, ms->count_solve, ms->time_solve / 1e6, ms->time_solve / 1e3 / ms->count_solve); fprintf(stderr, " %2.2f%% fill coeffs %2.2f%% boundaries %2.2f%% mg2d %2.2f%% export\n", (double)ms->time_fill * 100.0 / ms->time_solve, (double)ms->time_bnd * 100.0 / ms->time_solve, (double)ms->time_mg2d * 100.0 / ms->time_solve, (double)ms->time_export * 100.0 / ms->time_solve); orig_log_level = ms->log_level; ms->log_level = MG2D_LOG_VERBOSE; for (int i = 0; i < ms->nb_patches; i++) { CoordPatch *cp = get_coord_patch(ms, i); char indent_buf[64]; snprintf(indent_buf, sizeof(indent_buf), " [%d] ", cp->level); if (cp->solver) mg2d_print_stats(cp->solver, indent_buf); } ms->log_level = orig_log_level; } enum RHSVar { RHS_VAR_GTXX, RHS_VAR_GTYY, RHS_VAR_GTZZ, RHS_VAR_GTXY, RHS_VAR_GTXZ, RHS_VAR_GTYZ, RHS_VAR_ATXX, RHS_VAR_ATYY, RHS_VAR_ATZZ, RHS_VAR_ATXY, RHS_VAR_ATXZ, RHS_VAR_ATYZ, RHS_VAR_BETAX, RHS_VAR_BETAY, RHS_VAR_BETAZ, RHS_VAR_XTX, RHS_VAR_XTY, RHS_VAR_XTZ, RHS_VAR_TRK, RHS_VAR_PHI, RHS_VAR_ALPHA, RHS_VAR_X, RHS_VAR_Y, RHS_VAR_Z, RHS_VAR_NB, }; typedef void (*FECLine)(const cGH *gh, const CoordPatch *cp, MG2DContext *solver, const double * const vars[], const int idx_x_start, const int idx_x_end, const int idx_z, const ptrdiff_t stride_z, const double dx_inv_scal, const double dz_inv_scal); // computing QMS equation coefficients is resource-intensive, so we generate // vectorized versions when possible // // first, instantiate the plain scalar version #define ELEM_SCALAR #include "fill_eq_coeffs_template.c" #undef ELEM_SCALAR #if defined(__INTEL_COMPILER) && defined(__OPTIMIZE__) // ICC miscompiles the vectorized versions of fill_eq_coeffs_line() with higher // optimization levels # pragma GCC push_options # pragma GCC optimize("O1") #endif #ifdef __AVX512F__ #define ELEM_AVX512 #include "fill_eq_coeffs_template.c" #undef ELEM_AVX512 #define FILL_EQ_COEFFS_BLOCKSIZE 8 #define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_avx512 #elif defined(__AVX2__) #define ELEM_AVX2 #include "fill_eq_coeffs_template.c" #undef ELEM_AVX2 #define FILL_EQ_COEFFS_BLOCKSIZE 4 #define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_avx2 #else #define FILL_EQ_COEFFS_BLOCKSIZE 1 #define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_scalar #endif #if defined(__INTEL_COMPILER) && defined(__OPTIMIZE__) # pragma GCC pop_options #endif static void fill_eq_coeffs(QMSMGContext *ctx, CoordPatch *cp, MG2DContext *solver) { FECLine fill_eq_coeffs_line; cGH *gh = ctx->gh; int ret; const ptrdiff_t stride_z = CCTK_GFINDEX3D(gh, 0, 0, 1); const double * const vars[] = { [RHS_VAR_GTXX] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::gt11"), [RHS_VAR_GTYY] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::gt22"), [RHS_VAR_GTZZ] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::gt33"), [RHS_VAR_GTXY] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::gt12"), [RHS_VAR_GTXZ] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::gt13"), [RHS_VAR_GTYZ] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::gt23"), [RHS_VAR_ATXX] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::At11"), [RHS_VAR_ATYY] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::At22"), [RHS_VAR_ATZZ] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::At33"), [RHS_VAR_ATXY] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::At12"), [RHS_VAR_ATXZ] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::At13"), [RHS_VAR_ATYZ] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::At23"), [RHS_VAR_TRK] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::trK"), [RHS_VAR_PHI] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::phi"), [RHS_VAR_ALPHA] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::alpha"), [RHS_VAR_XTX] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::Xt1"), [RHS_VAR_XTZ] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::Xt3"), [RHS_VAR_BETAX] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::beta1"), [RHS_VAR_BETAZ] = CCTK_VarDataPtr(gh, 0, "ML_BSSN::beta3"), [RHS_VAR_X] = CCTK_VarDataPtr(gh, 0, "grid::x"), [RHS_VAR_Z] = CCTK_VarDataPtr(gh, 0, "grid::z"), }; const double dx_inv = 1.0 / (vars[RHS_VAR_X][1] - vars[RHS_VAR_X][0]); const double dz_inv = 1.0 / (vars[RHS_VAR_Z][stride_z] - vars[RHS_VAR_Z][0]); const int have_axis = fabs(vars[RHS_VAR_X][cp->offset_left[0]]) < 1e-13; int idx_x_start, idx_x_end; solver->diff_coeffs[MG2D_DIFF_COEFF_20]->boundaries[MG2D_BOUNDARY_0L].flags = MG2D_DC_FLAG_DISCONT; solver->diff_coeffs[MG2D_DIFF_COEFF_10]->boundaries[MG2D_BOUNDARY_0L].flags = MG2D_DC_FLAG_POLE; // do not execute the vectorized version on the axis, // as it does not support special handling of the values there idx_x_start = FILL_EQ_COEFFS_BLOCKSIZE > 1 && !!have_axis; idx_x_end = idx_x_start + (solver->local_size[0] - idx_x_start) / FILL_EQ_COEFFS_BLOCKSIZE * FILL_EQ_COEFFS_BLOCKSIZE; #pragma omp parallel for for (int idx_z = 0; idx_z < solver->local_size[1]; idx_z++) { if (idx_x_start) fill_eq_coeffs_line_scalar(gh, cp, solver, vars, 0, idx_x_start, idx_z, stride_z, dx_inv, dz_inv); FILL_EQ_COEFFS_VECTOR(gh, cp, solver, vars, idx_x_start, idx_x_end, idx_z, stride_z, dx_inv, dz_inv); if (idx_x_end < solver->local_size[0]) fill_eq_coeffs_line_scalar(gh, cp, solver, vars, idx_x_end, solver->local_size[0], idx_z, stride_z, dx_inv, dz_inv); } } static void mirror(CoordPatch *cp, double *dst) { if (!cp->bnd_intercomp[0][0]) { #pragma omp parallel for for (int idx_z = cp->offset_left[1]; idx_z < cp->grid_size[2]; idx_z++) for (int idx_x = 0; idx_x < cp->offset_left[0]; idx_x++) { const ptrdiff_t idx_dst = CPINDEX(cp, idx_x, cp->y_idx, idx_z); const ptrdiff_t idx_src = CPINDEX(cp, 2 * cp->offset_left[0] - idx_x, cp->y_idx, idx_z); dst[idx_dst] = dst[idx_src]; } } if (!cp->bnd_intercomp[1][0]) { #pragma omp parallel for for (int idx_z = 0; idx_z < cp->offset_left[1]; idx_z++) for (int idx_x = 0; idx_x < cp->grid_size[0]; idx_x++) { const ptrdiff_t idx_dst = CPINDEX(cp, idx_x, cp->y_idx, idx_z); const ptrdiff_t idx_src = CPINDEX(cp, idx_x, cp->y_idx, 2 * cp->offset_left[1] - idx_z); dst[idx_dst] = dst[idx_src]; } } } static double smooth_step(double x) { if (x <= 0.0) return 0.0; else if (x >= 1.0) return 1.0; return 1.0 - exp(-36.0 * pow(x, 6.0)); } static void solution_to_grid(QMSMGContext *ms, CoordPatch *cp, MG2DContext *solver, double *dst) { /* on the coarsest level, smooth the solution at the outer boundary */ if (cp->level == ms->solve_level_max) { const double bnd_loc = (solver->domain_size - 1) * solver->step[0]; const double transition_start = bnd_loc * ms->outer_smooth_fact; #pragma omp parallel for for (int j = 0; j < solver->local_size[1]; j++) { const double z_val = (j + solver->local_start[1]) * solver->step[1]; for (int i = 0; i < solver->local_size[0]; i++) { const double x_val = (i + solver->local_start[0]) * solver->step[0]; const double r = sqrt(SQR(x_val) + SQR(z_val)); const double dist = (r - transition_start) / fabs(bnd_loc - transition_start); const double smooth_fact = 1.0 - smooth_step(dist); const ptrdiff_t idx_dst = CPINDEX(cp, i + cp->offset_left[0], cp->y_idx, j + cp->offset_left[1]); const int idx_src = j * solver->u_stride + i; dst[idx_dst] = solver->u[idx_src] * smooth_fact; } } } else { #pragma omp parallel for for (int j = 0; j < solver->local_size[1]; j++) for (int i = 0; i < solver->local_size[0]; i++) { const ptrdiff_t idx_dst = CPINDEX(cp, i + cp->offset_left[0], cp->y_idx, j + cp->offset_left[1]); const int idx_src = j * solver->u_stride + i; dst[idx_dst] = solver->u[idx_src]; } } /* fill in the axial symmetry ghostpoints by mirroring */ mirror(cp, dst); } typedef struct Rect { ptrdiff_t start[2]; size_t size[2]; } Rect; static int rect_intersect(Rect *dst, const Rect *src1, const Rect *src2) { ptrdiff_t intersect_start0 = MAX(src1->start[0], src2->start[0]); ptrdiff_t intersect_start1 = MAX(src1->start[1], src2->start[1]); ptrdiff_t intersect_end0 = MIN(src1->start[0] + src1->size[0], src2->start[0] + src2->size[0]); ptrdiff_t intersect_end1 = MIN(src1->start[1] + src1->size[1], src2->start[1] + src2->size[1]); if (intersect_start0 < intersect_end0 && intersect_start1 < intersect_end1) { dst->start[0] = intersect_start0; dst->start[1] = intersect_start1; dst->size[0] = intersect_end0 - intersect_start0; dst->size[1] = intersect_end1 - intersect_start1; return 1; } dst->size[0] = 0; dst->size[1] = 0; return 0; } static int guess_from_coarser(QMSMGContext *ms, CoordPatch *cp, CoordPatch *cp_coarse, double *dst) { int ret; if (cp->nb_components > 1) { int comp_fine, comp_coarse = 0; // FIXME skip levels with skipped components, // those require somewhat more complex logic to implement MPI_Comm_size(cp->solver_comm, &comp_fine); if (cp_coarse->solver_comm != MPI_COMM_NULL) MPI_Comm_size(cp_coarse->solver_comm, &comp_coarse); if (comp_fine != cp->nb_components || comp_coarse != cp->nb_components) return 0; if (!cp->u_guess) { const int ghosts = cp->solver->fd_stencil + 1; int local_proc = CCTK_MyProc(ms->gh); cp->u_guess_sendcounts = calloc(cp->nb_components, sizeof(*cp->u_guess_sendcounts)); cp->u_guess_senddispl = calloc(cp->nb_components, sizeof(*cp->u_guess_senddispl)); cp->u_guess_sendtypes = calloc(cp->nb_components, sizeof(*cp->u_guess_sendtypes)); cp->u_guess_recvcounts = calloc(cp->nb_components, sizeof(*cp->u_guess_recvcounts)); cp->u_guess_recvdispl = calloc(cp->nb_components, sizeof(*cp->u_guess_recvdispl)); cp->u_guess_recvtypes = calloc(cp->nb_components, sizeof(*cp->u_guess_recvtypes)); if (!cp->u_guess_sendcounts || !cp->u_guess_senddispl || !cp->u_guess_sendtypes || !cp->u_guess_recvcounts || !cp->u_guess_recvdispl || !cp->u_guess_recvtypes) return -ENOMEM; cp->u_guess_size[0] = cp->solver_size[local_proc][0] / 2 + 2 * ghosts; cp->u_guess_size[1] = cp->solver_size[local_proc][1] / 2 + 2 * ghosts; cp->u_guess_start[0] = cp->solver_start[local_proc][0] / 2 - ghosts; cp->u_guess_start[1] = cp->solver_start[local_proc][1] / 2 - ghosts; cp->u_guess_stride = cp->u_guess_size[0]; cp->u_guess = calloc(cp->u_guess_stride * cp->u_guess_size[1], sizeof(*cp->u_guess)); if (!cp->u_guess) return -ENOMEM; for (int comp_fine = 0; comp_fine < cp->nb_components; comp_fine++) { Rect dst_fine = { .start = { cp->solver_start[comp_fine][0], cp->solver_start[comp_fine][1]}, .size = { cp->solver_size[comp_fine][0], cp->solver_size[comp_fine][1]}, }; Rect dst_coarse = { .start = { dst_fine.start[0] / 2 - ghosts, dst_fine.start[1] / 2 - ghosts }, .size = { dst_fine.size[0] / 2 + ghosts * 2, dst_fine.size[1] / 2 + ghosts * 2 }, }; for (int comp_coarse = 0; comp_coarse < cp->nb_components; comp_coarse++) { Rect overlap; Rect src = { .start = { cp_coarse->solver_start[comp_coarse][0], cp_coarse->solver_start[comp_coarse][1]}, .size = { cp_coarse->solver_size[comp_coarse][0], cp_coarse->solver_size[comp_coarse][1]}, }; for (int dim = 0; dim < 2; dim++) { if (src.start[dim] == 0) { src.start[dim] -= ghosts; src.size[dim] += ghosts; } } rect_intersect(&overlap, &dst_coarse, &src); if (comp_fine == local_proc) { if (overlap.size[0] > 0 && overlap.size[1] > 0) { cp->u_guess_recvcounts[comp_coarse] = 1; cp->u_guess_recvdispl[comp_coarse] = ((overlap.start[1] - dst_coarse.start[1]) * cp->u_guess_stride + (overlap.start[0] - dst_coarse.start[0])) * sizeof(double); MPI_Type_vector(overlap.size[1], overlap.size[0], cp->u_guess_stride, MPI_DOUBLE, &cp->u_guess_recvtypes[comp_coarse]); MPI_Type_commit(&cp->u_guess_recvtypes[comp_coarse]); } else { cp->u_guess_recvcounts[comp_coarse] = 0; cp->u_guess_recvdispl[comp_coarse] = 0; MPI_Type_dup(MPI_BYTE, &cp->u_guess_recvtypes[comp_coarse]); } } if (comp_coarse == local_proc) { if (overlap.size[0] > 0 && overlap.size[1] > 0) { ptrdiff_t src_origin[2] = { cp_coarse->solver_start[comp_coarse][0], cp_coarse->solver_start[comp_coarse][1]}; cp->u_guess_sendcounts[comp_fine] = 1; cp->u_guess_senddispl[comp_fine] = ((overlap.start[1] - src_origin[1]) * cp_coarse->solver->u_stride + (overlap.start[0] - src_origin[0])) * sizeof(double); MPI_Type_vector(overlap.size[1], overlap.size[0], cp_coarse->solver->u_stride, MPI_DOUBLE, &cp->u_guess_sendtypes[comp_fine]); MPI_Type_commit(&cp->u_guess_sendtypes[comp_fine]); } else { cp->u_guess_sendcounts[comp_fine] = 0; cp->u_guess_senddispl[comp_fine] = 0; MPI_Type_dup(MPI_BYTE, &cp->u_guess_sendtypes[comp_fine]); } } } } } MPI_Alltoallw(cp_coarse->solver->u, cp->u_guess_sendcounts, cp->u_guess_senddispl, cp->u_guess_sendtypes, cp->u_guess, cp->u_guess_recvcounts, cp->u_guess_recvdispl, cp->u_guess_recvtypes, MPI_COMM_WORLD); ret = mg2d_init_guess(cp->solver, cp->u_guess, cp->u_guess_stride, cp->u_guess_start, cp->u_guess_size, cp_coarse->solver->step); } else { ret = mg2d_init_guess(cp->solver, cp_coarse->solver->u, cp_coarse->solver->u_stride, cp_coarse->solver->local_start, cp_coarse->solver->local_size, cp_coarse->solver->step); } return ret; } static int interp_coarse(QMSMGContext *ms, CoordPatch *cp, double *dst, CoordPatch *cp_coarse, const double *src) { int ret; if (cp->nb_components > 1) { const int local_proc = CCTK_MyProc(ms->gh); int comp_fine, comp_coarse = 1; MPI_Comm_size(cp->solver_comm, &comp_fine); if (cp_coarse->solver_comm != MPI_COMM_NULL) MPI_Comm_size(cp_coarse->solver_comm, &comp_coarse); qms_assert(comp_fine == cp->nb_components && comp_coarse == cp->nb_components); if (!cp->interp_tmp_sync) { const int ghosts = ms->gh->cctk_nghostzones[0]; cp->interp_tmp_sendcounts = calloc(cp->nb_components, sizeof(*cp->interp_tmp_sendcounts)); cp->interp_tmp_senddispl = calloc(cp->nb_components, sizeof(*cp->interp_tmp_senddispl)); cp->interp_tmp_sendtypes = calloc(cp->nb_components, sizeof(*cp->interp_tmp_sendtypes)); cp->interp_tmp_recvcounts = calloc(cp->nb_components, sizeof(*cp->interp_tmp_recvcounts)); cp->interp_tmp_recvdispl = calloc(cp->nb_components, sizeof(*cp->interp_tmp_recvdispl)); cp->interp_tmp_recvtypes = calloc(cp->nb_components, sizeof(*cp->interp_tmp_recvtypes)); if (!cp->interp_tmp_sendcounts || !cp->interp_tmp_senddispl || !cp->interp_tmp_sendtypes || !cp->interp_tmp_recvcounts || !cp->interp_tmp_recvdispl || !cp->interp_tmp_recvtypes) return -ENOMEM; cp->interp_tmp_start[0] = (ptrdiff_t)cp->extents[4 * local_proc + 0] / 2 - ghosts; cp->interp_tmp_start[1] = (ptrdiff_t)cp->extents[4 * local_proc + 1] / 2 - ghosts; cp->interp_tmp_size[0] = cp->extents[4 * local_proc + 2] / 2 + 2 * ghosts; cp->interp_tmp_size[1] = cp->extents[4 * local_proc + 3] / 2 + 2 * ghosts; cp->interp_tmp_stride = cp->interp_tmp_size[0]; cp->interp_tmp_sync = calloc(cp->interp_tmp_stride * cp->interp_tmp_size[1], sizeof(*cp->interp_tmp_sync)); if (!cp->interp_tmp_sync) return -ENOMEM; for (int comp_fine = 0; comp_fine < cp->nb_components; comp_fine++) { Rect dst_fine = { .start = { cp->extents[4 * comp_fine + 0], cp->extents[4 * comp_fine + 1]}, .size = { cp->extents[4 * comp_fine + 2], cp->extents[4 * comp_fine + 3]}, }; Rect dst_coarse = { .start = { dst_fine.start[0] / 2 - ghosts, dst_fine.start[1] / 2 - ghosts }, .size = { dst_fine.size[0] / 2 + ghosts * 2, dst_fine.size[1] / 2 + ghosts * 2 }, }; for (int comp_coarse = 0; comp_coarse < cp_coarse->nb_components; comp_coarse++) { Rect overlap; Rect src = { .start = { cp_coarse->extents[4 * comp_coarse + 0], cp_coarse->extents[4 * comp_coarse + 1]}, .size = { cp_coarse->extents[4 * comp_coarse + 2], cp_coarse->extents[4 * comp_coarse + 3]}, }; for (int dim = 0; dim < 2; dim++) { if (src.start[dim] == 0) { src.start[dim] -= ghosts; src.size[dim] += ghosts; } } rect_intersect(&overlap, &dst_coarse, &src); if (comp_fine == local_proc) { if (overlap.size[0] > 0 && overlap.size[1] > 0) { cp->interp_tmp_recvcounts[comp_coarse] = 1; cp->interp_tmp_recvdispl[comp_coarse] = ((overlap.start[1] - dst_coarse.start[1]) * cp->interp_tmp_stride + (overlap.start[0] - dst_coarse.start[0])) * sizeof(double); MPI_Type_vector(overlap.size[1], overlap.size[0], cp->interp_tmp_stride, MPI_DOUBLE, &cp->interp_tmp_recvtypes[comp_coarse]); MPI_Type_commit(&cp->interp_tmp_recvtypes[comp_coarse]); } else { cp->interp_tmp_recvcounts[comp_coarse] = 0; cp->interp_tmp_recvdispl[comp_coarse] = 0; MPI_Type_dup(MPI_BYTE, &cp->interp_tmp_recvtypes[comp_coarse]); } } if (comp_coarse == local_proc) { if (overlap.size[0] > 0 && overlap.size[1] > 0) { ptrdiff_t src_origin[2] = { cp_coarse->extents[4 * comp_coarse + 0] - cp_coarse->offset_left[0], cp_coarse->extents[4 * comp_coarse + 1] - cp_coarse->offset_left[1] }; cp->interp_tmp_sendcounts[comp_fine] = 1; cp->interp_tmp_senddispl[comp_fine] = ((overlap.start[1] - src_origin[1]) * cp_coarse->grid_size[0] + (overlap.start[0] - src_origin[0])) * sizeof(double); MPI_Type_vector(overlap.size[1], overlap.size[0], cp_coarse->grid_size[0], MPI_DOUBLE, &cp->interp_tmp_sendtypes[comp_fine]); MPI_Type_commit(&cp->interp_tmp_sendtypes[comp_fine]); } else { cp->interp_tmp_sendcounts[comp_fine] = 0; cp->interp_tmp_senddispl[comp_fine] = 0; MPI_Type_dup(MPI_BYTE, &cp->interp_tmp_sendtypes[comp_fine]); } } } } } MPI_Alltoallw(src, cp->interp_tmp_sendcounts, cp->interp_tmp_senddispl, cp->interp_tmp_sendtypes, cp->interp_tmp_sync, cp->interp_tmp_recvcounts, cp->interp_tmp_recvdispl, cp->interp_tmp_recvtypes, MPI_COMM_WORLD); ret = mg2d_interp(cp->solver, dst + CCTK_GFINDEX3D(ms->gh, cp->offset_left[0], 0, cp->offset_left[1]), cp->grid_size[0], (ptrdiff_t [2]){ cp->extents[4 * local_proc + 0], cp->extents[4 * local_proc + 1] }, (size_t [2]){cp->extents[4 * local_proc + 2], cp->extents[4 * local_proc + 3] }, cp->solver->step, cp->interp_tmp_sync, cp->interp_tmp_stride, cp->interp_tmp_start, cp->interp_tmp_size, cp_coarse->solver->step); } else { ret = mg2d_interp(cp->solver, dst + CCTK_GFINDEX3D(ms->gh, cp->offset_left[0], 0, cp->offset_left[1]), cp->grid_size[0], (ptrdiff_t [2]){ 0, 0 }, (size_t [2]){cp->grid_size[0] - cp->offset_left[0], cp->grid_size[2] - cp->offset_left[1] }, cp->solver->step, src, cp_coarse->grid_size[0], (ptrdiff_t [2]){ -cp_coarse->offset_left[0], -cp_coarse->offset_left[1] }, (size_t [2]){cp_coarse->grid_size[0], cp_coarse->grid_size[2] }, cp_coarse->solver->step); } return ret; } void qms_mg_solve(CCTK_ARGUMENTS) { QMSMGContext *ms = qms_context; CoordPatch *cp; DECLARE_CCTK_ARGUMENTS; DECLARE_CCTK_PARAMETERS; const int reflevel = ctz(cctkGH->cctk_levfac[0]); const double time = ms->gh->cctk_time; const int timestep = lrint(time * cctkGH->cctk_levfac[0] / cctkGH->cctk_delta_time); int reflevel_top, ts_tmp; int64_t grid_size; int64_t total_start, start; int ret; const double dt = W_val1_time[reflevel] - W_val0_time[reflevel]; if (cctkGH->cctk_time >= switchoff_time + 1.0) return; /* skip the fine time steps */ if (reflevel > ms->solve_level && fabs(time - W_val1_time[ms->solve_level]) > 1e-13) return; total_start = gettime(); cp = get_coord_patch(ms, reflevel); grid_size = cctkGH->cctk_lsh[0] * cctkGH->cctk_lsh[1] * cctkGH->cctk_lsh[2]; reflevel_top = reflevel; ts_tmp = timestep; while (reflevel_top > 0 && !(ts_tmp % 2)) { ts_tmp /= 2; reflevel_top--; } reflevel_top = MAX(reflevel_top, ms->solve_level_max); if (reflevel < ms->solve_level_max) goto skip_solve; start = gettime(); fill_eq_coeffs(ms, cp, cp->solver); ms->time_fill += gettime() - start; ms->count_fill++; start = gettime(); if (reflevel > ms->solve_level_max) { /* outer-most level for this solve, use extrapolated values as boundary condition */ if (fabs(time - W_val1_time[reflevel - 1]) > 1e-13) { const double fact0 = (W_val1_time[reflevel] - time) / dt; const double fact1 = (time - W_val0_time[reflevel]) / dt; #pragma omp parallel for for (size_t i = 0; i < grid_size; i++) W_val[i] = fact0 * W_val0[i] + fact1 * W_val1[i]; } else { /* use the solution from the coarser level as the initial guess */ CoordPatch *cp_coarse = get_coord_patch(ms, reflevel - 1); ret = guess_from_coarser(ms, cp, cp_coarse, W_val); if (ret < 0) CCTK_WARN(0, "Error setting the initial guess"); } /* fill the solver boundary conditions */ if (cp->solver->local_start[1] + cp->solver->local_size[1] == cp->solver->domain_size) { #pragma omp parallel for for (int j = 0; j < cp->solver->fd_stencil; j++) { MG2DBoundary *bnd = cp->solver->boundaries[MG2D_BOUNDARY_1U]; double *dst = bnd->val + j * bnd->val_stride; for (ptrdiff_t i = -j; i < (ptrdiff_t)cp->solver->local_size[0] + j; i++) { const ptrdiff_t idx = CCTK_GFINDEX3D(cctkGH, ABS(i) + cp->offset_left[0], cp->y_idx, cp->offset_left[1] + cp->solver->local_size[1] - 1 + j); dst[i] = W_val[idx]; } } } if (cp->solver->local_start[0] + cp->solver->local_size[0] == cp->solver->domain_size) { #pragma omp parallel for for (int j = 0; j < cp->solver->fd_stencil; j++) { MG2DBoundary *bnd = cp->solver->boundaries[MG2D_BOUNDARY_0U]; double *dst = bnd->val + j * bnd->val_stride; for (ptrdiff_t i = -j; i < (ptrdiff_t)cp->solver->local_size[1] + j; i++) { const ptrdiff_t idx = CCTK_GFINDEX3D(cctkGH, cp->offset_left[1] + cp->solver->local_size[0] - 1 + j, cp->y_idx, cp->offset_left[1] + ABS(i)); dst[i] = W_val[idx]; } } } } ms->time_bnd += gettime() - start; ms->count_bnd++; fprintf(stderr, "%d qms solve: time %g step %d\n", reflevel, time, timestep); start = gettime(); ret = mg2d_solve(cp->solver); ms->time_mg2d += gettime() - start; ms->count_mg2d++; if (ret < 0) CCTK_WARN(0, "Error solving the quasi-maximal slicing equation"); skip_solve: start = gettime(); if (reflevel >= ms->solve_level_max) { double *W_val_tl1 = CCTK_VarDataPtr(cctkGH, 1, "QuasiMaximalSlicingMG::W_val"); solution_to_grid(ms, cp, cp->solver, W_val); memcpy(W_val_tl1, W_val, grid_size * sizeof(*W_val_tl1)); } if (timestep % 2) goto skip_export; fprintf(stderr, "%d qms export: time %g step %d\n", reflevel, time, timestep); /* add the solution to the list of past solutions */ { #if 0 memcpy(W_val0, W_val1, sizeof(*W_val0) * grid_size); memcpy(W_val1, W_val, sizeof(*W_val1) * grid_size); #else const double vel_fact = 1.0 / (1 << (MIN(reflevel, ms->solve_level) - reflevel_top)); #pragma omp parallel for for (int i = 0; i < grid_size; i++) { const double sol_new = W_val[i]; const double delta = sol_new - W_eval[i]; W_val0[i] = W_val1[i] + delta - delta * vel_fact; W_val1[i] = sol_new; W_eval[i] = 2 * W_val1[i] - W_val0[i]; } W_val0_time[reflevel] = W_val1_time[reflevel]; W_val1_time[reflevel] = time; #endif } /* linearly extrapolate the past two solution to predict the next one */ { const double time_src_0 = W_val0_time[reflevel]; const double time_src_1 = W_val1_time[reflevel]; const double pred_time = time_src_1 + dt; const double fact0 = (pred_time - time_src_1) / (time_src_0 - time_src_1); const double fact1 = (pred_time - time_src_0) / (time_src_1 - time_src_0); memcpy(W_pred0, W_pred1, sizeof(*W_pred0) * grid_size); W_pred0_time[reflevel] = W_pred1_time[reflevel]; if (reflevel > 0 /*&& reflevel <= ms->solve_level */) { CoordPatch *cp_coarse = get_coord_patch(ms, reflevel - 1); const int integrator_substeps = MoLNumIntegratorSubsteps(); const int local_proc = CCTK_MyProc(ms->gh); const ptrdiff_t bnd_idx = cp->level_size[0] - cctk_nghostzones[0] * integrator_substeps * 2 - 1; const double bnd_loc = bnd_idx * cp->solver->step[0]; const double transition_start = bnd_loc * 0.7; const ptrdiff_t smooth_end[2] = { MIN(cp->extents[local_proc * 4 + 0] + cp->extents[local_proc * 4 + 2], bnd_idx), MIN(cp->extents[local_proc * 4 + 1] + cp->extents[local_proc * 4 + 3], bnd_idx) }; const ptrdiff_t smooth_size[2] = { smooth_end[0] - cp->extents[local_proc * 4 + 0], smooth_end[1] - cp->extents[local_proc * 4 + 1] }; double tp0 = W_pred0_time[reflevel - 1]; double tp1 = W_pred1_time[reflevel - 1]; const double factp0 = (pred_time - tp1) / (tp0 - tp1); const double factp1 = (pred_time - tp0) / (tp1 - tp0); qms_assert(pred_time >= tp0 && pred_time <= tp1); //mg2d_interp(cp->solver, cp->interp_tmp0 + CCTK_GFINDEX3D(ms->gh, cp->offset_left[0], 0, cp->offset_left[1]), // cctk_lsh[0], (ptrdiff_t [2]){ 0, 0 }, (size_t [2]){cctk_lsh[0] - cp->offset_left[0], cctk_lsh[2] - cp->offset_left[1] }, // cp->solver->step, // W_pred0_coarse, cp_coarse->grid_size[0], (ptrdiff_t [2]){ -cp_coarse->offset_left[0], -cp_coarse->offset_left[1] }, // (size_t [2]){cp_coarse->grid_size[0], cp_coarse->grid_size[2] }, cp_coarse->solver->step); //mg2d_interp(cp->solver, cp->interp_tmp1 + CCTK_GFINDEX3D(ms->gh, cp->offset_left[0], 0, cp->offset_left[1]), // cctk_lsh[0], (ptrdiff_t [2]){ 0, 0 }, (size_t [2]){cctk_lsh[0] - cp->offset_left[0], cctk_lsh[2] - cp->offset_left[1] }, // cp->solver->step, // W_pred1_coarse, cp_coarse->grid_size[0], (ptrdiff_t [2]){ -cp_coarse->offset_left[0], -cp_coarse->offset_left[1] }, // (size_t [2]){cp_coarse->grid_size[0], cp_coarse->grid_size[2] }, cp_coarse->solver->step); interp_coarse(ms, cp, cp->interp_tmp0, cp_coarse, VarDataPtrI(ms->gh, 0, reflevel - 1, local_proc, 0, CCTK_VarIndex("QuasiMaximalSlicingMG::W_pred0"))); interp_coarse(ms, cp, cp->interp_tmp1, cp_coarse, VarDataPtrI(ms->gh, 0, reflevel - 1, local_proc, 0, CCTK_VarIndex("QuasiMaximalSlicingMG::W_pred1"))); #pragma omp parallel for for (int i = 0; i < grid_size; i++) W_pred1[i] = factp0 * cp->interp_tmp0[i] + factp1 * cp->interp_tmp1[i]; #pragma omp parallel for for (int idx_z = 0; idx_z < smooth_size[1]; idx_z++) for (int idx_x = 0; idx_x < smooth_size[0]; idx_x++) { const ptrdiff_t idx = CCTK_GFINDEX3D(ms->gh, cp->offset_left[0] + idx_x, 0, cp->offset_left[1] + idx_z); const double x_val = x[idx]; const double z_val = z[idx]; const double coarse_val = W_pred1[idx]; const double r = sqrt(SQR(x_val) + SQR(z_val)); const double dist = (r - transition_start) / (bnd_loc - transition_start); const double bnd_fact = smooth_step(dist); W_pred1[idx] = (W_val1[idx] * fact1 + W_val0[idx] * fact0) * (1.0 - bnd_fact) + coarse_val * bnd_fact; } mirror(cp, W_pred1); } else if (reflevel == 0) { const double bnd_loc = (cp->level_size[0] - cctk_nghostzones[0]) * cp->solver->step[0]; const double transition_start = bnd_loc * 0.7; #pragma omp parallel for for (int idx = 0; idx < grid_size; idx++) { const double x_val = x[idx]; const double z_val = z[idx]; const double coarse_val = W_pred1[idx]; const double r = sqrt(SQR(x_val) + SQR(z_val)); const double dist = (r - transition_start) / fabs(bnd_loc - transition_start); const double bnd_fact = 1.0 - smooth_step(dist); W_pred1[idx] = (W_val1[idx] * fact1 + W_val0[idx] * fact0) * bnd_fact; } } else { #pragma omp parallel for for (int i = 0; i < grid_size; i++) W_pred1[i] = (W_val1[i] * fact1 + W_val0[i] * fact0); } W_pred1_time[reflevel] = pred_time; } ms->time_export += gettime() - start; ms->count_export++; skip_export: ms->time_solve += gettime() - total_start; ms->count_solve++; if (stats_every > 0 && reflevel == ms->solve_level && ms->count_solve > 0 && ms->count_eval > 0 && !(ms->count_solve % stats_every)) print_stats(ms); } void qms_mg_sync(CCTK_ARGUMENTS) { } void qms_mg_eval(CCTK_ARGUMENTS) { QMSMGContext *ms; DECLARE_CCTK_ARGUMENTS; DECLARE_CCTK_PARAMETERS; const int reflevel = ctz(cctkGH->cctk_levfac[0]); const double time = cctkGH->cctk_time; const int64_t grid_size = cctkGH->cctk_lsh[0] * cctkGH->cctk_lsh[1] * cctkGH->cctk_lsh[2]; int64_t start; int i, ret; ms = qms_context; if (time >= switchoff_time + 1.0) return; start = gettime(); { double *u0 = W_pred0; double *u1 = W_pred1; double time0 = W_pred0_time[reflevel]; double time1 = W_pred1_time[reflevel]; double fact, fact0, fact1; if (time0 == DBL_MAX && time1 == DBL_MAX) { fact0 = 0.0; fact1 = 0.0; } else if (time0 == DBL_MAX) { fact0 = 0.0; fact1 = 1.0; } else { fact0 = (time - time1) / (time0 - time1); fact1 = (time - time0) / (time1 - time0); } if (time < switchoff_time) fact = 1.0; else fact = exp(-36.0 * pow((time - switchoff_time), 4.0)); #pragma omp parallel for for (int i = 0; i < grid_size; i++) W[i] = (u1[i] * fact1 + u0[i] * fact0) * fact; } ms->time_eval += gettime() - start; ms->count_eval++; } static int context_init(cGH *gh, int solve_level, int solve_level_max, int fd_stencil, int maxiter, int exact_size, int nb_cycles, int nb_relax_pre, int nb_relax_post, double tol_residual_base, double cfl_factor, double outer_smooth_fact, int adaptive_step, const char *loglevel_str, int boundary_offset, QMSMGContext **ctx) { QMSMGContext *qms; int loglevel = -1; int ret; qms = calloc(1, sizeof(*qms)); if (!qms) return -ENOMEM; qms->gh = gh; qms->solve_level = solve_level; qms->solve_level_max = solve_level_max; qms->fd_stencil = fd_stencil; qms->maxiter = maxiter; qms->max_exact_size = exact_size; qms->nb_cycles = nb_cycles; qms->nb_relax_pre = nb_relax_pre; qms->nb_relax_post = nb_relax_post; qms->tol_residual_base = tol_residual_base; qms->cfl_factor = cfl_factor; qms->adaptive_step = adaptive_step; qms->boundary_offset = boundary_offset; qms->outer_smooth_fact = outer_smooth_fact; for (int i = 0; i < ARRAY_ELEMS(log_levels); i++) { if (!strcmp(loglevel_str, log_levels[i].str)) { loglevel = log_levels[i].level; break; } } if (loglevel < 0) { fprintf(stderr, "Invalid loglevel: %s\n", loglevel_str); return -EINVAL; } qms->log_level = loglevel; *ctx = qms; return 0; } static void context_free(QMSMGContext **pms) { QMSMGContext *ms = *pms; if (!ms) return; for (int i = 0; i < ms->nb_patches; i++) coord_patch_free(&ms->patches[i]); free(ms->patches); free(ms); *pms = NULL; } void qms_mg_init(CCTK_ARGUMENTS) { DECLARE_CCTK_ARGUMENTS; DECLARE_CCTK_PARAMETERS; const int reflevel = ctz(cctk_levfac[0]); int use_tapered_grids = *(int*)CCTK_ParameterGet("use_tapered_grids", "Carpet", &use_tapered_grids); int ret; fprintf(stderr, "%d init\n", reflevel); if (!use_tapered_grids) CCTK_WARN(0, "MaximalSlicingAxiMG only works with use_tapered_grids=1"); if (!qms_context) { ret = context_init(cctkGH, solve_level, solve_level_max, fd_stencil, maxiter, exact_size, nb_cycles, nb_relax_pre, nb_relax_post, tol_residual_base, cfl_factor, outer_smooth_fact, adaptive_step, loglevel, boundary_offset, &qms_context); if (ret < 0) CCTK_WARN(0, "Error initializing the solver context"); } alloc_coord_patch(qms_context, reflevel); } void qms_mg_inithist(CCTK_ARGUMENTS) { DECLARE_CCTK_ARGUMENTS; DECLARE_CCTK_PARAMETERS; const int reflevel = ctz(cctk_levfac[0]); const int reflevel_solve = MIN(reflevel, solve_level); size_t grid_size = cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2]; double dt; dt = cctk_delta_time / (1 << reflevel_solve); dt *= 2; // timestep for QMS is equal to the evolution step for the coarser level fprintf(stderr, "%d inithist\n", reflevel); W_val1_time[reflevel] = -dt; W_val0_time[reflevel] = W_val1_time[reflevel] - dt; W_pred1_time[reflevel] = W_val1_time[reflevel] + dt; W_pred0_time[reflevel] = W_pred1_time[reflevel] - dt; memset(W_pred0, 0, sizeof(double) * grid_size); memset(W_pred1, 0, sizeof(double) * grid_size); memset(W_val0, 0, sizeof(double) * grid_size); memset(W_val1, 0, sizeof(double) * grid_size); memset(W_val, 0, sizeof(double) * grid_size); } void qms_mg_terminate_print_stats(CCTK_ARGUMENTS) { const int reflevel = ctz(cctkGH->cctk_levfac[0]); if (reflevel == 0 && qms_context) { print_stats(qms_context); context_free(&qms_context); } } void qms_mg_register_symmetries(CCTK_ARGUMENTS) { DECLARE_CCTK_ARGUMENTS; DECLARE_CCTK_PARAMETERS; int sym[3] = { 1, 1, 1 }; int ret; ret = SetCartSymVN(cctkGH, sym, "QuasiMaximalSlicingMG::W_val"); if (ret != 0) CCTK_WARN(0, "Error registering symmetries"); } void qms_mg_select_bc(CCTK_ARGUMENTS) { DECLARE_CCTK_ARGUMENTS; DECLARE_CCTK_PARAMETERS; int ret = Boundary_SelectVarForBC(cctkGH, CCTK_ALL_FACES, 1, -1, "QuasiMaximalSlicingMG::W_val", "none"); if (ret != 0) CCTK_WARN(0, "Error registering boundaries"); }