From 6a0eeab600032901e0116e59f304e2643e72d776 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Mon, 16 Apr 2018 19:30:05 +0200 Subject: Faster W evaluation. --- interface.ccl | 10 +- schedule.ccl | 1 + src/qms.c | 287 +++++++++++++++++++++++++++++++++++--------------------- src/qms_solve.c | 13 ++- src/qms_solve.h | 2 +- src/register.c | 2 + 6 files changed, 203 insertions(+), 112 deletions(-) diff --git a/interface.ccl b/interface.ccl index 7216b74..5b79b4a 100644 --- a/interface.ccl +++ b/interface.ccl @@ -13,5 +13,11 @@ REQUIRES FUNCTION MoLRegisterSaveAndRestoreGroup public: CCTK_REAL W TYPE=GF -CCTK_REAL W_coeffs TYPE=array DIM=3 SIZE=2,basis_order_z,basis_order_r DISTRIB=constant -CCTK_REAL W_pred TYPE=array DIM=3 SIZE=2,basis_order_z,basis_order_r DISTRIB=constant +CCTK_REAL W_pred TYPE=GF +{ + W_pred_0 + W_pred_1 +} "W_pred" + +CCTK_REAL W_coeffs TYPE=array DIM=3 SIZE=2,basis_order_z,basis_order_r DISTRIB=constant +CCTK_REAL W_pred_coeffs TYPE=array DIM=3 SIZE=2,basis_order_z,basis_order_r DISTRIB=constant diff --git a/schedule.ccl b/schedule.ccl index d8eb719..794bfef 100644 --- a/schedule.ccl +++ b/schedule.ccl @@ -27,4 +27,5 @@ SCHEDULE qms_init IN ADMBase_InitialData { STORAGE: W STORAGE: W_pred +STORAGE: W_pred_coeffs STORAGE: W_coeffs diff --git a/src/qms.c b/src/qms.c index 4be70c6..94a7b47 100644 --- a/src/qms.c +++ b/src/qms.c @@ -52,8 +52,14 @@ struct QMSContext { double max_radius; - uint64_t grid_expand_count; - uint64_t grid_expand_time; + uint64_t ps_solve_count; + uint64_t ps_solve_time; + uint64_t pred_build_count; + uint64_t pred_build_time; + uint64_t pred_eval_count; + uint64_t pred_eval_time; + uint64_t w_eval_count; + uint64_t w_eval_time; CoordPatch *patches; int nb_patches; @@ -257,14 +263,12 @@ static int context_init(cGH *cctkGH) for (int i = 0; i < ARRAY_ELEMS(qms->solutions); i++) { qms->solutions[i].coeffs = W_coeffs + i * basis_order_r * basis_order_z; - //memset(qms->solutions[i].coeffs, 0, basis_order_r * basis_order_z * sizeof(*qms->solutions[i].coeffs)); qms->solutions[i].time = cctkGH->cctk_time - (2 - i) * cctkGH->cctk_delta_time / (1.0 * (1 << solve_level)); fprintf(stderr, "init time %d %f\n", i, qms->solutions[i].time); } for (int i = 0; i < ARRAY_ELEMS(qms->pred); i++) { - qms->pred[i].coeffs = W_pred + i * basis_order_r * basis_order_z; - //memset(qms->pred[i].coeffs, 0, basis_order_r * basis_order_z * sizeof(*qms->pred[i].coeffs)); + qms->pred[i].coeffs = W_pred_coeffs + i * basis_order_r * basis_order_z; qms->pred[i].time = cctkGH->cctk_time - (1 - i) * cctkGH->cctk_delta_time / (1.0 * (1 << solve_level)); fprintf(stderr, "init time %d %f\n", i, qms->pred[i].time); } @@ -274,6 +278,32 @@ static int context_init(cGH *cctkGH) return 0; } +static int qms_eval_pred(QMSContext *qms, double *W_pred_0, double *W_pred_1, + double *x, double *y, double *z, int scale_factor, int scale_power) +{ + CoordPatch *cp; + size_t N = qms->gh->cctk_lsh[0] * qms->gh->cctk_lsh[2]; + + cp = get_coord_patch(qms, x, y, z, scale_factor, scale_power); + + memcpy(W_pred_0, W_pred_1, N * sizeof(*W_pred_0)); + + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + N, qms->solver->nb_coeffs[1], qms->solver->nb_coeffs[0], + 1.0, cp->transform_matrix, N, + qms->pred[1].coeffs, qms->solver->nb_coeffs[0], 0.0, cp->transform_tmp, N); +#pragma omp parallel for + for (int j = 0; j < qms->gh->cctk_lsh[2]; j++) + for (int i = 0; i < qms->gh->cctk_lsh[0]; i++) { + const int idx_grid = j * qms->gh->cctk_lsh[0] + i; + const double val = cblas_ddot(qms->solver->nb_coeffs[1], cp->transform_matrix1 + idx_grid * qms->solver->nb_coeffs[1], 1, + cp->transform_tmp + idx_grid, N); + W_pred_1[CCTK_GFINDEX3D(qms->gh, i, cp->y_idx, j)] = val; + } + + return 0; +} + void quasimaximal_slicing_axi_solve(CCTK_ARGUMENTS) { QMSContext *ms; @@ -281,6 +311,8 @@ void quasimaximal_slicing_axi_solve(CCTK_ARGUMENTS) DECLARE_CCTK_ARGUMENTS; DECLARE_CCTK_PARAMETERS; + uint64_t timing_start; + int have_solve_level, have_solve_time; double time; if (!qms_context) @@ -291,51 +323,60 @@ void quasimaximal_slicing_axi_solve(CCTK_ARGUMENTS) if (cctkGH->cctk_time >= switchoff_time + 1.0) return; - if (ctz(ms->gh->cctk_levfac[0]) != solve_level) - return; - - time = cctkGH->cctk_time * ms->gh->cctk_levfac[0] / ms->gh->cctk_delta_time; + have_solve_level = ctz(ms->gh->cctk_levfac[0]) == solve_level; + time = cctkGH->cctk_time * ms->gh->cctk_levfac[0] / ms->gh->cctk_delta_time; + have_solve_time = fabs(time - ceilf(time)) < 1e-8; - if (fabs(time - ceilf(time)) > 1e-8 || - (ms->solutions[ARRAY_ELEMS(ms->solutions) - 1].time >= cctkGH->cctk_time)) - return; + /* if we are on the right refinement level, solve for the W spectral + * coefficients and produce predictions */ + if (have_solve_level && have_solve_time && + (ms->solutions[ARRAY_ELEMS(ms->solutions) - 1].time < cctkGH->cctk_time)) { + int N = ms->solver->nb_coeffs[0] * ms->solver->nb_coeffs[1]; - CCTK_TimerStart("QuasiMaximalSlicing_Solve"); - qms_solver_solve(ms->solver); - CCTK_TimerStop("QuasiMaximalSlicing_Solve"); + timing_start = gettime(); + qms_solver_solve(ms->solver); + ms->ps_solve_time += gettime() - timing_start; + ms->ps_solve_count++; - fprintf(stderr, "%d qms solve: time %g %g %g\n", ms->gh->cctk_levfac[0], ms->gh->cctk_time, time, ms->solver->coeffs[0]); - - /* add the solution to the list of past solutions */ - { - int N = ms->solver->nb_coeffs[0] * ms->solver->nb_coeffs[1]; + fprintf(stderr, "%d qms solve: time %g %g %g\n", ms->gh->cctk_levfac[0], ms->gh->cctk_time, time, ms->solver->coeffs[0]); + timing_start = gettime(); + /* add the solution to the list of past solutions */ memcpy(ms->solutions[0].coeffs, ms->solutions[1].coeffs, sizeof(*ms->solutions[0].coeffs) * N); ms->solutions[0].time = ms->solutions[1].time; memcpy(ms->solutions[1].coeffs, ms->solver->coeffs, sizeof(*ms->solutions[1].coeffs) * N); ms->solutions[1].time = ms->gh->cctk_time; - } - /* linearly extrapolate the past two solution to predict the next one */ - { - int N = ms->solver->nb_coeffs[0] * ms->solver->nb_coeffs[1]; + /* linearly extrapolate the past two solution to predict the next one */ + { + double time0 = ms->solutions[0].time; + double time1 = ms->solutions[1].time; + double time = time1 + ms->gh->cctk_delta_time / (1.0 * (1 << solve_level)); - double time0 = ms->solutions[0].time; - double time1 = ms->solutions[1].time; - double time = time1 + ms->gh->cctk_delta_time / (1.0 * (1 << solve_level)); + double *coeffs0 = ms->solutions[0].coeffs; + double *coeffs1 = ms->solutions[1].coeffs; + double *pred = ms->pred[1].coeffs; - double *coeffs0 = ms->solutions[0].coeffs; - double *coeffs1 = ms->solutions[1].coeffs; - double *pred = ms->pred[1].coeffs; + memcpy(ms->pred[0].coeffs, ms->pred[1].coeffs, sizeof(*ms->pred[0].coeffs) * N); + ms->pred[0].time = ms->pred[1].time; - memcpy(ms->pred[0].coeffs, ms->pred[1].coeffs, sizeof(*ms->pred[0].coeffs) * N); - ms->pred[0].time = ms->pred[1].time; + for (int i = 0; i < N; i++) + ms->pred[1].coeffs[i] = (coeffs1[i] * (time - time0) / (time1 - time0) + coeffs0[i] * (time - time1) / (time0 - time1)); - for (int i = 0; i < N; i++) - ms->pred[1].coeffs[i] = (coeffs1[i] * (time - time0) / (time1 - time0) + coeffs0[i] * (time - time1) / (time0 - time1)); + ms->pred[1].time = time; + } + ms->pred_build_time += gettime() - timing_start; + ms->pred_build_count++; + } - ms->pred[1].time = time; + /* after we have the prediction coefficients, + * evaluate predicted values of W */ + if (have_solve_time) { + timing_start = gettime(); + qms_eval_pred(ms, W_pred_0, W_pred_1, x, y, z, scale_factor, scale_power); + ms->pred_eval_time += gettime() - timing_start; + ms->pred_eval_count++; } } @@ -350,7 +391,7 @@ void quasimaximal_slicing_axi_eval(CCTK_ARGUMENTS) double time; - int64_t expand_start; + int64_t w_eval_start; double *coeffs = NULL; int i, ret; @@ -367,97 +408,127 @@ void quasimaximal_slicing_axi_eval(CCTK_ARGUMENTS) cp = get_coord_patch(ms, x, y, z, scale_factor, scale_power); -#if 0 - //coeffs = ms->coeffs; - coeffs = ms->solutions[ARRAY_ELEMS(ms->solutions) - 1].coeffs; -#else - coeffs = ms->coeffs_eval; - + w_eval_start = gettime(); { - double *coeffs0 = ms->pred[0].coeffs; - double *coeffs1 = ms->pred[1].coeffs; + size_t N = cctk_lsh[0] * cctk_lsh[2]; double time0 = ms->pred[0].time; double time1 = ms->pred[1].time; - double fact; + double fact0, fact1, fact; if (time < switchoff_time) fact = 1.0; else fact = exp(-36.0 * pow((time - switchoff_time), 4.0)); - for (int i = 0; i < ms->solver->nb_coeffs[0] * ms->solver->nb_coeffs[1]; i++) - coeffs[i] = (coeffs1[i] * (time - time0) / (time1 - time0) + coeffs0[i] * (time - time1) / (time0 - time1)) * fact; + fact0 = fact * (time - time1) / (time0 - time1); + fact1 = fact * (time - time0) / (time1 - time0); + memcpy(W, W_pred_0, N * sizeof(*W)); + cblas_dscal(N, fact0, W, 1); + cblas_daxpy(N, fact1, W_pred_1, 1, W, 1); } -#endif - CCTK_TimerStart("QuasiMaximalSlicing_Expand"); - expand_start = gettime(); -#if 0 -#pragma omp parallel for - for (int k = 0; k < cctk_lsh[2]; k++) { - for (int i = 0; i < cctk_lsh[0]; i++) { - int idx = CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, k); - double xx = x[idx]; - double zz = z[idx]; - double r = sqrt(SQR(xx) + SQR(zz)); - double phi = atan2(zz, xx); - - double val = 0.0; - - for (int l = 0; l < ms->nb_coeffs_z; l++) { - double tmp = 0.0; - for (int m = 0; m < ms->nb_coeffs_x; m++) { - const int idx_coeff = l * ms->nb_coeffs_x + m; - tmp += coeffs[idx_coeff] * ms->basis->eval(r, m); - } - val += tmp * ms->basis1->eval(phi, l); - } +//#if 0 +// //coeffs = ms->coeffs; +// coeffs = ms->solutions[ARRAY_ELEMS(ms->solutions) - 1].coeffs; +//#else +// coeffs = ms->coeffs_eval; +// +// { +// double *coeffs0 = ms->pred[0].coeffs; +// double *coeffs1 = ms->pred[1].coeffs; +// double time0 = ms->pred[0].time; +// double time1 = ms->pred[1].time; +// +// double fact; +// +// if (time < switchoff_time) +// fact = 1.0; +// else +// fact = exp(-36.0 * pow((time - switchoff_time), 4.0)); +// +// for (int i = 0; i < ms->solver->nb_coeffs[0] * ms->solver->nb_coeffs[1]; i++) +// coeffs[i] = (coeffs1[i] * (time - time0) / (time1 - time0) + coeffs0[i] * (time - time1) / (time0 - time1)) * fact; +// +// } +//#endif +// +// expand_start = gettime(); +//#if 0 +//#pragma omp parallel for +// for (int k = 0; k < cctk_lsh[2]; k++) { +// for (int i = 0; i < cctk_lsh[0]; i++) { +// int idx = CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, k); +// double xx = x[idx]; +// double zz = z[idx]; +// double r = sqrt(SQR(xx) + SQR(zz)); +// double phi = atan2(zz, xx); +// +// double val = 0.0; +// +// for (int l = 0; l < ms->nb_coeffs_z; l++) { +// double tmp = 0.0; +// for (int m = 0; m < ms->nb_coeffs_x; m++) { +// const int idx_coeff = l * ms->nb_coeffs_x + m; +// tmp += coeffs[idx_coeff] * ms->basis->eval(r, m); +// } +// val += tmp * ms->basis1->eval(phi, l); +// } +// +// W[idx] = val; +// } +// } +//#elif QMS_POLAR || 1 +// cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, +// cctk_lsh[0] * cctk_lsh[2], ms->solver->nb_coeffs[1], ms->solver->nb_coeffs[0], +// 1.0, cp->transform_matrix, cctk_lsh[0] * cctk_lsh[2], +// coeffs, ms->solver->nb_coeffs[0], 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]); +//#pragma omp parallel for +// for (int j = 0; j < cctk_lsh[2]; j++) +// for (int i = 0; i < cctk_lsh[0]; i++) { +// const int idx_grid = j * cctk_lsh[0] + i; +// const double val = cblas_ddot(ms->solver->nb_coeffs[1], cp->transform_matrix1 + idx_grid * ms->solver->nb_coeffs[1], 1, +// cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]); +// W[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = val; +// } +//#else +// memset(W, 0, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*W)); +// cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, +// ms->solver->nb_coeffs[0], cctk_lsh[2], ms->solver->nb_coeffs[1], 1.0, +// coeffs, ms->solver->nb_coeffs[0], cp->basis_val_z, ms->solver->nb_coeffs[1], +// 0.0, cp->transform_z, ms->solver->nb_coeffs[0]); +// cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, +// cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], ms->solver->nb_coeffs[0], 1.0, +// cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, ms->solver->nb_coeffs[0], +// 1.0, W, cctk_lsh[0] * cctk_lsh[1]); +//#endif + + ms->w_eval_time += gettime() - w_eval_start; + ms->w_eval_count++; - W[idx] = val; - } - } -#elif QMS_POLAR || 1 - cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, - cctk_lsh[0] * cctk_lsh[2], ms->solver->nb_coeffs[1], ms->solver->nb_coeffs[0], - 1.0, cp->transform_matrix, cctk_lsh[0] * cctk_lsh[2], - coeffs, ms->solver->nb_coeffs[0], 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]); -#pragma omp parallel for - for (int j = 0; j < cctk_lsh[2]; j++) - for (int i = 0; i < cctk_lsh[0]; i++) { - const int idx_grid = j * cctk_lsh[0] + i; - const double val = cblas_ddot(ms->solver->nb_coeffs[1], cp->transform_matrix1 + idx_grid * ms->solver->nb_coeffs[1], 1, - cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]); - W[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = val; - } -#else - memset(W, 0, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*W)); - cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, - ms->solver->nb_coeffs[0], cctk_lsh[2], ms->solver->nb_coeffs[1], 1.0, - coeffs, ms->solver->nb_coeffs[0], cp->basis_val_z, ms->solver->nb_coeffs[1], - 0.0, cp->transform_z, ms->solver->nb_coeffs[0]); - cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, - cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], ms->solver->nb_coeffs[0], 1.0, - cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, ms->solver->nb_coeffs[0], - 1.0, W, cctk_lsh[0] * cctk_lsh[1]); -#endif + /* print stats */ + if (!(ms->w_eval_count & 255)) { + uint64_t total_time = ms->ps_solve_time + ms->pred_build_time + + ms->pred_eval_time + ms->w_eval_time; + fprintf(stderr, "Quasi-maximal slicing stats:\n"); - ms->grid_expand_time += gettime() - expand_start; - ms->grid_expand_count++; + fprintf(stderr, "Pseudospectral solve:\n"); + qms_solver_print_stats(ms->solver, 2); - CCTK_TimerStop("QuasiMaximalSlicing_Expand"); + fprintf(stderr, "Pseudospectral solves %lu total time %g s avg per call %g ms\n", + ms->ps_solve_count, (double)ms->ps_solve_time / 1e6, (double)ms->ps_solve_time / ms->ps_solve_count / 1e3); - /* print stats */ - if (!(ms->grid_expand_count & 255)) { - fprintf(stderr, "Quasi-maximal slicing stats:\n"); + fprintf(stderr, "Pred builds %lu total time %g s avg per call %g ms\n", + ms->pred_build_count, (double)ms->pred_build_time / 1e6, (double)ms->pred_build_time / ms->pred_build_count / 1e3); + + fprintf(stderr, "Pred evals %lu total time %g s avg per call %g ms\n", + ms->pred_eval_count, (double)ms->pred_eval_time / 1e6, (double)ms->pred_eval_time / ms->pred_eval_count / 1e3); - qms_solver_print_stats(ms->solver); + fprintf(stderr, "W evals %lu total time %g s avg per call %g ms\n", + ms->w_eval_count, (double)ms->w_eval_time / 1e6, (double)ms->w_eval_time / ms->w_eval_count / 1e3); - fprintf(stderr, - "%lu evals: total time %g s, avg time per call %g ms\n", - ms->grid_expand_count, (double)ms->grid_expand_time / 1e6, - (double)ms->grid_expand_time / ms->grid_expand_count / 1e3); + fprintf(stderr, "QMS total time: %g s\n", total_time / 1e6); } } diff --git a/src/qms_solve.c b/src/qms_solve.c index 3ce463b..08c298f 100644 --- a/src/qms_solve.c +++ b/src/qms_solve.c @@ -992,28 +992,38 @@ int qms_solver_solve(QMSSolver *ctx) return 0; } -void qms_solver_print_stats(QMSSolver *ctx) +static void indent(int level) +{ + for (int i = 0; i < level; i++) + fprintf(stderr, " "); +} + +void qms_solver_print_stats(QMSSolver *ctx, int indent_level) { QMSSolverPriv *s = ctx->priv; + indent(indent_level); fprintf(stderr, "%g%% interpolate geometry: %lu, " "total time %g s, avg time per call %g ms\n", (double)s->interp_geometry_time * 100 / s->solve_time, s->interp_geometry_count, (double)s->interp_geometry_time / 1e6, (double)s->interp_geometry_time / s->interp_geometry_count / 1e3); + indent(indent_level); fprintf(stderr, "%g%% calc equation coefficients: %lu, " "total time %g s, avg time per call %g ms\n", (double)s->calc_eq_coeffs_time * 100 / s->solve_time, s->calc_eq_coeffs_count, (double)s->calc_eq_coeffs_time / 1e6, (double)s->calc_eq_coeffs_time / s->calc_eq_coeffs_count / 1e3); + indent(indent_level); fprintf(stderr, "%g%% pseudospectral matrix construction: %lu, " "total time %g s, avg time per call %g ms\n", (double)s->ps_ctx->construct_matrix_time * 100 / s->solve_time, s->ps_ctx->construct_matrix_count, (double)s->ps_ctx->construct_matrix_time / 1e6, (double)s->ps_ctx->construct_matrix_time / s->ps_ctx->construct_matrix_count / 1e3); + indent(indent_level); fprintf(stderr, "%g%% BiCGSTAB %lu solves, " "%lu iterations, total time %g s, " @@ -1024,6 +1034,7 @@ void qms_solver_print_stats(QMSSolver *ctx) (double)s->ps_ctx->cg_iter_count / s->ps_ctx->cg_solve_count, (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_solve_count / 1e3, (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_iter_count / 1e3); + indent(indent_level); fprintf(stderr, "%g%% LU %lu solves, total time %g s, avg time per solve %g ms\n", (double)s->ps_ctx->lu_solves_time * 100 / s->solve_time, diff --git a/src/qms_solve.h b/src/qms_solve.h index 91520d9..d59e538 100644 --- a/src/qms_solve.h +++ b/src/qms_solve.h @@ -48,6 +48,6 @@ void qms_solver_free(QMSSolver **ctx); int qms_solver_solve(QMSSolver *ctx); -void qms_solver_print_stats(QMSSolver *ctx); +void qms_solver_print_stats(QMSSolver *ctx, int indent_level); #endif /* QMS_SOLVE_H */ diff --git a/src/register.c b/src/register.c index 65c2d21..d491433 100644 --- a/src/register.c +++ b/src/register.c @@ -6,4 +6,6 @@ void qms_register_mol(CCTK_ARGUMENTS) { MoLRegisterConstrained(CCTK_VarIndex("QuasiMaximalSlicing::W")); + MoLRegisterConstrained(CCTK_VarIndex("QuasiMaximalSlicing::W_pred_0")); + MoLRegisterConstrained(CCTK_VarIndex("QuasiMaximalSlicing::W_pred_1")); } -- cgit v1.2.3