From d53f73b7f7c728e96ffc07b55fa30b9e6fc5121c Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Sat, 7 Apr 2018 18:13:49 +0200 Subject: Initial commit. --- src/md.c | 573 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 573 insertions(+) create mode 100644 src/md.c (limited to 'src/md.c') diff --git a/src/md.c b/src/md.c new file mode 100644 index 0000000..21e38fc --- /dev/null +++ b/src/md.c @@ -0,0 +1,573 @@ +#include "common.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cctk.h" +#include "cctk_Arguments.h" +#include "cctk_Parameters.h" +#include "cctk_Timers.h" +#include "util_Table.h" + +#include "md.h" +#include "md_solve.h" +#include "threadpool.h" + +typedef struct EvalContext { + struct MDContext *md; + struct CoordPatch *cp; + const double *x; + const double *z; + double *W; + + const double *coeffs; + double nb_coeffs[2]; + + double *eval_tmp[2]; + + unsigned int x_idx_start; + unsigned int x_idx_end; + unsigned int z_idx_start; + unsigned int z_idx_end; +} EvalContext; + +/* precomputed values for a given refined grid */ +typedef struct CoordPatch { + CCTK_REAL origin[3]; + CCTK_INT delta[3]; + CCTK_INT size[3]; + + // basis values on the grid + double *basis_val_r; + double *basis_val_z; + + double *transform_z; + double *transform_matrix; + double *transform_matrix1; + double *transform_matrix2; + double *transform_matrix3; + double *transform_tmp; + + int y_idx; + + int nb_threads; + ThreadPoolContext *tp; + EvalContext *ec; +} CoordPatch; + +struct MDContext { + MDSolver *solver; + cGH *gh; + ThreadPoolContext *tp; + + struct { + double time; + double *coeffs; + } solution_cache[8]; + int nb_solutions; + + double *coeffs_eval; + + uint64_t grid_expand_count; + uint64_t grid_expand_time; + + CoordPatch *patches; + int nb_patches; +}; + +/* get an approximate "main" frequency component in a basis function */ +static double calc_basis_freq(const MDBasisSetContext *b, int order) +{ + return md_basis_colloc_point(b, order, 1); +} + +static CoordPatch *get_coord_patch(MDContext *md, + CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z, + double scale_factor, double scale_power) +{ + cGH *cctkGH = md->gh; + + CoordPatch *cp; + int64_t grid_size; + int i, block_size; + const char *nb_threads; + + for (int i = 0; i < md->nb_patches; i++) { + cp = &md->patches[i]; + + if (cp->origin[0] == md->gh->cctk_origin_space[0] && + cp->origin[1] == md->gh->cctk_origin_space[1] && + cp->origin[2] == md->gh->cctk_origin_space[2] && + cp->size[0] == md->gh->cctk_lsh[0] && + cp->size[1] == md->gh->cctk_lsh[1] && + cp->size[2] == md->gh->cctk_lsh[2] && + cp->delta[0] == md->gh->cctk_levfac[0] && + cp->delta[1] == md->gh->cctk_levfac[1] && + cp->delta[2] == md->gh->cctk_levfac[2]) + return cp; + } + + grid_size = cctkGH->cctk_lsh[0] * cctkGH->cctk_lsh[1] * cctkGH->cctk_lsh[2]; + + /* create a new patch */ + md->patches = realloc(md->patches, sizeof(*md->patches) * (md->nb_patches + 1)); + cp = &md->patches[md->nb_patches]; + + memset(cp, 0, sizeof(*cp)); + + memcpy(cp->origin, md->gh->cctk_origin_space, sizeof(cp->origin)); + memcpy(cp->size, md->gh->cctk_lsh, sizeof(cp->size)); + memcpy(cp->delta, md->gh->cctk_levfac, sizeof(cp->delta)); + + for (i = 0; i < cp->size[1]; i++) + if (fabs(y[CCTK_GFINDEX3D(cctkGH, 0, i, 0)]) < 1e-8) { + cp->y_idx = i; + break; + } + if (i == cp->size[1]) + CCTK_WARN(0, "The grid does not include y==0"); + +#if MD_POLAR || 1 + posix_memalign((void**)&cp->transform_matrix, 32, sizeof(*cp->transform_matrix) * md->solver->nb_coeffs[0] * cp->size[0] * cp->size[2]); + posix_memalign((void**)&cp->transform_matrix1, 32, sizeof(*cp->transform_matrix1) * md->solver->nb_coeffs[1] * cp->size[0] * cp->size[2]); + posix_memalign((void**)&cp->transform_matrix2, 32, sizeof(*cp->transform_matrix2) * md->solver->nb_coeffs[0] * cp->size[0] * cp->size[2]); + posix_memalign((void**)&cp->transform_matrix3, 32, sizeof(*cp->transform_matrix3) * md->solver->nb_coeffs[1] * cp->size[0] * cp->size[2]); +#pragma omp parallel for + for (int j = 0; j < cp->size[2]; j++) { + double zz = z[CCTK_GFINDEX3D(md->gh, 0, 0, j)]; + + for (int i = 0; i < cp->size[0]; i++) { + const int idx_grid = j * cp->size[0] + i; + + double xx = x[CCTK_GFINDEX3D(md->gh, i, 0, 0)]; + double rr = sqrt(SQR(xx) + SQR(zz)); + + double coord0 = xx; + double coord1 = zz; + + //for (int k = 0; k < md->nb_coeffs_z; k++) + // for (int l = 0; l < md->nb_coeffs_x; l++) { + // const int idx_coeff = k * md->nb_coeffs_x + l; + // cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * idx_coeff] = md->basis->eval(r, l) * md->basis1->eval(phi, k); + // } + for (int k = 0; k < md->solver->nb_coeffs[0]; k++) { + double dx = calc_basis_freq(md->solver->basis[0][0], k); + double r0 = MIN(60.0, dx * scale_factor); + double fact = exp(-36.0 * pow(rr / r0, scale_power)); + + cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * k] = md_basis_eval(md->solver->basis[0][0], MD_BASIS_EVAL_TYPE_VALUE, coord0, k) * fact; + } + for (int k = 0; k < md->solver->nb_coeffs[1]; k++) { + double dx = calc_basis_freq(md->solver->basis[0][1], k); + double r0 = MIN(60.0, dx * scale_factor); + double fact = exp(-36.0 * pow(rr / r0, scale_power)); + + cp->transform_matrix1[idx_grid * md->solver->nb_coeffs[1] + k] = md_basis_eval(md->solver->basis[0][1], MD_BASIS_EVAL_TYPE_VALUE, coord1, k) * fact; + } + for (int k = 0; k < md->solver->nb_coeffs[0]; k++) { + double dx = calc_basis_freq(md->solver->basis[1][0], k); + double r0 = MIN(60.0, dx * scale_factor); + double fact = exp(-36.0 * pow(rr / r0, scale_power)); + + cp->transform_matrix2[idx_grid + cp->size[0] * cp->size[2] * k] = md_basis_eval(md->solver->basis[1][0], MD_BASIS_EVAL_TYPE_VALUE, coord0, k) * fact; + } + for (int k = 0; k < md->solver->nb_coeffs[1]; k++) { + double dx = calc_basis_freq(md->solver->basis[1][1], k); + double r0 = MIN(60.0, dx * scale_factor); + double fact = exp(-36.0 * pow(rr / r0, scale_power)); + + cp->transform_matrix3[idx_grid * md->solver->nb_coeffs[1] + k] = md_basis_eval(md->solver->basis[1][1], MD_BASIS_EVAL_TYPE_VALUE, coord1, k) * fact; + } + } + } + posix_memalign((void**)&cp->transform_tmp, 32, sizeof(*cp->transform_tmp) * cp->size[0] * cp->size[2] * md->solver->nb_coeffs[1]); +#else + posix_memalign((void**)&cp->basis_val_r, 32, sizeof(*cp->basis_val_r) * md->solver->nb_coeffs[0] * md->gh->cctk_lsh[1] * md->gh->cctk_lsh[0]); + for (int j = 0; j < md->gh->cctk_lsh[1]; j++) + for (int i = 0; i < md->gh->cctk_lsh[0]; i++) { + CCTK_REAL xx = x[CCTK_GFINDEX3D(md->gh, i, j, 0)]; + CCTK_REAL yy = y[CCTK_GFINDEX3D(md->gh, i, j, 0)]; + CCTK_REAL r = sqrt(SQR(xx) + SQR(yy)); + + for (int k = 0; k < md->solver->nb_coeffs[0]; k++) + //cp->basis_val_r [(j * md->gh->cctk_lsh[0] + i) * md->nb_coeffs_x + k] = md->basis->eval(r, k); + cp->basis_val_r [(j * md->gh->cctk_lsh[0] + i) + md->gh->cctk_lsh[1] * md->gh->cctk_lsh[0] * k] = md->solver->basis[0]->eval(r, k); + } + + posix_memalign((void**)&cp->basis_val_z, 32, sizeof(*cp->basis_val_z) * md->solver->nb_coeffs[1] * md->gh->cctk_lsh[2]); + for (int i = 0; i < md->gh->cctk_lsh[2]; i++) { + CCTK_REAL zz = z[CCTK_GFINDEX3D(md->gh, 0, 0, i)]; + for (int j = 0; j < md->solver->nb_coeffs[1]; j++) + cp->basis_val_z [i * md->solver->nb_coeffs[1] + j] = md->solver->basis[0]->eval(fabs(zz), j); + //cp->basis_val_z [i + md->gh->cctk_lsh[2] * j] = md->basis->eval(zz, j); + } + posix_memalign((void**)&cp->transform_z, 32, sizeof(*cp->transform_z) * cctkGH->cctk_lsh[2] * md->solver->nb_coeffs[0]); +#endif + +#if 0 + nb_threads = getenv("OMP_NUM_THREADS"); + if (nb_threads) + cp->nb_threads = atoi(nb_threads); + if (cp->nb_threads <= 0) + cp->nb_threads = 1; + md_threadpool_init(&cp->tp, cp->nb_threads); + cp->ec = calloc(cp->nb_threads, sizeof(*cp->ec)); + + block_size = (md->gh->cctk_lsh[2] + cp->nb_threads - 1) / cp->nb_threads; + + for (int i = 0; i < cp->nb_threads; i++) { + EvalContext *ec = &cp->ec[i]; + + ec->md = md; + + ec->nb_coeffs[0] = md->solver->nb_coeffs[0]; + ec->nb_coeffs[1] = md->solver->nb_coeffs[1]; + + posix_memalign((void**)&ec->eval_tmp[0], 32, sizeof(*ec->eval_tmp[0]) * ec->nb_coeffs[0]); + posix_memalign((void**)&ec->eval_tmp[1], 32, sizeof(*ec->eval_tmp[1]) * ec->nb_coeffs[1]); + + ec->x_idx_start = 0; + ec->x_idx_end = md->gh->cctk_lsh[0]; + + ec->z_idx_start = block_size * i; + ec->z_idx_end = MIN(block_size * (i + 1), md->gh->cctk_lsh[2]); + } +#endif + + md->nb_patches++; + return cp; +} + +static MDContext *md_context; + +static int context_init(cGH *cctkGH) +{ + int threads_type; + const int *threads = CCTK_ParameterGet("num_threads", "Carpet", &threads_type); + + MDContext *md; + int ret; + + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + md = calloc(1, sizeof(*md)); + if (!md) + return -ENOMEM; + + md->gh = cctkGH; + + ret = md_threadpool_init(&md->tp, *threads); + if (ret < 0) + return ret; + + ret = md_solver_init(&md->solver, cctkGH, md->tp, 2, + (unsigned int [2][2]){ { basis_order_r, basis_order_z }, + { basis_order_r, basis_order_z }}, + scale_factor, filter_power, 0.0); + if (ret < 0) + return ret; + + ret = posix_memalign((void**)&md->coeffs_eval, 32, + basis_order_r * basis_order_z * sizeof(*md->coeffs_eval)); + if (ret) + return -ENOMEM; + + for (int i = 0; i < ARRAY_ELEMS(md->solution_cache); i++) { + ret = posix_memalign((void**)&md->solution_cache[i].coeffs, 32, + 2 * basis_order_r * basis_order_z * sizeof(*md->solution_cache[i].coeffs)); + if (ret) + return -ENOMEM; + } + + md_context = md; + + return 0; +} + +void minimal_distortion_solve(CCTK_ARGUMENTS) +{ + MDContext *md; + + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + double time; + + if (!md_context) + context_init(cctkGH); + + md = md_context; + + time = cctkGH->cctk_time / md->gh->cctk_delta_time; + + //if (md->gh->cctk_levfac[0] != 1 || fabs(time - ceilf(time)) > 1e-8 || + // (md->nb_solutions && md->solution_cache[md->nb_solutions - 1].time == cctkGH->cctk_time)) + // return; + //if (md->gh->cctk_time < 10.0) + // return; + + CCTK_TimerStart("MinimalDistortion_Solve"); + md_solver_solve(md->solver); + CCTK_TimerStop("MinimalDistortion_Solve"); + + fprintf(stderr, "%d md solve: time %g %g %g\n", md->gh->cctk_levfac[0], md->gh->cctk_time, time, md->solver->coeffs[0]); + if (1) { + double *tmp; + if (md->nb_solutions == ARRAY_ELEMS(md->solution_cache)) { + tmp = md->solution_cache[0].coeffs; + memmove(md->solution_cache, md->solution_cache + 1, sizeof(md->solution_cache[0]) * (ARRAY_ELEMS(md->solution_cache) - 1)); + } else { + md->nb_solutions++; + tmp = md->solution_cache[md->nb_solutions - 1].coeffs; + } + md->solution_cache[md->nb_solutions - 1].coeffs = md->solver->coeffs; + md->solution_cache[md->nb_solutions - 1].time = md->gh->cctk_time; + + md->solver->coeffs = tmp; + } +} + +double md_scalarproduct_metric_avx(size_t len1, size_t len2, const double *mat, + const double *vec1, const double *vec2); + +static double md_scalarproduct_metric_c(size_t len1, size_t len2, double *mat, + double *vec1, double *vec2) +{ + double val = 0.0; + for (int l = 0; l < len2; l++) { + double tmp = 0.0; + for (int m = 0; m < len1; m++) + tmp += mat[l * len1 + m] * vec1[m]; + + val += tmp * vec2[l]; + } + return val; +} + +#if 0 +static void md_eval(void *arg, + unsigned int job_id, unsigned int nb_jobs, + unsigned int thread_idx, unsigned int nb_threads) +{ + EvalContext *e = (EvalContext*)arg + job_id; + CoordPatch *cp = e->cp; + MDContext *md = e->md; + const cGH *gh = e->md->gh; + double *W = e->W; + + for (int k = e->z_idx_start; k < e->z_idx_end; k++) { + for (int i = e->x_idx_start; i < e->x_idx_end; i++) { + int idx = CCTK_GFINDEX3D(gh, i, cp->y_idx, k); + double xx = e->x[idx]; + double zz = e->z[idx]; + double r = sqrt(SQR(xx) + SQR(zz)); + double phi = atan2(zz, xx); + + double *basis_vec1 = e->eval_tmp[0]; + double *basis_vec2 = e->eval_tmp[1]; + + for (int l = 0; l < e->nb_coeffs[0]; l++) + basis_vec1[l] = md->solver->basis[0]->eval(r, l); + for (int l = 0; l < e->nb_coeffs[0]; l++) + basis_vec2[l] = md->solver->basis[1]->eval(phi, l); + + W[idx] = md_scalarproduct_metric_avx(e->nb_coeffs[0], e->nb_coeffs[1], e->coeffs, + basis_vec1, basis_vec2); + } + } +} +#endif + +void minimal_distortion_eval(CCTK_ARGUMENTS) +{ + MDContext *md; + + CoordPatch *cp; + + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + double *beta1 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::beta1"); + double *beta3 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::beta3"); + + double time; + + int64_t expand_start; + + double *coeffs = NULL; + int i, ret; + + if (!md_context) + context_init(cctkGH); + + time = cctkGH->cctk_time; + + md = md_context; + + cp = get_coord_patch(md, x, y, z, scale_factor, scale_power); + +#if 1 + //coeffs = md->coeffs; + coeffs = md->solution_cache[md->nb_solutions - 1].coeffs; +#elif 0 + if (time < 10.0) { + return; + } else if (time < 11.0) { + double fact = exp(-36.0 * pow((10.0 - time), 4.0)); + double *coeffs_src = md->solution_cache[md->nb_solutions - 1].coeffs; + + coeffs = md->coeffs_eval; + for (int i = 0; i < md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1] * 2; i++) + coeffs[i] = coeffs_src[i] * fact; + } else + coeffs = md->solution_cache[md->nb_solutions - 1].coeffs; + +#else + coeffs = md->coeffs_eval; + + if (cctkGH->cctk_levfac[0] < 1 || md->nb_solutions < 2) { + memset(coeffs, 0, sizeof(*coeffs) * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]); + //fprintf(stderr, "md eval: time %g zero\n", md->gh->cctk_time); + } else { + double *coeffs0 = md->solution_cache[md->nb_solutions - 2].coeffs; + double *coeffs1 = md->solution_cache[md->nb_solutions - 1].coeffs; + double time0 = md->solution_cache[md->nb_solutions - 2].time; + double time1 = md->solution_cache[md->nb_solutions - 1].time; + + double fact = 1.0; + + //if (time < 9.0) + // fact = 1.0; + //else + // fact = exp(-36.0 * pow((time - 9.0), 4.0)); + //else if (time < 0.1) + // fact = 0.0; + //else + // fact = (1.0 - exp(-pow((time - 0.0) / 0.25, 4.0))); + //fact = 1.0; + + //fprintf(stderr, "md eval: time %g interp from %g %g %g\n", md->gh->cctk_time, time0, time1, fact); + + for (int i = 0; i < 2 * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]; i++) + coeffs[i] = (coeffs1[i] * (time - time0) / (time1 - time0) + coeffs0[i] * (time - time1) / (time0 - time1)) * fact; + + } +#endif + + if (export_coeffs) { + memcpy(betax_coeffs, coeffs, sizeof(*coeffs) * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]); + memcpy(betaz_coeffs, coeffs + md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1], + sizeof(*coeffs) * md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1]); + } + + CCTK_TimerStart("MinimalDistortion_Expand"); + expand_start = gettime(); +#if 0 +#pragma omp parallel for + for (int k = 0; k < cctk_lsh[2]; k++) { + for (int i = 0; i < cctk_lsh[0]; i++) { + int idx = CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, k); + double xx = x[idx]; + double zz = z[idx]; + double r = sqrt(SQR(xx) + SQR(zz)); + double phi = atan2(zz, xx); + + double val = 0.0; + + for (int l = 0; l < md->nb_coeffs_z; l++) { + double tmp = 0.0; + for (int m = 0; m < md->nb_coeffs_x; m++) { + const int idx_coeff = l * md->nb_coeffs_x + m; + tmp += coeffs[idx_coeff] * md->basis->eval(r, m); + } + val += tmp * md->basis1->eval(phi, l); + } + + W[idx] = val; + } + } +#elif 0 + { + for (int i = 0; i < cp->nb_threads; i++) { + cp->ec[i].cp = cp; + cp->ec[i].x = x; + cp->ec[i].z = z; + cp->ec[i].W = W; + cp->ec[i].coeffs = coeffs; + } + md_threadpool_execute(cp->tp, cp->nb_threads, md_eval, cp->ec); + } +#elif MD_POLAR || 1 + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + cctk_lsh[0] * cctk_lsh[2], md->solver->nb_coeffs[1], md->solver->nb_coeffs[0], + 1.0, cp->transform_matrix, cctk_lsh[0] * cctk_lsh[2], + coeffs, md->solver->nb_coeffs[0], 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]); +#pragma omp parallel for + for (int j = 0; j < cctk_lsh[2]; j++) + for (int i = 0; i < cctk_lsh[0]; i++) { + const int idx_grid = j * cctk_lsh[0] + i; + const double val = cblas_ddot(md->solver->nb_coeffs[1], cp->transform_matrix1 + idx_grid * md->solver->nb_coeffs[1], 1, + cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]); + beta1[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = val; + } + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + cctk_lsh[0] * cctk_lsh[2], md->solver->nb_coeffs[1], md->solver->nb_coeffs[0], + 1.0, cp->transform_matrix2, cctk_lsh[0] * cctk_lsh[2], + coeffs + md->solver->nb_coeffs[0] * md->solver->nb_coeffs[1], + md->solver->nb_coeffs[0], 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]); +#pragma omp parallel for + for (int j = 0; j < cctk_lsh[2]; j++) + for (int i = 0; i < cctk_lsh[0]; i++) { + const int idx_grid = j * cctk_lsh[0] + i; + const double val = cblas_ddot(md->solver->nb_coeffs[1], cp->transform_matrix3 + idx_grid * md->solver->nb_coeffs[1], 1, + cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]); + beta3[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = val; + } +#else + memset(W, 0, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*W)); + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + md->solver->nb_coeffs[0], cctk_lsh[2], md->solver->nb_coeffs[1], 1.0, + coeffs, md->solver->nb_coeffs[0], cp->basis_val_z, md->solver->nb_coeffs[1], + 0.0, cp->transform_z, md->solver->nb_coeffs[0]); + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], md->solver->nb_coeffs[0], 1.0, + cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, md->solver->nb_coeffs[0], + 1.0, W, cctk_lsh[0] * cctk_lsh[1]); +#endif + + md->grid_expand_time += gettime() - expand_start; + md->grid_expand_count++; + + CCTK_TimerStop("MinimalDistortion_Expand"); + + /* print stats */ + if (!(md->grid_expand_count & 255)) { + fprintf(stderr, "Minimal distortion stats:\n"); + + md_solver_print_stats(md->solver); + + fprintf(stderr, + "%lu evals: total time %g s, avg time per call %g md\n", + md->grid_expand_count, (double)md->grid_expand_time / 1e6, + (double)md->grid_expand_time / md->grid_expand_count / 1e3); + } +} + +void minimal_distortion_init(CCTK_ARGUMENTS) +{ + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + if (!md_context) + context_init(cctkGH); +} -- cgit v1.2.3