#include #include #include #include #include #include #include #include #include #include #include #include #include #include "cctk.h" #include "cctk_Arguments.h" #include "cctk_Parameters.h" #include "cctk_Timers.h" #include "util_Table.h" #include "maximal_slicing_axi.h" #define ACC_TEST 0 /* * The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9) * SB(x, n) = sin((n + 1) arccot(|x| / L)) * They are symmetric wrt origin and decay as 1/x in infinity. */ double scale_factor; /* mapping between our indices and thorn names */ static const char *metric_vars[] = { [GTXX] = "ML_BSSN::gt11", [GTYY] = "ML_BSSN::gt22", [GTZZ] = "ML_BSSN::gt33", [GTXY] = "ML_BSSN::gt12", [GTXZ] = "ML_BSSN::gt13", [GTYZ] = "ML_BSSN::gt23", [ATXX] = "ML_BSSN::At11", [ATYY] = "ML_BSSN::At22", [ATZZ] = "ML_BSSN::At33", [ATXY] = "ML_BSSN::At12", [ATXZ] = "ML_BSSN::At13", [ATYZ] = "ML_BSSN::At23", [PHI] = "ML_BSSN::phi", [K] = "ML_BSSN::trK", [XTX] = "ML_BSSN::Xt1", [XTY] = "ML_BSSN::Xt2", [XTZ] = "ML_BSSN::Xt3", [BETAX] = "ML_BSSN::beta1", [BETAY] = "ML_BSSN::beta2", [BETAZ] = "ML_BSSN::beta3", }; /* mapping between the cactus grid values and interpolated values */ static const CCTK_INT interp_operation_indices[] = { [I_GTXX] = GTXX, [I_GTXX_DX] = GTXX, [I_GTXX_DY] = GTXX, [I_GTXX_DZ] = GTXX, [I_GTYY] = GTYY, [I_GTYY_DX] = GTYY, [I_GTYY_DY] = GTYY, [I_GTYY_DZ] = GTYY, [I_GTZZ] = GTZZ, [I_GTZZ_DX] = GTZZ, [I_GTZZ_DY] = GTZZ, [I_GTZZ_DZ] = GTZZ, [I_GTXY] = GTXY, [I_GTXY_DX] = GTXY, [I_GTXY_DY] = GTXY, [I_GTXY_DZ] = GTXY, [I_GTXZ] = GTXZ, [I_GTXZ_DX] = GTXZ, [I_GTXZ_DY] = GTXZ, [I_GTXZ_DZ] = GTXZ, [I_GTYZ] = GTYZ, [I_GTYZ_DX] = GTYZ, [I_GTYZ_DY] = GTYZ, [I_GTYZ_DZ] = GTYZ, [I_PHI] = PHI, [I_PHI_DX] = PHI, [I_PHI_DY] = PHI, [I_PHI_DZ] = PHI, [I_ATXX] = ATXX, [I_ATYY] = ATYY, [I_ATZZ] = ATZZ, [I_ATXY] = ATXY, [I_ATXZ] = ATXZ, [I_ATYZ] = ATYZ, [I_K] = K, [I_K_DX] = K, [I_K_DY] = K, [I_K_DZ] = K, [I_XTX] = XTX, [I_XTY] = XTY, [I_XTZ] = XTZ, [I_BETAX] = BETAX, [I_BETAY] = BETAY, [I_BETAZ] = BETAZ, }; /* the operation (plain value or x/y/z-derivative) to apply during interpolation */ static const CCTK_INT interp_operation_codes[] = { [I_GTXX] = 0, [I_GTXX_DX] = 1, [I_GTXX_DY] = 2, [I_GTXX_DZ] = 3, [I_GTYY] = 0, [I_GTYY_DX] = 1, [I_GTYY_DY] = 2, [I_GTYY_DZ] = 3, [I_GTZZ] = 0, [I_GTZZ_DX] = 1, [I_GTZZ_DY] = 2, [I_GTZZ_DZ] = 3, [I_GTXY] = 0, [I_GTXY_DX] = 1, [I_GTXY_DY] = 2, [I_GTXY_DZ] = 3, [I_GTXZ] = 0, [I_GTXZ_DX] = 1, [I_GTXZ_DY] = 2, [I_GTXZ_DZ] = 3, [I_GTYZ] = 0, [I_GTYZ_DX] = 1, [I_GTYZ_DY] = 2, [I_GTYZ_DZ] = 3, [I_PHI] = 0, [I_PHI_DX] = 1, [I_PHI_DY] = 2, [I_PHI_DZ] = 3, [I_ATXX] = 0, [I_ATYY] = 0, [I_ATZZ] = 0, [I_ATXY] = 0, [I_ATXZ] = 0, [I_ATYZ] = 0, [I_K] = 0, [I_K_DX] = 1, [I_K_DY] = 2, [I_K_DZ] = 3, [I_XTX] = 0, [I_XTY] = 0, [I_XTZ] = 0, [I_BETAX] = 0, [I_BETAY] = 0, [I_BETAZ] = 0, }; static void init_opencl(MaximalSlicingContext *ms) { int err, count; cl_platform_id platform; cl_device_id device; cl_context_properties props[3]; err = clGetPlatformIDs(1, &platform, &count); if (err != CL_SUCCESS || count < 1) { fprintf(stderr, "Could not get an OpenCL platform ID\n"); return; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &count); if (err != CL_SUCCESS || count < 1) { fprintf(stderr, "Could not get an OpenCL device ID\n"); return; } props[0] = CL_CONTEXT_PLATFORM; props[1] = (cl_context_properties)platform; props[2] = 0; ms->cl_ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS || !ms->cl_ctx) { fprintf(stderr, "Could not create an OpenCL context\n"); return; } ms->cl_queue = clCreateCommandQueue(ms->cl_ctx, device, 0, &err); if (err != CL_SUCCESS || !ms->cl_queue) { fprintf(stderr, "Could not create an OpenCL command queue: %d\n", err); goto fail; } err = clblasSetup(); if (err != CL_SUCCESS) { fprintf(stderr, "Error setting up clBLAS\n"); goto fail; } ms->ocl_coeffs = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); ms->bicgstab.cl_p = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); ms->bicgstab.cl_v = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); ms->bicgstab.cl_y = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); ms->bicgstab.cl_z = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); ms->bicgstab.cl_t = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); ms->bicgstab.cl_res = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); ms->bicgstab.cl_res0 = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); ms->bicgstab.cl_tmp = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); ms->bicgstab.cl_tmp1 = clCreateBuffer(ms->cl_ctx, 0, 2 * ms->nb_coeffs * sizeof(double), NULL, &err); ms->bicgstab.cl_k = clCreateBuffer(ms->cl_ctx, 0, ms->nb_colloc_points * ms->nb_coeffs * sizeof(double), NULL, &err); ms->bicgstab.cl_mat = clCreateBuffer(ms->cl_ctx, 0, ms->nb_colloc_points * ms->nb_coeffs * sizeof(double), NULL, &err); ms->bicgstab.cl_rho = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err); ms->bicgstab.cl_alpha = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err); ms->bicgstab.cl_beta = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err); ms->bicgstab.cl_omega = clCreateBuffer(ms->cl_ctx, 0, 2 * sizeof(double), NULL, &err); ms->bicgstab.cl_omega1 = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err); return; fail: if (ms->cl_queue) clReleaseCommandQueue(ms->cl_queue); ms->cl_queue = 0; if (ms->cl_ctx) clReleaseContext(ms->cl_ctx); ms->cl_ctx = 0; } static MaximalSlicingContext *init_ms(cGH *cctkGH, int basis_order_r, int basis_order_z, double sf, CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z, const int grid_size[3]) { MaximalSlicingContext *ms; int ret; //const double h = (x[1] - x[0]) / 8; //fprintf(stderr, "h %g\n", h); //if (basis_order_r != basis_order_z) // CCTK_WARN(0, "Different r and z basis orders are not supported."); ms = calloc(1, sizeof(*ms)); ms->gh = cctkGH; ms->basis = &msa_sb_even_basis; //ms->basis = &full_basis; //ms->basis = &cheb_basis; ms->nb_coeffs_x = basis_order_r; ms->nb_coeffs_z = basis_order_z; ms->nb_coeffs = ms->nb_coeffs_x * ms->nb_coeffs_z; ms->nb_colloc_points_x = basis_order_r; ms->nb_colloc_points_z = basis_order_z; ms->nb_colloc_points = ms->nb_colloc_points_x * ms->nb_colloc_points_z; if (ms->nb_colloc_points != ms->nb_coeffs) CCTK_WARN(0, "Non-square collocation matrix"); ms->colloc_grid_order_x = ms->nb_colloc_points_x; ms->colloc_grid_order_z = ms->nb_colloc_points_z; ms->mat = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points); ms->coeffs = malloc(sizeof(double) * ms->nb_coeffs); ms->rhs = malloc(sizeof(double) * ms->nb_colloc_points); ms->mat_f = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points); ms->ipiv = malloc(sizeof(*ms->ipiv) * ms->nb_coeffs); #if 1 scale_factor = 1.0; scale_factor = (x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)] - 3) / ms->basis->colloc_point(ms->colloc_grid_order_x, ms->nb_colloc_points_x - 1); //scale_factor = x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)] - 0.5; fprintf(stderr, "scale factor %g\n", scale_factor); #else scale_factor = sf; #endif ms->grid_x = malloc(ms->nb_colloc_points_x * sizeof(*ms->grid_x)); for (int i = 0; i < ms->nb_colloc_points_x; i++) { #if 0 double target_val = ms->basis->colloc_point(ms->colloc_grid_order_x, i); double best_diff = DBL_MAX, best_val = DBL_MAX; for (int j = 0; j < grid_size[0]; j++) { int idx = CCTK_GFINDEX3D(cctkGH, j, 0, 0); double val = x[idx]; double diff = fabs(target_val - val); if (val > 0.0 && diff < best_diff) { int k; for (k = 0; k < i; k++) if (ms->grid_x[k] == val) break; if (k == i) { best_diff = diff; best_val = val; } } } if (best_val == DBL_MAX) abort(); fprintf(stderr, "%d %g -> %g (%g)\n", i, target_val, best_val, target_val - best_val); ms->grid_x[i] = best_val; #elif 0 double max = x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)]; ms->grid_x[i] = max * SQR(SQR((double)i / ms->nb_colloc_points_x)) + 0.001; #else ms->grid_x[i] = ms->basis->colloc_point(ms->colloc_grid_order_x, i); #endif fprintf(stderr, "%d %g\n", i, ms->grid_x[i]); } ms->grid_z = malloc(ms->nb_colloc_points_z * sizeof(*ms->grid_z)); for (int i = 0; i < ms->nb_colloc_points_z; i++) { ms->grid_z[i] = ms->basis->colloc_point(ms->colloc_grid_order_z, i); } /* precompute the basis values we will need */ ms->basis_x_val = malloc(sizeof(*ms->basis_x_val) * ms->nb_colloc_points_x * ms->nb_coeffs_x); ms->basis_x_dval = malloc(sizeof(*ms->basis_x_dval) * ms->nb_colloc_points_x * ms->nb_coeffs_x); ms->basis_x_d2val = malloc(sizeof(*ms->basis_x_d2val) * ms->nb_colloc_points_x * ms->nb_coeffs_x); for (int i = 0; i < ms->nb_colloc_points_x; i++) { CCTK_REAL coord = ms->grid_x[i]; for (int j = 0; j < ms->nb_coeffs_x; j++) { ms->basis_x_val [i * ms->nb_coeffs_x + j] = ms->basis->eval(coord, j); ms->basis_x_dval [i * ms->nb_coeffs_x + j] = ms->basis->eval_diff1(coord, j); ms->basis_x_d2val[i * ms->nb_coeffs_x + j] = ms->basis->eval_diff2(coord, j); } } ms->basis_z_val = malloc(sizeof(*ms->basis_z_val) * ms->nb_colloc_points_z * ms->nb_coeffs_z); ms->basis_z_dval = malloc(sizeof(*ms->basis_z_dval) * ms->nb_colloc_points_z * ms->nb_coeffs_z); ms->basis_z_d2val = malloc(sizeof(*ms->basis_z_d2val) * ms->nb_colloc_points_z * ms->nb_coeffs_z); for (int i = 0; i < ms->nb_colloc_points_z; i++) { CCTK_REAL coord = ms->grid_z[i]; for (int j = 0; j < ms->nb_coeffs_z; j++) { ms->basis_z_val [i * ms->nb_coeffs_z + j] = ms->basis->eval(coord, j); ms->basis_z_dval [i * ms->nb_coeffs_z + j] = ms->basis->eval_diff1(coord, j); ms->basis_z_d2val[i * ms->nb_coeffs_z + j] = ms->basis->eval_diff2(coord, j); } } ms->basis_val_00 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00)); ms->basis_val_11 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00)); ms->basis_val_10 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00)); ms->basis_val_01 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00)); ms->basis_val_02 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00)); ms->basis_val_20 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00)); for (int i = 0; i < ms->nb_colloc_points_z; i++) { const double *basis_val_z = ms->basis_z_val + i * ms->nb_coeffs_z; const double *dbasis_val_z = ms->basis_z_dval + i * ms->nb_coeffs_z; const double *d2basis_val_z = ms->basis_z_d2val + i * ms->nb_coeffs_z; for (int j = 0; j < ms->nb_colloc_points_x; j++) { const double *basis_val_x = ms->basis_x_val + j * ms->nb_coeffs_x; const double *dbasis_val_x = ms->basis_x_dval + j * ms->nb_coeffs_x; const double *d2basis_val_x = ms->basis_x_d2val + j * ms->nb_coeffs_x; const int idx_grid = i * ms->nb_colloc_points_x + j; for (int k = 0; k < ms->nb_coeffs_z; k++) for (int l = 0; l < ms->nb_coeffs_x; l++) { const int idx_coeff = k * ms->nb_coeffs_x + l; const int idx = idx_grid + ms->nb_colloc_points * idx_coeff; ms->basis_val_00[idx] = basis_val_x[l] * basis_val_z[k]; ms->basis_val_11[idx] = dbasis_val_x[l] * dbasis_val_z[k]; ms->basis_val_10[idx] = dbasis_val_x[l] * basis_val_z[k]; ms->basis_val_01[idx] = basis_val_x[l] * dbasis_val_z[k]; ms->basis_val_02[idx] = basis_val_x[l] * d2basis_val_z[k]; ms->basis_val_20[idx] = d2basis_val_x[l] * basis_val_z[k]; } } } ms->interp_coords[0] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[0])); ms->interp_coords[1] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[1])); ms->interp_coords[2] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[2])); for (int i = 0; i < ms->nb_colloc_points_z; i++) { CCTK_REAL z = ms->grid_z[i]; for (int j = 0; j < ms->nb_colloc_points_x; j++) { CCTK_REAL x = ms->grid_x[j]; ms->interp_coords[0][i * ms->nb_colloc_points_x + j] = x; ms->interp_coords[1][i * ms->nb_colloc_points_x + j] = 0; ms->interp_coords[2][i * ms->nb_colloc_points_x + j] = z; } } for (int i = 0; i < ARRAY_ELEMS(ms->metric_u); i++) ms->metric_u[i] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_values[i])); ms->kij_kij = malloc(ms->nb_colloc_points * sizeof(*ms->kij_kij)); for (int i = 0; i < ARRAY_ELEMS(ms->interp_values); i++) { ms->interp_values[i] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_values[i])); ms->interp_value_codes[i] = CCTK_VARIABLE_REAL; } for (int i = 0; i < ARRAY_ELEMS(metric_vars); i++) ms->interp_vars_indices[i] = CCTK_VarIndex(metric_vars[i]); ms->coord_system = CCTK_CoordSystemHandle("cart3d"); if (ms->coord_system < 0) CCTK_WARN(0, "Error getting the coordinate system"); ms->interp_operator = CCTK_InterpHandle("Lagrange polynomial interpolation"); if (ms->interp_operator < 0) CCTK_WARN(0, "Error getting the interpolation operator"); ms->interp_params = Util_TableCreateFromString("order=2 want_global_mode=1"); if (ms->interp_params < 0) CCTK_WARN(0, "Error creating interpolation parameters table"); ret = Util_TableSetIntArray(ms->interp_params, NB_INTERP_VARS, interp_operation_codes, "operation_codes"); if (ret < 0) CCTK_WARN(0, "Error setting operation codes"); ret = Util_TableSetIntArray(ms->interp_params, NB_INTERP_VARS, interp_operation_indices, "operand_indices"); if (ret < 0) CCTK_WARN(0, "Error setting operand indices"); ms->bicgstab.p = malloc(sizeof(double) * ms->nb_coeffs); ms->bicgstab.v = malloc(sizeof(double) * ms->nb_coeffs); ms->bicgstab.y = malloc(sizeof(double) * ms->nb_coeffs); ms->bicgstab.z = malloc(sizeof(double) * ms->nb_coeffs); ms->bicgstab.t = malloc(sizeof(double) * ms->nb_coeffs); ms->bicgstab.res = malloc(sizeof(double) * ms->nb_coeffs); ms->bicgstab.res0 = malloc(sizeof(double) * ms->nb_coeffs); ms->bicgstab.k = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points); ms->steps_since_inverse = INT_MAX; init_opencl(ms); CCTK_TimerCreate("MaximalSlicingAxi_Solve"); CCTK_TimerCreate("MaximalSlicingAxi_Expand"); CCTK_TimerCreate("MaximalSlicingAxi_calc_geometry"); CCTK_TimerCreate("MaximalSlicingAxi_construct_matrix"); CCTK_TimerCreate("MaximalSlicingAxi_solve_LU"); CCTK_TimerCreate("MaximalSlicingAxi_solve_BiCGSTAB"); return ms; } static CoordPatch *get_coord_patch(MaximalSlicingContext *ms, CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z, CCTK_REAL *alp) { cGH *cctkGH = ms->gh; CoordPatch *cp; int64_t grid_size; for (int i = 0; i < ms->nb_patches; i++) { cp = &ms->patches[i]; if (cp->origin[0] == ms->gh->cctk_origin_space[0] && cp->origin[1] == ms->gh->cctk_origin_space[1] && cp->origin[2] == ms->gh->cctk_origin_space[2] && cp->size[0] == ms->gh->cctk_lsh[0] && cp->size[1] == ms->gh->cctk_lsh[1] && cp->size[2] == ms->gh->cctk_lsh[2] && cp->delta[0] == ms->gh->cctk_delta_space[0] && cp->delta[1] == ms->gh->cctk_delta_space[1] && cp->delta[2] == ms->gh->cctk_delta_space[2]) return cp; } grid_size = cctkGH->cctk_lsh[0] * cctkGH->cctk_lsh[1] * cctkGH->cctk_lsh[2]; /* create a new patch */ ms->patches = realloc(ms->patches, sizeof(*ms->patches) * (ms->nb_patches + 1)); cp = &ms->patches[ms->nb_patches]; memcpy(cp->origin, ms->gh->cctk_origin_space, sizeof(cp->origin)); memcpy(cp->size, ms->gh->cctk_lsh, sizeof(cp->size)); memcpy(cp->delta, ms->gh->cctk_delta_space, sizeof(cp->delta)); posix_memalign((void**)&cp->basis_val_r, 32, sizeof(*cp->basis_val_r) * ms->nb_coeffs_x * ms->gh->cctk_lsh[1] * ms->gh->cctk_lsh[0]); for (int j = 0; j < ms->gh->cctk_lsh[1]; j++) for (int i = 0; i < ms->gh->cctk_lsh[0]; i++) { CCTK_REAL xx = x[CCTK_GFINDEX3D(ms->gh, i, j, 0)]; CCTK_REAL yy = y[CCTK_GFINDEX3D(ms->gh, i, j, 0)]; CCTK_REAL r = sqrt(SQR(xx) + SQR(yy)); for (int k = 0; k < ms->nb_coeffs_x; k++) //cp->basis_val_r [(j * ms->gh->cctk_lsh[0] + i) * ms->nb_coeffs_x + k] = ms->basis->eval(r, k); cp->basis_val_r [(j * ms->gh->cctk_lsh[0] + i) + ms->gh->cctk_lsh[1] * ms->gh->cctk_lsh[0] * k] = ms->basis->eval(r, k); } posix_memalign((void**)&cp->basis_val_z, 32, sizeof(*cp->basis_val_z) * ms->nb_coeffs_z * ms->gh->cctk_lsh[2]); for (int i = 0; i < ms->gh->cctk_lsh[2]; i++) { CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, i)]; for (int j = 0; j < ms->nb_coeffs_z; j++) cp->basis_val_z [i * ms->nb_coeffs_z + j] = ms->basis->eval(zz, j); //cp->basis_val_z [i + ms->gh->cctk_lsh[2] * j] = ms->basis->eval(zz, j); } posix_memalign((void**)&cp->transform_z, 32, sizeof(*cp->transform_z) * cctkGH->cctk_lsh[2] * ms->nb_coeffs_x); posix_memalign((void**)&cp->one, 32, sizeof(*cp->one) * grid_size); for (int i = 0; i < grid_size; i++) cp->one[i] = 1.0; ms->nb_patches++; return cp; } void maximal_slicing_axi(CCTK_ARGUMENTS) { static MaximalSlicingContext *ms; CoordPatch *cp; DECLARE_CCTK_ARGUMENTS; DECLARE_CCTK_PARAMETERS; static double total; static int64_t count; int64_t timer_start; const int64_t grid_size = cctk_lsh[2] * cctk_lsh[1] * cctk_lsh[0]; /* on the first run, init the solver */ if (!ms) { ms = init_ms(cctkGH, basis_order_r, basis_order_z, scale_factor, x, y, z, cctk_lsh); } cp = get_coord_patch(ms, x, y, z, alp); CCTK_TimerStart("MaximalSlicingAxi_Solve"); msa_maximal_solve(ms); CCTK_TimerStop("MaximalSlicingAxi_Solve"); if (export_coeffs) memcpy(alpha_coeffs, ms->coeffs, sizeof(*alpha_coeffs) * ms->nb_coeffs); CCTK_TimerStart("MaximalSlicingAxi_Expand"); timer_start = gettime(); memcpy(alp, cp->one, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*alp)); cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, ms->nb_coeffs_x, cctk_lsh[2], ms->nb_coeffs_z, 1.0, ms->coeffs, ms->nb_coeffs_x, cp->basis_val_z, ms->nb_coeffs_z, 0.0, cp->transform_z, ms->nb_coeffs_x); cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], ms->nb_coeffs_x, 1.0, cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, ms->nb_coeffs_x, 1.0, alp, cctk_lsh[0] * cctk_lsh[1]); total += gettime() - timer_start; CCTK_TimerStop("MaximalSlicingAxi_Expand"); count++; if (!(count & 15)) fprintf(stderr, "avg %g total %g\n", total / count, total); }