#include #include #include #include #include #include #include #include #include "maximal_slicing_axi.h" static int construct_matrix_cylindric(MaximalSlicingContext *ms, double *mat, double *rhs, double *prhs_max) { int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z; double rhs_max = 0.0; #define BASIS_X (ms->basis_x_val [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) #define DBASIS_X (ms->basis_x_dval [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) #define D2BASIS_X (ms->basis_x_d2val[idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) #define BASIS_Z (ms->basis_z_val [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) #define DBASIS_Z (ms->basis_z_dval [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) #define D2BASIS_Z (ms->basis_z_d2val[idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) //memset(mat, 0, sizeof(*mat) * ms->nb_coeffs * ms->nb_colloc_points); #pragma omp parallel for reduction(max : rhs_max) for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z; idx_grid_z++) { for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x; idx_grid_x++) { CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x]; int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; const double gtuxx = ms->metric_u[0][idx_grid]; const double gtuyy = ms->metric_u[1][idx_grid]; const double gtuzz = ms->metric_u[2][idx_grid]; const double gtuxz = ms->metric_u[4][idx_grid]; const double phi = ms->interp_values[I_PHI][idx_grid]; const double phi_dx = ms->interp_values[I_PHI_DX][idx_grid]; const double phi_dz = ms->interp_values[I_PHI_DZ][idx_grid]; const double Xtx = ms->interp_values[I_XTX][idx_grid]; const double Xtz = ms->interp_values[I_XTZ][idx_grid]; const double k2 = ms->kij_kij[idx_grid]; const double trk = ms->interp_values[I_K][idx_grid]; const double trk_dx = ms->interp_values[I_K_DX][idx_grid]; const double trk_dz = ms->interp_values[I_K_DZ][idx_grid]; const double betax = ms->interp_values[I_BETAX][idx_grid]; const double betaz = ms->interp_values[I_BETAZ][idx_grid]; const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi); const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); //const double tr_ric = ms->interp_values[I_TR_R][idx_grid]; const double coeff_20 = SQR(phi) * (gtuxx + ((x_physical <= EPS) ? gtuyy : 0.0)); const double coeff_02 = SQR(phi) * gtuzz; const double coeff_11 = SQR(phi) * gtuxz * 2; const double coeff_10 = -Xx + ((x_physical > EPS) ? SQR(phi) * gtuyy / x_physical : 0.0); const double coeff_01 = -Xz; const double coeff_00 = -k2; //const double coeff_00 = -(tr_ric + SQR(trk)); #if 1 for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; #if 0 double d2alpha = gtuxx * D2BASIS_X * BASIS_Z + gtuzz * BASIS_X * D2BASIS_Z + 2 * gtuxz * DBASIS_X * DBASIS_Z; if (x_physical > EPS) d2alpha += gtuyy * DBASIS_X * BASIS_Z / x_physical; else d2alpha += gtuyy * D2BASIS_X * BASIS_Z; double curv_term = (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi) * DBASIS_X * BASIS_Z + (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi) * BASIS_X * DBASIS_Z; double D2alpha = SQR(phi) * (d2alpha - curv_term); mat[idx_grid + ms->nb_colloc_points * idx_coeff] = D2alpha - BASIS_X * BASIS_Z * k2; #else mat[idx_grid + ms->nb_colloc_points * idx_coeff] = coeff_00 * BASIS_X * BASIS_Z + coeff_10 * DBASIS_X * BASIS_Z + coeff_01 * BASIS_X * DBASIS_Z + coeff_11 * DBASIS_X * DBASIS_Z + coeff_20 * D2BASIS_X * BASIS_Z + coeff_02 * BASIS_X * D2BASIS_Z; #endif } #else cblas_daxpy(ms->nb_coeffs, coeff_20, ms->basis_val_20 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); cblas_daxpy(ms->nb_coeffs, coeff_02, ms->basis_val_02 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); cblas_daxpy(ms->nb_coeffs, coeff_11, ms->basis_val_11 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); cblas_daxpy(ms->nb_coeffs, coeff_10, ms->basis_val_10 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); cblas_daxpy(ms->nb_coeffs, coeff_01, ms->basis_val_01 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); cblas_daxpy(ms->nb_coeffs, coeff_00, ms->basis_val_00 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); #endif rhs[idx_grid] = k2 + trk + betax * trk_dx + betaz * trk_dz; //rhs[idx_grid] = 0.0; //rhs[idx_grid] = tr_ric + SQR(trk) + betax * trk_dx + betaz * trk_dz; rhs_max = MAX(rhs_max, fabs(rhs[idx_grid])); //rhs_max = fabs(rhs[idx_grid]); } } //memcpy(rhs, ms->kij_kij, sizeof(*rhs) * ms->nb_colloc_points); //cblas_daxpy(ms->nb_colloc_points, 1.0, ms->interp_values[I_K], 1, rhs, 1); //cblas_dsbmv(CblasColMajor, CblasUpper, ms->nb_colloc_points, 0, 1.0, ms->interp_values[I_BETAX], 1, ms->interp_values[I_K_DX], 1, 1.0, rhs, 1); //cblas_dsbmv(CblasColMajor, CblasUpper, ms->nb_colloc_points, 0, 1.0, ms->interp_values[I_BETAZ], 1, ms->interp_values[I_K_DZ], 1, 1.0, rhs, 1); //*prhs_max = rhs[cblas_idamax(ms->nb_colloc_points, rhs, 1)]; *prhs_max = rhs_max; return 0; } static int construct_matrix_polar(MaximalSlicingContext *ms, double *mat, double *rhs, double *prhs_max) { int idx_coeff, idx_grid; #pragma omp parallel for for (idx_coeff = 0; idx_coeff < ms->nb_coeffs; idx_coeff++) for (idx_grid = 0; idx_grid < ms->nb_colloc_points; idx_grid++) { const int idx = idx_grid + ms->nb_colloc_points * idx_coeff; mat[idx] = ms->eq_coeff_00[idx_grid] * ms->basis_val_00[idx] + ms->eq_coeff_10[idx_grid] * ms->basis_val_10[idx] + ms->eq_coeff_01[idx_grid] * ms->basis_val_01[idx] + ms->eq_coeff_11[idx_grid] * ms->basis_val_11[idx] + ms->eq_coeff_20[idx_grid] * ms->basis_val_20[idx] + ms->eq_coeff_02[idx_grid] * ms->basis_val_02[idx]; } //*prhs_max = ms->rhs[cblas_idamax(ms->nb_colloc_points, ms->rhs, 1)]; return 0; } static int construct_matrix_boundary(MaximalSlicingContext *ms, double *mat, double *rhs, double *prhs_max) { int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z; double rhs_max = 0.0; #define BASIS_X (ms->basis_x_val [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) #define DBASIS_X (ms->basis_x_dval [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) #define D2BASIS_X (ms->basis_x_d2val[idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) #define BASIS_Z (ms->basis_z_val [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) #define DBASIS_Z (ms->basis_z_dval [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) #define D2BASIS_Z (ms->basis_z_d2val[idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) #pragma omp parallel for for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) { for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) { CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x]; int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; const double gtuxx = ms->metric_u[0][idx_grid]; const double gtuyy = ms->metric_u[1][idx_grid]; const double gtuzz = ms->metric_u[2][idx_grid]; const double gtuxz = ms->metric_u[4][idx_grid]; const double phi = ms->interp_values[I_PHI][idx_grid]; const double phi_dx = ms->interp_values[I_PHI_DX][idx_grid]; const double phi_dz = ms->interp_values[I_PHI_DZ][idx_grid]; const double Xtx = ms->interp_values[I_XTX][idx_grid]; const double Xtz = ms->interp_values[I_XTZ][idx_grid]; const double k2 = ms->kij_kij[idx_grid]; const double trk = ms->interp_values[I_K][idx_grid]; const double trk_dx = ms->interp_values[I_K_DX][idx_grid]; const double trk_dz = ms->interp_values[I_K_DZ][idx_grid]; const double betax = ms->interp_values[I_BETAX][idx_grid]; const double betaz = ms->interp_values[I_BETAZ][idx_grid]; const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi); const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); const double coeff_20 = SQR(phi) * (gtuxx + ((fabs(x_physical) <= EPS) ? gtuyy : 0)); const double coeff_02 = SQR(phi) * gtuzz; const double coeff_11 = SQR(phi) * gtuxz * 2; const double coeff_10 = -Xx + ((fabs(x_physical) > EPS) ? (SQR(phi) * gtuyy / x_physical) : 0); const double coeff_01 = -Xz; const double coeff_00 = -k2; for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; mat[idx_grid + ms->nb_colloc_points * idx_coeff] = coeff_00 * BASIS_X * BASIS_Z + coeff_10 * DBASIS_X * BASIS_Z + coeff_01 * BASIS_X * DBASIS_Z + coeff_11 * DBASIS_X * DBASIS_Z + coeff_20 * D2BASIS_X * BASIS_Z + coeff_02 * BASIS_X * D2BASIS_Z; } rhs[idx_grid] = 0.0; //rhs[idx_grid] = k2 + trk + betax * trk_dx + betaz * trk_dz; //rhs_max = MAX(rhs_max, fabs(rhs[idx_grid])); //rhs_max = fabs(rhs[idx_grid]); } } #if 1 // z = 0 axis idx_grid_z = 0; for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) { int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; int idx_grid1 = (ms->nb_colloc_points_z - 2) * ms->nb_colloc_points_x + idx_grid_x; for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; mat[idx_grid1 + ms->nb_colloc_points * idx_coeff] = BASIS_X * DBASIS_Z; } rhs[idx_grid1] = 0.0; } // x = 0 axis idx_grid_x = 0; for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) { int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; int idx_grid1 = idx_grid_z * ms->nb_colloc_points_x + ms->nb_colloc_points_x - 2; for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; mat[idx_grid1 + ms->nb_colloc_points * idx_coeff] = DBASIS_X * BASIS_Z; } rhs[idx_grid1] = 0.0; } // z = z_max axis idx_grid_z = ms->nb_colloc_points_z - 1; for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) { CCTK_REAL z_physical = ms->colloc_grid[1][idx_grid_z]; int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z; //+ z_physical * BASIS_X * DBASIS_Z; } rhs[idx_grid] = 1.0; } // x = x_max axis idx_grid_x = ms->nb_colloc_points_x - 1; for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) { CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x]; int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z;// + x_physical * DBASIS_X * BASIS_Z; } rhs[idx_grid] = 1.0; } //idx_grid_x = 0; //idx_grid_z = 0; //{ // int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; // for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) // for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { // const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; // mat[idx_grid + ms->nb_colloc_points * idx_coeff] = DBASIS_X * BASIS_Z + BASIS_X * DBASIS_Z; // } // rhs[idx_grid] = 0.0; //} //idx_grid_x = ms->nb_colloc_points_x - 1; //idx_grid_z = ms->nb_colloc_points_z - 1; //{ // CCTK_REAL z_physical = ms->colloc_grid[1][idx_grid_z]; // CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x]; // CCTK_REAL r2 = SQR(z_physical) + SQR(x_physical); // int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; // for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) // for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { // const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; // mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z + 0.5 * r2 / z_physical * BASIS_X * DBASIS_Z + 0.5 * r2 / x_physical * DBASIS_X * BASIS_Z; // } // rhs[idx_grid] = 1.0; //} #endif *prhs_max = rhs_max; #if 0 fprintf(stderr, "matrix\n"); for (int i = 0; i < ms->nb_colloc_points; i++) { for (int j = 0; j < ms->nb_coeffs; j++) fprintf(stderr, "%+08.04g ", mat[i + ms->nb_colloc_points * j]); fprintf(stderr, "\n"); } fprintf(stderr, "rhs\n"); for (int i = 0; i < ms->nb_colloc_points; i++) fprintf(stderr, "%+08.04g ", rhs[i]); fprintf(stderr, "\n"); #endif return 0; } static int construct_matrix(MaximalSlicingContext *ms) { return construct_matrix_polar(ms, ms->mat, NULL, NULL); } /* interpolate the cactus gridfunctions onto the pseudospectral grid */ static int interp_geometry(MaximalSlicingContext *ms) { int ret; ret = CCTK_InterpGridArrays(ms->gh, 3, ms->interp_operator, ms->interp_params, ms->coord_system, ms->nb_colloc_points, CCTK_VARIABLE_REAL, (const void * const *)ms->interp_coords, ARRAY_ELEMS(ms->interp_vars_indices), ms->interp_vars_indices, ARRAY_ELEMS(ms->interp_values), ms->interp_value_codes, (void * const *)ms->interp_values); if (ret < 0) CCTK_WARN(0, "Error interpolating"); //CCTK_TimerStart("MaximalSlicingAxi_filter_input"); //{ // cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, // ms->nb_colloc_points, ARRAY_ELEMS(ms->interp_values), ms->nb_colloc_points, // 1.0, ms->input_filter, ms->nb_colloc_points, ms->interp_buffer_prefilter, ms->nb_colloc_points, // 0.0, ms->interp_buffer, ms->nb_colloc_points); //} //CCTK_TimerStop("MaximalSlicingAxi_filter_input"); return 0; } static int calc_eq_coeffs(MaximalSlicingContext *ms, double *prhs_max) { double rhs_max = 0.0; #pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x) reduction(max : rhs_max) for (int i = 0; i < ms->nb_colloc_points; i++) { CCTK_REAL Am[3][3], gtu[3][3]; CCTK_REAL a2; CCTK_REAL gtxx = ms->interp_values[I_GTXX][i]; CCTK_REAL gtyy = ms->interp_values[I_GTYY][i]; CCTK_REAL gtzz = ms->interp_values[I_GTZZ][i]; CCTK_REAL gtxy = ms->interp_values[I_GTXY][i]; CCTK_REAL gtxz = ms->interp_values[I_GTXZ][i]; CCTK_REAL gtyz = ms->interp_values[I_GTYZ][i]; CCTK_REAL Atxx = ms->interp_values[I_ATXX][i]; CCTK_REAL Atyy = ms->interp_values[I_ATYY][i]; CCTK_REAL Atzz = ms->interp_values[I_ATZZ][i]; CCTK_REAL Atxy = ms->interp_values[I_ATXY][i]; CCTK_REAL Atxz = ms->interp_values[I_ATXZ][i]; CCTK_REAL Atyz = ms->interp_values[I_ATYZ][i]; CCTK_REAL At[3][3] = {{ Atxx, Atxy, Atxz }, { Atxy, Atyy, Atyz }, { Atxz, Atyz, Atzz }}; CCTK_REAL trK = ms->interp_values[I_K][i]; CCTK_REAL Xtx = ms->interp_values[I_XTX][i]; CCTK_REAL Xtz = ms->interp_values[I_XTZ][i]; CCTK_REAL det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz); // \tilde{γ}^{ij} gtu[0][0] = (gtyy * gtzz - SQR(gtyz)) / det; gtu[1][1] = (gtxx * gtzz - SQR(gtxz)) / det; gtu[2][2] = (gtxx * gtyy - SQR(gtxy)) / det; gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det; gtu[0][2] = (gtxy * gtyz - gtyy * gtxz) / det; gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det; gtu[1][0] = gtu[0][1]; gtu[2][0] = gtu[0][2]; gtu[2][1] = gtu[1][2]; // \tilde{A}_{i}^j for (int j = 0; j < 3; j++) for (int k = 0; k < 3; k++) { double val = 0.0; for (int l = 0; l < 3; l++) val += gtu[j][l] * At[l][k]; Am[j][k] = val; } // K_{ij} K^{ij} a2 = 0.0; for (int j = 0; j < 3; j++) for (int k = 0; k < 3; k++) a2 += Am[j][k] * Am[k][j]; { double x = ms->interp_coords[0][i]; double z = ms->interp_coords[2][i]; const double gtuxx = gtu[0][0]; const double gtuyy = gtu[1][1]; const double gtuzz = gtu[2][2]; const double gtuxz = gtu[0][2]; const double phi = ms->interp_values[I_PHI][i]; const double phi_dx = ms->interp_values[I_PHI_DX][i]; const double phi_dz = ms->interp_values[I_PHI_DZ][i]; const double Xtx = ms->interp_values[I_XTX][i]; const double Xtz = ms->interp_values[I_XTZ][i]; const double k2 = a2 + SQR(trK) / 3.; const double trk_dx = ms->interp_values[I_K_DX][i]; const double trk_dz = ms->interp_values[I_K_DZ][i]; const double betax = ms->interp_values[I_BETAX][i]; const double betaz = ms->interp_values[I_BETAZ][i]; const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi); const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); ms->eq_coeff_20[i] = SQR(phi) * (gtuxx + ((x <= EPS) ? gtuyy : 0.0)); ms->eq_coeff_02[i] = SQR(phi) * gtuzz; ms->eq_coeff_11[i] = SQR(phi) * gtuxz * 2; ms->eq_coeff_10[i] = -Xx + ((x > EPS) ? SQR(phi) * gtuyy / x : 0.0); ms->eq_coeff_01[i] = -Xz; ms->eq_coeff_00[i] = -k2; ms->rhs[i] = k2 + trK + betax * trk_dx + betaz * trk_dz; rhs_max = MAX(rhs_max, fabs(ms->rhs[i])); } //ms->metric_u[0][i] = gtu[0][0]; //ms->metric_u[1][i] = gtu[1][1]; //ms->metric_u[2][i] = gtu[2][2]; //ms->metric_u[3][i] = gtu[0][1]; //ms->metric_u[4][i] = gtu[0][2]; //ms->metric_u[5][i] = gtu[1][2]; //ms->kij_kij[i] = a2 + SQR(trK) / 3.; } *prhs_max = rhs_max; return 0; } // based on the wikipedia article // and http://www.netlib.org/templates/matlab/bicgstab.m static int solve_bicgstab(BiCGSTABContext *ctx, const int N, double *mat, double *rhs, double *x) { const double rhs_norm = cblas_dnrm2(N, rhs, 1); double rho, rho_prev = 1.0; double omega = 1.0; double alpha = 1.0; double err; int i; double *k = ctx->k; double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t; double *res = ctx->res, *res0 = ctx->res0; // initialize the residual memcpy(res, rhs, N * sizeof(*res)); cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, mat, N, x, 1, 1.0, res, 1); memcpy(res0, res, N * sizeof(*res0)); memcpy(p, res, N * sizeof(*p)); #define MAXITER 16 #define TOL (1e-15) for (i = 0; i < MAXITER; i++) { rho = cblas_ddot(N, res, 1, res0, 1); if (i) { double beta = (rho / rho_prev) * (alpha / omega); cblas_daxpy(N, -omega, v, 1, p, 1); cblas_dscal(N, beta, p, 1); cblas_daxpy(N, 1, res, 1, p, 1); } cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, k, N, p, 1, 0.0, y, 1); cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, mat, N, y, 1, 0.0, v, 1); alpha = rho / cblas_ddot(N, res0, 1, v, 1); cblas_daxpy(N, -alpha, v, 1, res, 1); cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, k, N, res, 1, 0.0, z, 1); cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, mat, N, z, 1, 0.0, t, 1); omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1); cblas_daxpy(N, alpha, y, 1, x, 1); cblas_daxpy(N, omega, z, 1, x, 1); cblas_daxpy(N, -omega, t, 1, res, 1); err = cblas_dnrm2(N, res, 1) / rhs_norm; if (err < TOL) break; rho_prev = rho; } if (i == MAXITER) return -1; return i; } static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q, const int N, double *mat, double *rhs, cl_mem ocl_x) { const double rhs_norm = cblas_dnrm2(N, rhs, 1); double rho, rho_prev = 1.0; double omega[2] = { 1.0 }; double alpha = 1.0; double err; int i; cl_event events[8]; // the matrix, rhs, k and x are assumed to be already uploaded clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs, 0, NULL, &events[0]); clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]); // initialize the residual clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, ctx->cl_mat, 0, N, ocl_x, 0, 1, 1.0, ctx->cl_res, 0, 1, 1, &cl_q, 2, events, &events[2]); clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double), 1, &events[2], &events[3]); clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double), 1, &events[2], &events[4]); clWaitForEvents(5, events); // BARRIER #define MAXITER 16 #define TOL (1e-15) for (i = 0; i < MAXITER; i++) { clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1, ctx->cl_tmp, 1, &cl_q, 0, NULL, &events[0]); clEnqueueReadBuffer(cl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho, 1, &events[0], NULL); // BARRIER if (i) { double beta = (rho / rho_prev) * (alpha / omega[0]); clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1, 1, &cl_q, 0, NULL, &events[0]); clblasDscal(N, beta, ctx->cl_p, 0, 1, 1, &cl_q, 1, &events[0], &events[1]); clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1, 1, &cl_q, 1, &events[1], &events[0]); clWaitForEvents(1, &events[0]); // BARRIER } clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1, 1, &cl_q, 0, NULL, &events[0]); clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1, 1, &cl_q, 1, &events[0], &events[1]); clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1, ctx->cl_tmp, 1, &cl_q, 1, &events[1], &events[0]); clEnqueueReadBuffer(cl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha, 1, &events[0], NULL); // BARRIER alpha = rho / alpha; clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1, 1, &cl_q, 0, NULL, &events[0]); clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1, 1, &cl_q, 1, &events[0], &events[1]); clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1, 1, &cl_q, 1, &events[1], &events[0]); clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, ctx->cl_tmp, 1, &cl_q, 1, &events[0], &events[1]); clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1, ctx->cl_tmp1, 1, &cl_q, 1, &events[0], &events[2]); clEnqueueReadBuffer(cl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega, 2, &events[1], NULL); // BARRIER omega[0] /= omega[1]; clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ocl_x, 0, 1, 1, &cl_q, 0, NULL, &events[0]); clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ocl_x, 0, 1, 1, &cl_q, 1, &events[0], &events[1]); clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, 1, &cl_q, 0, NULL, &events[0]); clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1, 1, &cl_q, 1, &events[0], &events[2]); clEnqueueReadBuffer(cl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err, 1, &events[2], NULL); clWaitForEvents(1, &events[1]); // BARRIER if (err < TOL) break; rho_prev = rho; } if (i == MAXITER) return -1; return i; } static int lu_invert(const int N, double *mat, double *rhs, int *ipiv) { char equed = 'N'; double cond, ferr, berr, rpivot; double *mat_f, *x; int ret = 0; #if 1 LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, mat, N, ipiv, rhs, N); LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv); #else mat_f = malloc(SQR(N) * sizeof(*mat_f)); x = malloc(N * sizeof(*x)); //{ // int i, j; // for (i = 0; i < N; i++) { // for (j = 0; j < N; j++) // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]); // fprintf(stderr, "\n"); // } //} { double *mat_copy = malloc(SQR(N) * sizeof(double)); double *svd = malloc(N * sizeof(double)); double *rhs_copy = malloc(N * sizeof(double)); int rank; memcpy(mat_copy, mat, SQR(N) * sizeof(double)); memcpy(rhs_copy, rhs, N * sizeof(double)); LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N, svd, 1e-13, &rank); free(mat_copy); for (int i = 0; i < N; i++) { if (i > 5 && i < N - 5) continue; fprintf(stderr, "%g\t", svd[i]); } fprintf(stderr, "\n rank %d\n", rank); free(svd); free(rhs_copy); if (rank < N) ret = 1; } //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, // mat, N, ipiv, rhs, N); LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1, mat, N, mat_f, N, ipiv, &equed, NULL, NULL, rhs, N, x, N, &cond, &ferr, &berr, &rpivot); LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv); memcpy(rhs, x, N * sizeof(double)); memcpy(mat, mat_f, SQR(N) * sizeof(double)); fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: " "condition number %16.16g; forward error %16.16g backward error %16.16g\n", N, N, cond, ferr, berr); free(mat_f); free(x); #endif return ret; } /* * Solve the equation * D²α - KᵢⱼKⁱʲα = -K * for the coefficients of spectral approximation of α: * α(ρ, z) = 1 + ΣaᵢⱼTᵢ(ρ)Tⱼ(z) * where i = { 0, ... , ms->nb_coeffs_x }; * j = { 0, ... , ms->nb_coeffs_z }; * Tᵢ(x) are defined by ms->basis. */ int msa_maximal_solve(MaximalSlicingContext *ms) { const int N = ms->nb_coeffs; double rhs_max; int64_t start; int ret = 0; /* interpolate the metric values and construct the quantities we'll need */ CCTK_TimerStart("MaximalSlicingAxi_interp_geometry"); start = gettime(); ret = interp_geometry(ms); ms->interp_geometry_time += gettime() - start; ms->interp_geometry_count++; CCTK_TimerStop("MaximalSlicingAxi_interp_geometry"); if (ret < 0) return ret; CCTK_TimerStart("MaximalSlicingAxi_calc_eq_coeffs"); start = gettime(); ret = calc_eq_coeffs(ms, &rhs_max); ms->calc_eq_coeffs_time += gettime() - start; ms->calc_eq_coeffs_count++; CCTK_TimerStop("MaximalSlicingAxi_calc_eq_coeffs"); if (ret < 0) return ret; /* fill the matrix */ CCTK_TimerStart("MaximalSlicingAxi_construct_matrix"); start = gettime(); ret = construct_matrix(ms); ms->construct_matrix_time += gettime() - start; ms->construct_matrix_count++; CCTK_TimerStop("MaximalSlicingAxi_construct_matrix"); if (ret < 0) return ret; #if 1 if (rhs_max < EPS) { memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); if (ms->cl_queue) { clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double), ms->coeffs, 0, NULL, NULL); } return 0; } #endif /* solve for the coeffs */ if (ms->steps_since_inverse < 1024) { BiCGSTABContext *b = &ms->bicgstab; int64_t start = gettime(); CCTK_TimerStart("MaximalSlicingAxi_solve_BiCGSTAB"); if (ms->cl_queue) { ret = solve_bicgstab_cl(b, ms->cl_queue, ms->nb_coeffs, ms->mat, ms->rhs, ms->ocl_coeffs); clEnqueueReadBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, sizeof(double) * N, ms->coeffs, 0, NULL, NULL); } else ret = solve_bicgstab(b, ms->nb_coeffs, ms->mat, ms->rhs, ms->coeffs); CCTK_TimerStop("MaximalSlicingAxi_solve_BiCGSTAB"); if (ret >= 0) { ms->cg_time_total += gettime() - start; ms->cg_solve_count++; ms->cg_iter_count += ret + 1; ms->steps_since_inverse++; #if 0 { double min, max; gsl_vector_memcpy(b->y, ms->rhs); cblas_dgemv(CblasColMajor, CblasNoTrans, ms->mat->size1, ms->mat->size2, -1.0, ms->mat->data, ms->mat->tda, ms->coeffs->data, 1, 1.0, b->y->data, 1); gsl_vector_minmax(b->y, &min, &max); if (fabs(min) > 1e-11 || fabs(max) > 1e-11) abort(); } #endif } } else ret = -1; if (ret < 0) { double *tmpv; double *tmpm; int64_t start; CCTK_TimerStart("MaximalSlicingAxi_solve_LU"); start = gettime(); ret = lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv); ms->lu_solves_time += gettime() - start; ms->lu_solves_count++; CCTK_TimerStop("MaximalSlicingAxi_solve_LU"); tmpv = ms->coeffs; ms->coeffs = ms->rhs; ms->rhs = tmpv; tmpm = ms->mat; ms->mat = ms->bicgstab.k; ms->bicgstab.k = tmpm; if (ret == 1) { memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); ms->coeffs[0] = 1.0; } if (ms->cl_queue) { cl_event events[2]; clEnqueueWriteBuffer(ms->cl_queue, ms->bicgstab.cl_k, 0, 0, N * N * sizeof(double), ms->bicgstab.k, 0, NULL, &events[0]); clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 0, 0, N * sizeof(double), ms->coeffs, 0, NULL, &events[1]); clWaitForEvents(2, events); } ms->steps_since_inverse = 0; } for (int i = 0; i < N; i++) ms->coeffs[i] *= ms->coeff_scale[i]; //fprintf(stderr, "solve %g %g\n", ms->gh->cctk_time, ms->coeffs[0]); return ret; }