diff options
Diffstat (limited to 'src/solve.c')
-rw-r--r-- | src/solve.c | 463 |
1 files changed, 406 insertions, 57 deletions
diff --git a/src/solve.c b/src/solve.c index 610dc0d..a1bee5e 100644 --- a/src/solve.c +++ b/src/solve.c @@ -1,4 +1,5 @@ +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -11,7 +12,7 @@ #include "maximal_slicing_axi.h" -static int construct_matrix(MaximalSlicingContext *ms, double *mat, +static int construct_matrix_cylindric(MaximalSlicingContext *ms, double *mat, double *rhs, double *prhs_max) { int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z; @@ -29,7 +30,7 @@ static int construct_matrix(MaximalSlicingContext *ms, double *mat, #pragma omp parallel for reduction(max : rhs_max) for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z; idx_grid_z++) { for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x; idx_grid_x++) { - CCTK_REAL x_physical = ms->grid_x[idx_grid_x]; + CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x]; int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; const double gtuxx = ms->metric_u[0][idx_grid]; @@ -56,47 +57,48 @@ static int construct_matrix(MaximalSlicingContext *ms, double *mat, const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi); const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); - const double coeff_20 = SQR(phi) * (gtuxx + (x_physical <= EPS) * gtuyy); + //const double tr_ric = ms->interp_values[I_TR_R][idx_grid]; + + const double coeff_20 = SQR(phi) * (gtuxx + ((x_physical <= EPS) ? gtuyy : 0.0)); const double coeff_02 = SQR(phi) * gtuzz; const double coeff_11 = SQR(phi) * gtuxz * 2; - const double coeff_10 = -Xx + (x_physical > EPS) * SQR(phi) * gtuyy / x_physical; + const double coeff_10 = -Xx + ((x_physical > EPS) ? SQR(phi) * gtuyy / x_physical : 0.0); const double coeff_01 = -Xz; + const double coeff_00 = -k2; + //const double coeff_00 = -(tr_ric + SQR(trk)); #if 1 for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; - //double d2alpha = gtuxx * D2BASIS_X * BASIS_Z - // + gtuzz * BASIS_X * D2BASIS_Z - // + 2 * gtuxz * DBASIS_X * DBASIS_Z; - //if (x_physical > EPS) - // d2alpha += gtuyy * DBASIS_X * BASIS_Z / x_physical; - //else - // d2alpha += gtuyy * D2BASIS_X * BASIS_Z; +#if 0 + double d2alpha = gtuxx * D2BASIS_X * BASIS_Z + + gtuzz * BASIS_X * D2BASIS_Z + + 2 * gtuxz * DBASIS_X * DBASIS_Z; + if (x_physical > EPS) + d2alpha += gtuyy * DBASIS_X * BASIS_Z / x_physical; + else + d2alpha += gtuyy * D2BASIS_X * BASIS_Z; - //double curv_term = Xx * DBASIS_X * BASIS_Z + Xz * BASIS_X * DBASIS_Z; + double curv_term = (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi) * DBASIS_X * BASIS_Z + + (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi) * BASIS_X * DBASIS_Z; - //double D2alpha = SQR(phi) * d2alpha - curv_term; + double D2alpha = SQR(phi) * (d2alpha - curv_term); - //mat[idx_grid + ms->nb_colloc_points * idx_coeff] = D2alpha - BASIS_X * BASIS_Z * k2; + mat[idx_grid + ms->nb_colloc_points * idx_coeff] = D2alpha - BASIS_X * BASIS_Z * k2; +#else mat[idx_grid + ms->nb_colloc_points * idx_coeff] = coeff_00 * BASIS_X * BASIS_Z + coeff_10 * DBASIS_X * BASIS_Z + coeff_01 * BASIS_X * DBASIS_Z + coeff_11 * DBASIS_X * DBASIS_Z + coeff_20 * D2BASIS_X * BASIS_Z + coeff_02 * BASIS_X * D2BASIS_Z; +#endif } #else - - const double coeff_20 = SQR(phi) * (gtuxx + (x_physical <= EPS) * gtuyy); - const double coeff_02 = SQR(phi) * gtuzz; - const double coeff_11 = SQR(phi) * gtuxz * 2; - const double coeff_10 = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi + (x_physical > EPS) * gtuyy); - const double coeff_01 = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); - const double coeff_00 = -k2; cblas_daxpy(ms->nb_coeffs, coeff_20, ms->basis_val_20 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); cblas_daxpy(ms->nb_coeffs, coeff_02, ms->basis_val_02 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); cblas_daxpy(ms->nb_coeffs, coeff_11, ms->basis_val_11 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); @@ -105,8 +107,11 @@ static int construct_matrix(MaximalSlicingContext *ms, double *mat, cblas_daxpy(ms->nb_coeffs, coeff_00, ms->basis_val_00 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); #endif - rhs[idx_grid] = k2 + trk ;// betax * trk_dx + betaz * trk_dz; - //rhs[idx_grid] = k2; + rhs[idx_grid] = k2 + trk + betax * trk_dx + betaz * trk_dz; + //rhs[idx_grid] = 0.0; + + //rhs[idx_grid] = tr_ric + SQR(trk) + betax * trk_dx + betaz * trk_dz; + rhs_max = MAX(rhs_max, fabs(rhs[idx_grid])); //rhs_max = fabs(rhs[idx_grid]); } @@ -123,8 +128,220 @@ static int construct_matrix(MaximalSlicingContext *ms, double *mat, return 0; } +static int construct_matrix_polar(MaximalSlicingContext *ms, double *mat, + double *rhs, double *prhs_max) +{ + int idx_coeff, idx_grid; + +#pragma omp parallel for + for (idx_coeff = 0; idx_coeff < ms->nb_coeffs; idx_coeff++) + for (idx_grid = 0; idx_grid < ms->nb_colloc_points; idx_grid++) { + const int idx = idx_grid + ms->nb_colloc_points * idx_coeff; + + mat[idx] = ms->eq_coeff_00[idx_grid] * ms->basis_val_00[idx] + + ms->eq_coeff_10[idx_grid] * ms->basis_val_10[idx] + + ms->eq_coeff_01[idx_grid] * ms->basis_val_01[idx] + + ms->eq_coeff_11[idx_grid] * ms->basis_val_11[idx] + + ms->eq_coeff_20[idx_grid] * ms->basis_val_20[idx] + + ms->eq_coeff_02[idx_grid] * ms->basis_val_02[idx]; + } + + //*prhs_max = ms->rhs[cblas_idamax(ms->nb_colloc_points, ms->rhs, 1)]; + + return 0; +} + +static int construct_matrix_boundary(MaximalSlicingContext *ms, double *mat, + double *rhs, double *prhs_max) +{ + int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z; + double rhs_max = 0.0; + +#define BASIS_X (ms->basis_x_val [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) +#define DBASIS_X (ms->basis_x_dval [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) +#define D2BASIS_X (ms->basis_x_d2val[idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) +#define BASIS_Z (ms->basis_z_val [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) +#define DBASIS_Z (ms->basis_z_dval [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) +#define D2BASIS_Z (ms->basis_z_d2val[idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) + +#pragma omp parallel for + for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) { + for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) { + CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x]; + int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + + const double gtuxx = ms->metric_u[0][idx_grid]; + const double gtuyy = ms->metric_u[1][idx_grid]; + const double gtuzz = ms->metric_u[2][idx_grid]; + const double gtuxz = ms->metric_u[4][idx_grid]; + + const double phi = ms->interp_values[I_PHI][idx_grid]; + const double phi_dx = ms->interp_values[I_PHI_DX][idx_grid]; + const double phi_dz = ms->interp_values[I_PHI_DZ][idx_grid]; + + const double Xtx = ms->interp_values[I_XTX][idx_grid]; + const double Xtz = ms->interp_values[I_XTZ][idx_grid]; + + const double k2 = ms->kij_kij[idx_grid]; + const double trk = ms->interp_values[I_K][idx_grid]; + + const double trk_dx = ms->interp_values[I_K_DX][idx_grid]; + const double trk_dz = ms->interp_values[I_K_DZ][idx_grid]; + + const double betax = ms->interp_values[I_BETAX][idx_grid]; + const double betaz = ms->interp_values[I_BETAZ][idx_grid]; + + const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi); + const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); + + const double coeff_20 = SQR(phi) * (gtuxx + ((fabs(x_physical) <= EPS) ? gtuyy : 0)); + const double coeff_02 = SQR(phi) * gtuzz; + const double coeff_11 = SQR(phi) * gtuxz * 2; + const double coeff_10 = -Xx + ((fabs(x_physical) > EPS) ? (SQR(phi) * gtuyy / x_physical) : 0); + const double coeff_01 = -Xz; + + const double coeff_00 = -k2; + + for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + mat[idx_grid + ms->nb_colloc_points * idx_coeff] = coeff_00 * BASIS_X * BASIS_Z + + coeff_10 * DBASIS_X * BASIS_Z + + coeff_01 * BASIS_X * DBASIS_Z + + coeff_11 * DBASIS_X * DBASIS_Z + + coeff_20 * D2BASIS_X * BASIS_Z + + coeff_02 * BASIS_X * D2BASIS_Z; + } + + rhs[idx_grid] = 0.0; + //rhs[idx_grid] = k2 + trk + betax * trk_dx + betaz * trk_dz; + + //rhs_max = MAX(rhs_max, fabs(rhs[idx_grid])); + //rhs_max = fabs(rhs[idx_grid]); + } + } + +#if 1 + // z = 0 axis + idx_grid_z = 0; + for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) { + int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + int idx_grid1 = (ms->nb_colloc_points_z - 2) * ms->nb_colloc_points_x + idx_grid_x; + + for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + mat[idx_grid1 + ms->nb_colloc_points * idx_coeff] = BASIS_X * DBASIS_Z; + } + rhs[idx_grid1] = 0.0; + } + + // x = 0 axis + idx_grid_x = 0; + for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) { + int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + int idx_grid1 = idx_grid_z * ms->nb_colloc_points_x + ms->nb_colloc_points_x - 2; + + for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + mat[idx_grid1 + ms->nb_colloc_points * idx_coeff] = DBASIS_X * BASIS_Z; + } + rhs[idx_grid1] = 0.0; + } + + // z = z_max axis + idx_grid_z = ms->nb_colloc_points_z - 1; + for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) { + CCTK_REAL z_physical = ms->colloc_grid[1][idx_grid_z]; + + int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + + for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z; //+ z_physical * BASIS_X * DBASIS_Z; + } + rhs[idx_grid] = 1.0; + } + + // x = x_max axis + idx_grid_x = ms->nb_colloc_points_x - 1; + for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) { + CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x]; + + int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + + for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z;// + x_physical * DBASIS_X * BASIS_Z; + } + rhs[idx_grid] = 1.0; + } + + //idx_grid_x = 0; + //idx_grid_z = 0; + //{ + // int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + + // for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + // for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + // const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + // mat[idx_grid + ms->nb_colloc_points * idx_coeff] = DBASIS_X * BASIS_Z + BASIS_X * DBASIS_Z; + // } + // rhs[idx_grid] = 0.0; + //} + + //idx_grid_x = ms->nb_colloc_points_x - 1; + //idx_grid_z = ms->nb_colloc_points_z - 1; + //{ + // CCTK_REAL z_physical = ms->colloc_grid[1][idx_grid_z]; + // CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x]; + // CCTK_REAL r2 = SQR(z_physical) + SQR(x_physical); + // int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + + // for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + // for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + // const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + // mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z + 0.5 * r2 / z_physical * BASIS_X * DBASIS_Z + 0.5 * r2 / x_physical * DBASIS_X * BASIS_Z; + // } + // rhs[idx_grid] = 1.0; + //} +#endif + + *prhs_max = rhs_max; + +#if 0 + fprintf(stderr, "matrix\n"); + for (int i = 0; i < ms->nb_colloc_points; i++) { + for (int j = 0; j < ms->nb_coeffs; j++) + fprintf(stderr, "%+08.04g ", mat[i + ms->nb_colloc_points * j]); + fprintf(stderr, "\n"); + } + + fprintf(stderr, "rhs\n"); + for (int i = 0; i < ms->nb_colloc_points; i++) + fprintf(stderr, "%+08.04g ", rhs[i]); + fprintf(stderr, "\n"); +#endif -static int calc_geometry(MaximalSlicingContext *ms) + return 0; +} + +static int construct_matrix(MaximalSlicingContext *ms) +{ + return construct_matrix_polar(ms, ms->mat, NULL, NULL); +} +/* interpolate the cactus gridfunctions onto the pseudospectral grid */ +static int interp_geometry(MaximalSlicingContext *ms) { int ret; @@ -135,7 +352,22 @@ static int calc_geometry(MaximalSlicingContext *ms) if (ret < 0) CCTK_WARN(0, "Error interpolating"); -#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x) + //CCTK_TimerStart("MaximalSlicingAxi_filter_input"); + //{ + // cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + // ms->nb_colloc_points, ARRAY_ELEMS(ms->interp_values), ms->nb_colloc_points, + // 1.0, ms->input_filter, ms->nb_colloc_points, ms->interp_buffer_prefilter, ms->nb_colloc_points, + // 0.0, ms->interp_buffer, ms->nb_colloc_points); + //} + //CCTK_TimerStop("MaximalSlicingAxi_filter_input"); + + return 0; +} + +static int calc_eq_coeffs(MaximalSlicingContext *ms, double *prhs_max) +{ + double rhs_max = 0.0; +#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x) reduction(max : rhs_max) for (int i = 0; i < ms->nb_colloc_points; i++) { CCTK_REAL Am[3][3], gtu[3][3]; CCTK_REAL a2; @@ -191,16 +423,56 @@ static int calc_geometry(MaximalSlicingContext *ms) for (int k = 0; k < 3; k++) a2 += Am[j][k] * Am[k][j]; - ms->metric_u[0][i] = gtu[0][0]; - ms->metric_u[1][i] = gtu[1][1]; - ms->metric_u[2][i] = gtu[2][2]; - ms->metric_u[3][i] = gtu[0][1]; - ms->metric_u[4][i] = gtu[0][2]; - ms->metric_u[5][i] = gtu[1][2]; + { + double x = ms->interp_coords[0][i]; + double z = ms->interp_coords[2][i]; + + const double gtuxx = gtu[0][0]; + const double gtuyy = gtu[1][1]; + const double gtuzz = gtu[2][2]; + const double gtuxz = gtu[0][2]; + + const double phi = ms->interp_values[I_PHI][i]; + const double phi_dx = ms->interp_values[I_PHI_DX][i]; + const double phi_dz = ms->interp_values[I_PHI_DZ][i]; + + const double Xtx = ms->interp_values[I_XTX][i]; + const double Xtz = ms->interp_values[I_XTZ][i]; + + const double k2 = a2 + SQR(trK) / 3.; + + const double trk_dx = ms->interp_values[I_K_DX][i]; + const double trk_dz = ms->interp_values[I_K_DZ][i]; + + const double betax = ms->interp_values[I_BETAX][i]; + const double betaz = ms->interp_values[I_BETAZ][i]; + + const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi); + const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); + + ms->eq_coeff_20[i] = SQR(phi) * (gtuxx + ((x <= EPS) ? gtuyy : 0.0)); + ms->eq_coeff_02[i] = SQR(phi) * gtuzz; + ms->eq_coeff_11[i] = SQR(phi) * gtuxz * 2; + ms->eq_coeff_10[i] = -Xx + ((x > EPS) ? SQR(phi) * gtuyy / x : 0.0); + ms->eq_coeff_01[i] = -Xz; + ms->eq_coeff_00[i] = -k2; + ms->rhs[i] = k2 + trK + betax * trk_dx + betaz * trk_dz; + + rhs_max = MAX(rhs_max, fabs(ms->rhs[i])); + } + + //ms->metric_u[0][i] = gtu[0][0]; + //ms->metric_u[1][i] = gtu[1][1]; + //ms->metric_u[2][i] = gtu[2][2]; + //ms->metric_u[3][i] = gtu[0][1]; + //ms->metric_u[4][i] = gtu[0][2]; + //ms->metric_u[5][i] = gtu[1][2]; - ms->kij_kij[i] = a2 + SQR(trK) / 3.; + //ms->kij_kij[i] = a2 + SQR(trK) / 3.; } + *prhs_max = rhs_max; + return 0; } @@ -274,8 +546,6 @@ static int solve_bicgstab(BiCGSTABContext *ctx, const int N, if (i == MAXITER) return -1; - ctx->iter_total += i + 1; - return i; } @@ -293,12 +563,9 @@ static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q, cl_event events[8]; - // upload the matrix and the RHS to the GPU - // k and x are assumed to be already uploaded - clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double), - rhs, 0, NULL, &events[0]); - clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), - mat, 0, NULL, &events[1]); + // the matrix, rhs, k and x are assumed to be already uploaded + clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs, 0, NULL, &events[0]); + clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]); // initialize the residual clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, @@ -393,18 +660,77 @@ static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q, if (i == MAXITER) return -1; - ctx->iter_total += i + 1; - return i; } static int lu_invert(const int N, double *mat, double *rhs, int *ipiv) { + char equed = 'N'; + double cond, ferr, berr, rpivot; + + double *mat_f, *x; + int ret = 0; +#if 1 LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, mat, N, ipiv, rhs, N); LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv); +#else + mat_f = malloc(SQR(N) * sizeof(*mat_f)); + x = malloc(N * sizeof(*x)); + + //{ + // int i, j; + // for (i = 0; i < N; i++) { + // for (j = 0; j < N; j++) + // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]); + // fprintf(stderr, "\n"); + // } + //} + { + double *mat_copy = malloc(SQR(N) * sizeof(double)); + double *svd = malloc(N * sizeof(double)); + double *rhs_copy = malloc(N * sizeof(double)); + int rank; + + memcpy(mat_copy, mat, SQR(N) * sizeof(double)); + memcpy(rhs_copy, rhs, N * sizeof(double)); + + LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N, + svd, 1e-13, &rank); + + free(mat_copy); + for (int i = 0; i < N; i++) { + if (i > 5 && i < N - 5) + continue; + + fprintf(stderr, "%g\t", svd[i]); + } + fprintf(stderr, "\n rank %d\n", rank); + free(svd); + free(rhs_copy); - return 0; + if (rank < N) + ret = 1; + } + + //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + // mat, N, ipiv, rhs, N); + LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1, + mat, N, mat_f, N, ipiv, &equed, NULL, NULL, + rhs, N, x, N, &cond, &ferr, &berr, &rpivot); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv); + memcpy(rhs, x, N * sizeof(double)); + memcpy(mat, mat_f, SQR(N) * sizeof(double)); + + fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: " + "condition number %16.16g; forward error %16.16g backward error %16.16g\n", + N, N, cond, ferr, berr); + + free(mat_f); + free(x); +#endif + + return ret; } /* @@ -420,23 +746,40 @@ int msa_maximal_solve(MaximalSlicingContext *ms) { const int N = ms->nb_coeffs; double rhs_max; + int64_t start; int ret = 0; /* interpolate the metric values and construct the quantities we'll need */ - CCTK_TimerStart("MaximalSlicingAxi_calc_geometry"); - ret = calc_geometry(ms); - CCTK_TimerStop("MaximalSlicingAxi_calc_geometry"); + CCTK_TimerStart("MaximalSlicingAxi_interp_geometry"); + start = gettime(); + ret = interp_geometry(ms); + ms->interp_geometry_time += gettime() - start; + ms->interp_geometry_count++; + CCTK_TimerStop("MaximalSlicingAxi_interp_geometry"); + if (ret < 0) + return ret; + + CCTK_TimerStart("MaximalSlicingAxi_calc_eq_coeffs"); + start = gettime(); + ret = calc_eq_coeffs(ms, &rhs_max); + ms->calc_eq_coeffs_time += gettime() - start; + ms->calc_eq_coeffs_count++; + CCTK_TimerStop("MaximalSlicingAxi_calc_eq_coeffs"); if (ret < 0) return ret; /* fill the matrix */ CCTK_TimerStart("MaximalSlicingAxi_construct_matrix"); - ret = construct_matrix(ms, ms->mat, ms->rhs, &rhs_max); + start = gettime(); + ret = construct_matrix(ms); + ms->construct_matrix_time += gettime() - start; + ms->construct_matrix_count++; CCTK_TimerStop("MaximalSlicingAxi_construct_matrix"); if (ret < 0) return ret; +#if 1 if (rhs_max < EPS) { memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); if (ms->cl_queue) { @@ -445,9 +788,10 @@ int msa_maximal_solve(MaximalSlicingContext *ms) } return 0; } +#endif /* solve for the coeffs */ - if (ms->steps_since_inverse < 128) { + if (ms->steps_since_inverse < 1024) { BiCGSTABContext *b = &ms->bicgstab; int64_t start = gettime(); @@ -461,15 +805,11 @@ int msa_maximal_solve(MaximalSlicingContext *ms) CCTK_TimerStop("MaximalSlicingAxi_solve_BiCGSTAB"); if (ret >= 0) { - b->time_total += gettime() - start; - b->solve_total++; + ms->cg_time_total += gettime() - start; + ms->cg_solve_count++; + ms->cg_iter_count += ret + 1; ms->steps_since_inverse++; - if (!(b->solve_total & 127)) { - fprintf(stderr, "BiCGSTAB %ld solves, %ld iterations, total time %ld, avg iterations per solve %g, avg time per solve %g, avg time per iteration %g\n", - b->solve_total, b->iter_total, b->time_total, (double)b->iter_total / b->solve_total, (double)b->time_total / b->solve_total, (double)b->time_total / b->iter_total); - fprintf(stderr, "LU %ld solves, total time %ld, avg time per solve %g\n", ms->lu_solves_total, ms->lu_solves_time, (double)ms->lu_solves_time / ms->lu_solves_total); - } #if 0 { double min, max; @@ -493,9 +833,9 @@ int msa_maximal_solve(MaximalSlicingContext *ms) CCTK_TimerStart("MaximalSlicingAxi_solve_LU"); start = gettime(); - lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv); + ret = lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv); ms->lu_solves_time += gettime() - start; - ms->lu_solves_total++; + ms->lu_solves_count++; CCTK_TimerStop("MaximalSlicingAxi_solve_LU"); tmpv = ms->coeffs; @@ -506,6 +846,11 @@ int msa_maximal_solve(MaximalSlicingContext *ms) ms->mat = ms->bicgstab.k; ms->bicgstab.k = tmpm; + if (ret == 1) { + memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); + ms->coeffs[0] = 1.0; + } + if (ms->cl_queue) { cl_event events[2]; clEnqueueWriteBuffer(ms->cl_queue, ms->bicgstab.cl_k, 0, 0, N * N * sizeof(double), @@ -518,6 +863,10 @@ int msa_maximal_solve(MaximalSlicingContext *ms) ms->steps_since_inverse = 0; } + for (int i = 0; i < N; i++) + ms->coeffs[i] *= ms->coeff_scale[i]; + + //fprintf(stderr, "solve %g %g\n", ms->gh->cctk_time, ms->coeffs[0]); return ret; } |