diff options
Diffstat (limited to 'src/solve.c')
-rw-r--r-- | src/solve.c | 636 |
1 files changed, 0 insertions, 636 deletions
diff --git a/src/solve.c b/src/solve.c deleted file mode 100644 index 0bd56e2..0000000 --- a/src/solve.c +++ /dev/null @@ -1,636 +0,0 @@ - -#include <math.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include <cblas.h> -#include <lapacke.h> - -#include <cl.h> -#include <clBLAS.h> - -#include "qms.h" - -static int construct_matrix(MaximalSlicingContext *ms) -{ - double *mat = ms->mat; - int idx_coeff, idx_grid; - -#pragma omp parallel for - for (idx_coeff = 0; idx_coeff < ms->nb_coeffs; idx_coeff++) - for (idx_grid = 0; idx_grid < ms->nb_colloc_points; idx_grid++) { - const int idx = idx_grid + ms->nb_colloc_points * idx_coeff; - - mat[idx] = ms->eq_coeff_00[idx_grid] * ms->basis_val_00[idx] + - ms->eq_coeff_10[idx_grid] * ms->basis_val_10[idx] + - ms->eq_coeff_01[idx_grid] * ms->basis_val_01[idx] + - ms->eq_coeff_11[idx_grid] * ms->basis_val_11[idx] + - ms->eq_coeff_20[idx_grid] * ms->basis_val_20[idx] + - ms->eq_coeff_02[idx_grid] * ms->basis_val_02[idx]; - } - - return 0; -} - -/* interpolate the cactus gridfunctions onto the pseudospectral grid */ -static int interp_geometry(MaximalSlicingContext *ms) -{ - int ret; - - ret = CCTK_InterpGridArrays(ms->gh, 3, ms->interp_operator, ms->interp_params, - ms->coord_system, ms->nb_colloc_points, CCTK_VARIABLE_REAL, - (const void * const *)ms->interp_coords, ARRAY_ELEMS(ms->interp_vars_indices), ms->interp_vars_indices, - ARRAY_ELEMS(ms->interp_values), ms->interp_value_codes, (void * const *)ms->interp_values_prefilter); - if (ret < 0) - CCTK_WARN(0, "Error interpolating"); - - CCTK_TimerStart("MaximalSlicingAxi_filter_input"); - for (int i = 0; i < ARRAY_ELEMS(ms->interp_values); i++) { - cblas_dgemv(CblasColMajor, CblasNoTrans, ms->nb_colloc_points, ms->nb_colloc_points, 1.0, - ms->input_filter, ms->nb_colloc_points, ms->interp_values_prefilter[i], 1, 0.0, - ms->interp_values[i], 1); - } - CCTK_TimerStop("MaximalSlicingAxi_filter_input"); - - return 0; -} - -static int calc_eq_coeffs(MaximalSlicingContext *ms, double *prhs_max) -{ - double rhs_max = 0.0; -//#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x) reduction(max : rhs_max) - for (int i = 0; i < ms->nb_colloc_points; i++) { - const double x = ms->interp_coords[0][i]; - const double z = ms->interp_coords[2][i]; - const int zaxis = x <= EPS; - - double Am[3][3], K[3][3], Km[3][3], Ku[3][3], gtu[3][3]; - double k2, kij_dij_alpha, k_kdot, k3; - - const double gtxx = ms->interp_values[I_GTXX][i]; - const double gtyy = ms->interp_values[I_GTYY][i]; - const double gtzz = ms->interp_values[I_GTZZ][i]; - const double gtxy = ms->interp_values[I_GTXY][i]; - const double gtxz = ms->interp_values[I_GTXZ][i]; - const double gtyz = ms->interp_values[I_GTYZ][i]; - - const double gt[3][3] = {{ gtxx, gtxy, gtxz }, - { gtxy, gtyy, gtyz }, - { gtxz, gtyz, gtzz }}; - - const double Atxx = ms->interp_values[I_ATXX][i]; - const double Atyy = ms->interp_values[I_ATYY][i]; - const double Atzz = ms->interp_values[I_ATZZ][i]; - const double Atxy = ms->interp_values[I_ATXY][i]; - const double Atxz = ms->interp_values[I_ATXZ][i]; - const double Atyz = ms->interp_values[I_ATYZ][i]; - - const double phi = ms->interp_values[I_PHI][i]; - - const double phidot = ms->interp_values[I_PHIDOT][i]; - const double phidot_dx = ms->interp_values[I_PHIDOT_DX][i]; - const double phidot_dz = ms->interp_values[I_PHIDOT_DZ][i]; - - const double At[3][3] = {{ Atxx, Atxy, Atxz }, - { Atxy, Atyy, Atyz }, - { Atxz, Atyz, Atzz }}; - - const double trK = ms->interp_values[I_K][i]; - const double kdot_xx = ms->interp_values[I_KDOT_XX][i]; - const double kdot_yy = ms->interp_values[I_KDOT_YY][i]; - const double kdot_zz = ms->interp_values[I_KDOT_ZZ][i]; - const double kdot_xy = ms->interp_values[I_KDOT_XY][i]; - const double kdot_xz = ms->interp_values[I_KDOT_XZ][i]; - const double kdot_yz = ms->interp_values[I_KDOT_YZ][i]; - - const double kdot[3][3] = {{ kdot_xx, kdot_xy, kdot_xz }, - { kdot_xy, kdot_yy, kdot_yz }, - { kdot_xz, kdot_yz, kdot_zz }}; - - const double alpha = ms->interp_values[I_ALPHA][i]; - const double dx_alpha = ms->interp_values[I_ALPHA_DX][i]; - const double dz_alpha = ms->interp_values[I_ALPHA_DZ][i]; - const double dxx_alpha = ms->interp_values[I_ALPHA_DXX][i]; - const double dzz_alpha = ms->interp_values[I_ALPHA_DZZ][i]; - const double dxz_alpha = ms->interp_values[I_ALPHA_DXZ][i]; - - const double dij_alpha[3][3] = {{ dxx_alpha, 0, dxz_alpha }, - { 0, zaxis ? dxx_alpha : dx_alpha / x, 0 }, - { dxz_alpha, 0, dzz_alpha }}; - - const double Xtx = ms->interp_values[I_XTX][i]; - const double Xtz = ms->interp_values[I_XTZ][i]; - - const double Xtdot_x = ms->interp_values[I_XTDOT_X][i]; - const double Xtdot_z = ms->interp_values[I_XTDOT_Z][i]; - - const double det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz); - - // \tilde{γ}^{ij} - gtu[0][0] = (gtyy * gtzz - SQR(gtyz)) / det; - gtu[1][1] = (gtxx * gtzz - SQR(gtxz)) / det; - gtu[2][2] = (gtxx * gtyy - SQR(gtxy)) / det; - gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det; - gtu[0][2] = (gtxy * gtyz - gtyy * gtxz) / det; - gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det; - gtu[1][0] = gtu[0][1]; - gtu[2][0] = gtu[0][2]; - gtu[2][1] = gtu[1][2]; - - // K_{ij} - for (int j = 0; j < 3; j++) - for (int k = 0; k < 3; k++) - K[j][k] = At[j][k] / SQR(phi) + gt[j][k] * trK; - - for (int j = 0; j < 3; j++) - for (int k = 0; k < 3; k++) { - double val = 0.0; - for (int l = 0; l < 3; l++) - val += SQR(phi) * gtu[j][l] * K[l][k]; - Km[j][k] = val; - } - - // K^{ij} - for (int j = 0; j < 3; j++) - for (int k = 0; k < 3; k++) { - double val = 0.0; - for (int l = 0; l < 3; l++) - val += SQR(phi) * gtu[j][l] * Km[k][l]; - Ku[j][k] = val; - } - - // \tilde{A}_{i}^j - for (int j = 0; j < 3; j++) - for (int k = 0; k < 3; k++) { - double val = 0.0; - for (int l = 0; l < 3; l++) - val += gtu[j][l] * At[l][k]; - Am[j][k] = val; - } - - kij_dij_alpha = 0.0; - for (int j = 0; j < 3; j++) - for (int k = 0; k < 3; k++) - kij_dij_alpha += Ku[j][k] * dij_alpha[j][k]; - - k_kdot = 0.0; - for (int j = 0; j < 3; j++) - for (int k = 0; k < 3; k++) - k_kdot += kdot[j][k] * Ku[j][k]; - - k3 = 0.0; - for (int j = 0; j < 3; j++) - for (int k = 0; k < 3; k++) { - double val = 0.0; - for (int l = 0; l < 3; l++) - val += Km[k][l] * Ku[l][j]; - k3 += val * K[j][k]; - } - - // K_{ij} K^{ij} - k2 = 0.0; - for (int j = 0; j < 3; j++) - for (int k = 0; k < 3; k++) - k2 += Km[j][k] * Km[k][j]; - - { - const double gtuxx = gtu[0][0]; - const double gtuyy = gtu[1][1]; - const double gtuzz = gtu[2][2]; - const double gtuxz = gtu[0][2]; - - const double phi_dx = ms->interp_values[I_PHI_DX][i]; - const double phi_dz = ms->interp_values[I_PHI_DZ][i]; - - const double Xtx = ms->interp_values[I_XTX][i]; - const double Xtz = ms->interp_values[I_XTZ][i]; - - //const double k2 = a2 + SQR(trK) / 3.; - - const double trk_dx = ms->interp_values[I_K_DX][i]; - const double trk_dz = ms->interp_values[I_K_DZ][i]; - - const double betax = ms->interp_values[I_BETAX][i]; - const double betaz = ms->interp_values[I_BETAZ][i]; - - const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi); - const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); - - const double Xdot_x = 2 * phi * phidot * Xtx + SQR(phi) * Xtdot_x + phi * (phidot_dx * gtuxx + phidot_dz * gtuxz) - - phidot * (phi_dx * gtuxx + phi_dz * gtuxz) + 2 * alpha * (phi_dx * Ku[0][0] + phi_dz * Ku[0][2]) / phi; - const double Xdot_z = 2 * phi * phidot * Xtz + SQR(phi) * Xtdot_z + phi * (phidot_dz * gtuzz + phidot_dx * gtuxz) - - phidot * (phi_dz * gtuzz + phi_dx * gtuxz) + 2 * alpha * (phi_dz * Ku[2][2] + phi_dx * Ku[0][2]) / phi; - - ms->eq_coeff_20[i] = SQR(phi) * (gtuxx + ((x <= EPS) ? gtuyy : 0.0)); - ms->eq_coeff_02[i] = SQR(phi) * gtuzz; - ms->eq_coeff_11[i] = SQR(phi) * gtuxz * 2; - ms->eq_coeff_10[i] = -Xx + ((x > EPS) ? SQR(phi) * gtuyy / x : 0.0); - ms->eq_coeff_01[i] = -Xz; - ms->eq_coeff_00[i] = -k2; - - ms->rhs[i] = -2 * alpha * kij_dij_alpha + Xdot_x * dx_alpha + Xdot_z * dz_alpha + - 2 * (k_kdot + 2 * alpha * k3) * alpha; - - rhs_max = MAX(rhs_max, fabs(ms->rhs[i])); - } - } - - *prhs_max = rhs_max; - - return 0; -} - -// based on the wikipedia article -// and http://www.netlib.org/templates/matlab/bicgstab.m -static int solve_bicgstab(BiCGSTABContext *ctx, const int N, - double *mat, double *rhs, double *x) -{ - const double rhs_norm = cblas_dnrm2(N, rhs, 1); - - double rho, rho_prev = 1.0; - double omega = 1.0; - double alpha = 1.0; - - double err; - int i; - - double *k = ctx->k; - double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t; - double *res = ctx->res, *res0 = ctx->res0; - - // initialize the residual - memcpy(res, rhs, N * sizeof(*res)); - cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, - mat, N, x, 1, 1.0, res, 1); - - memcpy(res0, res, N * sizeof(*res0)); - memcpy(p, res, N * sizeof(*p)); - -#define MAXITER 16 -#define TOL (1e-15) - for (i = 0; i < MAXITER; i++) { - rho = cblas_ddot(N, res, 1, res0, 1); - - if (i) { - double beta = (rho / rho_prev) * (alpha / omega); - - cblas_daxpy(N, -omega, v, 1, p, 1); - cblas_dscal(N, beta, p, 1); - cblas_daxpy(N, 1, res, 1, p, 1); - } - - cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - k, N, p, 1, 0.0, y, 1); - - cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - mat, N, y, 1, 0.0, v, 1); - - alpha = rho / cblas_ddot(N, res0, 1, v, 1); - - cblas_daxpy(N, -alpha, v, 1, res, 1); - - cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - k, N, res, 1, 0.0, z, 1); - cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - mat, N, z, 1, 0.0, t, 1); - - omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1); - - cblas_daxpy(N, alpha, y, 1, x, 1); - cblas_daxpy(N, omega, z, 1, x, 1); - - cblas_daxpy(N, -omega, t, 1, res, 1); - - err = cblas_dnrm2(N, res, 1) / rhs_norm; - if (err < TOL) - break; - - rho_prev = rho; - } - if (i == MAXITER) - return -1; - - return i; -} - -static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q, - const int N, double *mat, double *rhs, cl_mem ocl_x) -{ - const double rhs_norm = cblas_dnrm2(N, rhs, 1); - - double rho, rho_prev = 1.0; - double omega[2] = { 1.0 }; - double alpha = 1.0; - - double err; - int i; - - cl_event events[8]; - - // the matrix, rhs, k and x are assumed to be already uploaded - clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs, 0, NULL, &events[0]); - clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]); - - // initialize the residual - clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, - ctx->cl_mat, 0, N, ocl_x, 0, 1, 1.0, ctx->cl_res, 0, 1, - 1, &cl_q, 2, events, &events[2]); - clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double), - 1, &events[2], &events[3]); - clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double), - 1, &events[2], &events[4]); - - clWaitForEvents(5, events); - // BARRIER - -#define MAXITER 16 -#define TOL (1e-15) - for (i = 0; i < MAXITER; i++) { - clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1, - ctx->cl_tmp, 1, &cl_q, 0, NULL, &events[0]); - clEnqueueReadBuffer(cl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho, - 1, &events[0], NULL); - // BARRIER - - if (i) { - double beta = (rho / rho_prev) * (alpha / omega[0]); - - clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1, - 1, &cl_q, 0, NULL, &events[0]); - clblasDscal(N, beta, ctx->cl_p, 0, 1, - 1, &cl_q, 1, &events[0], &events[1]); - clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1, - 1, &cl_q, 1, &events[1], &events[0]); - clWaitForEvents(1, &events[0]); - // BARRIER - } - - clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1, - 1, &cl_q, 0, NULL, &events[0]); - - clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1, - 1, &cl_q, 1, &events[0], &events[1]); - - clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1, - ctx->cl_tmp, 1, &cl_q, 1, &events[1], &events[0]); - clEnqueueReadBuffer(cl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha, - 1, &events[0], NULL); - // BARRIER - - alpha = rho / alpha; - - clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1, - 1, &cl_q, 0, NULL, &events[0]); - - clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1, - 1, &cl_q, 1, &events[0], &events[1]); - clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1, - 1, &cl_q, 1, &events[1], &events[0]); - - clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, - ctx->cl_tmp, 1, &cl_q, 1, &events[0], &events[1]); - clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1, - ctx->cl_tmp1, 1, &cl_q, 1, &events[0], &events[2]); - - clEnqueueReadBuffer(cl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega, - 2, &events[1], NULL); - // BARRIER - - omega[0] /= omega[1]; - - clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ocl_x, 0, 1, - 1, &cl_q, 0, NULL, &events[0]); - clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ocl_x, 0, 1, - 1, &cl_q, 1, &events[0], &events[1]); - - clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, - 1, &cl_q, 0, NULL, &events[0]); - clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1, - 1, &cl_q, 1, &events[0], &events[2]); - clEnqueueReadBuffer(cl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err, - 1, &events[2], NULL); - clWaitForEvents(1, &events[1]); - // BARRIER - - if (err < TOL) - break; - - rho_prev = rho; - } - if (i == MAXITER) - return -1; - - return i; -} - -static int lu_invert(const int N, double *mat, double *rhs, int *ipiv) -{ - char equed = 'N'; - double cond, ferr, berr, rpivot; - - double *mat_f, *x; - int ret = 0; -#if 1 - LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, - mat, N, ipiv, rhs, N); - LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv); -#else - mat_f = malloc(SQR(N) * sizeof(*mat_f)); - x = malloc(N * sizeof(*x)); - - //{ - // int i, j; - // for (i = 0; i < N; i++) { - // for (j = 0; j < N; j++) - // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]); - // fprintf(stderr, "\n"); - // } - //} - //{ - // double *mat_copy = malloc(SQR(N) * sizeof(double)); - // double *svd = malloc(N * sizeof(double)); - // double *rhs_copy = malloc(N * sizeof(double)); - // int rank; - - // memcpy(mat_copy, mat, SQR(N) * sizeof(double)); - // memcpy(rhs_copy, rhs, N * sizeof(double)); - - // LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N, - // svd, 1e-13, &rank); - - // free(mat_copy); - // for (int i = 0; i < N; i++) { - // if (i > 5 && i < N - 5) - // continue; - - // fprintf(stderr, "%g\t", svd[i]); - // } - // fprintf(stderr, "\n rank %d\n", rank); - // free(svd); - // free(rhs_copy); - - // if (rank < N) - // ret = 1; - //} - - //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, - // mat, N, ipiv, rhs, N); - LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1, - mat, N, mat_f, N, ipiv, &equed, NULL, NULL, - rhs, N, x, N, &cond, &ferr, &berr, &rpivot); - LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv); - memcpy(rhs, x, N * sizeof(double)); - memcpy(mat, mat_f, SQR(N) * sizeof(double)); - - fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: " - "condition number %16.16g; forward error %16.16g backward error %16.16g\n", - N, N, cond, ferr, berr); - - free(mat_f); - free(x); -#endif - - return ret; -} - -/* - * Solve the equation - * D²α - KᵢⱼKⁱʲα = -K - * for the coefficients of spectral approximation of α: - * α(ρ, z) = 1 + ΣaᵢⱼTᵢ(ρ)Tⱼ(z) - * where i = { 0, ... , ms->nb_coeffs_x }; - * j = { 0, ... , ms->nb_coeffs_z }; - * Tᵢ(x) are defined by ms->basis. - */ -int qms_maximal_solve(MaximalSlicingContext *ms) -{ - const int N = ms->nb_coeffs; - double rhs_max; - int64_t start; - - int ret = 0; - - /* interpolate the metric values and construct the quantities we'll need */ - CCTK_TimerStart("MaximalSlicingAxi_interp_geometry"); - start = gettime(); - ret = interp_geometry(ms); - ms->interp_geometry_time += gettime() - start; - ms->interp_geometry_count++; - CCTK_TimerStop("MaximalSlicingAxi_interp_geometry"); - if (ret < 0) - return ret; - - CCTK_TimerStart("MaximalSlicingAxi_calc_eq_coeffs"); - start = gettime(); - ret = calc_eq_coeffs(ms, &rhs_max); - ms->calc_eq_coeffs_time += gettime() - start; - ms->calc_eq_coeffs_count++; - CCTK_TimerStop("MaximalSlicingAxi_calc_eq_coeffs"); - if (ret < 0) - return ret; - - /* fill the matrix */ - CCTK_TimerStart("MaximalSlicingAxi_construct_matrix"); - start = gettime(); - ret = construct_matrix(ms); - ms->construct_matrix_time += gettime() - start; - ms->construct_matrix_count++; - CCTK_TimerStop("MaximalSlicingAxi_construct_matrix"); - if (ret < 0) - return ret; - -#if 1 - if (rhs_max < EPS) { - fprintf(stderr, "zero rhs\n"); - memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); - if (ms->cl_queue) { - clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double), - ms->coeffs, 0, NULL, NULL); - } - return 0; - } -#endif - - /* solve for the coeffs */ - if (ms->steps_since_inverse < 1024) { - BiCGSTABContext *b = &ms->bicgstab; - int64_t start = gettime(); - - CCTK_TimerStart("MaximalSlicingAxi_solve_BiCGSTAB"); - if (ms->cl_queue) { - ret = solve_bicgstab_cl(b, ms->cl_queue, ms->nb_coeffs, ms->mat, ms->rhs, ms->ocl_coeffs); - clEnqueueReadBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, sizeof(double) * N, - ms->coeffs, 0, NULL, NULL); - } else - ret = solve_bicgstab(b, ms->nb_coeffs, ms->mat, ms->rhs, ms->coeffs); - CCTK_TimerStop("MaximalSlicingAxi_solve_BiCGSTAB"); - - if (ret >= 0) { - ms->cg_time_total += gettime() - start; - ms->cg_solve_count++; - ms->cg_iter_count += ret + 1; - ms->steps_since_inverse++; - -#if 0 - { - double min, max; - gsl_vector_memcpy(b->y, ms->rhs); - cblas_dgemv(CblasColMajor, CblasNoTrans, ms->mat->size1, ms->mat->size2, -1.0, - ms->mat->data, ms->mat->tda, ms->coeffs->data, 1, 1.0, b->y->data, 1); - gsl_vector_minmax(b->y, &min, &max); - if (fabs(min) > 1e-11 || fabs(max) > 1e-11) - abort(); - } -#endif - } - } else - ret = -1; - - if (ret < 0) { - double *tmpv; - double *tmpm; - int64_t start; - - CCTK_TimerStart("MaximalSlicingAxi_solve_LU"); - start = gettime(); - - ret = lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv); - ms->lu_solves_time += gettime() - start; - ms->lu_solves_count++; - CCTK_TimerStop("MaximalSlicingAxi_solve_LU"); - - tmpv = ms->coeffs; - ms->coeffs = ms->rhs; - ms->rhs = tmpv; - - tmpm = ms->mat; - ms->mat = ms->bicgstab.k; - ms->bicgstab.k = tmpm; - - if (ret == 1) { - memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); - ms->coeffs[0] = 1.0; - } - - if (ms->cl_queue) { - cl_event events[2]; - clEnqueueWriteBuffer(ms->cl_queue, ms->bicgstab.cl_k, 0, 0, N * N * sizeof(double), - ms->bicgstab.k, 0, NULL, &events[0]); - clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 0, 0, N * sizeof(double), - ms->coeffs, 0, NULL, &events[1]); - clWaitForEvents(2, events); - } - - ms->steps_since_inverse = 0; - } - - for (int i = 0; i < N; i++) - ms->coeffs[i] *= ms->coeff_scale[i]; - - return ret; -} |