summaryrefslogtreecommitdiff
path: root/src/solve.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/solve.c')
-rw-r--r--src/solve.c523
1 files changed, 523 insertions, 0 deletions
diff --git a/src/solve.c b/src/solve.c
new file mode 100644
index 0000000..610dc0d
--- /dev/null
+++ b/src/solve.c
@@ -0,0 +1,523 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cblas.h>
+#include <lapacke.h>
+
+#include <cl.h>
+#include <clBLAS.h>
+
+#include "maximal_slicing_axi.h"
+
+static int construct_matrix(MaximalSlicingContext *ms, double *mat,
+ double *rhs, double *prhs_max)
+{
+ int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z;
+ double rhs_max = 0.0;
+
+#define BASIS_X (ms->basis_x_val [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
+#define DBASIS_X (ms->basis_x_dval [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
+#define D2BASIS_X (ms->basis_x_d2val[idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
+#define BASIS_Z (ms->basis_z_val [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
+#define DBASIS_Z (ms->basis_z_dval [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
+#define D2BASIS_Z (ms->basis_z_d2val[idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
+
+ //memset(mat, 0, sizeof(*mat) * ms->nb_coeffs * ms->nb_colloc_points);
+
+#pragma omp parallel for reduction(max : rhs_max)
+ for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z; idx_grid_z++) {
+ for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x; idx_grid_x++) {
+ CCTK_REAL x_physical = ms->grid_x[idx_grid_x];
+ int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;
+
+ const double gtuxx = ms->metric_u[0][idx_grid];
+ const double gtuyy = ms->metric_u[1][idx_grid];
+ const double gtuzz = ms->metric_u[2][idx_grid];
+ const double gtuxz = ms->metric_u[4][idx_grid];
+
+ const double phi = ms->interp_values[I_PHI][idx_grid];
+ const double phi_dx = ms->interp_values[I_PHI_DX][idx_grid];
+ const double phi_dz = ms->interp_values[I_PHI_DZ][idx_grid];
+
+ const double Xtx = ms->interp_values[I_XTX][idx_grid];
+ const double Xtz = ms->interp_values[I_XTZ][idx_grid];
+
+ const double k2 = ms->kij_kij[idx_grid];
+ const double trk = ms->interp_values[I_K][idx_grid];
+
+ const double trk_dx = ms->interp_values[I_K_DX][idx_grid];
+ const double trk_dz = ms->interp_values[I_K_DZ][idx_grid];
+
+ const double betax = ms->interp_values[I_BETAX][idx_grid];
+ const double betaz = ms->interp_values[I_BETAZ][idx_grid];
+
+ const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi);
+ const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);
+
+ const double coeff_20 = SQR(phi) * (gtuxx + (x_physical <= EPS) * gtuyy);
+ const double coeff_02 = SQR(phi) * gtuzz;
+ const double coeff_11 = SQR(phi) * gtuxz * 2;
+ const double coeff_10 = -Xx + (x_physical > EPS) * SQR(phi) * gtuyy / x_physical;
+ const double coeff_01 = -Xz;
+ const double coeff_00 = -k2;
+
+#if 1
+ for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
+ for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
+ const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;
+
+ //double d2alpha = gtuxx * D2BASIS_X * BASIS_Z
+ // + gtuzz * BASIS_X * D2BASIS_Z
+ // + 2 * gtuxz * DBASIS_X * DBASIS_Z;
+ //if (x_physical > EPS)
+ // d2alpha += gtuyy * DBASIS_X * BASIS_Z / x_physical;
+ //else
+ // d2alpha += gtuyy * D2BASIS_X * BASIS_Z;
+
+ //double curv_term = Xx * DBASIS_X * BASIS_Z + Xz * BASIS_X * DBASIS_Z;
+
+
+ //double D2alpha = SQR(phi) * d2alpha - curv_term;
+
+ //mat[idx_grid + ms->nb_colloc_points * idx_coeff] = D2alpha - BASIS_X * BASIS_Z * k2;
+ mat[idx_grid + ms->nb_colloc_points * idx_coeff] = coeff_00 * BASIS_X * BASIS_Z +
+ coeff_10 * DBASIS_X * BASIS_Z +
+ coeff_01 * BASIS_X * DBASIS_Z +
+ coeff_11 * DBASIS_X * DBASIS_Z +
+ coeff_20 * D2BASIS_X * BASIS_Z +
+ coeff_02 * BASIS_X * D2BASIS_Z;
+ }
+#else
+
+ const double coeff_20 = SQR(phi) * (gtuxx + (x_physical <= EPS) * gtuyy);
+ const double coeff_02 = SQR(phi) * gtuzz;
+ const double coeff_11 = SQR(phi) * gtuxz * 2;
+ const double coeff_10 = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi + (x_physical > EPS) * gtuyy);
+ const double coeff_01 = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);
+ const double coeff_00 = -k2;
+ cblas_daxpy(ms->nb_coeffs, coeff_20, ms->basis_val_20 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
+ cblas_daxpy(ms->nb_coeffs, coeff_02, ms->basis_val_02 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
+ cblas_daxpy(ms->nb_coeffs, coeff_11, ms->basis_val_11 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
+ cblas_daxpy(ms->nb_coeffs, coeff_10, ms->basis_val_10 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
+ cblas_daxpy(ms->nb_coeffs, coeff_01, ms->basis_val_01 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
+ cblas_daxpy(ms->nb_coeffs, coeff_00, ms->basis_val_00 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
+#endif
+
+ rhs[idx_grid] = k2 + trk ;// betax * trk_dx + betaz * trk_dz;
+ //rhs[idx_grid] = k2;
+ rhs_max = MAX(rhs_max, fabs(rhs[idx_grid]));
+ //rhs_max = fabs(rhs[idx_grid]);
+ }
+ }
+
+ //memcpy(rhs, ms->kij_kij, sizeof(*rhs) * ms->nb_colloc_points);
+ //cblas_daxpy(ms->nb_colloc_points, 1.0, ms->interp_values[I_K], 1, rhs, 1);
+ //cblas_dsbmv(CblasColMajor, CblasUpper, ms->nb_colloc_points, 0, 1.0, ms->interp_values[I_BETAX], 1, ms->interp_values[I_K_DX], 1, 1.0, rhs, 1);
+ //cblas_dsbmv(CblasColMajor, CblasUpper, ms->nb_colloc_points, 0, 1.0, ms->interp_values[I_BETAZ], 1, ms->interp_values[I_K_DZ], 1, 1.0, rhs, 1);
+
+ //*prhs_max = rhs[cblas_idamax(ms->nb_colloc_points, rhs, 1)];
+ *prhs_max = rhs_max;
+
+ return 0;
+}
+
+
+static int calc_geometry(MaximalSlicingContext *ms)
+{
+ int ret;
+
+ ret = CCTK_InterpGridArrays(ms->gh, 3, ms->interp_operator, ms->interp_params,
+ ms->coord_system, ms->nb_colloc_points, CCTK_VARIABLE_REAL,
+ (const void * const *)ms->interp_coords, ARRAY_ELEMS(ms->interp_vars_indices), ms->interp_vars_indices,
+ ARRAY_ELEMS(ms->interp_values), ms->interp_value_codes, (void * const *)ms->interp_values);
+ if (ret < 0)
+ CCTK_WARN(0, "Error interpolating");
+
+#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x)
+ for (int i = 0; i < ms->nb_colloc_points; i++) {
+ CCTK_REAL Am[3][3], gtu[3][3];
+ CCTK_REAL a2;
+
+ CCTK_REAL gtxx = ms->interp_values[I_GTXX][i];
+ CCTK_REAL gtyy = ms->interp_values[I_GTYY][i];
+ CCTK_REAL gtzz = ms->interp_values[I_GTZZ][i];
+ CCTK_REAL gtxy = ms->interp_values[I_GTXY][i];
+ CCTK_REAL gtxz = ms->interp_values[I_GTXZ][i];
+ CCTK_REAL gtyz = ms->interp_values[I_GTYZ][i];
+
+ CCTK_REAL Atxx = ms->interp_values[I_ATXX][i];
+ CCTK_REAL Atyy = ms->interp_values[I_ATYY][i];
+ CCTK_REAL Atzz = ms->interp_values[I_ATZZ][i];
+ CCTK_REAL Atxy = ms->interp_values[I_ATXY][i];
+ CCTK_REAL Atxz = ms->interp_values[I_ATXZ][i];
+ CCTK_REAL Atyz = ms->interp_values[I_ATYZ][i];
+
+ CCTK_REAL At[3][3] = {{ Atxx, Atxy, Atxz },
+ { Atxy, Atyy, Atyz },
+ { Atxz, Atyz, Atzz }};
+
+ CCTK_REAL trK = ms->interp_values[I_K][i];
+
+ CCTK_REAL Xtx = ms->interp_values[I_XTX][i];
+ CCTK_REAL Xtz = ms->interp_values[I_XTZ][i];
+
+ CCTK_REAL det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz);
+
+ // \tilde{γ}^{ij}
+ gtu[0][0] = (gtyy * gtzz - SQR(gtyz)) / det;
+ gtu[1][1] = (gtxx * gtzz - SQR(gtxz)) / det;
+ gtu[2][2] = (gtxx * gtyy - SQR(gtxy)) / det;
+ gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det;
+ gtu[0][2] = (gtxy * gtyz - gtyy * gtxz) / det;
+ gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det;
+ gtu[1][0] = gtu[0][1];
+ gtu[2][0] = gtu[0][2];
+ gtu[2][1] = gtu[1][2];
+
+ // \tilde{A}_{i}^j
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ double val = 0.0;
+ for (int l = 0; l < 3; l++)
+ val += gtu[j][l] * At[l][k];
+ Am[j][k] = val;
+ }
+
+ // K_{ij} K^{ij}
+ a2 = 0.0;
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ a2 += Am[j][k] * Am[k][j];
+
+ ms->metric_u[0][i] = gtu[0][0];
+ ms->metric_u[1][i] = gtu[1][1];
+ ms->metric_u[2][i] = gtu[2][2];
+ ms->metric_u[3][i] = gtu[0][1];
+ ms->metric_u[4][i] = gtu[0][2];
+ ms->metric_u[5][i] = gtu[1][2];
+
+ ms->kij_kij[i] = a2 + SQR(trK) / 3.;
+ }
+
+ return 0;
+}
+
+// based on the wikipedia article
+// and http://www.netlib.org/templates/matlab/bicgstab.m
+static int solve_bicgstab(BiCGSTABContext *ctx, const int N,
+ double *mat, double *rhs, double *x)
+{
+ const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+ double rho, rho_prev = 1.0;
+ double omega = 1.0;
+ double alpha = 1.0;
+
+ double err;
+ int i;
+
+ double *k = ctx->k;
+ double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t;
+ double *res = ctx->res, *res0 = ctx->res0;
+
+ // initialize the residual
+ memcpy(res, rhs, N * sizeof(*res));
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+ mat, N, x, 1, 1.0, res, 1);
+
+ memcpy(res0, res, N * sizeof(*res0));
+ memcpy(p, res, N * sizeof(*p));
+
+#define MAXITER 16
+#define TOL (1e-15)
+ for (i = 0; i < MAXITER; i++) {
+ rho = cblas_ddot(N, res, 1, res0, 1);
+
+ if (i) {
+ double beta = (rho / rho_prev) * (alpha / omega);
+
+ cblas_daxpy(N, -omega, v, 1, p, 1);
+ cblas_dscal(N, beta, p, 1);
+ cblas_daxpy(N, 1, res, 1, p, 1);
+ }
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ k, N, p, 1, 0.0, y, 1);
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ mat, N, y, 1, 0.0, v, 1);
+
+ alpha = rho / cblas_ddot(N, res0, 1, v, 1);
+
+ cblas_daxpy(N, -alpha, v, 1, res, 1);
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ k, N, res, 1, 0.0, z, 1);
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ mat, N, z, 1, 0.0, t, 1);
+
+ omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1);
+
+ cblas_daxpy(N, alpha, y, 1, x, 1);
+ cblas_daxpy(N, omega, z, 1, x, 1);
+
+ cblas_daxpy(N, -omega, t, 1, res, 1);
+
+ err = cblas_dnrm2(N, res, 1) / rhs_norm;
+ if (err < TOL)
+ break;
+
+ rho_prev = rho;
+ }
+ if (i == MAXITER)
+ return -1;
+
+ ctx->iter_total += i + 1;
+
+ return i;
+}
+
+static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q,
+ const int N, double *mat, double *rhs, cl_mem ocl_x)
+{
+ const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+ double rho, rho_prev = 1.0;
+ double omega[2] = { 1.0 };
+ double alpha = 1.0;
+
+ double err;
+ int i;
+
+ cl_event events[8];
+
+ // upload the matrix and the RHS to the GPU
+ // k and x are assumed to be already uploaded
+ clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double),
+ rhs, 0, NULL, &events[0]);
+ clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double),
+ mat, 0, NULL, &events[1]);
+
+ // initialize the residual
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+ ctx->cl_mat, 0, N, ocl_x, 0, 1, 1.0, ctx->cl_res, 0, 1,
+ 1, &cl_q, 2, events, &events[2]);
+ clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double),
+ 1, &events[2], &events[3]);
+ clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double),
+ 1, &events[2], &events[4]);
+
+ clWaitForEvents(5, events);
+ // BARRIER
+
+#define MAXITER 16
+#define TOL (1e-15)
+ for (i = 0; i < MAXITER; i++) {
+ clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1,
+ ctx->cl_tmp, 1, &cl_q, 0, NULL, &events[0]);
+ clEnqueueReadBuffer(cl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho,
+ 1, &events[0], NULL);
+ // BARRIER
+
+ if (i) {
+ double beta = (rho / rho_prev) * (alpha / omega[0]);
+
+ clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1,
+ 1, &cl_q, 0, NULL, &events[0]);
+ clblasDscal(N, beta, ctx->cl_p, 0, 1,
+ 1, &cl_q, 1, &events[0], &events[1]);
+ clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1,
+ 1, &cl_q, 1, &events[1], &events[0]);
+ clWaitForEvents(1, &events[0]);
+ // BARRIER
+ }
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1,
+ 1, &cl_q, 0, NULL, &events[0]);
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1,
+ 1, &cl_q, 1, &events[0], &events[1]);
+
+ clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1,
+ ctx->cl_tmp, 1, &cl_q, 1, &events[1], &events[0]);
+ clEnqueueReadBuffer(cl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha,
+ 1, &events[0], NULL);
+ // BARRIER
+
+ alpha = rho / alpha;
+
+ clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1,
+ 1, &cl_q, 0, NULL, &events[0]);
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1,
+ 1, &cl_q, 1, &events[0], &events[1]);
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1,
+ 1, &cl_q, 1, &events[1], &events[0]);
+
+ clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+ ctx->cl_tmp, 1, &cl_q, 1, &events[0], &events[1]);
+ clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1,
+ ctx->cl_tmp1, 1, &cl_q, 1, &events[0], &events[2]);
+
+ clEnqueueReadBuffer(cl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega,
+ 2, &events[1], NULL);
+ // BARRIER
+
+ omega[0] /= omega[1];
+
+ clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ocl_x, 0, 1,
+ 1, &cl_q, 0, NULL, &events[0]);
+ clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ocl_x, 0, 1,
+ 1, &cl_q, 1, &events[0], &events[1]);
+
+ clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+ 1, &cl_q, 0, NULL, &events[0]);
+ clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1,
+ 1, &cl_q, 1, &events[0], &events[2]);
+ clEnqueueReadBuffer(cl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err,
+ 1, &events[2], NULL);
+ clWaitForEvents(1, &events[1]);
+ // BARRIER
+
+ if (err < TOL)
+ break;
+
+ rho_prev = rho;
+ }
+ if (i == MAXITER)
+ return -1;
+
+ ctx->iter_total += i + 1;
+
+ return i;
+}
+
+static int lu_invert(const int N, double *mat, double *rhs, int *ipiv)
+{
+ LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+ mat, N, ipiv, rhs, N);
+ LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv);
+
+ return 0;
+}
+
+/*
+ * Solve the equation
+ * D²α - KᵢⱼKⁱʲα = -K
+ * for the coefficients of spectral approximation of α:
+ * α(ρ, z) = 1 + ΣaᵢⱼTᵢ(ρ)Tⱼ(z)
+ * where i = { 0, ... , ms->nb_coeffs_x };
+ * j = { 0, ... , ms->nb_coeffs_z };
+ * Tᵢ(x) are defined by ms->basis.
+ */
+int msa_maximal_solve(MaximalSlicingContext *ms)
+{
+ const int N = ms->nb_coeffs;
+ double rhs_max;
+
+ int ret = 0;
+
+ /* interpolate the metric values and construct the quantities we'll need */
+ CCTK_TimerStart("MaximalSlicingAxi_calc_geometry");
+ ret = calc_geometry(ms);
+ CCTK_TimerStop("MaximalSlicingAxi_calc_geometry");
+ if (ret < 0)
+ return ret;
+
+ /* fill the matrix */
+ CCTK_TimerStart("MaximalSlicingAxi_construct_matrix");
+ ret = construct_matrix(ms, ms->mat, ms->rhs, &rhs_max);
+ CCTK_TimerStop("MaximalSlicingAxi_construct_matrix");
+ if (ret < 0)
+ return ret;
+
+ if (rhs_max < EPS) {
+ memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
+ if (ms->cl_queue) {
+ clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double),
+ ms->coeffs, 0, NULL, NULL);
+ }
+ return 0;
+ }
+
+ /* solve for the coeffs */
+ if (ms->steps_since_inverse < 128) {
+ BiCGSTABContext *b = &ms->bicgstab;
+ int64_t start = gettime();
+
+ CCTK_TimerStart("MaximalSlicingAxi_solve_BiCGSTAB");
+ if (ms->cl_queue) {
+ ret = solve_bicgstab_cl(b, ms->cl_queue, ms->nb_coeffs, ms->mat, ms->rhs, ms->ocl_coeffs);
+ clEnqueueReadBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, sizeof(double) * N,
+ ms->coeffs, 0, NULL, NULL);
+ } else
+ ret = solve_bicgstab(b, ms->nb_coeffs, ms->mat, ms->rhs, ms->coeffs);
+ CCTK_TimerStop("MaximalSlicingAxi_solve_BiCGSTAB");
+
+ if (ret >= 0) {
+ b->time_total += gettime() - start;
+ b->solve_total++;
+ ms->steps_since_inverse++;
+
+ if (!(b->solve_total & 127)) {
+ fprintf(stderr, "BiCGSTAB %ld solves, %ld iterations, total time %ld, avg iterations per solve %g, avg time per solve %g, avg time per iteration %g\n",
+ b->solve_total, b->iter_total, b->time_total, (double)b->iter_total / b->solve_total, (double)b->time_total / b->solve_total, (double)b->time_total / b->iter_total);
+ fprintf(stderr, "LU %ld solves, total time %ld, avg time per solve %g\n", ms->lu_solves_total, ms->lu_solves_time, (double)ms->lu_solves_time / ms->lu_solves_total);
+ }
+#if 0
+ {
+ double min, max;
+ gsl_vector_memcpy(b->y, ms->rhs);
+ cblas_dgemv(CblasColMajor, CblasNoTrans, ms->mat->size1, ms->mat->size2, -1.0,
+ ms->mat->data, ms->mat->tda, ms->coeffs->data, 1, 1.0, b->y->data, 1);
+ gsl_vector_minmax(b->y, &min, &max);
+ if (fabs(min) > 1e-11 || fabs(max) > 1e-11)
+ abort();
+ }
+#endif
+ }
+ } else
+ ret = -1;
+
+ if (ret < 0) {
+ double *tmpv;
+ double *tmpm;
+ int64_t start;
+
+ CCTK_TimerStart("MaximalSlicingAxi_solve_LU");
+ start = gettime();
+
+ lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv);
+ ms->lu_solves_time += gettime() - start;
+ ms->lu_solves_total++;
+ CCTK_TimerStop("MaximalSlicingAxi_solve_LU");
+
+ tmpv = ms->coeffs;
+ ms->coeffs = ms->rhs;
+ ms->rhs = tmpv;
+
+ tmpm = ms->mat;
+ ms->mat = ms->bicgstab.k;
+ ms->bicgstab.k = tmpm;
+
+ if (ms->cl_queue) {
+ cl_event events[2];
+ clEnqueueWriteBuffer(ms->cl_queue, ms->bicgstab.cl_k, 0, 0, N * N * sizeof(double),
+ ms->bicgstab.k, 0, NULL, &events[0]);
+ clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 0, 0, N * sizeof(double),
+ ms->coeffs, 0, NULL, &events[1]);
+ clWaitForEvents(2, events);
+ }
+
+ ms->steps_since_inverse = 0;
+ }
+
+
+ return ret;
+}