From b581dc077be26ff6ca8e541cfdcadb9417a9ca49 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Fri, 13 Mar 2015 16:29:53 +0100 Subject: Split off the solver core into a separate file. --- src/make.code.defn | 2 +- src/maximal_slicing_axi.c | 715 +--------------------------------------------- src/maximal_slicing_axi.h | 203 +++++++++++++ src/solve.c | 523 +++++++++++++++++++++++++++++++++ 4 files changed, 728 insertions(+), 715 deletions(-) create mode 100644 src/solve.c diff --git a/src/make.code.defn b/src/make.code.defn index 629817b..d11b536 100644 --- a/src/make.code.defn +++ b/src/make.code.defn @@ -1,7 +1,7 @@ # Main make.code.defn file for thorn MaximalSlicingAxi # Source files in this directory -SRCS = basis.c maximal_slicing_axi.c register.c +SRCS = basis.c maximal_slicing_axi.c register.c solve.c # Subdirectories containing source files SUBDIRS = diff --git a/src/maximal_slicing_axi.c b/src/maximal_slicing_axi.c index 10661f6..48bb4c9 100644 --- a/src/maximal_slicing_axi.c +++ b/src/maximal_slicing_axi.c @@ -24,23 +24,6 @@ #define ACC_TEST 0 -#define MAX(x, y) ((x) > (y) ? (x) : (y)) -#define MIN(x, y) ((x) > (y) ? (y) : (x)) -#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr)) - -/* - * small number to avoid r=0 singularities - */ -#define EPS 1E-08 - -#include -int64_t gettime(void) -{ - struct timeval tv; - gettimeofday(&tv, NULL); - return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; -} - /* * The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9) * SB(x, n) = sin((n + 1) arccot(|x| / L)) @@ -50,81 +33,6 @@ int64_t gettime(void) double scale_factor; -/* indices (in our code, not cactus structs) of the grid functions which we'll need to - * interpolate on the pseudospectral grid */ -enum MetricVars { - GTXX = 0, - GTYY, - GTZZ, - GTXY, - GTXZ, - GTYZ, - PHI, - ATXX, - ATYY, - ATZZ, - ATXY, - ATXZ, - ATYZ, - K, - XTX, - XTY, - XTZ, - BETAX, - BETAY, - BETAZ, - NB_METRIC_VARS, -}; - -/* indices of the interpolated values of the above grid functions and their derivatives */ -enum InterpMetricVars { - I_GTXX = 0, - I_GTXX_DX, - I_GTXX_DY, - I_GTXX_DZ, - I_GTYY, - I_GTYY_DX, - I_GTYY_DY, - I_GTYY_DZ, - I_GTZZ, - I_GTZZ_DX, - I_GTZZ_DY, - I_GTZZ_DZ, - I_GTXY, - I_GTXY_DX, - I_GTXY_DY, - I_GTXY_DZ, - I_GTXZ, - I_GTXZ_DX, - I_GTXZ_DY, - I_GTXZ_DZ, - I_GTYZ, - I_GTYZ_DX, - I_GTYZ_DY, - I_GTYZ_DZ, - I_PHI, - I_PHI_DX, - I_PHI_DY, - I_PHI_DZ, - I_ATXX, - I_ATYY, - I_ATZZ, - I_ATXY, - I_ATXZ, - I_ATYZ, - I_K, - I_K_DX, - I_K_DY, - I_K_DZ, - I_XTX, - I_XTY, - I_XTZ, - I_BETAX, - I_BETAY, - I_BETAZ, - NB_INTERP_VARS, -}; - /* mapping between our indices and thorn names */ static const char *metric_vars[] = { [GTXX] = "ML_BSSN::gt11", @@ -245,627 +153,6 @@ static const CCTK_INT interp_operation_codes[] = { [I_BETAZ] = 0, }; -/* precomputed values for a given refined grid */ -typedef struct CoordPatch { - CCTK_REAL origin[3]; - CCTK_REAL delta[3]; - CCTK_INT size[3]; - - // basis values on the grid - double *basis_val_r; - double *basis_val_z; - - double *transform_z; - double *one; -} CoordPatch; - -typedef struct ExpansionThreadData { - struct MaximalSlicingContext *ms; - CoordPatch *cp; - double *alp; - double *vec_tmp; - int start, end; -} ExpansionThreadData; - -/* state and scratch storage for the BiCGSTAB solver */ -typedef struct BiCGSTABContext { - double *p, *v, *y, *z, *t; - double *res, *res0; - double *k; - - cl_mem cl_p, cl_v, cl_y, cl_z, cl_t; - cl_mem cl_res, cl_res0; - cl_mem cl_k, cl_mat; - cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1; - cl_mem cl_tmp, cl_tmp1; - - int64_t solve_total; - int64_t iter_total; - int64_t time_total; -} BiCGSTABContext; - -typedef struct MaximalSlicingContext { - cGH *gh; - const BasisSet *basis; - - BiCGSTABContext bicgstab; - int steps_since_inverse; - - int64_t lu_solves_total; - int64_t lu_solves_time; - - // the grid of collocation points - CCTK_REAL *grid_x; - CCTK_REAL *grid_z; - - // interpolation parameters - int coord_system; - int interp_operator; - int interp_params; - - CCTK_REAL *interp_coords[3]; - - int interp_vars_indices[NB_METRIC_VARS]; - - CCTK_REAL *interp_values[NB_INTERP_VARS]; - CCTK_INT interp_value_codes[NB_INTERP_VARS]; - - CCTK_REAL *metric_u[6]; - - CCTK_REAL *kij_kij; - CCTK_REAL *trk; - - int nb_coeffs_x; - int nb_coeffs_z; - int nb_coeffs; - - int nb_colloc_points_x; - int nb_colloc_points_z; - int nb_colloc_points; - - int colloc_grid_order_x; - int colloc_grid_order_z; - - double *mat; - double *mat_f; - double *rhs; - double *coeffs; - int *ipiv; - double *basis_x_val; - double *basis_x_dval; - double *basis_x_d2val; - - double *basis_z_val; - double *basis_z_dval; - double *basis_z_d2val; - - double *basis_val_00; - double *basis_val_20; - double *basis_val_02; - double *basis_val_11; - double *basis_val_10; - double *basis_val_01; - - CoordPatch *patches; - int nb_patches; - - // OpenCL / CLBLAS stuff - cl_context cl_ctx; - cl_command_queue cl_queue; - - cl_mem ocl_coeffs; -} MaximalSlicingContext; - -static int construct_matrix(MaximalSlicingContext *ms, double *mat, - double *rhs, double *prhs_max) -{ - int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z; - double rhs_max = 0.0; - -#define BASIS_X (ms->basis_x_val [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) -#define DBASIS_X (ms->basis_x_dval [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) -#define D2BASIS_X (ms->basis_x_d2val[idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) -#define BASIS_Z (ms->basis_z_val [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) -#define DBASIS_Z (ms->basis_z_dval [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) -#define D2BASIS_Z (ms->basis_z_d2val[idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) - - //memset(mat, 0, sizeof(*mat) * ms->nb_coeffs * ms->nb_colloc_points); - -#pragma omp parallel for reduction(max : rhs_max) - for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z; idx_grid_z++) { - for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x; idx_grid_x++) { - CCTK_REAL x_physical = ms->grid_x[idx_grid_x]; - int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; - - const double gtuxx = ms->metric_u[0][idx_grid]; - const double gtuyy = ms->metric_u[1][idx_grid]; - const double gtuzz = ms->metric_u[2][idx_grid]; - const double gtuxz = ms->metric_u[4][idx_grid]; - - const double phi = ms->interp_values[I_PHI][idx_grid]; - const double phi_dx = ms->interp_values[I_PHI_DX][idx_grid]; - const double phi_dz = ms->interp_values[I_PHI_DZ][idx_grid]; - - const double Xtx = ms->interp_values[I_XTX][idx_grid]; - const double Xtz = ms->interp_values[I_XTZ][idx_grid]; - - const double k2 = ms->kij_kij[idx_grid]; - const double trk = ms->interp_values[I_K][idx_grid]; - - const double trk_dx = ms->interp_values[I_K_DX][idx_grid]; - const double trk_dz = ms->interp_values[I_K_DZ][idx_grid]; - - const double betax = ms->interp_values[I_BETAX][idx_grid]; - const double betaz = ms->interp_values[I_BETAZ][idx_grid]; - - const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi); - const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); - - const double coeff_20 = SQR(phi) * (gtuxx + (x_physical <= EPS) * gtuyy); - const double coeff_02 = SQR(phi) * gtuzz; - const double coeff_11 = SQR(phi) * gtuxz * 2; - const double coeff_10 = -Xx + (x_physical > EPS) * SQR(phi) * gtuyy / x_physical; - const double coeff_01 = -Xz; - const double coeff_00 = -k2; - -#if 1 - for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) - for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { - const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; - - //double d2alpha = gtuxx * D2BASIS_X * BASIS_Z - // + gtuzz * BASIS_X * D2BASIS_Z - // + 2 * gtuxz * DBASIS_X * DBASIS_Z; - //if (x_physical > EPS) - // d2alpha += gtuyy * DBASIS_X * BASIS_Z / x_physical; - //else - // d2alpha += gtuyy * D2BASIS_X * BASIS_Z; - - //double curv_term = Xx * DBASIS_X * BASIS_Z + Xz * BASIS_X * DBASIS_Z; - - - //double D2alpha = SQR(phi) * d2alpha - curv_term; - - //mat[idx_grid + ms->nb_colloc_points * idx_coeff] = D2alpha - BASIS_X * BASIS_Z * k2; - mat[idx_grid + ms->nb_colloc_points * idx_coeff] = coeff_00 * BASIS_X * BASIS_Z + - coeff_10 * DBASIS_X * BASIS_Z + - coeff_01 * BASIS_X * DBASIS_Z + - coeff_11 * DBASIS_X * DBASIS_Z + - coeff_20 * D2BASIS_X * BASIS_Z + - coeff_02 * BASIS_X * D2BASIS_Z; - } -#else - - const double coeff_20 = SQR(phi) * (gtuxx + (x_physical <= EPS) * gtuyy); - const double coeff_02 = SQR(phi) * gtuzz; - const double coeff_11 = SQR(phi) * gtuxz * 2; - const double coeff_10 = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi + (x_physical > EPS) * gtuyy); - const double coeff_01 = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); - const double coeff_00 = -k2; - cblas_daxpy(ms->nb_coeffs, coeff_20, ms->basis_val_20 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); - cblas_daxpy(ms->nb_coeffs, coeff_02, ms->basis_val_02 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); - cblas_daxpy(ms->nb_coeffs, coeff_11, ms->basis_val_11 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); - cblas_daxpy(ms->nb_coeffs, coeff_10, ms->basis_val_10 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); - cblas_daxpy(ms->nb_coeffs, coeff_01, ms->basis_val_01 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); - cblas_daxpy(ms->nb_coeffs, coeff_00, ms->basis_val_00 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); -#endif - - rhs[idx_grid] = k2 + trk ;// betax * trk_dx + betaz * trk_dz; - //rhs[idx_grid] = k2; - rhs_max = MAX(rhs_max, fabs(rhs[idx_grid])); - //rhs_max = fabs(rhs[idx_grid]); - } - } - - //memcpy(rhs, ms->kij_kij, sizeof(*rhs) * ms->nb_colloc_points); - //cblas_daxpy(ms->nb_colloc_points, 1.0, ms->interp_values[I_K], 1, rhs, 1); - //cblas_dsbmv(CblasColMajor, CblasUpper, ms->nb_colloc_points, 0, 1.0, ms->interp_values[I_BETAX], 1, ms->interp_values[I_K_DX], 1, 1.0, rhs, 1); - //cblas_dsbmv(CblasColMajor, CblasUpper, ms->nb_colloc_points, 0, 1.0, ms->interp_values[I_BETAZ], 1, ms->interp_values[I_K_DZ], 1, 1.0, rhs, 1); - - //*prhs_max = rhs[cblas_idamax(ms->nb_colloc_points, rhs, 1)]; - *prhs_max = rhs_max; - - return 0; -} - -// based on the wikipedia article -// and http://www.netlib.org/templates/matlab/bicgstab.m -static int solve_bicgstab(BiCGSTABContext *ctx, const int N, - double *mat, double *rhs, double *x) -{ - const double rhs_norm = cblas_dnrm2(N, rhs, 1); - - double rho, rho_prev = 1.0; - double omega = 1.0; - double alpha = 1.0; - - double err; - int i; - - double *k = ctx->k; - double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t; - double *res = ctx->res, *res0 = ctx->res0; - - // initialize the residual - memcpy(res, rhs, N * sizeof(*res)); - cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, - mat, N, x, 1, 1.0, res, 1); - - memcpy(res0, res, N * sizeof(*res0)); - memcpy(p, res, N * sizeof(*p)); - -#define MAXITER 16 -#define TOL (1e-15) - for (i = 0; i < MAXITER; i++) { - rho = cblas_ddot(N, res, 1, res0, 1); - - if (i) { - double beta = (rho / rho_prev) * (alpha / omega); - - cblas_daxpy(N, -omega, v, 1, p, 1); - cblas_dscal(N, beta, p, 1); - cblas_daxpy(N, 1, res, 1, p, 1); - } - - cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - k, N, p, 1, 0.0, y, 1); - - cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - mat, N, y, 1, 0.0, v, 1); - - alpha = rho / cblas_ddot(N, res0, 1, v, 1); - - cblas_daxpy(N, -alpha, v, 1, res, 1); - - cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - k, N, res, 1, 0.0, z, 1); - cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - mat, N, z, 1, 0.0, t, 1); - - omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1); - - cblas_daxpy(N, alpha, y, 1, x, 1); - cblas_daxpy(N, omega, z, 1, x, 1); - - cblas_daxpy(N, -omega, t, 1, res, 1); - - err = cblas_dnrm2(N, res, 1) / rhs_norm; - if (err < TOL) - break; - - rho_prev = rho; - } - if (i == MAXITER) - return -1; - - ctx->iter_total += i + 1; - - return i; -} - -static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q, - const int N, double *mat, double *rhs, cl_mem ocl_x) -{ - const double rhs_norm = cblas_dnrm2(N, rhs, 1); - - double rho, rho_prev = 1.0; - double omega[2] = { 1.0 }; - double alpha = 1.0; - - double err; - int i; - - cl_event events[8]; - - // upload the matrix and the RHS to the GPU - // k and x are assumed to be already uploaded - clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double), - rhs, 0, NULL, &events[0]); - clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), - mat, 0, NULL, &events[1]); - - // initialize the residual - clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, - ctx->cl_mat, 0, N, ocl_x, 0, 1, 1.0, ctx->cl_res, 0, 1, - 1, &cl_q, 2, events, &events[2]); - clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double), - 1, &events[2], &events[3]); - clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double), - 1, &events[2], &events[4]); - - clWaitForEvents(5, events); - // BARRIER - -#define MAXITER 16 -#define TOL (1e-15) - for (i = 0; i < MAXITER; i++) { - clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1, - ctx->cl_tmp, 1, &cl_q, 0, NULL, &events[0]); - clEnqueueReadBuffer(cl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho, - 1, &events[0], NULL); - // BARRIER - - if (i) { - double beta = (rho / rho_prev) * (alpha / omega[0]); - - clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1, - 1, &cl_q, 0, NULL, &events[0]); - clblasDscal(N, beta, ctx->cl_p, 0, 1, - 1, &cl_q, 1, &events[0], &events[1]); - clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1, - 1, &cl_q, 1, &events[1], &events[0]); - clWaitForEvents(1, &events[0]); - // BARRIER - } - - clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1, - 1, &cl_q, 0, NULL, &events[0]); - - clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1, - 1, &cl_q, 1, &events[0], &events[1]); - - clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1, - ctx->cl_tmp, 1, &cl_q, 1, &events[1], &events[0]); - clEnqueueReadBuffer(cl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha, - 1, &events[0], NULL); - // BARRIER - - alpha = rho / alpha; - - clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1, - 1, &cl_q, 0, NULL, &events[0]); - - clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1, - 1, &cl_q, 1, &events[0], &events[1]); - clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, - ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1, - 1, &cl_q, 1, &events[1], &events[0]); - - clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, - ctx->cl_tmp, 1, &cl_q, 1, &events[0], &events[1]); - clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1, - ctx->cl_tmp1, 1, &cl_q, 1, &events[0], &events[2]); - - clEnqueueReadBuffer(cl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega, - 2, &events[1], NULL); - // BARRIER - - omega[0] /= omega[1]; - - clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ocl_x, 0, 1, - 1, &cl_q, 0, NULL, &events[0]); - clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ocl_x, 0, 1, - 1, &cl_q, 1, &events[0], &events[1]); - - clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, - 1, &cl_q, 0, NULL, &events[0]); - clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1, - 1, &cl_q, 1, &events[0], &events[2]); - clEnqueueReadBuffer(cl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err, - 1, &events[2], NULL); - clWaitForEvents(1, &events[1]); - // BARRIER - - if (err < TOL) - break; - - rho_prev = rho; - } - if (i == MAXITER) - return -1; - - ctx->iter_total += i + 1; - - return i; -} - -static int lu_invert(const int N, double *mat, double *rhs, int *ipiv) -{ - LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, - mat, N, ipiv, rhs, N); - LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv); - - return 0; -} - -static int calc_geometry(MaximalSlicingContext *ms) -{ - int ret; - - ret = CCTK_InterpGridArrays(ms->gh, 3, ms->interp_operator, ms->interp_params, - ms->coord_system, ms->nb_colloc_points, CCTK_VARIABLE_REAL, - (const void * const *)ms->interp_coords, ARRAY_ELEMS(ms->interp_vars_indices), ms->interp_vars_indices, - ARRAY_ELEMS(ms->interp_values), ms->interp_value_codes, (void * const *)ms->interp_values); - if (ret < 0) - CCTK_WARN(0, "Error interpolating"); - -#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x) - for (int i = 0; i < ms->nb_colloc_points; i++) { - CCTK_REAL Am[3][3], gtu[3][3]; - CCTK_REAL a2; - - CCTK_REAL gtxx = ms->interp_values[I_GTXX][i]; - CCTK_REAL gtyy = ms->interp_values[I_GTYY][i]; - CCTK_REAL gtzz = ms->interp_values[I_GTZZ][i]; - CCTK_REAL gtxy = ms->interp_values[I_GTXY][i]; - CCTK_REAL gtxz = ms->interp_values[I_GTXZ][i]; - CCTK_REAL gtyz = ms->interp_values[I_GTYZ][i]; - - CCTK_REAL Atxx = ms->interp_values[I_ATXX][i]; - CCTK_REAL Atyy = ms->interp_values[I_ATYY][i]; - CCTK_REAL Atzz = ms->interp_values[I_ATZZ][i]; - CCTK_REAL Atxy = ms->interp_values[I_ATXY][i]; - CCTK_REAL Atxz = ms->interp_values[I_ATXZ][i]; - CCTK_REAL Atyz = ms->interp_values[I_ATYZ][i]; - - CCTK_REAL At[3][3] = {{ Atxx, Atxy, Atxz }, - { Atxy, Atyy, Atyz }, - { Atxz, Atyz, Atzz }}; - - CCTK_REAL trK = ms->interp_values[I_K][i]; - - CCTK_REAL Xtx = ms->interp_values[I_XTX][i]; - CCTK_REAL Xtz = ms->interp_values[I_XTZ][i]; - - CCTK_REAL det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz); - - // \tilde{γ}^{ij} - gtu[0][0] = (gtyy * gtzz - SQR(gtyz)) / det; - gtu[1][1] = (gtxx * gtzz - SQR(gtxz)) / det; - gtu[2][2] = (gtxx * gtyy - SQR(gtxy)) / det; - gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det; - gtu[0][2] = (gtxy * gtyz - gtyy * gtxz) / det; - gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det; - gtu[1][0] = gtu[0][1]; - gtu[2][0] = gtu[0][2]; - gtu[2][1] = gtu[1][2]; - - // \tilde{A}_{i}^j - for (int j = 0; j < 3; j++) - for (int k = 0; k < 3; k++) { - double val = 0.0; - for (int l = 0; l < 3; l++) - val += gtu[j][l] * At[l][k]; - Am[j][k] = val; - } - - // K_{ij} K^{ij} - a2 = 0.0; - for (int j = 0; j < 3; j++) - for (int k = 0; k < 3; k++) - a2 += Am[j][k] * Am[k][j]; - - ms->metric_u[0][i] = gtu[0][0]; - ms->metric_u[1][i] = gtu[1][1]; - ms->metric_u[2][i] = gtu[2][2]; - ms->metric_u[3][i] = gtu[0][1]; - ms->metric_u[4][i] = gtu[0][2]; - ms->metric_u[5][i] = gtu[1][2]; - - ms->kij_kij[i] = a2 + SQR(trK) / 3.; - } - - return 0; -} - -/* - * Solve the equation - * D²α - KᵢⱼKⁱʲα = -K - * for the coefficients of spectral approximation of α: - * α(ρ, z) = 1 + ΣaᵢⱼTᵢ(ρ)Tⱼ(z) - * where i = { 0, ... , ms->nb_coeffs_x }; - * j = { 0, ... , ms->nb_coeffs_z }; - * Tᵢ(x) are defined by ms->basis. - */ -static int maximal_solve(MaximalSlicingContext *ms) -{ - const int N = ms->nb_coeffs; - double rhs_max; - - int ret = 0; - - /* interpolate the metric values and construct the quantities we'll need */ - CCTK_TimerStart("MaximalSlicingAxi_calc_geometry"); - ret = calc_geometry(ms); - CCTK_TimerStop("MaximalSlicingAxi_calc_geometry"); - if (ret < 0) - return ret; - - /* fill the matrix */ - CCTK_TimerStart("MaximalSlicingAxi_construct_matrix"); - ret = construct_matrix(ms, ms->mat, ms->rhs, &rhs_max); - CCTK_TimerStop("MaximalSlicingAxi_construct_matrix"); - if (ret < 0) - return ret; - - if (rhs_max < EPS) { - memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); - if (ms->cl_queue) { - clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double), - ms->coeffs, 0, NULL, NULL); - } - return 0; - } - - /* solve for the coeffs */ - if (ms->steps_since_inverse < 128) { - BiCGSTABContext *b = &ms->bicgstab; - int64_t start = gettime(); - - CCTK_TimerStart("MaximalSlicingAxi_solve_BiCGSTAB"); - if (ms->cl_queue) { - ret = solve_bicgstab_cl(b, ms->cl_queue, ms->nb_coeffs, ms->mat, ms->rhs, ms->ocl_coeffs); - clEnqueueReadBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, sizeof(double) * N, - ms->coeffs, 0, NULL, NULL); - } else - ret = solve_bicgstab(b, ms->nb_coeffs, ms->mat, ms->rhs, ms->coeffs); - CCTK_TimerStop("MaximalSlicingAxi_solve_BiCGSTAB"); - - if (ret >= 0) { - b->time_total += gettime() - start; - b->solve_total++; - ms->steps_since_inverse++; - - if (!(b->solve_total & 127)) { - fprintf(stderr, "BiCGSTAB %ld solves, %ld iterations, total time %ld, avg iterations per solve %g, avg time per solve %g, avg time per iteration %g\n", - b->solve_total, b->iter_total, b->time_total, (double)b->iter_total / b->solve_total, (double)b->time_total / b->solve_total, (double)b->time_total / b->iter_total); - fprintf(stderr, "LU %ld solves, total time %ld, avg time per solve %g\n", ms->lu_solves_total, ms->lu_solves_time, (double)ms->lu_solves_time / ms->lu_solves_total); - } -#if 0 - { - double min, max; - gsl_vector_memcpy(b->y, ms->rhs); - cblas_dgemv(CblasColMajor, CblasNoTrans, ms->mat->size1, ms->mat->size2, -1.0, - ms->mat->data, ms->mat->tda, ms->coeffs->data, 1, 1.0, b->y->data, 1); - gsl_vector_minmax(b->y, &min, &max); - if (fabs(min) > 1e-11 || fabs(max) > 1e-11) - abort(); - } -#endif - } - } else - ret = -1; - - if (ret < 0) { - double *tmpv; - double *tmpm; - int64_t start; - - CCTK_TimerStart("MaximalSlicingAxi_solve_LU"); - start = gettime(); - - lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv); - ms->lu_solves_time += gettime() - start; - ms->lu_solves_total++; - CCTK_TimerStop("MaximalSlicingAxi_solve_LU"); - - tmpv = ms->coeffs; - ms->coeffs = ms->rhs; - ms->rhs = tmpv; - - tmpm = ms->mat; - ms->mat = ms->bicgstab.k; - ms->bicgstab.k = tmpm; - - if (ms->cl_queue) { - cl_event events[2]; - clEnqueueWriteBuffer(ms->cl_queue, ms->bicgstab.cl_k, 0, 0, N * N * sizeof(double), - ms->bicgstab.k, 0, NULL, &events[0]); - clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 0, 0, N * sizeof(double), - ms->coeffs, 0, NULL, &events[1]); - clWaitForEvents(2, events); - } - - ms->steps_since_inverse = 0; - } - - - return ret; -} - static void init_opencl(MaximalSlicingContext *ms) { int err, count; @@ -1254,7 +541,7 @@ void maximal_slicing_axi(CCTK_ARGUMENTS) cp = get_coord_patch(ms, x, y, z, alp); CCTK_TimerStart("MaximalSlicingAxi_Solve"); - maximal_solve(ms); + msa_maximal_solve(ms); CCTK_TimerStop("MaximalSlicingAxi_Solve"); if (export_coeffs) diff --git a/src/maximal_slicing_axi.h b/src/maximal_slicing_axi.h index b15420c..fc6c86f 100644 --- a/src/maximal_slicing_axi.h +++ b/src/maximal_slicing_axi.h @@ -1,9 +1,98 @@ +#include + +#include + +#include "cctk.h" + #define SQR(x) ((x) * (x)) #define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0) +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define MIN(x, y) ((x) > (y) ? (y) : (x)) +#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr)) + +/* + * small number to avoid r=0 singularities + */ +#define EPS 1E-08 #define SCALE_FACTOR scale_factor +/* indices (in our code, not cactus structs) of the grid functions which we'll need to + * interpolate on the pseudospectral grid */ +enum MetricVars { + GTXX = 0, + GTYY, + GTZZ, + GTXY, + GTXZ, + GTYZ, + PHI, + ATXX, + ATYY, + ATZZ, + ATXY, + ATXZ, + ATYZ, + K, + XTX, + XTY, + XTZ, + BETAX, + BETAY, + BETAZ, + NB_METRIC_VARS, +}; + +/* indices of the interpolated values of the above grid functions and their derivatives */ +enum InterpMetricVars { + I_GTXX = 0, + I_GTXX_DX, + I_GTXX_DY, + I_GTXX_DZ, + I_GTYY, + I_GTYY_DX, + I_GTYY_DY, + I_GTYY_DZ, + I_GTZZ, + I_GTZZ_DX, + I_GTZZ_DY, + I_GTZZ_DZ, + I_GTXY, + I_GTXY_DX, + I_GTXY_DY, + I_GTXY_DZ, + I_GTXZ, + I_GTXZ_DX, + I_GTXZ_DY, + I_GTXZ_DZ, + I_GTYZ, + I_GTYZ_DX, + I_GTYZ_DY, + I_GTYZ_DZ, + I_PHI, + I_PHI_DX, + I_PHI_DY, + I_PHI_DZ, + I_ATXX, + I_ATYY, + I_ATZZ, + I_ATXY, + I_ATXZ, + I_ATYZ, + I_K, + I_K_DX, + I_K_DY, + I_K_DZ, + I_XTX, + I_XTY, + I_XTZ, + I_BETAX, + I_BETAY, + I_BETAZ, + NB_INTERP_VARS, +}; + /* a set of basis functions */ typedef struct BasisSet { /* evaluate the idx-th basis function at the specified point*/ @@ -25,3 +114,117 @@ extern const BasisSet msa_tb_even_basis; extern const BasisSet msa_sb_even_basis; extern double scale_factor; + +/* precomputed values for a given refined grid */ +typedef struct CoordPatch { + CCTK_REAL origin[3]; + CCTK_REAL delta[3]; + CCTK_INT size[3]; + + // basis values on the grid + double *basis_val_r; + double *basis_val_z; + + double *transform_z; + double *one; +} CoordPatch; + +/* state and scratch storage for the BiCGSTAB solver */ +typedef struct BiCGSTABContext { + double *p, *v, *y, *z, *t; + double *res, *res0; + double *k; + + cl_mem cl_p, cl_v, cl_y, cl_z, cl_t; + cl_mem cl_res, cl_res0; + cl_mem cl_k, cl_mat; + cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1; + cl_mem cl_tmp, cl_tmp1; + + int64_t solve_total; + int64_t iter_total; + int64_t time_total; +} BiCGSTABContext; + +typedef struct MaximalSlicingContext { + cGH *gh; + const BasisSet *basis; + + BiCGSTABContext bicgstab; + int steps_since_inverse; + + int64_t lu_solves_total; + int64_t lu_solves_time; + + // the grid of collocation points + CCTK_REAL *grid_x; + CCTK_REAL *grid_z; + + // interpolation parameters + int coord_system; + int interp_operator; + int interp_params; + + CCTK_REAL *interp_coords[3]; + + int interp_vars_indices[NB_METRIC_VARS]; + + CCTK_REAL *interp_values[NB_INTERP_VARS]; + CCTK_INT interp_value_codes[NB_INTERP_VARS]; + + CCTK_REAL *metric_u[6]; + + CCTK_REAL *kij_kij; + CCTK_REAL *trk; + + int nb_coeffs_x; + int nb_coeffs_z; + int nb_coeffs; + + int nb_colloc_points_x; + int nb_colloc_points_z; + int nb_colloc_points; + + int colloc_grid_order_x; + int colloc_grid_order_z; + + double *mat; + double *mat_f; + double *rhs; + double *coeffs; + int *ipiv; + double *basis_x_val; + double *basis_x_dval; + double *basis_x_d2val; + + double *basis_z_val; + double *basis_z_dval; + double *basis_z_d2val; + + double *basis_val_00; + double *basis_val_20; + double *basis_val_02; + double *basis_val_11; + double *basis_val_10; + double *basis_val_01; + + CoordPatch *patches; + int nb_patches; + + // OpenCL / CLBLAS stuff + cl_context cl_ctx; + cl_command_queue cl_queue; + + cl_mem ocl_coeffs; +} MaximalSlicingContext; + +int msa_maximal_solve(MaximalSlicingContext *ms); + +#include +static inline int64_t gettime(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; +} + diff --git a/src/solve.c b/src/solve.c new file mode 100644 index 0000000..610dc0d --- /dev/null +++ b/src/solve.c @@ -0,0 +1,523 @@ + +#include +#include +#include + +#include +#include + +#include +#include + +#include "maximal_slicing_axi.h" + +static int construct_matrix(MaximalSlicingContext *ms, double *mat, + double *rhs, double *prhs_max) +{ + int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z; + double rhs_max = 0.0; + +#define BASIS_X (ms->basis_x_val [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) +#define DBASIS_X (ms->basis_x_dval [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) +#define D2BASIS_X (ms->basis_x_d2val[idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) +#define BASIS_Z (ms->basis_z_val [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) +#define DBASIS_Z (ms->basis_z_dval [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) +#define D2BASIS_Z (ms->basis_z_d2val[idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) + + //memset(mat, 0, sizeof(*mat) * ms->nb_coeffs * ms->nb_colloc_points); + +#pragma omp parallel for reduction(max : rhs_max) + for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z; idx_grid_z++) { + for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x; idx_grid_x++) { + CCTK_REAL x_physical = ms->grid_x[idx_grid_x]; + int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + + const double gtuxx = ms->metric_u[0][idx_grid]; + const double gtuyy = ms->metric_u[1][idx_grid]; + const double gtuzz = ms->metric_u[2][idx_grid]; + const double gtuxz = ms->metric_u[4][idx_grid]; + + const double phi = ms->interp_values[I_PHI][idx_grid]; + const double phi_dx = ms->interp_values[I_PHI_DX][idx_grid]; + const double phi_dz = ms->interp_values[I_PHI_DZ][idx_grid]; + + const double Xtx = ms->interp_values[I_XTX][idx_grid]; + const double Xtz = ms->interp_values[I_XTZ][idx_grid]; + + const double k2 = ms->kij_kij[idx_grid]; + const double trk = ms->interp_values[I_K][idx_grid]; + + const double trk_dx = ms->interp_values[I_K_DX][idx_grid]; + const double trk_dz = ms->interp_values[I_K_DZ][idx_grid]; + + const double betax = ms->interp_values[I_BETAX][idx_grid]; + const double betaz = ms->interp_values[I_BETAZ][idx_grid]; + + const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi); + const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); + + const double coeff_20 = SQR(phi) * (gtuxx + (x_physical <= EPS) * gtuyy); + const double coeff_02 = SQR(phi) * gtuzz; + const double coeff_11 = SQR(phi) * gtuxz * 2; + const double coeff_10 = -Xx + (x_physical > EPS) * SQR(phi) * gtuyy / x_physical; + const double coeff_01 = -Xz; + const double coeff_00 = -k2; + +#if 1 + for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + //double d2alpha = gtuxx * D2BASIS_X * BASIS_Z + // + gtuzz * BASIS_X * D2BASIS_Z + // + 2 * gtuxz * DBASIS_X * DBASIS_Z; + //if (x_physical > EPS) + // d2alpha += gtuyy * DBASIS_X * BASIS_Z / x_physical; + //else + // d2alpha += gtuyy * D2BASIS_X * BASIS_Z; + + //double curv_term = Xx * DBASIS_X * BASIS_Z + Xz * BASIS_X * DBASIS_Z; + + + //double D2alpha = SQR(phi) * d2alpha - curv_term; + + //mat[idx_grid + ms->nb_colloc_points * idx_coeff] = D2alpha - BASIS_X * BASIS_Z * k2; + mat[idx_grid + ms->nb_colloc_points * idx_coeff] = coeff_00 * BASIS_X * BASIS_Z + + coeff_10 * DBASIS_X * BASIS_Z + + coeff_01 * BASIS_X * DBASIS_Z + + coeff_11 * DBASIS_X * DBASIS_Z + + coeff_20 * D2BASIS_X * BASIS_Z + + coeff_02 * BASIS_X * D2BASIS_Z; + } +#else + + const double coeff_20 = SQR(phi) * (gtuxx + (x_physical <= EPS) * gtuyy); + const double coeff_02 = SQR(phi) * gtuzz; + const double coeff_11 = SQR(phi) * gtuxz * 2; + const double coeff_10 = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi + (x_physical > EPS) * gtuyy); + const double coeff_01 = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); + const double coeff_00 = -k2; + cblas_daxpy(ms->nb_coeffs, coeff_20, ms->basis_val_20 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); + cblas_daxpy(ms->nb_coeffs, coeff_02, ms->basis_val_02 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); + cblas_daxpy(ms->nb_coeffs, coeff_11, ms->basis_val_11 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); + cblas_daxpy(ms->nb_coeffs, coeff_10, ms->basis_val_10 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); + cblas_daxpy(ms->nb_coeffs, coeff_01, ms->basis_val_01 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); + cblas_daxpy(ms->nb_coeffs, coeff_00, ms->basis_val_00 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); +#endif + + rhs[idx_grid] = k2 + trk ;// betax * trk_dx + betaz * trk_dz; + //rhs[idx_grid] = k2; + rhs_max = MAX(rhs_max, fabs(rhs[idx_grid])); + //rhs_max = fabs(rhs[idx_grid]); + } + } + + //memcpy(rhs, ms->kij_kij, sizeof(*rhs) * ms->nb_colloc_points); + //cblas_daxpy(ms->nb_colloc_points, 1.0, ms->interp_values[I_K], 1, rhs, 1); + //cblas_dsbmv(CblasColMajor, CblasUpper, ms->nb_colloc_points, 0, 1.0, ms->interp_values[I_BETAX], 1, ms->interp_values[I_K_DX], 1, 1.0, rhs, 1); + //cblas_dsbmv(CblasColMajor, CblasUpper, ms->nb_colloc_points, 0, 1.0, ms->interp_values[I_BETAZ], 1, ms->interp_values[I_K_DZ], 1, 1.0, rhs, 1); + + //*prhs_max = rhs[cblas_idamax(ms->nb_colloc_points, rhs, 1)]; + *prhs_max = rhs_max; + + return 0; +} + + +static int calc_geometry(MaximalSlicingContext *ms) +{ + int ret; + + ret = CCTK_InterpGridArrays(ms->gh, 3, ms->interp_operator, ms->interp_params, + ms->coord_system, ms->nb_colloc_points, CCTK_VARIABLE_REAL, + (const void * const *)ms->interp_coords, ARRAY_ELEMS(ms->interp_vars_indices), ms->interp_vars_indices, + ARRAY_ELEMS(ms->interp_values), ms->interp_value_codes, (void * const *)ms->interp_values); + if (ret < 0) + CCTK_WARN(0, "Error interpolating"); + +#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x) + for (int i = 0; i < ms->nb_colloc_points; i++) { + CCTK_REAL Am[3][3], gtu[3][3]; + CCTK_REAL a2; + + CCTK_REAL gtxx = ms->interp_values[I_GTXX][i]; + CCTK_REAL gtyy = ms->interp_values[I_GTYY][i]; + CCTK_REAL gtzz = ms->interp_values[I_GTZZ][i]; + CCTK_REAL gtxy = ms->interp_values[I_GTXY][i]; + CCTK_REAL gtxz = ms->interp_values[I_GTXZ][i]; + CCTK_REAL gtyz = ms->interp_values[I_GTYZ][i]; + + CCTK_REAL Atxx = ms->interp_values[I_ATXX][i]; + CCTK_REAL Atyy = ms->interp_values[I_ATYY][i]; + CCTK_REAL Atzz = ms->interp_values[I_ATZZ][i]; + CCTK_REAL Atxy = ms->interp_values[I_ATXY][i]; + CCTK_REAL Atxz = ms->interp_values[I_ATXZ][i]; + CCTK_REAL Atyz = ms->interp_values[I_ATYZ][i]; + + CCTK_REAL At[3][3] = {{ Atxx, Atxy, Atxz }, + { Atxy, Atyy, Atyz }, + { Atxz, Atyz, Atzz }}; + + CCTK_REAL trK = ms->interp_values[I_K][i]; + + CCTK_REAL Xtx = ms->interp_values[I_XTX][i]; + CCTK_REAL Xtz = ms->interp_values[I_XTZ][i]; + + CCTK_REAL det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz); + + // \tilde{γ}^{ij} + gtu[0][0] = (gtyy * gtzz - SQR(gtyz)) / det; + gtu[1][1] = (gtxx * gtzz - SQR(gtxz)) / det; + gtu[2][2] = (gtxx * gtyy - SQR(gtxy)) / det; + gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det; + gtu[0][2] = (gtxy * gtyz - gtyy * gtxz) / det; + gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det; + gtu[1][0] = gtu[0][1]; + gtu[2][0] = gtu[0][2]; + gtu[2][1] = gtu[1][2]; + + // \tilde{A}_{i}^j + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + double val = 0.0; + for (int l = 0; l < 3; l++) + val += gtu[j][l] * At[l][k]; + Am[j][k] = val; + } + + // K_{ij} K^{ij} + a2 = 0.0; + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + a2 += Am[j][k] * Am[k][j]; + + ms->metric_u[0][i] = gtu[0][0]; + ms->metric_u[1][i] = gtu[1][1]; + ms->metric_u[2][i] = gtu[2][2]; + ms->metric_u[3][i] = gtu[0][1]; + ms->metric_u[4][i] = gtu[0][2]; + ms->metric_u[5][i] = gtu[1][2]; + + ms->kij_kij[i] = a2 + SQR(trK) / 3.; + } + + return 0; +} + +// based on the wikipedia article +// and http://www.netlib.org/templates/matlab/bicgstab.m +static int solve_bicgstab(BiCGSTABContext *ctx, const int N, + double *mat, double *rhs, double *x) +{ + const double rhs_norm = cblas_dnrm2(N, rhs, 1); + + double rho, rho_prev = 1.0; + double omega = 1.0; + double alpha = 1.0; + + double err; + int i; + + double *k = ctx->k; + double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t; + double *res = ctx->res, *res0 = ctx->res0; + + // initialize the residual + memcpy(res, rhs, N * sizeof(*res)); + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, + mat, N, x, 1, 1.0, res, 1); + + memcpy(res0, res, N * sizeof(*res0)); + memcpy(p, res, N * sizeof(*p)); + +#define MAXITER 16 +#define TOL (1e-15) + for (i = 0; i < MAXITER; i++) { + rho = cblas_ddot(N, res, 1, res0, 1); + + if (i) { + double beta = (rho / rho_prev) * (alpha / omega); + + cblas_daxpy(N, -omega, v, 1, p, 1); + cblas_dscal(N, beta, p, 1); + cblas_daxpy(N, 1, res, 1, p, 1); + } + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + k, N, p, 1, 0.0, y, 1); + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + mat, N, y, 1, 0.0, v, 1); + + alpha = rho / cblas_ddot(N, res0, 1, v, 1); + + cblas_daxpy(N, -alpha, v, 1, res, 1); + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + k, N, res, 1, 0.0, z, 1); + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + mat, N, z, 1, 0.0, t, 1); + + omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1); + + cblas_daxpy(N, alpha, y, 1, x, 1); + cblas_daxpy(N, omega, z, 1, x, 1); + + cblas_daxpy(N, -omega, t, 1, res, 1); + + err = cblas_dnrm2(N, res, 1) / rhs_norm; + if (err < TOL) + break; + + rho_prev = rho; + } + if (i == MAXITER) + return -1; + + ctx->iter_total += i + 1; + + return i; +} + +static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q, + const int N, double *mat, double *rhs, cl_mem ocl_x) +{ + const double rhs_norm = cblas_dnrm2(N, rhs, 1); + + double rho, rho_prev = 1.0; + double omega[2] = { 1.0 }; + double alpha = 1.0; + + double err; + int i; + + cl_event events[8]; + + // upload the matrix and the RHS to the GPU + // k and x are assumed to be already uploaded + clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double), + rhs, 0, NULL, &events[0]); + clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), + mat, 0, NULL, &events[1]); + + // initialize the residual + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, + ctx->cl_mat, 0, N, ocl_x, 0, 1, 1.0, ctx->cl_res, 0, 1, + 1, &cl_q, 2, events, &events[2]); + clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double), + 1, &events[2], &events[3]); + clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double), + 1, &events[2], &events[4]); + + clWaitForEvents(5, events); + // BARRIER + +#define MAXITER 16 +#define TOL (1e-15) + for (i = 0; i < MAXITER; i++) { + clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1, + ctx->cl_tmp, 1, &cl_q, 0, NULL, &events[0]); + clEnqueueReadBuffer(cl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho, + 1, &events[0], NULL); + // BARRIER + + if (i) { + double beta = (rho / rho_prev) * (alpha / omega[0]); + + clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1, + 1, &cl_q, 0, NULL, &events[0]); + clblasDscal(N, beta, ctx->cl_p, 0, 1, + 1, &cl_q, 1, &events[0], &events[1]); + clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1, + 1, &cl_q, 1, &events[1], &events[0]); + clWaitForEvents(1, &events[0]); + // BARRIER + } + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1, + 1, &cl_q, 0, NULL, &events[0]); + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1, + 1, &cl_q, 1, &events[0], &events[1]); + + clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1, + ctx->cl_tmp, 1, &cl_q, 1, &events[1], &events[0]); + clEnqueueReadBuffer(cl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha, + 1, &events[0], NULL); + // BARRIER + + alpha = rho / alpha; + + clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1, + 1, &cl_q, 0, NULL, &events[0]); + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1, + 1, &cl_q, 1, &events[0], &events[1]); + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1, + 1, &cl_q, 1, &events[1], &events[0]); + + clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, + ctx->cl_tmp, 1, &cl_q, 1, &events[0], &events[1]); + clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1, + ctx->cl_tmp1, 1, &cl_q, 1, &events[0], &events[2]); + + clEnqueueReadBuffer(cl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega, + 2, &events[1], NULL); + // BARRIER + + omega[0] /= omega[1]; + + clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ocl_x, 0, 1, + 1, &cl_q, 0, NULL, &events[0]); + clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ocl_x, 0, 1, + 1, &cl_q, 1, &events[0], &events[1]); + + clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, + 1, &cl_q, 0, NULL, &events[0]); + clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1, + 1, &cl_q, 1, &events[0], &events[2]); + clEnqueueReadBuffer(cl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err, + 1, &events[2], NULL); + clWaitForEvents(1, &events[1]); + // BARRIER + + if (err < TOL) + break; + + rho_prev = rho; + } + if (i == MAXITER) + return -1; + + ctx->iter_total += i + 1; + + return i; +} + +static int lu_invert(const int N, double *mat, double *rhs, int *ipiv) +{ + LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + mat, N, ipiv, rhs, N); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv); + + return 0; +} + +/* + * Solve the equation + * D²α - KᵢⱼKⁱʲα = -K + * for the coefficients of spectral approximation of α: + * α(ρ, z) = 1 + ΣaᵢⱼTᵢ(ρ)Tⱼ(z) + * where i = { 0, ... , ms->nb_coeffs_x }; + * j = { 0, ... , ms->nb_coeffs_z }; + * Tᵢ(x) are defined by ms->basis. + */ +int msa_maximal_solve(MaximalSlicingContext *ms) +{ + const int N = ms->nb_coeffs; + double rhs_max; + + int ret = 0; + + /* interpolate the metric values and construct the quantities we'll need */ + CCTK_TimerStart("MaximalSlicingAxi_calc_geometry"); + ret = calc_geometry(ms); + CCTK_TimerStop("MaximalSlicingAxi_calc_geometry"); + if (ret < 0) + return ret; + + /* fill the matrix */ + CCTK_TimerStart("MaximalSlicingAxi_construct_matrix"); + ret = construct_matrix(ms, ms->mat, ms->rhs, &rhs_max); + CCTK_TimerStop("MaximalSlicingAxi_construct_matrix"); + if (ret < 0) + return ret; + + if (rhs_max < EPS) { + memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); + if (ms->cl_queue) { + clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double), + ms->coeffs, 0, NULL, NULL); + } + return 0; + } + + /* solve for the coeffs */ + if (ms->steps_since_inverse < 128) { + BiCGSTABContext *b = &ms->bicgstab; + int64_t start = gettime(); + + CCTK_TimerStart("MaximalSlicingAxi_solve_BiCGSTAB"); + if (ms->cl_queue) { + ret = solve_bicgstab_cl(b, ms->cl_queue, ms->nb_coeffs, ms->mat, ms->rhs, ms->ocl_coeffs); + clEnqueueReadBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, sizeof(double) * N, + ms->coeffs, 0, NULL, NULL); + } else + ret = solve_bicgstab(b, ms->nb_coeffs, ms->mat, ms->rhs, ms->coeffs); + CCTK_TimerStop("MaximalSlicingAxi_solve_BiCGSTAB"); + + if (ret >= 0) { + b->time_total += gettime() - start; + b->solve_total++; + ms->steps_since_inverse++; + + if (!(b->solve_total & 127)) { + fprintf(stderr, "BiCGSTAB %ld solves, %ld iterations, total time %ld, avg iterations per solve %g, avg time per solve %g, avg time per iteration %g\n", + b->solve_total, b->iter_total, b->time_total, (double)b->iter_total / b->solve_total, (double)b->time_total / b->solve_total, (double)b->time_total / b->iter_total); + fprintf(stderr, "LU %ld solves, total time %ld, avg time per solve %g\n", ms->lu_solves_total, ms->lu_solves_time, (double)ms->lu_solves_time / ms->lu_solves_total); + } +#if 0 + { + double min, max; + gsl_vector_memcpy(b->y, ms->rhs); + cblas_dgemv(CblasColMajor, CblasNoTrans, ms->mat->size1, ms->mat->size2, -1.0, + ms->mat->data, ms->mat->tda, ms->coeffs->data, 1, 1.0, b->y->data, 1); + gsl_vector_minmax(b->y, &min, &max); + if (fabs(min) > 1e-11 || fabs(max) > 1e-11) + abort(); + } +#endif + } + } else + ret = -1; + + if (ret < 0) { + double *tmpv; + double *tmpm; + int64_t start; + + CCTK_TimerStart("MaximalSlicingAxi_solve_LU"); + start = gettime(); + + lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv); + ms->lu_solves_time += gettime() - start; + ms->lu_solves_total++; + CCTK_TimerStop("MaximalSlicingAxi_solve_LU"); + + tmpv = ms->coeffs; + ms->coeffs = ms->rhs; + ms->rhs = tmpv; + + tmpm = ms->mat; + ms->mat = ms->bicgstab.k; + ms->bicgstab.k = tmpm; + + if (ms->cl_queue) { + cl_event events[2]; + clEnqueueWriteBuffer(ms->cl_queue, ms->bicgstab.cl_k, 0, 0, N * N * sizeof(double), + ms->bicgstab.k, 0, NULL, &events[0]); + clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 0, 0, N * sizeof(double), + ms->coeffs, 0, NULL, &events[1]); + clWaitForEvents(2, events); + } + + ms->steps_since_inverse = 0; + } + + + return ret; +} -- cgit v1.2.3