From efce90efb4d4e38fdd44a96ccbf1424c8b5ab94e Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Sat, 7 Apr 2018 09:54:54 +0200 Subject: Large rewrite. Split into separate subsystems. --- interface.ccl | 2 +- param.ccl | 10 + schedule.ccl | 10 +- src/basis.c | 236 ++++++++++++++-- src/basis.h | 49 ++++ src/bicgstab.c | 420 ++++++++++++++++++++++++++++ src/bicgstab.h | 60 ++++ src/common.h | 30 ++ src/make.code.defn | 2 +- src/maximal_slicing_axi.c | 631 ++++++++++++----------------------------- src/maximal_slicing_axi.h | 209 +------------- src/ms_solve.c | 697 ++++++++++++++++++++++++++++++++++++++++++++++ src/ms_solve.h | 52 ++++ src/pssolve.c | 389 ++++++++++++++++++++++++++ src/pssolve.h | 116 ++++++++ src/solve.c | 463 ++++++++++++++++++++++++++---- 16 files changed, 2645 insertions(+), 731 deletions(-) create mode 100644 src/basis.h create mode 100644 src/bicgstab.c create mode 100644 src/bicgstab.h create mode 100644 src/common.h create mode 100644 src/ms_solve.c create mode 100644 src/ms_solve.h create mode 100644 src/pssolve.c create mode 100644 src/pssolve.h diff --git a/interface.ccl b/interface.ccl index cec39a3..1a2c245 100644 --- a/interface.ccl +++ b/interface.ccl @@ -1,7 +1,7 @@ # Interface definition for thorn MaximalSlicingAxi implements: MaximalSlicingAxi -INHERITS: ADMBase grid CoordBase MethodOfLines +INHERITS: ADMBase grid CoordBase MethodOfLines ML_BSSN CCTK_INT FUNCTION MoLRegisterConstrained(CCTK_INT IN idx) CCTK_INT FUNCTION MoLRegisterSaveAndRestore(CCTK_INT IN idx) diff --git a/param.ccl b/param.ccl index 2ca5e50..5309214 100644 --- a/param.ccl +++ b/param.ccl @@ -23,6 +23,16 @@ CCTK_INT basis_order_z "Number of the basis functions in the z direction" STEERA 1: :: "" } 40 +CCTK_REAL filter_power "" STEERABLE=recover +{ + 0: :: "" +} 64.0 + +CCTK_REAL input_filter_power "" STEERABLE=recover +{ + 0: :: "" +} 64.0 + CCTK_REAL scale_factor "Scaling factor L for the SB basis" STEERABLE=recover { 0: :: "" diff --git a/schedule.ccl b/schedule.ccl index 0d9c425..eafabd1 100644 --- a/schedule.ccl +++ b/schedule.ccl @@ -2,10 +2,18 @@ # if (CCTK_Equals(lapse_evolution_method, "maximal_axi")) { - SCHEDULE maximal_slicing_axi IN MoL_CalcRHS BEFORE ML_BSSN_evolCalcGroup { + SCHEDULE maximal_slicing_axi IN ML_BSSN_evolCalcGroup BEFORE ML_BSSN_RHS { LANG: C } "Maximal slicing in axisymmetry" + #SCHEDULE maximal_slicing_axi IN MoL_PostStepModify { + # LANG: C + #} "Maximal slicing in axisymmetry" + + #SCHEDULE maximal_slicing_axi IN MoL_PseudoEvolution { + # LANG: C + #} "Maximal slicing in axisymmetry" + SCHEDULE maximal_slicing_axi_register AT Startup { LANG: C } "" diff --git a/src/basis.c b/src/basis.c index 5ea043e..20e9f12 100644 --- a/src/basis.c +++ b/src/basis.c @@ -1,11 +1,34 @@ +/* + * Basis sets for pseudospectral methods + * Copyright (C) 2016 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ #include -#include "maximal_slicing_axi.h" +#include "basis.h" +#include "common.h" +/* + * The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9) + * SB(x, n) = sin((n + 1) arccot(|x| / L)) + * They are symmetric wrt origin and decay as 1/x in infinity. + */ static double sb_even_eval(double coord, int idx) { - double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord)); + double val = atan2(SCALE_FACTOR, coord); idx *= 2; // even only @@ -14,20 +37,20 @@ static double sb_even_eval(double coord, int idx) static double sb_even_eval_diff1(double coord, int idx) { - double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord)); + double val = atan2(SCALE_FACTOR, coord); idx *= 2; // even only - return - SCALE_FACTOR * (idx + 1) * SGN(coord) * cos((idx + 1) * val) / (SQR(SCALE_FACTOR) + SQR(coord)); + return - SCALE_FACTOR * (idx + 1) * cos((idx + 1) * val) / (SQR(SCALE_FACTOR) + SQR(coord)); } static double sb_even_eval_diff2(double coord, int idx) { - double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord)); + double val = atan2(SCALE_FACTOR, coord); idx *= 2; // even only - return SCALE_FACTOR * (idx + 1) * SGN(coord) * (2 * fabs(coord) * cos((idx + 1) * val) - SCALE_FACTOR * (idx + 1) * sin((idx + 1) * val)) / SQR(SQR(SCALE_FACTOR) + SQR(coord)); + return SCALE_FACTOR * (idx + 1) * (2 * coord * cos((idx + 1) * val) - SCALE_FACTOR * (idx + 1) * sin((idx + 1) * val)) / SQR(SQR(SCALE_FACTOR) + SQR(coord)); } static double sb_even_colloc_point(int order, int idx) @@ -38,11 +61,15 @@ static double sb_even_colloc_point(int order, int idx) //order *= 2; //t = (idx + 2) * M_PI / (order + 4); +#if MS_POLAR + t = (idx + 2) * M_PI / (2 * order + 3); +#else t = (idx + 2) * M_PI / (2 * order + 2); +#endif return SCALE_FACTOR / tan(t); } -const BasisSet msa_sb_even_basis = { +const BasisSet ms_sb_even_basis = { .eval = sb_even_eval, .eval_diff1 = sb_even_eval_diff1, .eval_diff2 = sb_even_eval_diff2, @@ -87,17 +114,77 @@ static double tb_even_colloc_point(int order, int idx) //order *= 2; //t = (idx + 2) * M_PI / (order + 4); - t = (idx + 3) * M_PI / (2 * (order + 2)); + t = (idx + 2) * M_PI / (2 * order + 4); return SCALE_FACTOR / tan(t); } -const BasisSet msa_tb_even_basis = { +const BasisSet ms_tb_even_basis = { .eval = tb_even_eval, .eval_diff1 = tb_even_eval_diff1, .eval_diff2 = tb_even_eval_diff2, .colloc_point = tb_even_colloc_point, }; +static double tl_eval(double coord, int idx) +{ + double t = 2 * atan2(sqrt(SCALE_FACTOR), sqrt(fabs(coord))); + + //idx++; + + return cos(idx * t);// - 1.0; +} + +static double tl_eval_diff1(double coord, int idx) +{ + double y = fabs(coord); + double y12 = sqrt(y); + double t = 2 * atan2(sqrt(SCALE_FACTOR), y12); + + //idx++; + + if (y < 1e-10) + return -2.0 * SQR(idx) * pow(-1.0, idx) / SCALE_FACTOR; + + return idx * sqrt(SCALE_FACTOR) * sin(idx * t) / (y12 * (y + SCALE_FACTOR)); +} + +static double tl_eval_diff2(double coord, int idx) +{ + double y = fabs(coord); + double y12 = sqrt(y); + double t = 2 * atan2(sqrt(SCALE_FACTOR), y12); + + //idx++; + + if (y < 1e-10) + return 4.0 * SQR(idx) * (SQR(idx) + 2) * pow(-1.0, idx) / (3.0 * SQR(SCALE_FACTOR)); + + return -(SCALE_FACTOR * SQR(idx) * cos(idx * t) / (y * SQR(y + SCALE_FACTOR)) + sqrt(SCALE_FACTOR) * idx * sin(idx * t) / (y12 * SQR(y + SCALE_FACTOR)) + sqrt(SCALE_FACTOR) * idx * sin(idx * t) / (2 * y * y12 * (y + SCALE_FACTOR))); +} + +static double tl_colloc_point(int order, int idx) +{ + double t; + + if (!idx) + return 0.0; + + idx = order - idx - 1; + + order++; + idx++; + + t = (2 * idx + 1) * M_PI / (2 * order); + return SCALE_FACTOR / SQR(tan(0.5 * t)); +} + +const BasisSet ms_tl_basis = { + .eval = tl_eval, + .eval_diff1 = tl_eval_diff1, + .eval_diff2 = tl_eval_diff2, + .colloc_point = tl_colloc_point, +}; + static double full_eval(double coord, int idx) { double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord)); @@ -153,7 +240,7 @@ static double full_colloc_point(int order, int idx) } -const BasisSet msa_full_basis = { +const BasisSet ms_full_basis = { .eval = full_eval, .eval_diff1 = full_eval_diff1, .eval_diff2 = full_eval_diff2, @@ -162,28 +249,145 @@ const BasisSet msa_full_basis = { static double cheb_eval(double coord, int idx) { - return cos(2 * idx * acos(coord / SCALE_FACTOR)); + double x = MIN(1.0, MAX(2.0 * coord / SCALE_FACTOR - 1.0, -1.0)); + + //idx += 1; + return cos(idx * acos(x)); } static double cheb_eval_diff1(double coord, int idx) { - return 2 * idx * sin(2 * idx * acos(coord / SCALE_FACTOR)) / sqrt(SQR(SCALE_FACTOR) - SQR(coord)); + double x = MIN(1.0, MAX(2.0 * coord / SCALE_FACTOR - 1.0, -1.0)); + double t = acos(x); + + //idx += 1; + + if (fabs(x - 1.0) < 1e-8) + return (2.0 / SCALE_FACTOR) * SQR(idx); + if (fabs(x + 1.0) < 1e-8) + return -(2.0 / SCALE_FACTOR) * SQR(idx) * pow(-1.0, idx); + return (2.0 / SCALE_FACTOR) * idx * sin(idx * t) / sin(t); } static double cheb_eval_diff2(double coord, int idx) { - double t = acos(coord / SCALE_FACTOR); - return 2 * idx * (cos(2 * idx * t) * 2 * idx / (SQR(SCALE_FACTOR) - SQR(coord)) + sin(2 * idx * t) * coord / pow(SQR(SCALE_FACTOR) - SQR(coord), 1.5)); + double x = MIN(1.0, MAX(2.0 * coord / SCALE_FACTOR - 1.0, -1.0)); + double t = acos(x); + double st = sin(t); + + //idx += 1; + + if (fabs(x - 1.0) < 1e-8) + return SQR(2.0 / SCALE_FACTOR) * SQR(idx) * (SQR(idx) - 1.0) / 3.; + if (fabs(x + 1.0) < 1e-8) + return SQR(2.0 / SCALE_FACTOR) * pow(-1.0, idx) * SQR(idx) * (SQR(idx) - 1.0) / 3.; + + return SQR(2.0 / SCALE_FACTOR) * (-SQR(idx) * cos(idx * t) / SQR(st) + idx * cos(t) * sin(idx * t) / (st * SQR(st))); } static double cheb_colloc_point(int order, int idx) { - return SCALE_FACTOR * cos((idx + 0.01) * M_PI / (4 * order + 4)); + double y; + + idx = order - idx - 1; + + //order += 1; + //idx += 1; + y = cos(idx * M_PI / (order - 1)); + //y = cos((2 * idx + 1) * M_PI / (2 * order)); + + return 0.5 * SCALE_FACTOR * (y + 1.0); } -const BasisSet msa_cheb_basis = { +const BasisSet ms_cheb_basis = { .eval = cheb_eval, .eval_diff1 = cheb_eval_diff1, .eval_diff2 = cheb_eval_diff2, .colloc_point = cheb_colloc_point, }; + +static double cheb_even_eval(double coord, int idx) +{ + double x = MIN(1.0, MAX(coord / SCALE_FACTOR, -1.0)); + + idx *= 2; + //idx += 1; + return cos(idx * acos(x)); +} + +static double cheb_even_eval_diff1(double coord, int idx) +{ + double x = MIN(1.0, MAX(coord / SCALE_FACTOR, -1.0)); + double t = acos(x); + + idx *= 2; + //idx += 1; + + if (fabs(fabs(x) - 1.0) < 1e-8) + return (1.0 / SCALE_FACTOR) * SQR(idx); + return (1.0 / SCALE_FACTOR) * idx * sin(idx * t) / sin(t); +} + +static double cheb_even_eval_diff2(double coord, int idx) +{ + double x = MIN(1.0, MAX(coord / SCALE_FACTOR, -1.0)); + double t = acos(x); + double st = sin(t); + + idx *= 2; + //idx += 1; + + if (fabs(fabs(x) - 1) < 1e-8) + return SQR(1.0 / SCALE_FACTOR) * SQR(idx) * (SQR(idx) - 1.0) / 3.; + + return SQR(1.0 / SCALE_FACTOR) * (-SQR(idx) * cos(idx * t) / SQR(st) + idx * cos(t) * sin(idx * t) / (st * SQR(st))); +} + +static double cheb_even_colloc_point(int order, int idx) +{ + double y; + + idx = order - idx - 1; + + //order += 1; + //idx += 1; + order *= 2; + y = cos(idx * M_PI / (order - 1)); + y = cos((2 * idx + 2) * M_PI / (2 * order)); + + return SCALE_FACTOR * y; +} + +const BasisSet ms_cheb_even_basis = { + .eval = cheb_even_eval, + .eval_diff1 = cheb_even_eval_diff1, + .eval_diff2 = cheb_even_eval_diff2, + .colloc_point = cheb_even_colloc_point, +}; + +static double cos_even_eval(double coord, int idx) +{ + return cos(2 * idx * coord); +} + +static double cos_even_eval_diff1(double coord, int idx) +{ + return -2 * idx * sin(2 * idx * coord); +} + +static double cos_even_eval_diff2(double coord, int idx) +{ + return -4 * SQR(idx) * cos(2 * idx * coord); +} + +static double cos_even_colloc_point(int order, int idx) +{ + return M_PI * idx / (2 * order - 0); +} + +const BasisSet ms_cos_even_basis = { + .eval = cos_even_eval, + .eval_diff1 = cos_even_eval_diff1, + .eval_diff2 = cos_even_eval_diff2, + .colloc_point = cos_even_colloc_point, +}; diff --git a/src/basis.h b/src/basis.h new file mode 100644 index 0000000..c275d6f --- /dev/null +++ b/src/basis.h @@ -0,0 +1,49 @@ +/* + * Basis sets for pseudospectral methods + * Copyright (C) 2016 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef MS_BASIS_H +#define MS_BASIS_H + +/* a set of basis functions */ +typedef struct BasisSet { + /* evaluate the idx-th basis function at the specified point*/ + double (*eval) (double coord, int idx); + /* evaluate the first derivative of the idx-th basis function at the specified point*/ + double (*eval_diff1)(double coord, int idx); + /* evaluate the second derivative of the idx-th basis function at the specified point*/ + double (*eval_diff2)(double coord, int idx); + /** + * Get the idx-th collocation point for the specified order. + * idx runs from 0 to order - 1 (inclusive) + */ + double (*colloc_point)(int order, int idx); +} BasisSet; + +extern const BasisSet ms_cheb_basis; +extern const BasisSet ms_cheb_even_basis; +extern const BasisSet ms_full_basis; +extern const BasisSet ms_tb_even_basis; +extern const BasisSet ms_sb_even_basis; +extern const BasisSet ms_sb_odd_basis; +extern const BasisSet ms_tl_basis; +extern const BasisSet ms_cos_even_basis; + +#define SCALE_FACTOR scale_factor +extern double scale_factor; + +#endif /* MS_BASIS_H */ diff --git a/src/bicgstab.c b/src/bicgstab.c new file mode 100644 index 0000000..e5d782f --- /dev/null +++ b/src/bicgstab.c @@ -0,0 +1,420 @@ +/* + * BiCGStab iterative linear system solver + * Copyright (C) 2016 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "common.h" + +#if HAVE_OPENCL +#include +#include +#endif + +#include +#include +#include +#include + +#include "bicgstab.h" + +#define BICGSTAB_MAXITER 16 +#define BICGSTAB_TOL (1e-15) + +struct BiCGStabContext { + int N; + + double *x; + double *p, *v, *y, *z, *t; + double *res, *res0; + double *k; + +#if HAVE_OPENCL + cl_context ocl_ctx; + cl_command_queue ocl_queue; + + cl_mem cl_x; + cl_mem cl_p, cl_v, cl_y, cl_z, cl_t; + cl_mem cl_res, cl_res0; + cl_mem cl_k, cl_mat; + cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1; + cl_mem cl_tmp, cl_tmp1; +#endif +}; + +#if HAVE_OPENCL +static int solve_cl(BiCGStabContext *ctx, + const double *mat, const double *rhs, double *x) +{ + cl_command_queue ocl_q = ctx->ocl_queue; + const int N = ctx->N; + const double rhs_norm = cblas_dnrm2(N, rhs, 1); + + double rho, rho_prev = 1.0; + double omega[2] = { 1.0 }; + double alpha = 1.0; + + double err; + int i; + + cl_event events[8]; + + if (rhs_norm < BICGSTAB_TOL) { + memset(x, 0, N * sizeof(*x)); + return 0; + } + + // upload the matrix and RHS + clEnqueueWriteBuffer(ocl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs, 0, NULL, &events[0]); + clEnqueueWriteBuffer(ocl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]); + + // initialize the residual + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, + ctx->cl_mat, 0, N, ctx->cl_x, 0, 1, 1.0, ctx->cl_res, 0, 1, + 1, &ocl_q, 2, events, &events[2]); + clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double), + 1, &events[2], &events[3]); + clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double), + 1, &events[2], &events[4]); + + clWaitForEvents(5, events); + // BARRIER + + for (i = 0; i < MAXITER; i++) { + clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1, + ctx->cl_tmp, 1, &ocl_q, 0, NULL, &events[0]); + clEnqueueReadBuffer(ocl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho, + 1, &events[0], NULL); + // BARRIER + + if (i) { + double beta = (rho / rho_prev) * (alpha / omega[0]); + + clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + clblasDscal(N, beta, ctx->cl_p, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1, + 1, &ocl_q, 1, &events[1], &events[0]); + clWaitForEvents(1, &events[0]); + // BARRIER + } + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + + clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1, + ctx->cl_tmp, 1, &ocl_q, 1, &events[1], &events[0]); + clEnqueueReadBuffer(ocl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha, + 1, &events[0], NULL); + // BARRIER + + alpha = rho / alpha; + + clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1, + 1, &ocl_q, 1, &events[1], &events[0]); + + clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, + ctx->cl_tmp, 1, &ocl_q, 1, &events[0], &events[1]); + clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1, + ctx->cl_tmp1, 1, &ocl_q, 1, &events[0], &events[2]); + + clEnqueueReadBuffer(ocl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega, + 2, &events[1], NULL); + // BARRIER + + omega[0] /= omega[1]; + + clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ctx->cl_x, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ctx->cl_x, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + + clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1, + 1, &ocl_q, 1, &events[0], &events[2]); + clEnqueueReadBuffer(ocl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err, + 1, &events[2], NULL); + clWaitForEvents(1, &events[1]); + // BARRIER + + if (err < BICGSTAB_TOL) + break; + + rho_prev = rho; + } + if (i == BICGSTAB_MAXITER) + return -1; + + clEnqueueReadBuffer(ocl_q, ctx->cl_x, 1, 0, sizeof(double) * N, + x, 0, NULL, NULL); + return i; +} +#endif + +// based on the wikipedia article +// and http://www.netlib.org/templates/matlab/bicgstab.m +static int solve_sw(BiCGStabContext *ctx, + const double *mat, const double *rhs, double *x) +{ + const int N = ctx->N; + const double rhs_norm = cblas_dnrm2(N, rhs, 1); + + double rho, rho_prev = 1.0; + double omega = 1.0; + double alpha = 1.0; + + double err; + int i; + + double *k = ctx->k; + double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t; + double *res = ctx->res, *res0 = ctx->res0; + + if (rhs_norm < BICGSTAB_TOL) { + memset(x, 0, N * sizeof(*x)); + return 0; + } + + // initialize the residual + memcpy(res, rhs, N * sizeof(*res)); + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, + mat, N, ctx->x, 1, 1.0, res, 1); + + memcpy(res0, res, N * sizeof(*res0)); + memcpy(p, res, N * sizeof(*p)); + + for (i = 0; i < BICGSTAB_MAXITER; i++) { + rho = cblas_ddot(N, res, 1, res0, 1); + + if (i) { + double beta = (rho / rho_prev) * (alpha / omega); + + cblas_daxpy(N, -omega, v, 1, p, 1); + cblas_dscal(N, beta, p, 1); + cblas_daxpy(N, 1, res, 1, p, 1); + } + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + k, N, p, 1, 0.0, y, 1); + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + mat, N, y, 1, 0.0, v, 1); + + alpha = rho / cblas_ddot(N, res0, 1, v, 1); + + cblas_daxpy(N, -alpha, v, 1, res, 1); + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + k, N, res, 1, 0.0, z, 1); + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + mat, N, z, 1, 0.0, t, 1); + + omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1); + + cblas_daxpy(N, alpha, y, 1, ctx->x, 1); + cblas_daxpy(N, omega, z, 1, ctx->x, 1); + + cblas_daxpy(N, -omega, t, 1, res, 1); + + err = cblas_dnrm2(N, res, 1) / rhs_norm; + if (err < BICGSTAB_TOL) + break; + + rho_prev = rho; + } + if (i == BICGSTAB_MAXITER) + return -1; + + memcpy(x, ctx->x, sizeof(*x) * ctx->N); + + return i; +} + +int ms_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x) +{ + int ret; + +#if HAVE_OPENCL + if (ctx->ocl_ctx) + ret = solve_cl(ctx, mat, rhs, x); + else +#endif + ret = solve_sw(ctx, mat, rhs, x); + if (ret < 0) + return ret; + +#if MS_VERIFY + { + int i; + double *y; + + y = malloc(sizeof(*y) * ctx->N); + memcpy(y, rhs, sizeof(*y) * ctx->N); + cblas_dgemv(CblasColMajor, CblasNoTrans, ctx->N, ctx->N, -1.0, + mat, ctx->N, x, 1, 1.0, y, 1); + i = cblas_idamax(ctx->N, y, 1); + if (fabs(y[i]) > 1e-11) + abort(); + } +#endif + + return ret; +} + +int ms_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0) +{ +#if HAVE_OPENCL + if (ctx->ocl_ctx) { + cl_event events[2]; + clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_k, 0, 0, ctx->N * ctx->N * sizeof(double), + k, 0, NULL, &events[0]); + clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_x, 0, 0, ctx->N * sizeof(double), + x0, 0, NULL, &events[1]); + clWaitForEvents(2, events); + } else +#endif + { + memcpy(ctx->x, x0, ctx->N * sizeof(*x0)); + memcpy(ctx->k, k, ctx->N * ctx->N * sizeof(*k)); + } + + return 0; +} + +int ms_bicgstab_context_alloc(BiCGStabContext **pctx, int N, + cl_context ocl_ctx, cl_command_queue ocl_q) +{ + BiCGStabContext *ctx; + int ret = 0; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + ctx->N = N; + +#if HAVE_OPENCL + if (ocl_ctx) { + ctx->ocl_ctx = ocl_ctx; + ctx->ocl_queue = ocl_q; + +#define ALLOC(dst, size) \ +do { \ + ctx->dst = clCreateBuffer(ocl_ctx, 0, size, NULL, &ret); \ + if (ret != CL_SUCCESS) \ + goto fail; \ +} while (0) + + ALLOC(cl_x, N * sizeof(double)); + ALLOC(cl_p, N * sizeof(double)); + ALLOC(cl_v, N * sizeof(double)); + ALLOC(cl_y, N * sizeof(double)); + ALLOC(cl_z, N * sizeof(double)); + ALLOC(cl_t, N * sizeof(double)); + ALLOC(cl_res, N * sizeof(double)); + ALLOC(cl_res0, N * sizeof(double)); + ALLOC(cl_tmp, N * sizeof(double)); + ALLOC(cl_tmp1, N * 2 * sizeof(double)); + + ALLOC(cl_k, N * N * sizeof(double)); + ALLOC(cl_mat, N * N * sizeof(double)); + + ALLOC(cl_rho, sizeof(double)); + ALLOC(cl_alpha, sizeof(double)); + ALLOC(cl_beta, sizeof(double)); + ALLOC(cl_omega, 2 * sizeof(double)); + ALLOC(cl_omega1, sizeof(double)); + } else +#endif + { + ret |= posix_memalign((void**)&ctx->x, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->p, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->v, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->y, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->z, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->t, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->res, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->res0, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->k, 32, sizeof(double) * N * N); + } + +fail: + if (ret) { + ms_bicgstab_context_free(&ctx); + return -ENOMEM; + } + + *pctx = ctx; + return 0; +} + +void ms_bicgstab_context_free(BiCGStabContext **pctx) +{ + BiCGStabContext *ctx = *pctx; + + if (!ctx) + return; + + free(ctx->x); + free(ctx->p); + free(ctx->v); + free(ctx->y); + free(ctx->z); + free(ctx->t); + free(ctx->res); + free(ctx->res0); + free(ctx->k); + +#if HAVE_OPENCL + if (ctx->ocl_ctx) { + clReleaseMemObject(ctx->cl_x); + clReleaseMemObject(ctx->cl_p); + clReleaseMemObject(ctx->cl_v); + clReleaseMemObject(ctx->cl_y); + clReleaseMemObject(ctx->cl_z); + clReleaseMemObject(ctx->cl_t); + clReleaseMemObject(ctx->cl_res); + clReleaseMemObject(ctx->cl_res0); + clReleaseMemObject(ctx->cl_tmp); + clReleaseMemObject(ctx->cl_tmp1); + + clReleaseMemObject(ctx->cl_k); + clReleaseMemObject(ctx->cl_mat); + + clReleaseMemObject(ctx->cl_rho); + clReleaseMemObject(ctx->cl_alpha); + clReleaseMemObject(ctx->cl_beta); + clReleaseMemObject(ctx->cl_omega); + clReleaseMemObject(ctx->cl_omega1); + } +#endif + + free(ctx); + *pctx = NULL; +} diff --git a/src/bicgstab.h b/src/bicgstab.h new file mode 100644 index 0000000..150c05c --- /dev/null +++ b/src/bicgstab.h @@ -0,0 +1,60 @@ +/* + * BiCGStab iterative linear system solver + * Copyright (C) 2016 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef MS_BICGSTAB_H +#define MS_BICGSTAB_H + +#include "common.h" + +#if HAVE_OPENCL +#include +#else +typedef void* cl_context; +typedef void* cl_command_queue; +#endif + +typedef struct BiCGStabContext BiCGStabContext; + +/** + * Allocate and initialize the solver for the NxN system. + * + * If the OpenCL context and command queue are provided (non-NULL), the solver + * will run using clBLAS. + */ +int ms_bicgstab_context_alloc(BiCGStabContext **ctx, int N, + cl_context ocl_ctx, cl_command_queue ocl_q); + +/** + * Free the solver and all its internal state. + */ +void ms_bicgstab_context_free(BiCGStabContext **ctx); + +/** + * Initialise the solver with the given preconditioner matrix. This function + * may be any number of times on a given solver context. + */ +int ms_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0); + +/** + * Solve the linear system + * mat · x = rhs + * The result is written into x. + */ +int ms_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x); + +#endif /* MS_BICGSTAB_H */ diff --git a/src/common.h b/src/common.h new file mode 100644 index 0000000..8f8f3fb --- /dev/null +++ b/src/common.h @@ -0,0 +1,30 @@ +#ifndef MS_COMMON_H +#define MS_COMMON_H + +#define HAVE_OPENCL 0 +#define MS_VERIFY 0 +#define MS_POLAR 0 +#define MS_CCZ4 0 + +#define SQR(x) ((x) * (x)) +#define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0) +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define MIN(x, y) ((x) > (y) ? (y) : (x)) +#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr)) + +/* + * small number to avoid r=0 singularities + */ +#define EPS 1E-08 + +#include +#include +#include +static inline int64_t gettime(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; +} + +#endif /* MS_COMMON_H */ diff --git a/src/make.code.defn b/src/make.code.defn index d11b536..d74c8bb 100644 --- a/src/make.code.defn +++ b/src/make.code.defn @@ -1,7 +1,7 @@ # Main make.code.defn file for thorn MaximalSlicingAxi # Source files in this directory -SRCS = basis.c maximal_slicing_axi.c register.c solve.c +SRCS = basis.c bicgstab.c maximal_slicing_axi.c ms_solve.c pssolve.c register.c # Subdirectories containing source files SUBDIRS = diff --git a/src/maximal_slicing_axi.c b/src/maximal_slicing_axi.c index 48bb4c9..e5b0d85 100644 --- a/src/maximal_slicing_axi.c +++ b/src/maximal_slicing_axi.c @@ -21,446 +21,26 @@ #include "util_Table.h" #include "maximal_slicing_axi.h" - -#define ACC_TEST 0 - -/* - * The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9) - * SB(x, n) = sin((n + 1) arccot(|x| / L)) - * They are symmetric wrt origin and decay as 1/x in infinity. - */ +#include "ms_solve.h" double scale_factor; - -/* mapping between our indices and thorn names */ -static const char *metric_vars[] = { - [GTXX] = "ML_BSSN::gt11", - [GTYY] = "ML_BSSN::gt22", - [GTZZ] = "ML_BSSN::gt33", - [GTXY] = "ML_BSSN::gt12", - [GTXZ] = "ML_BSSN::gt13", - [GTYZ] = "ML_BSSN::gt23", - [ATXX] = "ML_BSSN::At11", - [ATYY] = "ML_BSSN::At22", - [ATZZ] = "ML_BSSN::At33", - [ATXY] = "ML_BSSN::At12", - [ATXZ] = "ML_BSSN::At13", - [ATYZ] = "ML_BSSN::At23", - [PHI] = "ML_BSSN::phi", - [K] = "ML_BSSN::trK", - [XTX] = "ML_BSSN::Xt1", - [XTY] = "ML_BSSN::Xt2", - [XTZ] = "ML_BSSN::Xt3", - [BETAX] = "ML_BSSN::beta1", - [BETAY] = "ML_BSSN::beta2", - [BETAZ] = "ML_BSSN::beta3", -}; - -/* mapping between the cactus grid values and interpolated values */ -static const CCTK_INT interp_operation_indices[] = { - [I_GTXX] = GTXX, - [I_GTXX_DX] = GTXX, - [I_GTXX_DY] = GTXX, - [I_GTXX_DZ] = GTXX, - [I_GTYY] = GTYY, - [I_GTYY_DX] = GTYY, - [I_GTYY_DY] = GTYY, - [I_GTYY_DZ] = GTYY, - [I_GTZZ] = GTZZ, - [I_GTZZ_DX] = GTZZ, - [I_GTZZ_DY] = GTZZ, - [I_GTZZ_DZ] = GTZZ, - [I_GTXY] = GTXY, - [I_GTXY_DX] = GTXY, - [I_GTXY_DY] = GTXY, - [I_GTXY_DZ] = GTXY, - [I_GTXZ] = GTXZ, - [I_GTXZ_DX] = GTXZ, - [I_GTXZ_DY] = GTXZ, - [I_GTXZ_DZ] = GTXZ, - [I_GTYZ] = GTYZ, - [I_GTYZ_DX] = GTYZ, - [I_GTYZ_DY] = GTYZ, - [I_GTYZ_DZ] = GTYZ, - [I_PHI] = PHI, - [I_PHI_DX] = PHI, - [I_PHI_DY] = PHI, - [I_PHI_DZ] = PHI, - [I_ATXX] = ATXX, - [I_ATYY] = ATYY, - [I_ATZZ] = ATZZ, - [I_ATXY] = ATXY, - [I_ATXZ] = ATXZ, - [I_ATYZ] = ATYZ, - [I_K] = K, - [I_K_DX] = K, - [I_K_DY] = K, - [I_K_DZ] = K, - [I_XTX] = XTX, - [I_XTY] = XTY, - [I_XTZ] = XTZ, - [I_BETAX] = BETAX, - [I_BETAY] = BETAY, - [I_BETAZ] = BETAZ, -}; - -/* the operation (plain value or x/y/z-derivative) to apply during interpolation */ -static const CCTK_INT interp_operation_codes[] = { - [I_GTXX] = 0, - [I_GTXX_DX] = 1, - [I_GTXX_DY] = 2, - [I_GTXX_DZ] = 3, - [I_GTYY] = 0, - [I_GTYY_DX] = 1, - [I_GTYY_DY] = 2, - [I_GTYY_DZ] = 3, - [I_GTZZ] = 0, - [I_GTZZ_DX] = 1, - [I_GTZZ_DY] = 2, - [I_GTZZ_DZ] = 3, - [I_GTXY] = 0, - [I_GTXY_DX] = 1, - [I_GTXY_DY] = 2, - [I_GTXY_DZ] = 3, - [I_GTXZ] = 0, - [I_GTXZ_DX] = 1, - [I_GTXZ_DY] = 2, - [I_GTXZ_DZ] = 3, - [I_GTYZ] = 0, - [I_GTYZ_DX] = 1, - [I_GTYZ_DY] = 2, - [I_GTYZ_DZ] = 3, - [I_PHI] = 0, - [I_PHI_DX] = 1, - [I_PHI_DY] = 2, - [I_PHI_DZ] = 3, - [I_ATXX] = 0, - [I_ATYY] = 0, - [I_ATZZ] = 0, - [I_ATXY] = 0, - [I_ATXZ] = 0, - [I_ATYZ] = 0, - [I_K] = 0, - [I_K_DX] = 1, - [I_K_DY] = 2, - [I_K_DZ] = 3, - [I_XTX] = 0, - [I_XTY] = 0, - [I_XTZ] = 0, - [I_BETAX] = 0, - [I_BETAY] = 0, - [I_BETAZ] = 0, -}; - -static void init_opencl(MaximalSlicingContext *ms) +/* get an approximate "main" frequency component in a basis function */ +static double calc_basis_freq(const BasisSet *b, int order) { - int err, count; - cl_platform_id platform; - cl_device_id device; - cl_context_properties props[3]; - - err = clGetPlatformIDs(1, &platform, &count); - if (err != CL_SUCCESS || count < 1) { - fprintf(stderr, "Could not get an OpenCL platform ID\n"); - return; - } - - err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &count); - if (err != CL_SUCCESS || count < 1) { - fprintf(stderr, "Could not get an OpenCL device ID\n"); - return; - } - - props[0] = CL_CONTEXT_PLATFORM; - props[1] = (cl_context_properties)platform; - props[2] = 0; - - ms->cl_ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); - if (err != CL_SUCCESS || !ms->cl_ctx) { - fprintf(stderr, "Could not create an OpenCL context\n"); - return; - } - - ms->cl_queue = clCreateCommandQueue(ms->cl_ctx, device, 0, &err); - if (err != CL_SUCCESS || !ms->cl_queue) { - fprintf(stderr, "Could not create an OpenCL command queue: %d\n", err); - goto fail; - } - - err = clblasSetup(); - if (err != CL_SUCCESS) { - fprintf(stderr, "Error setting up clBLAS\n"); - goto fail; - } - - ms->ocl_coeffs = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); - - ms->bicgstab.cl_p = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); - ms->bicgstab.cl_v = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); - ms->bicgstab.cl_y = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); - ms->bicgstab.cl_z = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); - ms->bicgstab.cl_t = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); - ms->bicgstab.cl_res = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); - ms->bicgstab.cl_res0 = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); - ms->bicgstab.cl_tmp = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); - ms->bicgstab.cl_tmp1 = clCreateBuffer(ms->cl_ctx, 0, 2 * ms->nb_coeffs * sizeof(double), NULL, &err); - - ms->bicgstab.cl_k = clCreateBuffer(ms->cl_ctx, 0, ms->nb_colloc_points * ms->nb_coeffs * sizeof(double), NULL, &err); - ms->bicgstab.cl_mat = clCreateBuffer(ms->cl_ctx, 0, ms->nb_colloc_points * ms->nb_coeffs * sizeof(double), NULL, &err); - - ms->bicgstab.cl_rho = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err); - ms->bicgstab.cl_alpha = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err); - ms->bicgstab.cl_beta = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err); - ms->bicgstab.cl_omega = clCreateBuffer(ms->cl_ctx, 0, 2 * sizeof(double), NULL, &err); - ms->bicgstab.cl_omega1 = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err); - - return; -fail: - if (ms->cl_queue) - clReleaseCommandQueue(ms->cl_queue); - ms->cl_queue = 0; - - if (ms->cl_ctx) - clReleaseContext(ms->cl_ctx); - ms->cl_ctx = 0; -} - -static MaximalSlicingContext *init_ms(cGH *cctkGH, - int basis_order_r, int basis_order_z, - double sf, - CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z, - const int grid_size[3]) -{ - MaximalSlicingContext *ms; - int ret; - - //const double h = (x[1] - x[0]) / 8; - //fprintf(stderr, "h %g\n", h); - - //if (basis_order_r != basis_order_z) - // CCTK_WARN(0, "Different r and z basis orders are not supported."); - - ms = calloc(1, sizeof(*ms)); - - ms->gh = cctkGH; - - ms->basis = &msa_sb_even_basis; - //ms->basis = &full_basis; - //ms->basis = &cheb_basis; - - ms->nb_coeffs_x = basis_order_r; - ms->nb_coeffs_z = basis_order_z; - - ms->nb_coeffs = ms->nb_coeffs_x * ms->nb_coeffs_z; - - ms->nb_colloc_points_x = basis_order_r; - ms->nb_colloc_points_z = basis_order_z; - - ms->nb_colloc_points = ms->nb_colloc_points_x * ms->nb_colloc_points_z; - - if (ms->nb_colloc_points != ms->nb_coeffs) - CCTK_WARN(0, "Non-square collocation matrix"); - - ms->colloc_grid_order_x = ms->nb_colloc_points_x; - ms->colloc_grid_order_z = ms->nb_colloc_points_z; - - ms->mat = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points); - ms->coeffs = malloc(sizeof(double) * ms->nb_coeffs); - ms->rhs = malloc(sizeof(double) * ms->nb_colloc_points); - - ms->mat_f = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points); - ms->ipiv = malloc(sizeof(*ms->ipiv) * ms->nb_coeffs); - -#if 1 - scale_factor = 1.0; - - scale_factor = (x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)] - 3) / ms->basis->colloc_point(ms->colloc_grid_order_x, ms->nb_colloc_points_x - 1); - //scale_factor = x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)] - 0.5; - fprintf(stderr, "scale factor %g\n", scale_factor); - -#else - scale_factor = sf; -#endif - - ms->grid_x = malloc(ms->nb_colloc_points_x * sizeof(*ms->grid_x)); - - for (int i = 0; i < ms->nb_colloc_points_x; i++) { -#if 0 - double target_val = ms->basis->colloc_point(ms->colloc_grid_order_x, i); - double best_diff = DBL_MAX, best_val = DBL_MAX; - - for (int j = 0; j < grid_size[0]; j++) { - int idx = CCTK_GFINDEX3D(cctkGH, j, 0, 0); - double val = x[idx]; - double diff = fabs(target_val - val); - - if (val > 0.0 && diff < best_diff) { - int k; - for (k = 0; k < i; k++) - if (ms->grid_x[k] == val) - break; - if (k == i) { - best_diff = diff; - best_val = val; - } - } - } - if (best_val == DBL_MAX) - abort(); - fprintf(stderr, "%d %g -> %g (%g)\n", i, target_val, best_val, target_val - best_val); - ms->grid_x[i] = best_val; -#elif 0 - double max = x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)]; - ms->grid_x[i] = max * SQR(SQR((double)i / ms->nb_colloc_points_x)) + 0.001; -#else - ms->grid_x[i] = ms->basis->colloc_point(ms->colloc_grid_order_x, i); -#endif - fprintf(stderr, "%d %g\n", i, ms->grid_x[i]); - } - - ms->grid_z = malloc(ms->nb_colloc_points_z * sizeof(*ms->grid_z)); - - for (int i = 0; i < ms->nb_colloc_points_z; i++) { - ms->grid_z[i] = ms->basis->colloc_point(ms->colloc_grid_order_z, i); - } - - /* precompute the basis values we will need */ - ms->basis_x_val = malloc(sizeof(*ms->basis_x_val) * ms->nb_colloc_points_x * ms->nb_coeffs_x); - ms->basis_x_dval = malloc(sizeof(*ms->basis_x_dval) * ms->nb_colloc_points_x * ms->nb_coeffs_x); - ms->basis_x_d2val = malloc(sizeof(*ms->basis_x_d2val) * ms->nb_colloc_points_x * ms->nb_coeffs_x); - for (int i = 0; i < ms->nb_colloc_points_x; i++) { - CCTK_REAL coord = ms->grid_x[i]; - for (int j = 0; j < ms->nb_coeffs_x; j++) { - ms->basis_x_val [i * ms->nb_coeffs_x + j] = ms->basis->eval(coord, j); - ms->basis_x_dval [i * ms->nb_coeffs_x + j] = ms->basis->eval_diff1(coord, j); - ms->basis_x_d2val[i * ms->nb_coeffs_x + j] = ms->basis->eval_diff2(coord, j); - } - } - - ms->basis_z_val = malloc(sizeof(*ms->basis_z_val) * ms->nb_colloc_points_z * ms->nb_coeffs_z); - ms->basis_z_dval = malloc(sizeof(*ms->basis_z_dval) * ms->nb_colloc_points_z * ms->nb_coeffs_z); - ms->basis_z_d2val = malloc(sizeof(*ms->basis_z_d2val) * ms->nb_colloc_points_z * ms->nb_coeffs_z); - for (int i = 0; i < ms->nb_colloc_points_z; i++) { - CCTK_REAL coord = ms->grid_z[i]; - for (int j = 0; j < ms->nb_coeffs_z; j++) { - ms->basis_z_val [i * ms->nb_coeffs_z + j] = ms->basis->eval(coord, j); - ms->basis_z_dval [i * ms->nb_coeffs_z + j] = ms->basis->eval_diff1(coord, j); - ms->basis_z_d2val[i * ms->nb_coeffs_z + j] = ms->basis->eval_diff2(coord, j); - } - } - - ms->basis_val_00 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00)); - ms->basis_val_11 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00)); - ms->basis_val_10 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00)); - ms->basis_val_01 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00)); - ms->basis_val_02 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00)); - ms->basis_val_20 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00)); - for (int i = 0; i < ms->nb_colloc_points_z; i++) { - const double *basis_val_z = ms->basis_z_val + i * ms->nb_coeffs_z; - const double *dbasis_val_z = ms->basis_z_dval + i * ms->nb_coeffs_z; - const double *d2basis_val_z = ms->basis_z_d2val + i * ms->nb_coeffs_z; - - for (int j = 0; j < ms->nb_colloc_points_x; j++) { - const double *basis_val_x = ms->basis_x_val + j * ms->nb_coeffs_x; - const double *dbasis_val_x = ms->basis_x_dval + j * ms->nb_coeffs_x; - const double *d2basis_val_x = ms->basis_x_d2val + j * ms->nb_coeffs_x; - const int idx_grid = i * ms->nb_colloc_points_x + j; - - for (int k = 0; k < ms->nb_coeffs_z; k++) - for (int l = 0; l < ms->nb_coeffs_x; l++) { - const int idx_coeff = k * ms->nb_coeffs_x + l; - const int idx = idx_grid + ms->nb_colloc_points * idx_coeff; - ms->basis_val_00[idx] = basis_val_x[l] * basis_val_z[k]; - ms->basis_val_11[idx] = dbasis_val_x[l] * dbasis_val_z[k]; - ms->basis_val_10[idx] = dbasis_val_x[l] * basis_val_z[k]; - ms->basis_val_01[idx] = basis_val_x[l] * dbasis_val_z[k]; - ms->basis_val_02[idx] = basis_val_x[l] * d2basis_val_z[k]; - ms->basis_val_20[idx] = d2basis_val_x[l] * basis_val_z[k]; - } - } - } - - ms->interp_coords[0] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[0])); - ms->interp_coords[1] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[1])); - ms->interp_coords[2] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[2])); - for (int i = 0; i < ms->nb_colloc_points_z; i++) { - CCTK_REAL z = ms->grid_z[i]; - for (int j = 0; j < ms->nb_colloc_points_x; j++) { - CCTK_REAL x = ms->grid_x[j]; - - ms->interp_coords[0][i * ms->nb_colloc_points_x + j] = x; - ms->interp_coords[1][i * ms->nb_colloc_points_x + j] = 0; - ms->interp_coords[2][i * ms->nb_colloc_points_x + j] = z; - } - } - - for (int i = 0; i < ARRAY_ELEMS(ms->metric_u); i++) - ms->metric_u[i] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_values[i])); - - ms->kij_kij = malloc(ms->nb_colloc_points * sizeof(*ms->kij_kij)); - - for (int i = 0; i < ARRAY_ELEMS(ms->interp_values); i++) { - ms->interp_values[i] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_values[i])); - ms->interp_value_codes[i] = CCTK_VARIABLE_REAL; - } - - for (int i = 0; i < ARRAY_ELEMS(metric_vars); i++) - ms->interp_vars_indices[i] = CCTK_VarIndex(metric_vars[i]); - - ms->coord_system = CCTK_CoordSystemHandle("cart3d"); - if (ms->coord_system < 0) - CCTK_WARN(0, "Error getting the coordinate system"); - - ms->interp_operator = CCTK_InterpHandle("Lagrange polynomial interpolation"); - if (ms->interp_operator < 0) - CCTK_WARN(0, "Error getting the interpolation operator"); - - ms->interp_params = Util_TableCreateFromString("order=2 want_global_mode=1"); - if (ms->interp_params < 0) - CCTK_WARN(0, "Error creating interpolation parameters table"); - - ret = Util_TableSetIntArray(ms->interp_params, NB_INTERP_VARS, - interp_operation_codes, "operation_codes"); - if (ret < 0) - CCTK_WARN(0, "Error setting operation codes"); - - ret = Util_TableSetIntArray(ms->interp_params, NB_INTERP_VARS, - interp_operation_indices, "operand_indices"); - if (ret < 0) - CCTK_WARN(0, "Error setting operand indices"); - - ms->bicgstab.p = malloc(sizeof(double) * ms->nb_coeffs); - ms->bicgstab.v = malloc(sizeof(double) * ms->nb_coeffs); - ms->bicgstab.y = malloc(sizeof(double) * ms->nb_coeffs); - ms->bicgstab.z = malloc(sizeof(double) * ms->nb_coeffs); - ms->bicgstab.t = malloc(sizeof(double) * ms->nb_coeffs); - ms->bicgstab.res = malloc(sizeof(double) * ms->nb_coeffs); - ms->bicgstab.res0 = malloc(sizeof(double) * ms->nb_coeffs); - ms->bicgstab.k = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points); - - ms->steps_since_inverse = INT_MAX; - - init_opencl(ms); - - CCTK_TimerCreate("MaximalSlicingAxi_Solve"); - CCTK_TimerCreate("MaximalSlicingAxi_Expand"); - CCTK_TimerCreate("MaximalSlicingAxi_calc_geometry"); - CCTK_TimerCreate("MaximalSlicingAxi_construct_matrix"); - CCTK_TimerCreate("MaximalSlicingAxi_solve_LU"); - CCTK_TimerCreate("MaximalSlicingAxi_solve_BiCGSTAB"); - - return ms; + return b->colloc_point(order, 1); } static CoordPatch *get_coord_patch(MaximalSlicingContext *ms, - CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z, - CCTK_REAL *alp) + CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z) { cGH *cctkGH = ms->gh; + int nb_coeffs_x = ms->solver->nb_coeffs[0]; + int nb_coeffs_z = ms->solver->nb_coeffs[1]; CoordPatch *cp; int64_t grid_size; + int i; for (int i = 0; i < ms->nb_patches; i++) { cp = &ms->patches[i]; @@ -471,9 +51,9 @@ static CoordPatch *get_coord_patch(MaximalSlicingContext *ms, cp->size[0] == ms->gh->cctk_lsh[0] && cp->size[1] == ms->gh->cctk_lsh[1] && cp->size[2] == ms->gh->cctk_lsh[2] && - cp->delta[0] == ms->gh->cctk_delta_space[0] && - cp->delta[1] == ms->gh->cctk_delta_space[1] && - cp->delta[2] == ms->gh->cctk_delta_space[2]) + cp->delta[0] == ms->gh->cctk_levfac[0] && + cp->delta[1] == ms->gh->cctk_levfac[1] && + cp->delta[2] == ms->gh->cctk_levfac[2]) return cp; } @@ -483,10 +63,21 @@ static CoordPatch *get_coord_patch(MaximalSlicingContext *ms, ms->patches = realloc(ms->patches, sizeof(*ms->patches) * (ms->nb_patches + 1)); cp = &ms->patches[ms->nb_patches]; + memset(cp, 0, sizeof(*cp)); + memcpy(cp->origin, ms->gh->cctk_origin_space, sizeof(cp->origin)); memcpy(cp->size, ms->gh->cctk_lsh, sizeof(cp->size)); - memcpy(cp->delta, ms->gh->cctk_delta_space, sizeof(cp->delta)); + memcpy(cp->delta, ms->gh->cctk_levfac, sizeof(cp->delta)); + + for (i = 0; i < cp->size[1]; i++) + if (fabs(y[CCTK_GFINDEX3D(cctkGH, 0, i, 0)]) < 1e-8) { + cp->y_idx = i; + break; + } + if (i == cp->size[1]) + CCTK_WARN(0, "The grid does not include y==0"); +#if 0 posix_memalign((void**)&cp->basis_val_r, 32, sizeof(*cp->basis_val_r) * ms->nb_coeffs_x * ms->gh->cctk_lsh[1] * ms->gh->cctk_lsh[0]); for (int j = 0; j < ms->gh->cctk_lsh[1]; j++) for (int i = 0; i < ms->gh->cctk_lsh[0]; i++) { @@ -500,23 +91,91 @@ static CoordPatch *get_coord_patch(MaximalSlicingContext *ms, } posix_memalign((void**)&cp->basis_val_z, 32, sizeof(*cp->basis_val_z) * ms->nb_coeffs_z * ms->gh->cctk_lsh[2]); - for (int i = 0; i < ms->gh->cctk_lsh[2]; i++) { CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, i)]; for (int j = 0; j < ms->nb_coeffs_z; j++) - cp->basis_val_z [i * ms->nb_coeffs_z + j] = ms->basis->eval(zz, j); + cp->basis_val_z [i * ms->nb_coeffs_z + j] = ms->basis->eval(fabs(zz), j); //cp->basis_val_z [i + ms->gh->cctk_lsh[2] * j] = ms->basis->eval(zz, j); } - posix_memalign((void**)&cp->transform_z, 32, sizeof(*cp->transform_z) * cctkGH->cctk_lsh[2] * ms->nb_coeffs_x); posix_memalign((void**)&cp->one, 32, sizeof(*cp->one) * grid_size); for (int i = 0; i < grid_size; i++) cp->one[i] = 1.0; +#else + posix_memalign((void**)&cp->transform_matrix, 32, sizeof(*cp->transform_matrix) * nb_coeffs_x * cp->size[0] * cp->size[2]); + posix_memalign((void**)&cp->transform_matrix1, 32, sizeof(*cp->transform_matrix1) * nb_coeffs_z * cp->size[0] * cp->size[2]); +#pragma omp parallel for + for (int j = 0; j < cp->size[2]; j++) { + CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, j)]; + + for (int i = 0; i < cp->size[0]; i++) { + const int idx_grid = j * cp->size[0] + i; + + double xx = x[CCTK_GFINDEX3D(ms->gh, i, 0, 0)]; + double rr = sqrt(SQR(xx) + SQR(zz)); + +#if MS_POLAR + double coord0 = rr; + double coord1 = atan2(zz, xx); +#else + double coord0 = xx; + double coord1 = zz; +#endif + + //for (int k = 0; k < ms->nb_coeffs_z; k++) + // for (int l = 0; l < ms->nb_coeffs_x; l++) { + // const int idx_coeff = k * ms->nb_coeffs_x + l; + // cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * idx_coeff] = ms->basis->eval(r, l) * ms->basis1->eval(phi, k); + // } + for (int k = 0; k < nb_coeffs_x; k++) { + double dx = calc_basis_freq(ms->solver->basis[0], k); + double r0 = dx * 64; + double fact = exp(-36.0 * pow(rr / r0, 10)); + fact = 1.0; + + cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * k] = ms->solver->basis[0]->eval(coord0, k) * fact; + } + for (int k = 0; k < nb_coeffs_z; k++) { + double dx = calc_basis_freq(ms->solver->basis[1], k); + double r0 = dx * 64; + double fact = exp(-36.0 * pow(rr / r0, 10)); + fact = 1.0; + + cp->transform_matrix1[idx_grid * nb_coeffs_z + k] = ms->solver->basis[1]->eval(coord1, k) * fact; + } + } + } + posix_memalign((void**)&cp->transform_tmp, 32, sizeof(*cp->transform_tmp) * cp->size[0] * cp->size[2] * nb_coeffs_z); +#endif ms->nb_patches++; return cp; } +static int context_init(cGH *cctkGH, MaximalSlicingContext **ctx) +{ + MaximalSlicingContext *ms; + int ret; + + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + ms = calloc(1, sizeof(*ms)); + if (!ms) + return -ENOMEM; + + ms->gh = cctkGH; + + ret = ms_solver_init(&ms->solver, cctkGH, basis_order_r, basis_order_z, + scale_factor, filter_power, 0.0); + if (ret < 0) + return ret; + + *ctx = ms; + + return 0; +} + void maximal_slicing_axi(CCTK_ARGUMENTS) { static MaximalSlicingContext *ms; @@ -526,45 +185,97 @@ void maximal_slicing_axi(CCTK_ARGUMENTS) DECLARE_CCTK_ARGUMENTS; DECLARE_CCTK_PARAMETERS; - static double total; - static int64_t count; - int64_t timer_start; + int64_t expand_start, totaltime_start; + + double *coeffs = NULL; + int i, ret; - const int64_t grid_size = cctk_lsh[2] * cctk_lsh[1] * cctk_lsh[0]; + totaltime_start = gettime(); /* on the first run, init the solver */ - if (!ms) { - ms = init_ms(cctkGH, basis_order_r, basis_order_z, - scale_factor, x, y, z, cctk_lsh); - } + if (!ms) + context_init(cctkGH, &ms); - cp = get_coord_patch(ms, x, y, z, alp); + cp = get_coord_patch(ms, x, y, z); CCTK_TimerStart("MaximalSlicingAxi_Solve"); - msa_maximal_solve(ms); + ms_solver_solve(ms->solver); CCTK_TimerStop("MaximalSlicingAxi_Solve"); + coeffs = ms->solver->coeffs; + if (export_coeffs) - memcpy(alpha_coeffs, ms->coeffs, sizeof(*alpha_coeffs) * ms->nb_coeffs); + memcpy(alpha_coeffs, coeffs, sizeof(*alpha_coeffs) * ms->solver->nb_coeffs[0] * ms->solver->nb_coeffs[1]); CCTK_TimerStart("MaximalSlicingAxi_Expand"); - timer_start = gettime(); + expand_start = gettime(); +#if 0 +#pragma omp parallel for + for (int k = 0; k < cctk_lsh[2]; k++) { + for (int i = 0; i < cctk_lsh[0]; i++) { + int idx = CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, k); + double xx = x[idx]; + double zz = z[idx]; + double r = sqrt(SQR(xx) + SQR(zz)); + double phi = atan2(zz, xx); + + double val = 1.0; + + for (int l = 0; l < ms->nb_coeffs_z; l++) { + double tmp = 0.0; + for (int m = 0; m < ms->nb_coeffs_x; m++) { + const int idx_coeff = l * ms->nb_coeffs_x + m; + tmp += ms->coeffs[idx_coeff] * ms->basis->eval(r, m); + } + val += tmp * ms->basis1->eval(phi, l); + } - memcpy(alp, cp->one, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*alp)); - cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, - ms->nb_coeffs_x, cctk_lsh[2], ms->nb_coeffs_z, 1.0, - ms->coeffs, ms->nb_coeffs_x, cp->basis_val_z, ms->nb_coeffs_z, - 0.0, cp->transform_z, ms->nb_coeffs_x); + alp[idx] = val; + } + } +#else cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, - cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], ms->nb_coeffs_x, 1.0, - cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, ms->nb_coeffs_x, - 1.0, alp, cctk_lsh[0] * cctk_lsh[1]); - - total += gettime() - timer_start; + cctk_lsh[0] * cctk_lsh[2], ms->solver->nb_coeffs[1], ms->solver->nb_coeffs[0], + 1.0, cp->transform_matrix, cctk_lsh[0] * cctk_lsh[2], + coeffs, ms->solver->nb_coeffs[0], 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]); +#pragma omp parallel for + for (int j = 0; j < cctk_lsh[2]; j++) + for (int i = 0; i < cctk_lsh[0]; i++) { + const int idx_grid = j * cctk_lsh[0] + i; + const double val = cblas_ddot(ms->solver->nb_coeffs[1], cp->transform_matrix1 + idx_grid * ms->solver->nb_coeffs[1], 1, + cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]); + alpha[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = 1.0 + val; + } +#endif + //memcpy(alp, cp->one, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*alp)); + //memset(alp, 0, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*alp)); + //cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + // ms->nb_coeffs_x, cctk_lsh[2], ms->nb_coeffs_z, 1.0, + // coeffs, ms->nb_coeffs_x, cp->basis_val_z, ms->nb_coeffs_z, + // 0.0, cp->transform_z, ms->nb_coeffs_x); + //cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + // cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], ms->nb_coeffs_x, 1.0, + // cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, ms->nb_coeffs_x, + // 1.0, alp, cctk_lsh[0] * cctk_lsh[1]); + + ms->grid_expand_time += gettime() - expand_start; + ms->grid_expand_count++; CCTK_TimerStop("MaximalSlicingAxi_Expand"); - count++; - if (!(count & 15)) - fprintf(stderr, "avg %g total %g\n", total / count, total); + //CCTK_TimerStart("MaximalSlicingAxi_Polish"); + //msa_sor(ms, cp); + //CCTK_TimerStop("MaximalSlicingAxi_Polish"); + + /* print stats */ + if (!(ms->grid_expand_count & 255)) { + fprintf(stderr, "Maximal slicing stats:\n"); + + ms_solver_print_stats(ms->solver); + fprintf(stderr, + "%lu evals: total time %g s, avg time per call %g ms\n", + ms->grid_expand_count, (double)ms->grid_expand_time / 1e6, + (double)ms->grid_expand_time / ms->grid_expand_count / 1e3); + + } } diff --git a/src/maximal_slicing_axi.h b/src/maximal_slicing_axi.h index fc6c86f..3bb97aa 100644 --- a/src/maximal_slicing_axi.h +++ b/src/maximal_slicing_axi.h @@ -1,124 +1,23 @@ +#ifndef MAXIMAL_SLICING_AXI_H +#define MAXIMAL_SLICING_AXI_H + #include #include #include "cctk.h" -#define SQR(x) ((x) * (x)) -#define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0) -#define MAX(x, y) ((x) > (y) ? (x) : (y)) -#define MIN(x, y) ((x) > (y) ? (y) : (x)) -#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr)) - -/* - * small number to avoid r=0 singularities - */ -#define EPS 1E-08 +#include "ms_solve.h" #define SCALE_FACTOR scale_factor -/* indices (in our code, not cactus structs) of the grid functions which we'll need to - * interpolate on the pseudospectral grid */ -enum MetricVars { - GTXX = 0, - GTYY, - GTZZ, - GTXY, - GTXZ, - GTYZ, - PHI, - ATXX, - ATYY, - ATZZ, - ATXY, - ATXZ, - ATYZ, - K, - XTX, - XTY, - XTZ, - BETAX, - BETAY, - BETAZ, - NB_METRIC_VARS, -}; - -/* indices of the interpolated values of the above grid functions and their derivatives */ -enum InterpMetricVars { - I_GTXX = 0, - I_GTXX_DX, - I_GTXX_DY, - I_GTXX_DZ, - I_GTYY, - I_GTYY_DX, - I_GTYY_DY, - I_GTYY_DZ, - I_GTZZ, - I_GTZZ_DX, - I_GTZZ_DY, - I_GTZZ_DZ, - I_GTXY, - I_GTXY_DX, - I_GTXY_DY, - I_GTXY_DZ, - I_GTXZ, - I_GTXZ_DX, - I_GTXZ_DY, - I_GTXZ_DZ, - I_GTYZ, - I_GTYZ_DX, - I_GTYZ_DY, - I_GTYZ_DZ, - I_PHI, - I_PHI_DX, - I_PHI_DY, - I_PHI_DZ, - I_ATXX, - I_ATYY, - I_ATZZ, - I_ATXY, - I_ATXZ, - I_ATYZ, - I_K, - I_K_DX, - I_K_DY, - I_K_DZ, - I_XTX, - I_XTY, - I_XTZ, - I_BETAX, - I_BETAY, - I_BETAZ, - NB_INTERP_VARS, -}; - -/* a set of basis functions */ -typedef struct BasisSet { - /* evaluate the idx-th basis function at the specified point*/ - double (*eval) (double coord, int idx); - /* evaluate the first derivative of the idx-th basis function at the specified point*/ - double (*eval_diff1)(double coord, int idx); - /* evaluate the second derivative of the idx-th basis function at the specified point*/ - double (*eval_diff2)(double coord, int idx); - /** - * Get the idx-th collocation point for the specified order. - * idx runs from 0 to order - 1 (inclusive) - */ - double (*colloc_point)(int order, int idx); -} BasisSet; - -extern const BasisSet msa_cheb_basis; -extern const BasisSet msa_full_basis; -extern const BasisSet msa_tb_even_basis; -extern const BasisSet msa_sb_even_basis; - extern double scale_factor; /* precomputed values for a given refined grid */ typedef struct CoordPatch { CCTK_REAL origin[3]; - CCTK_REAL delta[3]; + CCTK_INT delta[3]; CCTK_INT size[3]; // basis values on the grid @@ -126,105 +25,25 @@ typedef struct CoordPatch { double *basis_val_z; double *transform_z; + double *transform_matrix; + double *transform_matrix1; + double *transform_tmp; double *one; -} CoordPatch; - -/* state and scratch storage for the BiCGSTAB solver */ -typedef struct BiCGSTABContext { - double *p, *v, *y, *z, *t; - double *res, *res0; - double *k; - - cl_mem cl_p, cl_v, cl_y, cl_z, cl_t; - cl_mem cl_res, cl_res0; - cl_mem cl_k, cl_mat; - cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1; - cl_mem cl_tmp, cl_tmp1; - int64_t solve_total; - int64_t iter_total; - int64_t time_total; -} BiCGSTABContext; + int y_idx; +} CoordPatch; typedef struct MaximalSlicingContext { + MSSolver *solver; cGH *gh; - const BasisSet *basis; - - BiCGSTABContext bicgstab; - int steps_since_inverse; - - int64_t lu_solves_total; - int64_t lu_solves_time; - - // the grid of collocation points - CCTK_REAL *grid_x; - CCTK_REAL *grid_z; - - // interpolation parameters - int coord_system; - int interp_operator; - int interp_params; - - CCTK_REAL *interp_coords[3]; - int interp_vars_indices[NB_METRIC_VARS]; - - CCTK_REAL *interp_values[NB_INTERP_VARS]; - CCTK_INT interp_value_codes[NB_INTERP_VARS]; - - CCTK_REAL *metric_u[6]; - - CCTK_REAL *kij_kij; - CCTK_REAL *trk; - - int nb_coeffs_x; - int nb_coeffs_z; - int nb_coeffs; - - int nb_colloc_points_x; - int nb_colloc_points_z; - int nb_colloc_points; - - int colloc_grid_order_x; - int colloc_grid_order_z; - - double *mat; - double *mat_f; - double *rhs; - double *coeffs; - int *ipiv; - double *basis_x_val; - double *basis_x_dval; - double *basis_x_d2val; - - double *basis_z_val; - double *basis_z_dval; - double *basis_z_d2val; - - double *basis_val_00; - double *basis_val_20; - double *basis_val_02; - double *basis_val_11; - double *basis_val_10; - double *basis_val_01; + uint64_t grid_expand_count; + uint64_t grid_expand_time; CoordPatch *patches; int nb_patches; - - // OpenCL / CLBLAS stuff - cl_context cl_ctx; - cl_command_queue cl_queue; - - cl_mem ocl_coeffs; } MaximalSlicingContext; int msa_maximal_solve(MaximalSlicingContext *ms); -#include -static inline int64_t gettime(void) -{ - struct timeval tv; - gettimeofday(&tv, NULL); - return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; -} - +#endif /* MAXIMAL_SLICING_AXI_H */ diff --git a/src/ms_solve.c b/src/ms_solve.c new file mode 100644 index 0000000..90502be --- /dev/null +++ b/src/ms_solve.c @@ -0,0 +1,697 @@ +/* + * Maximal slicing -- actual solver code + * Copyright (C) 2016 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "common.h" + +#include +#include +#include +#include +#include + +#if HAVE_OPENCL +#include +#include +#endif + +#include "cctk.h" +#include "cctk_Timers.h" +#include "util_Table.h" + +#include "basis.h" +#include "pssolve.h" +#include "ms_solve.h" + +#define NB_COEFFS(ms) (ms->nb_coeffs[0] * ms->nb_coeffs[1]) +#define NB_COLLOC_POINTS(ms) (ms->nb_colloc_points[0] * ms->nb_colloc_points[1]) + +/* indices (in our code, not cactus structs) of the grid functions which we'll need to + * interpolate on the pseudospectral grid */ +enum MetricVars { + GTXX = 0, + GTYY, + GTZZ, + GTXY, + GTXZ, + GTYZ, + PHI, + ATXX, + ATYY, + ATZZ, + ATXY, + ATXZ, + ATYZ, + K, + XTX, + XTY, + XTZ, + BETAX, + BETAY, + BETAZ, + NB_METRIC_VARS, +}; + +/* indices of the interpolated values of the above grid functions and their derivatives */ +enum InterpMetricVars { + I_GTXX = 0, + I_GTYY, + I_GTZZ, + I_GTXY, + I_GTXZ, + I_GTYZ, + I_PHI, + I_PHI_DX, + I_PHI_DY, + I_PHI_DZ, + I_ATXX, + I_ATYY, + I_ATZZ, + I_ATXY, + I_ATXZ, + I_ATYZ, + I_K, + I_XTX, + I_XTY, + I_XTZ, + I_BETAX, + I_BETAY, + I_BETAZ, + NB_INTERP_VARS, +}; + +struct MSSolverPriv { + PSSolveContext *ps_ctx; + cGH *gh; + + int colloc_grid_order[2]; + + double *eq_coeffs[PSSOLVE_DIFF_ORDER_NB]; + double *rhs; + + double *coeff_scale; + + // interpolation parameters + int coord_system; + int interp_operator; + int interp_params; + + CCTK_REAL *interp_coords[3]; + + int interp_vars_indices[NB_METRIC_VARS]; + CCTK_REAL *interp_values[NB_INTERP_VARS]; + CCTK_INT interp_value_codes[NB_INTERP_VARS]; + +#if HAVE_OPENCL + // OpenCL / CLBLAS stuff + cl_context ocl_ctx; + cl_command_queue ocl_queue; +#endif + + uint64_t solve_count; + uint64_t solve_time; + + uint64_t interp_geometry_count; + uint64_t interp_geometry_time; + + uint64_t calc_eq_coeffs_count; + uint64_t calc_eq_coeffs_time; +}; + +/* mapping between our indices and thorn names */ +static const char *metric_vars[] = { +#if MS_CCZ4 + [GTXX] = "ML_CCZ4::gt11", + [GTYY] = "ML_CCZ4::gt22", + [GTZZ] = "ML_CCZ4::gt33", + [GTXY] = "ML_CCZ4::gt12", + [GTXZ] = "ML_CCZ4::gt13", + [GTYZ] = "ML_CCZ4::gt23", + [ATXX] = "ML_CCZ4::At11", + [ATYY] = "ML_CCZ4::At22", + [ATZZ] = "ML_CCZ4::At33", + [ATXY] = "ML_CCZ4::At12", + [ATXZ] = "ML_CCZ4::At13", + [ATYZ] = "ML_CCZ4::At23", + [PHI] = "ML_CCZ4::phi", + [K] = "ML_CCZ4::trK", + [XTX] = "ML_CCZ4::Xt1", + [XTY] = "ML_CCZ4::Xt2", + [XTZ] = "ML_CCZ4::Xt3", + [BETAX] = "ML_CCZ4::beta1", + [BETAY] = "ML_CCZ4::beta2", + [BETAZ] = "ML_CCZ4::beta3", +#else + [GTXX] = "ML_BSSN::gt11", + [GTYY] = "ML_BSSN::gt22", + [GTZZ] = "ML_BSSN::gt33", + [GTXY] = "ML_BSSN::gt12", + [GTXZ] = "ML_BSSN::gt13", + [GTYZ] = "ML_BSSN::gt23", + [ATXX] = "ML_BSSN::At11", + [ATYY] = "ML_BSSN::At22", + [ATZZ] = "ML_BSSN::At33", + [ATXY] = "ML_BSSN::At12", + [ATXZ] = "ML_BSSN::At13", + [ATYZ] = "ML_BSSN::At23", + [PHI] = "ML_BSSN::phi", + [K] = "ML_BSSN::trK", + [XTX] = "ML_BSSN::Xt1", + [XTY] = "ML_BSSN::Xt2", + [XTZ] = "ML_BSSN::Xt3", + [BETAX] = "ML_BSSN::beta1", + [BETAY] = "ML_BSSN::beta2", + [BETAZ] = "ML_BSSN::beta3", +#endif +}; + +/* mapping between the cactus grid values and interpolated values */ +static const CCTK_INT interp_operation_indices[] = { + [I_GTXX] = GTXX, + [I_GTYY] = GTYY, + [I_GTZZ] = GTZZ, + [I_GTXY] = GTXY, + [I_GTXZ] = GTXZ, + [I_GTYZ] = GTYZ, + [I_PHI] = PHI, + [I_PHI_DX] = PHI, + [I_PHI_DY] = PHI, + [I_PHI_DZ] = PHI, + [I_ATXX] = ATXX, + [I_ATYY] = ATYY, + [I_ATZZ] = ATZZ, + [I_ATXY] = ATXY, + [I_ATXZ] = ATXZ, + [I_ATYZ] = ATYZ, + [I_K] = K, + [I_XTX] = XTX, + [I_XTY] = XTY, + [I_XTZ] = XTZ, + [I_BETAX] = BETAX, + [I_BETAY] = BETAY, + [I_BETAZ] = BETAZ, +}; + +/* the operation (plain value or x/y/z-derivative) to apply during interpolation */ +static const CCTK_INT interp_operation_codes[] = { + [I_GTXX] = 0, + [I_GTYY] = 0, + [I_GTZZ] = 0, + [I_GTXY] = 0, + [I_GTXZ] = 0, + [I_GTYZ] = 0, + [I_PHI] = 0, + [I_PHI_DX] = 1, + [I_PHI_DY] = 2, + [I_PHI_DZ] = 3, + [I_ATXX] = 0, + [I_ATYY] = 0, + [I_ATZZ] = 0, + [I_ATXY] = 0, + [I_ATXZ] = 0, + [I_ATYZ] = 0, + [I_K] = 0, + [I_XTX] = 0, + [I_XTY] = 0, + [I_XTZ] = 0, + [I_BETAX] = 0, + [I_BETAY] = 0, + [I_BETAZ] = 0, +}; + +/* interpolate the cactus gridfunctions onto the pseudospectral grid */ +static int interp_geometry(MSSolver *ctx) +{ + MSSolverPriv *s = ctx->priv; + int ret; + + ret = CCTK_InterpGridArrays(s->gh, 3, s->interp_operator, s->interp_params, + s->coord_system, NB_COLLOC_POINTS(ctx), CCTK_VARIABLE_REAL, + (const void * const *)s->interp_coords, ARRAY_ELEMS(s->interp_vars_indices), s->interp_vars_indices, + ARRAY_ELEMS(s->interp_values), s->interp_value_codes, (void * const *)s->interp_values); + if (ret < 0) + CCTK_WARN(0, "Error interpolating"); + + return 0; +} + +/* evaluate the equation coefficients at the collocation points */ +static int calc_eq_coeffs(MSSolver *ctx) +{ + MSSolverPriv *s = ctx->priv; + +#pragma omp parallel for + for (int i = 0; i < NB_COLLOC_POINTS(ctx); i++) { + CCTK_REAL Am[3][3], gtu[3][3]; + CCTK_REAL a2; + + CCTK_REAL gtxx = s->interp_values[I_GTXX][i]; + CCTK_REAL gtyy = s->interp_values[I_GTYY][i]; + CCTK_REAL gtzz = s->interp_values[I_GTZZ][i]; + CCTK_REAL gtxy = s->interp_values[I_GTXY][i]; + CCTK_REAL gtxz = s->interp_values[I_GTXZ][i]; + CCTK_REAL gtyz = s->interp_values[I_GTYZ][i]; + + CCTK_REAL Atxx = s->interp_values[I_ATXX][i]; + CCTK_REAL Atyy = s->interp_values[I_ATYY][i]; + CCTK_REAL Atzz = s->interp_values[I_ATZZ][i]; + CCTK_REAL Atxy = s->interp_values[I_ATXY][i]; + CCTK_REAL Atxz = s->interp_values[I_ATXZ][i]; + CCTK_REAL Atyz = s->interp_values[I_ATYZ][i]; + + CCTK_REAL At[3][3] = {{ Atxx, Atxy, Atxz }, + { Atxy, Atyy, Atyz }, + { Atxz, Atyz, Atzz }}; + + CCTK_REAL trK = s->interp_values[I_K][i]; + + CCTK_REAL Xtx = s->interp_values[I_XTX][i]; + CCTK_REAL Xtz = s->interp_values[I_XTZ][i]; + + CCTK_REAL det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz); + + // \tilde{γ}^{ij} + gtu[0][0] = (gtyy * gtzz - SQR(gtyz)) / det; + gtu[1][1] = (gtxx * gtzz - SQR(gtxz)) / det; + gtu[2][2] = (gtxx * gtyy - SQR(gtxy)) / det; + gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det; + gtu[0][2] = (gtxy * gtyz - gtyy * gtxz) / det; + gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det; + gtu[1][0] = gtu[0][1]; + gtu[2][0] = gtu[0][2]; + gtu[2][1] = gtu[1][2]; + + // \tilde{A}_{i}^j + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + double val = 0.0; + for (int l = 0; l < 3; l++) + val += gtu[j][l] * At[l][k]; + Am[j][k] = val; + } + + // K_{ij} K^{ij} + a2 = 0.0; + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + a2 += Am[j][k] * Am[k][j]; + + { + double x = s->interp_coords[0][i]; + double z = s->interp_coords[2][i]; + + const double gtuxx = gtu[0][0]; + const double gtuyy = gtu[1][1]; + const double gtuzz = gtu[2][2]; + const double gtuxz = gtu[0][2]; + + const double phi = s->interp_values[I_PHI][i]; + const double phi_dx = s->interp_values[I_PHI_DX][i]; + const double phi_dz = s->interp_values[I_PHI_DZ][i]; + + const double Xtx = s->interp_values[I_XTX][i]; + const double Xtz = s->interp_values[I_XTZ][i]; + + const double k2 = a2 + SQR(trK) / 3.; + + const double betax = s->interp_values[I_BETAX][i]; + const double betaz = s->interp_values[I_BETAZ][i]; + + const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi); + const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); + + s->eq_coeffs[PSSOLVE_DIFF_ORDER_20][i] = SQR(phi) * (gtuxx + ((x <= EPS) ? gtuyy : 0.0)); + s->eq_coeffs[PSSOLVE_DIFF_ORDER_02][i] = SQR(phi) * gtuzz; + s->eq_coeffs[PSSOLVE_DIFF_ORDER_11][i] = SQR(phi) * gtuxz * 2; + s->eq_coeffs[PSSOLVE_DIFF_ORDER_10][i] = -Xx + ((x > EPS) ? SQR(phi) * gtuyy / x : 0.0); + s->eq_coeffs[PSSOLVE_DIFF_ORDER_01][i] = -Xz; + s->eq_coeffs[PSSOLVE_DIFF_ORDER_00][i] = -k2; + s->rhs[i] = k2 + trK; + } + } + + return 0; +} + +int ms_solver_solve(MSSolver *ctx) +{ + MSSolverPriv *s = ctx->priv; + int ret; + int64_t start, totaltime_start; + + totaltime_start = gettime(); + + /* interpolate the metric values and construct the quantities we'll need */ + CCTK_TimerStart("MaximalSlicing_interp_geometry"); + start = gettime(); + + ret = interp_geometry(ctx); + + s->interp_geometry_time += gettime() - start; + s->interp_geometry_count++; + CCTK_TimerStop("MaximalSlicing_interp_geometry"); + if (ret < 0) + return ret; + + CCTK_TimerStart("MaximalSlicing_calc_eq_coeffs"); + start = gettime(); + + ret = calc_eq_coeffs(ctx); + + s->calc_eq_coeffs_time += gettime() - start; + s->calc_eq_coeffs_count++; + CCTK_TimerStop("MaximalSlicing_calc_eq_coeffs"); + if (ret < 0) + return ret; + + ret = ms_pssolve_solve(s->ps_ctx, (const double * const *)s->eq_coeffs, + s->rhs, ctx->coeffs); + if (ret < 0) + return ret; + + for (int i = 0; i < NB_COEFFS(ctx); i++) + ctx->coeffs[i] *= s->coeff_scale[i]; + + s->solve_count++; + s->solve_time += gettime() - totaltime_start; + + return 0; +} + +void ms_solver_print_stats(MSSolver *ctx) +{ + MSSolverPriv *s = ctx->priv; + + fprintf(stderr, + "%g%% interpolate geometry: %lu, " + "total time %g s, avg time per call %g ms\n", + (double)s->interp_geometry_time * 100 / s->solve_time, + s->interp_geometry_count, (double)s->interp_geometry_time / 1e6, + (double)s->interp_geometry_time / s->interp_geometry_count / 1e3); + fprintf(stderr, + "%g%% calc equation coefficients: %lu, " + "total time %g s, avg time per call %g ms\n", + (double)s->calc_eq_coeffs_time * 100 / s->solve_time, + s->calc_eq_coeffs_count, (double)s->calc_eq_coeffs_time / 1e6, + (double)s->calc_eq_coeffs_time / s->calc_eq_coeffs_count / 1e3); + fprintf(stderr, + "%g%% pseudospectral matrix construction: %lu, " + "total time %g s, avg time per call %g ms\n", + (double)s->ps_ctx->construct_matrix_time * 100 / s->solve_time, + s->ps_ctx->construct_matrix_count, (double)s->ps_ctx->construct_matrix_time / 1e6, + (double)s->ps_ctx->construct_matrix_time / s->ps_ctx->construct_matrix_count / 1e3); + fprintf(stderr, + "%g%% BiCGSTAB %lu solves, " + "%lu iterations, total time %g s, " + "avg iterations per solve %g, avg time per solve %g ms, " + "avg time per iteration %g ms\n", + (double)s->ps_ctx->cg_time_total * 100 / s->solve_time, + s->ps_ctx->cg_solve_count, s->ps_ctx->cg_iter_count, (double)s->ps_ctx->cg_time_total / 1e6, + (double)s->ps_ctx->cg_iter_count / s->ps_ctx->cg_solve_count, + (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_solve_count / 1e3, + (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_iter_count / 1e3); + fprintf(stderr, + "%g%% LU %lu solves, total time %g s, avg time per solve %g ms\n", + (double)s->ps_ctx->lu_solves_time * 100 / s->solve_time, + s->ps_ctx->lu_solves_count, (double)s->ps_ctx->lu_solves_time / 1e6, + (double)s->ps_ctx->lu_solves_time / s->ps_ctx->lu_solves_count / 1e3); +} + +static void init_opencl(MSSolver *ctx) +#if HAVE_OPENCL +{ + MSSolverPriv *s = ctx->priv; + int err, count; + cl_platform_id platform; + cl_context_properties props[3]; + cl_device_id ocl_device; + + err = clGetPlatformIDs(1, &platform, &count); + if (err != CL_SUCCESS || count < 1) { + fprintf(stderr, "Could not get an OpenCL platform ID\n"); + return; + } + + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ocl_device, &count); + if (err != CL_SUCCESS || count < 1) { + fprintf(stderr, "Could not get an OpenCL device ID\n"); + return; + } + + props[0] = CL_CONTEXT_PLATFORM; + props[1] = (cl_context_properties)platform; + props[2] = 0; + + s->ocl_ctx = clCreateContext(props, 1, &ocl_device, NULL, NULL, &err); + if (err != CL_SUCCESS || !s->ocl_ctx) { + fprintf(stderr, "Could not create an OpenCL context\n"); + return; + } + + s->ocl_queue = clCreateCommandQueue(s->ocl_ctx, ocl_device, 0, &err); + if (err != CL_SUCCESS || !s->ocl_queue) { + fprintf(stderr, "Could not create an OpenCL command queue: %d\n", err); + goto fail; + } + + err = clblasSetup(); + if (err != CL_SUCCESS) { + fprintf(stderr, "Error setting up clBLAS\n"); + goto fail; + } + + return; +fail: + if (s->ocl_queue) + clReleaseCommandQueue(s->ocl_queue); + s->ocl_queue = 0; + + if (s->ocl_ctx) + clReleaseContext(s->ocl_ctx); + s->ocl_ctx = 0; +} +#else +{ +} +#endif + +int ms_solver_init(MSSolver **pctx, + cGH *cctkGH, + int basis_order_r, int basis_order_z, + double sf, double filter_power, double input_filter_power) +{ + MSSolver *ctx; + MSSolverPriv *s; + int ret; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + ctx->priv = calloc(1, sizeof(*ctx->priv)); + if (!ctx->priv) + goto fail; + s = ctx->priv; + + s->gh = cctkGH; + + ctx->basis[0] = &ms_sb_even_basis; +#if MS_POLAR + ctx->basis[1] = &ms_cos_even_basis; +#else + ctx->basis[1] = &ms_sb_even_basis; +#endif + + ctx->nb_coeffs[0] = basis_order_r; + ctx->nb_coeffs[1] = basis_order_z; + + ctx->nb_colloc_points[0] = basis_order_r; + ctx->nb_colloc_points[1] = basis_order_z; + + if (NB_COLLOC_POINTS(ctx) != NB_COEFFS(ctx)) + CCTK_WARN(0, "Non-square collocation matrix"); + + s->colloc_grid_order[0] = ctx->nb_colloc_points[0]; + s->colloc_grid_order[1] = ctx->nb_colloc_points[1]; + + ret = posix_memalign((void**)&ctx->coeffs, 32, sizeof(*ctx->coeffs) * NB_COEFFS(ctx)); + ret |= posix_memalign((void**)&s->rhs, 32, sizeof(*s->rhs) * NB_COLLOC_POINTS(ctx)); + if (ret) + goto fail; + + //FIXME + scale_factor = 1.0; + scale_factor = (64.0 / ctx->basis[0]->colloc_point(s->colloc_grid_order[0], ctx->nb_colloc_points[0] - 1)); + fprintf(stderr, "scale factor %16.16g\n", scale_factor); + + init_opencl(ctx); + + ret = ms_pssolve_context_alloc(&s->ps_ctx); + if (ret < 0) + CCTK_WARN(0, "Error allocating the pseudospectral solver"); + + s->ps_ctx->basis[0] = ctx->basis[0]; + s->ps_ctx->basis[1] = ctx->basis[1]; + s->ps_ctx->solve_order[0] = basis_order_r; + s->ps_ctx->solve_order[1] = basis_order_z; +#if HAVE_OPENCL + s->ps_ctx->ocl_ctx = s->ocl_ctx; + s->ps_ctx->ocl_queue = s->ocl_queue; +#endif + + ret = ms_pssolve_context_init(s->ps_ctx); + if (ret < 0) + CCTK_WARN(0, "Error initializing the pseudospectral solver"); + + for (int i = 0; i < MAX(s->ps_ctx->solve_order[0], s->ps_ctx->solve_order[1]); i++) { + fprintf(stderr, "%d ", i); + if (i < s->ps_ctx->solve_order[0]) + fprintf(stderr, "%g\t", s->ps_ctx->colloc_grid[0][i]); + else + fprintf(stderr, "\t\t"); + if (i < s->ps_ctx->solve_order[1]) + fprintf(stderr, "%g\t", s->ps_ctx->colloc_grid[1][i]); + fprintf(stderr, "\n"); + } + + for (int i = 0; i < ARRAY_ELEMS(s->eq_coeffs); i++) { + ret = posix_memalign((void**)&s->eq_coeffs[i], 32, + NB_COLLOC_POINTS(ctx) * sizeof(*s->eq_coeffs[i])); + if (ret) + goto fail; + } + + for (int i = 0; i < ARRAY_ELEMS(s->interp_coords); i++) { + ret |= posix_memalign((void**)&s->interp_coords[i], 32, + NB_COLLOC_POINTS(ctx) * sizeof(*s->interp_coords[i])); + } + if (ret) + goto fail; + + for (int i = 0; i < ctx->nb_colloc_points[1]; i++) { + for (int j = 0; j < ctx->nb_colloc_points[0]; j++) { +#if MS_POLAR + double phi = s->ps_ctx->colloc_grid[1][i]; + double r = s->ps_ctx->colloc_grid[0][j]; + + double x = r * cos(phi); + double z = r * sin(phi); +#else + double x = s->ps_ctx->colloc_grid[0][j]; + double z = s->ps_ctx->colloc_grid[1][i]; +#endif + + s->interp_coords[0][i * ctx->nb_colloc_points[0] + j] = x; + s->interp_coords[1][i * ctx->nb_colloc_points[0] + j] = 0; + s->interp_coords[2][i * ctx->nb_colloc_points[0] + j] = z; + } + } + + ret = posix_memalign((void**)&s->coeff_scale, 32, NB_COEFFS(ctx) * sizeof(*s->coeff_scale)); + if (ret) + goto fail; + for (int j = 0; j < ctx->nb_coeffs[1]; j++) + for (int i = 0; i < ctx->nb_coeffs[0]; i++) { + s->coeff_scale[j * ctx->nb_coeffs[0] + i] = exp(-36.0 * pow((double)i / ctx->nb_coeffs[0], filter_power)) * + exp(-36.0 * pow((double)j / ctx->nb_coeffs[1], filter_power)); + } + + for (int i = 0; i < ARRAY_ELEMS(s->interp_values); i++) { + ret = posix_memalign((void**)&s->interp_values[i], 32, + NB_COLLOC_POINTS(ctx) * sizeof(*s->interp_values[i])); + if (ret) + goto fail; + s->interp_value_codes[i] = CCTK_VARIABLE_REAL; + } + + for (int i = 0; i < ARRAY_ELEMS(metric_vars); i++) { + s->interp_vars_indices[i] = CCTK_VarIndex(metric_vars[i]); + if (s->interp_vars_indices[i] < 0) + CCTK_VWarn(0, __LINE__, __FILE__, CCTK_THORNSTRING, "Error getting the index of variable: %s\n", metric_vars[i]); + } + + s->coord_system = CCTK_CoordSystemHandle("cart3d"); + if (s->coord_system < 0) + CCTK_WARN(0, "Error getting the coordinate system"); + + s->interp_operator = CCTK_InterpHandle("Lagrange polynomial interpolation (tensor product)"); + if (s->interp_operator < 0) + CCTK_WARN(0, "Error getting the interpolation operator"); + + s->interp_params = Util_TableCreateFromString("order=4 want_global_mode=1"); + if (s->interp_params < 0) + CCTK_WARN(0, "Error creating interpolation parameters table"); + + ret = Util_TableSetIntArray(s->interp_params, NB_INTERP_VARS, + interp_operation_codes, "operation_codes"); + if (ret < 0) + CCTK_WARN(0, "Error setting operation codes"); + + ret = Util_TableSetIntArray(s->interp_params, NB_INTERP_VARS, + interp_operation_indices, "operand_indices"); + if (ret < 0) + CCTK_WARN(0, "Error setting operand indices"); + + CCTK_TimerCreate("MaximalSlicing_Solve"); + CCTK_TimerCreate("MaximalSlicing_Expand"); + CCTK_TimerCreate("MaximalSlicing_interp_geometry"); + CCTK_TimerCreate("MaximalSlicing_calc_eq_coeffs"); + CCTK_TimerCreate("MaximalSlicing_construct_matrix"); + CCTK_TimerCreate("MaximalSlicing_solve_LU"); + CCTK_TimerCreate("MaximalSlicing_solve_BiCGSTAB"); + + *pctx = ctx; + return 0; +fail: + ms_solver_free(&ctx); + return -ENOMEM; +} + +void ms_solver_free(MSSolver **pctx) +{ + MSSolver *ctx = *pctx; + + if (!ctx) + return; + + if (ctx->priv) { + for (int i = 0; i < ARRAY_ELEMS(ctx->priv->interp_coords); i++) + free(ctx->priv->interp_coords[i]); + for (int i = 0; i < ARRAY_ELEMS(ctx->priv->interp_values); i++) + free(ctx->priv->interp_values[i]); + for (int i = 0; i < ARRAY_ELEMS(ctx->priv->eq_coeffs); i++) + free(ctx->priv->eq_coeffs[i]); + free(ctx->priv->rhs); + free(ctx->priv->coeff_scale); + + ms_pssolve_context_free(&ctx->priv->ps_ctx); + +#if HAVE_OPENCL + if (ctx->priv->ocl_queue) + clReleaseCommandQueue(ctx->priv->ocl_queue); + if (ctx->priv->ocl_ctx) + clReleaseContext(ctx->priv->ocl_ctx); +#endif + } + + free(ctx->priv); + + free(ctx->coeffs); + + free(ctx); + *pctx = NULL; +} diff --git a/src/ms_solve.h b/src/ms_solve.h new file mode 100644 index 0000000..2ebf1e1 --- /dev/null +++ b/src/ms_solve.h @@ -0,0 +1,52 @@ +/* + * Maximal slicing -- actual solver code + * Copyright (C) 2016 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef MS_SOLVE_H +#define MS_SOLVE_H + +#include "common.h" + +#include "cctk.h" + +#include "basis.h" + +typedef struct MSSolverPriv MSSolverPriv; + +typedef struct MSSolver { + MSSolverPriv *priv; + + const BasisSet *basis[2]; + + int nb_coeffs[2]; + int nb_colloc_points[2]; + + double *coeffs; +} MSSolver; + +int ms_solver_init(MSSolver **ctx, + cGH *cctkGH, + int basis_order_r, int basis_order_z, + double sf, double filter_power, double input_filter_power); + +void ms_solver_free(MSSolver **ctx); + +int ms_solver_solve(MSSolver *ctx); + +void ms_solver_print_stats(MSSolver *ctx); + +#endif /* MS_SOLVE_H */ diff --git a/src/pssolve.c b/src/pssolve.c new file mode 100644 index 0000000..9d6124c --- /dev/null +++ b/src/pssolve.c @@ -0,0 +1,389 @@ +/* + * Pseudospectral 2nd order 2D linear PDE solver + * Copyright (C) 2016 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "bicgstab.h" +#include "pssolve.h" + +#define NB_COEFFS(priv) (priv->nb_coeffs[0] * priv->nb_coeffs[1]) +#define NB_COLLOC_POINTS(priv) (priv->nb_colloc_points[0] * priv->nb_colloc_points[1]) + +struct PSSolvePriv { + BiCGStabContext *bicgstab; + int steps_since_inverse; + + int nb_coeffs[2]; + int nb_colloc_points[2]; + int colloc_grid_order[2]; + + double *basis_val[PSSOLVE_DIFF_ORDER_NB]; + + int *ipiv; + double *mat; +}; + +static int construct_matrix(PSSolveContext *ctx, + const double *eq_coeffs[PSSOLVE_DIFF_ORDER_NB]) +{ + double *mat = ctx->priv->mat; + int idx_coeff, idx_grid; + +#pragma omp parallel for simd + for (idx_coeff = 0; idx_coeff < NB_COEFFS(ctx->priv); idx_coeff++) + for (idx_grid = 0; idx_grid < NB_COLLOC_POINTS(ctx->priv); idx_grid++) { + const int idx = idx_grid + NB_COLLOC_POINTS(ctx->priv) * idx_coeff; + double val = 0.0; + + for (int i = 0; i < ARRAY_ELEMS(ctx->priv->basis_val); i++) + val += eq_coeffs[i][idx_grid] * ctx->priv->basis_val[i][idx]; + + mat[idx] = val; + } + + return 0; +} + +static int lu_invert(const int N, double *mat, double *rhs, int *ipiv) +{ + char equed = 'N'; + double cond, ferr, berr, rpivot; + + double *mat_f, *x; + int ret = 0; +#if 0 + LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + mat, N, ipiv, rhs, N); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv); +#else + mat_f = malloc(SQR(N) * sizeof(*mat_f)); + x = malloc(N * sizeof(*x)); + + //{ + // int i, j; + // for (i = 0; i < N; i++) { + // for (j = 0; j < N; j++) + // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]); + // fprintf(stderr, "\n"); + // } + //} + //{ + // double *mat_copy = malloc(SQR(N) * sizeof(double)); + // double *svd = malloc(N * sizeof(double)); + // double *rhs_copy = malloc(N * sizeof(double)); + // int rank; + + // memcpy(mat_copy, mat, SQR(N) * sizeof(double)); + // memcpy(rhs_copy, rhs, N * sizeof(double)); + + // LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N, + // svd, 1e-13, &rank); + + // free(mat_copy); + // for (int i = 0; i < N; i++) { + // if (i > 5 && i < N - 5) + // continue; + + // fprintf(stderr, "%g\t", svd[i]); + // } + // fprintf(stderr, "\n rank %d\n", rank); + // free(svd); + // free(rhs_copy); + + // if (rank < N) + // ret = 1; + //} + + //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + // mat, N, ipiv, rhs, N); + LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1, + mat, N, mat_f, N, ipiv, &equed, NULL, NULL, + rhs, N, x, N, &cond, &ferr, &berr, &rpivot); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv); + memcpy(rhs, x, N * sizeof(double)); + memcpy(mat, mat_f, SQR(N) * sizeof(double)); + + fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: " + "condition number %16.16g; forward error %16.16g backward error %16.16g\n", + N, N, cond, ferr, berr); + + free(mat_f); + free(x); +#endif + + return ret; +} + +int ms_pssolve_solve(PSSolveContext *ctx, + const double * const eq_coeffs[PSSOLVE_DIFF_ORDER_NB], + const double *rhs, double *coeffs) +{ + PSSolvePriv *s = ctx->priv; + const int N = NB_COEFFS(s); + double rhs_max; + int64_t start; + + int ret = 0; + + /* fill the matrix */ + CCTK_TimerStart("QuasiMaximalSlicing_construct_matrix"); + start = gettime(); + ret = construct_matrix(ctx, eq_coeffs); + ctx->construct_matrix_time += gettime() - start; + ctx->construct_matrix_count++; + CCTK_TimerStop("QuasiMaximalSlicing_construct_matrix"); + if (ret < 0) + return ret; + +#if 0 + if (rhs_max < EPS) { + fprintf(stderr, "zero rhs\n"); + memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); + if (ms->cl_queue) { + clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double), + ms->coeffs, 0, NULL, NULL); + } + return 0; + } +#endif + + /* solve for the coeffs */ + if (s->steps_since_inverse < 1024) { + int64_t start; + + start = gettime(); + + CCTK_TimerStart("QuasiMaximalSlicing_solve_BiCGSTAB"); + ret = ms_bicgstab_solve(s->bicgstab, s->mat, rhs, coeffs); + CCTK_TimerStop("QuasiMaximalSlicing_solve_BiCGSTAB"); + + if (ret >= 0) { + ctx->cg_time_total += gettime() - start; + ctx->cg_solve_count++; + ctx->cg_iter_count += ret + 1; + s->steps_since_inverse++; + + } + } else + ret = -1; + + if (ret < 0) { + int64_t start; + + CCTK_TimerStart("QuasiMaximalSlicing_solve_LU"); + start = gettime(); + + memcpy(coeffs, rhs, N * sizeof(*rhs)); + + ret = lu_invert(N, s->mat, coeffs, s->ipiv); + ctx->lu_solves_time += gettime() - start; + ctx->lu_solves_count++; + CCTK_TimerStop("QuasiMaximalSlicing_solve_LU"); + + ret = ms_bicgstab_init(s->bicgstab, s->mat, coeffs); + + s->steps_since_inverse = 0; + } + + return ret; +} + +int ms_pssolve_context_init(PSSolveContext *ctx) +{ + PSSolvePriv *s = ctx->priv; + double *basis_val[2][3] = { { NULL } }; + + int ret = 0; + + if (ctx->solve_order[0] <= 0 || ctx->solve_order[1] <= 0) + return -EINVAL; + s->nb_coeffs[0] = ctx->solve_order[0]; + s->nb_coeffs[1] = ctx->solve_order[1]; + s->nb_colloc_points[0] = ctx->solve_order[0]; + s->nb_colloc_points[1] = ctx->solve_order[1]; + s->colloc_grid_order[0] = ctx->solve_order[0]; + s->colloc_grid_order[1] = ctx->solve_order[1]; + + s->steps_since_inverse = INT_MAX; + + /* init the BiCGStab solver */ + ret = ms_bicgstab_context_alloc(&s->bicgstab, NB_COEFFS(s), ctx->ocl_ctx, + ctx->ocl_queue); + if (ret < 0) + return ret; + + /* compute the collocation grid */ + posix_memalign((void**)&ctx->colloc_grid[0], 32, s->nb_colloc_points[0] * sizeof(*ctx->colloc_grid[0])); + posix_memalign((void**)&ctx->colloc_grid[1], 32, s->nb_colloc_points[1] * sizeof(*ctx->colloc_grid[1])); + if (!ctx->colloc_grid[0] || !ctx->colloc_grid[1]) + return -ENOMEM; + + for (int i = 0; i < s->nb_colloc_points[0]; i++) + ctx->colloc_grid[0][i] = ctx->basis[0]->colloc_point(s->colloc_grid_order[0], i); + for (int i = 0; i < s->nb_colloc_points[1]; i++) + ctx->colloc_grid[1][i] = ctx->basis[1]->colloc_point(s->colloc_grid_order[1], i); + + /* precompute the basis values we will need */ + for (int i = 0; i < ARRAY_ELEMS(basis_val); i++) { + for (int j = 0; j < ARRAY_ELEMS(basis_val[i]); j++) { + int ret = posix_memalign((void**)&basis_val[i][j], 32, + sizeof(*basis_val[i][j]) * s->nb_coeffs[i] * s->nb_colloc_points[i]); + if (ret) { + ret = -ENOMEM; + goto fail; + } + } + + for (int j = 0; j < s->nb_colloc_points[i]; j++) { + double coord = ctx->colloc_grid[i][j]; + for (int k = 0; k < s->nb_coeffs[i]; k++) { + basis_val[i][0][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval (coord, k); + basis_val[i][1][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval_diff1(coord, k); + basis_val[i][2][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval_diff2(coord, k); + } + } + } + + for (int i = 0; i < ARRAY_ELEMS(s->basis_val); i++) { + ret = posix_memalign((void**)&s->basis_val[i], 32, NB_COLLOC_POINTS(s) * NB_COEFFS(s) * sizeof(*s->basis_val[i])); + if (ret) { + ret = -ENOMEM; + goto fail; + } + } + + for (int i = 0; i < s->nb_colloc_points[1]; i++) { + const double *basis_z = basis_val[1][0] + i * s->nb_coeffs[1]; + const double *dbasis_z = basis_val[1][1] + i * s->nb_coeffs[1]; + const double *d2basis_z = basis_val[1][2] + i * s->nb_coeffs[1]; + + for (int j = 0; j < s->nb_colloc_points[0]; j++) { + const double *basis_x = basis_val[0][0] + j * s->nb_coeffs[0]; + const double *dbasis_x = basis_val[0][1] + j * s->nb_coeffs[0]; + const double *d2basis_x = basis_val[0][2] + j * s->nb_coeffs[0]; + const int idx_grid = i * s->nb_colloc_points[0] + j; + +#if MS_POLAR + double r = ctx->colloc_grid[0][j]; + double theta = ctx->colloc_grid[1][i]; + + double x = r * cos(theta); + double z = r * sin(theta); +#else + double x = ctx->colloc_grid[0][j]; + double z = ctx->colloc_grid[1][i]; +#endif + + for (int k = 0; k < s->nb_coeffs[1]; k++) + for (int l = 0; l < s->nb_coeffs[0]; l++) { + const int idx_coeff = k * s->nb_coeffs[0] + l; + const int idx = idx_grid + NB_COLLOC_POINTS(s) * idx_coeff; + s->basis_val[PSSOLVE_DIFF_ORDER_00][idx] = basis_x[l] * basis_z[k]; +#if MS_POLAR + s->basis_val[PSSOLVE_DIFF_ORDER_10][idx] = ((r > EPS) ? (dbasis_x[l] * basis_z[k] * x / r - basis_x[l] * dbasis_z[k] * z / SQR(r)) : 0.0); + s->basis_val[PSSOLVE_DIFF_ORDER_01][idx] = ((r > EPS) ? (dbasis_x[l] * basis_z[k] * z / r + basis_x[l] * dbasis_z[k] * x / SQR(r)) : 0.0); + s->basis_val[PSSOLVE_DIFF_ORDER_20][idx] = ((r > EPS) ? (SQR(x / r) * d2basis_x[l] * basis_z[k] + SQR(z / SQR(r)) * basis_x[l] * d2basis_z[k] + + (1.0 - SQR(x / r)) / r * dbasis_x[l] * basis_z[k] + + 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k] + - 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0); + s->basis_val[PSSOLVE_DIFF_ORDER_02][idx] = ((r > EPS) ? (SQR(z / r) * d2basis_x[l] * basis_z[k] + SQR(x / SQR(r)) * basis_x[l] * d2basis_z[k] + + (1.0 - SQR(z / r)) / r * dbasis_x[l] * basis_z[k] + - 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k] + + 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0); + s->basis_val[PSSOLVE_DIFF_ORDER_11][idx] = ((r > EPS) ? (x * z / SQR(r) * d2basis_x[l] * basis_z[k] - x * z / SQR(SQR(r)) * basis_x[l] * d2basis_z[k] + - x * z / (r * SQR(r)) * dbasis_x[l] * basis_z[k] + - (1.0 - SQR(z / r)) / SQR(r) * basis_x[l] * dbasis_z[k] + + (SQR(x) - SQR(z)) / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0); +#else + s->basis_val[PSSOLVE_DIFF_ORDER_10][idx] = dbasis_x[l] * basis_z[k]; + s->basis_val[PSSOLVE_DIFF_ORDER_01][idx] = basis_x[l] * dbasis_z[k]; + s->basis_val[PSSOLVE_DIFF_ORDER_20][idx] = d2basis_x[l] * basis_z[k]; + s->basis_val[PSSOLVE_DIFF_ORDER_02][idx] = basis_x[l] * d2basis_z[k]; + s->basis_val[PSSOLVE_DIFF_ORDER_11][idx] = dbasis_x[l] * dbasis_z[k]; +#endif + } + } + } + + ret = posix_memalign((void**)&s->ipiv, 32, sizeof(*s->ipiv) * NB_COEFFS(s)); + ret |= posix_memalign((void**)&s->mat, 32, sizeof(*s->mat) * NB_COEFFS(s) * NB_COLLOC_POINTS(s)); + if (ret) { + ret = -ENOMEM; + goto fail; + } + +fail: + for (int i = 0; i < ARRAY_ELEMS(basis_val); i++) + for (int j = 0; j < ARRAY_ELEMS(basis_val[i]); j++) + free(basis_val[i][j]); + + return ret; +} + +int ms_pssolve_context_alloc(PSSolveContext **pctx) +{ + PSSolveContext *ctx = calloc(1, sizeof(*ctx)); + + if (!ctx) + return -ENOMEM; + + ctx->priv = calloc(1, sizeof(*ctx->priv)); + if (!ctx->priv) + goto fail; + + *pctx = ctx; + return 0; +fail: + ms_pssolve_context_free(&ctx); + return -ENOMEM; +} + +void ms_pssolve_context_free(PSSolveContext **pctx) +{ + PSSolveContext *ctx = *pctx; + + if (!ctx) + return; + + if (ctx->priv) { + for (int i = 0; i < ARRAY_ELEMS(ctx->priv->basis_val); i++) + free(ctx->priv->basis_val[i]); + + free(ctx->priv->ipiv); + free(ctx->priv->mat); + + ms_bicgstab_context_free(&ctx->priv->bicgstab); + } + + free(ctx->priv); + + free(ctx->colloc_grid[0]); + free(ctx->colloc_grid[1]); + + free(ctx); + *pctx = NULL; +} diff --git a/src/pssolve.h b/src/pssolve.h new file mode 100644 index 0000000..544988b --- /dev/null +++ b/src/pssolve.h @@ -0,0 +1,116 @@ +/* + * Pseudospectral 2nd order 2D linear PDE solver + * Copyright (C) 2016 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef MS_PSSOLVE_H +#define MS_PSSOLVE_H + +#include "common.h" + +#if HAVE_OPENCL +#include +#else +typedef void* cl_context; +typedef void* cl_command_queue; +#endif + +#include + +#include "basis.h" + +enum PSSolveDiffOrder { + PSSOLVE_DIFF_ORDER_00, + PSSOLVE_DIFF_ORDER_10, + PSSOLVE_DIFF_ORDER_01, + PSSOLVE_DIFF_ORDER_11, + PSSOLVE_DIFF_ORDER_20, + PSSOLVE_DIFF_ORDER_02, + PSSOLVE_DIFF_ORDER_NB, +}; + +typedef struct PSSolvePriv PSSolvePriv; + +typedef struct PSSolveContext { + /** + * Solver private data, not to be touched by the caller. + */ + PSSolvePriv *priv; + + /** + * The basis sets to be used in each direction. + * Set by the caller before ms_pssolve_context_init(). + */ + const BasisSet *basis[2]; + + /** + * Order of the solver in each direction. + * Set by the caller before ms_pssolve_context_init(). + */ + int solve_order[2]; + + /** + * Locations of the collocation points in each direction. The equation + * coefficients passed to ms_pssolve_solve() should be evaluated at those + * grid positions. + * + * Set by the solver after ms_pssolve_context_init(). Each array is + * solve_order[i]-sized. + */ + double *colloc_grid[2]; + + cl_context ocl_ctx; + cl_command_queue ocl_queue; + + uint64_t lu_solves_count; + uint64_t lu_solves_time; + + uint64_t cg_solve_count; + uint64_t cg_iter_count; + uint64_t cg_time_total; + + uint64_t construct_matrix_count; + uint64_t construct_matrix_time; +} PSSolveContext; + +/** + * Allocate a new solver. + */ +int ms_pssolve_context_alloc(PSSolveContext **ctx); + +/** + * Initialize the solver for use after all the context options have been set. + */ +int ms_pssolve_context_init(PSSolveContext *ctx); + +/** + * Free the solver and all its internal state. + */ +void ms_pssolve_context_free(PSSolveContext **ctx); + +/** + * Solve a second order linear PDE in 2D with a pseudospectral method. + * + * @param eq_coeffs the coefficients of each derivative term at the collocation + * points. + * @param rhs the right-hand side of the equation at the collocation points. + * @param coeffs the spectral coefficients of the solution will be written here. + */ +int ms_pssolve_solve(PSSolveContext *ctx, + const double * const eq_coeffs[PSSOLVE_DIFF_ORDER_NB], + const double *rhs, double *coeffs); + +#endif /* MS_PSSOLVE_H */ diff --git a/src/solve.c b/src/solve.c index 610dc0d..a1bee5e 100644 --- a/src/solve.c +++ b/src/solve.c @@ -1,4 +1,5 @@ +#include #include #include #include @@ -11,7 +12,7 @@ #include "maximal_slicing_axi.h" -static int construct_matrix(MaximalSlicingContext *ms, double *mat, +static int construct_matrix_cylindric(MaximalSlicingContext *ms, double *mat, double *rhs, double *prhs_max) { int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z; @@ -29,7 +30,7 @@ static int construct_matrix(MaximalSlicingContext *ms, double *mat, #pragma omp parallel for reduction(max : rhs_max) for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z; idx_grid_z++) { for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x; idx_grid_x++) { - CCTK_REAL x_physical = ms->grid_x[idx_grid_x]; + CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x]; int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; const double gtuxx = ms->metric_u[0][idx_grid]; @@ -56,47 +57,48 @@ static int construct_matrix(MaximalSlicingContext *ms, double *mat, const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi); const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); - const double coeff_20 = SQR(phi) * (gtuxx + (x_physical <= EPS) * gtuyy); + //const double tr_ric = ms->interp_values[I_TR_R][idx_grid]; + + const double coeff_20 = SQR(phi) * (gtuxx + ((x_physical <= EPS) ? gtuyy : 0.0)); const double coeff_02 = SQR(phi) * gtuzz; const double coeff_11 = SQR(phi) * gtuxz * 2; - const double coeff_10 = -Xx + (x_physical > EPS) * SQR(phi) * gtuyy / x_physical; + const double coeff_10 = -Xx + ((x_physical > EPS) ? SQR(phi) * gtuyy / x_physical : 0.0); const double coeff_01 = -Xz; + const double coeff_00 = -k2; + //const double coeff_00 = -(tr_ric + SQR(trk)); #if 1 for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; - //double d2alpha = gtuxx * D2BASIS_X * BASIS_Z - // + gtuzz * BASIS_X * D2BASIS_Z - // + 2 * gtuxz * DBASIS_X * DBASIS_Z; - //if (x_physical > EPS) - // d2alpha += gtuyy * DBASIS_X * BASIS_Z / x_physical; - //else - // d2alpha += gtuyy * D2BASIS_X * BASIS_Z; +#if 0 + double d2alpha = gtuxx * D2BASIS_X * BASIS_Z + + gtuzz * BASIS_X * D2BASIS_Z + + 2 * gtuxz * DBASIS_X * DBASIS_Z; + if (x_physical > EPS) + d2alpha += gtuyy * DBASIS_X * BASIS_Z / x_physical; + else + d2alpha += gtuyy * D2BASIS_X * BASIS_Z; - //double curv_term = Xx * DBASIS_X * BASIS_Z + Xz * BASIS_X * DBASIS_Z; + double curv_term = (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi) * DBASIS_X * BASIS_Z + + (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi) * BASIS_X * DBASIS_Z; - //double D2alpha = SQR(phi) * d2alpha - curv_term; + double D2alpha = SQR(phi) * (d2alpha - curv_term); - //mat[idx_grid + ms->nb_colloc_points * idx_coeff] = D2alpha - BASIS_X * BASIS_Z * k2; + mat[idx_grid + ms->nb_colloc_points * idx_coeff] = D2alpha - BASIS_X * BASIS_Z * k2; +#else mat[idx_grid + ms->nb_colloc_points * idx_coeff] = coeff_00 * BASIS_X * BASIS_Z + coeff_10 * DBASIS_X * BASIS_Z + coeff_01 * BASIS_X * DBASIS_Z + coeff_11 * DBASIS_X * DBASIS_Z + coeff_20 * D2BASIS_X * BASIS_Z + coeff_02 * BASIS_X * D2BASIS_Z; +#endif } #else - - const double coeff_20 = SQR(phi) * (gtuxx + (x_physical <= EPS) * gtuyy); - const double coeff_02 = SQR(phi) * gtuzz; - const double coeff_11 = SQR(phi) * gtuxz * 2; - const double coeff_10 = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi + (x_physical > EPS) * gtuyy); - const double coeff_01 = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); - const double coeff_00 = -k2; cblas_daxpy(ms->nb_coeffs, coeff_20, ms->basis_val_20 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); cblas_daxpy(ms->nb_coeffs, coeff_02, ms->basis_val_02 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); cblas_daxpy(ms->nb_coeffs, coeff_11, ms->basis_val_11 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); @@ -105,8 +107,11 @@ static int construct_matrix(MaximalSlicingContext *ms, double *mat, cblas_daxpy(ms->nb_coeffs, coeff_00, ms->basis_val_00 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points); #endif - rhs[idx_grid] = k2 + trk ;// betax * trk_dx + betaz * trk_dz; - //rhs[idx_grid] = k2; + rhs[idx_grid] = k2 + trk + betax * trk_dx + betaz * trk_dz; + //rhs[idx_grid] = 0.0; + + //rhs[idx_grid] = tr_ric + SQR(trk) + betax * trk_dx + betaz * trk_dz; + rhs_max = MAX(rhs_max, fabs(rhs[idx_grid])); //rhs_max = fabs(rhs[idx_grid]); } @@ -123,8 +128,220 @@ static int construct_matrix(MaximalSlicingContext *ms, double *mat, return 0; } +static int construct_matrix_polar(MaximalSlicingContext *ms, double *mat, + double *rhs, double *prhs_max) +{ + int idx_coeff, idx_grid; + +#pragma omp parallel for + for (idx_coeff = 0; idx_coeff < ms->nb_coeffs; idx_coeff++) + for (idx_grid = 0; idx_grid < ms->nb_colloc_points; idx_grid++) { + const int idx = idx_grid + ms->nb_colloc_points * idx_coeff; + + mat[idx] = ms->eq_coeff_00[idx_grid] * ms->basis_val_00[idx] + + ms->eq_coeff_10[idx_grid] * ms->basis_val_10[idx] + + ms->eq_coeff_01[idx_grid] * ms->basis_val_01[idx] + + ms->eq_coeff_11[idx_grid] * ms->basis_val_11[idx] + + ms->eq_coeff_20[idx_grid] * ms->basis_val_20[idx] + + ms->eq_coeff_02[idx_grid] * ms->basis_val_02[idx]; + } + + //*prhs_max = ms->rhs[cblas_idamax(ms->nb_colloc_points, ms->rhs, 1)]; + + return 0; +} + +static int construct_matrix_boundary(MaximalSlicingContext *ms, double *mat, + double *rhs, double *prhs_max) +{ + int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z; + double rhs_max = 0.0; + +#define BASIS_X (ms->basis_x_val [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) +#define DBASIS_X (ms->basis_x_dval [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) +#define D2BASIS_X (ms->basis_x_d2val[idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) +#define BASIS_Z (ms->basis_z_val [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) +#define DBASIS_Z (ms->basis_z_dval [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) +#define D2BASIS_Z (ms->basis_z_d2val[idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) + +#pragma omp parallel for + for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) { + for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) { + CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x]; + int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + + const double gtuxx = ms->metric_u[0][idx_grid]; + const double gtuyy = ms->metric_u[1][idx_grid]; + const double gtuzz = ms->metric_u[2][idx_grid]; + const double gtuxz = ms->metric_u[4][idx_grid]; + + const double phi = ms->interp_values[I_PHI][idx_grid]; + const double phi_dx = ms->interp_values[I_PHI_DX][idx_grid]; + const double phi_dz = ms->interp_values[I_PHI_DZ][idx_grid]; + + const double Xtx = ms->interp_values[I_XTX][idx_grid]; + const double Xtz = ms->interp_values[I_XTZ][idx_grid]; + + const double k2 = ms->kij_kij[idx_grid]; + const double trk = ms->interp_values[I_K][idx_grid]; + + const double trk_dx = ms->interp_values[I_K_DX][idx_grid]; + const double trk_dz = ms->interp_values[I_K_DZ][idx_grid]; + + const double betax = ms->interp_values[I_BETAX][idx_grid]; + const double betaz = ms->interp_values[I_BETAZ][idx_grid]; + + const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi); + const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); + + const double coeff_20 = SQR(phi) * (gtuxx + ((fabs(x_physical) <= EPS) ? gtuyy : 0)); + const double coeff_02 = SQR(phi) * gtuzz; + const double coeff_11 = SQR(phi) * gtuxz * 2; + const double coeff_10 = -Xx + ((fabs(x_physical) > EPS) ? (SQR(phi) * gtuyy / x_physical) : 0); + const double coeff_01 = -Xz; + + const double coeff_00 = -k2; + + for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + mat[idx_grid + ms->nb_colloc_points * idx_coeff] = coeff_00 * BASIS_X * BASIS_Z + + coeff_10 * DBASIS_X * BASIS_Z + + coeff_01 * BASIS_X * DBASIS_Z + + coeff_11 * DBASIS_X * DBASIS_Z + + coeff_20 * D2BASIS_X * BASIS_Z + + coeff_02 * BASIS_X * D2BASIS_Z; + } + + rhs[idx_grid] = 0.0; + //rhs[idx_grid] = k2 + trk + betax * trk_dx + betaz * trk_dz; + + //rhs_max = MAX(rhs_max, fabs(rhs[idx_grid])); + //rhs_max = fabs(rhs[idx_grid]); + } + } + +#if 1 + // z = 0 axis + idx_grid_z = 0; + for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) { + int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + int idx_grid1 = (ms->nb_colloc_points_z - 2) * ms->nb_colloc_points_x + idx_grid_x; + + for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + mat[idx_grid1 + ms->nb_colloc_points * idx_coeff] = BASIS_X * DBASIS_Z; + } + rhs[idx_grid1] = 0.0; + } + + // x = 0 axis + idx_grid_x = 0; + for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) { + int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + int idx_grid1 = idx_grid_z * ms->nb_colloc_points_x + ms->nb_colloc_points_x - 2; + + for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + mat[idx_grid1 + ms->nb_colloc_points * idx_coeff] = DBASIS_X * BASIS_Z; + } + rhs[idx_grid1] = 0.0; + } + + // z = z_max axis + idx_grid_z = ms->nb_colloc_points_z - 1; + for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) { + CCTK_REAL z_physical = ms->colloc_grid[1][idx_grid_z]; + + int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + + for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z; //+ z_physical * BASIS_X * DBASIS_Z; + } + rhs[idx_grid] = 1.0; + } + + // x = x_max axis + idx_grid_x = ms->nb_colloc_points_x - 1; + for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) { + CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x]; + + int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + + for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z;// + x_physical * DBASIS_X * BASIS_Z; + } + rhs[idx_grid] = 1.0; + } + + //idx_grid_x = 0; + //idx_grid_z = 0; + //{ + // int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + + // for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + // for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + // const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + // mat[idx_grid + ms->nb_colloc_points * idx_coeff] = DBASIS_X * BASIS_Z + BASIS_X * DBASIS_Z; + // } + // rhs[idx_grid] = 0.0; + //} + + //idx_grid_x = ms->nb_colloc_points_x - 1; + //idx_grid_z = ms->nb_colloc_points_z - 1; + //{ + // CCTK_REAL z_physical = ms->colloc_grid[1][idx_grid_z]; + // CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x]; + // CCTK_REAL r2 = SQR(z_physical) + SQR(x_physical); + // int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + + // for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + // for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + // const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + // mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z + 0.5 * r2 / z_physical * BASIS_X * DBASIS_Z + 0.5 * r2 / x_physical * DBASIS_X * BASIS_Z; + // } + // rhs[idx_grid] = 1.0; + //} +#endif + + *prhs_max = rhs_max; + +#if 0 + fprintf(stderr, "matrix\n"); + for (int i = 0; i < ms->nb_colloc_points; i++) { + for (int j = 0; j < ms->nb_coeffs; j++) + fprintf(stderr, "%+08.04g ", mat[i + ms->nb_colloc_points * j]); + fprintf(stderr, "\n"); + } + + fprintf(stderr, "rhs\n"); + for (int i = 0; i < ms->nb_colloc_points; i++) + fprintf(stderr, "%+08.04g ", rhs[i]); + fprintf(stderr, "\n"); +#endif -static int calc_geometry(MaximalSlicingContext *ms) + return 0; +} + +static int construct_matrix(MaximalSlicingContext *ms) +{ + return construct_matrix_polar(ms, ms->mat, NULL, NULL); +} +/* interpolate the cactus gridfunctions onto the pseudospectral grid */ +static int interp_geometry(MaximalSlicingContext *ms) { int ret; @@ -135,7 +352,22 @@ static int calc_geometry(MaximalSlicingContext *ms) if (ret < 0) CCTK_WARN(0, "Error interpolating"); -#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x) + //CCTK_TimerStart("MaximalSlicingAxi_filter_input"); + //{ + // cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + // ms->nb_colloc_points, ARRAY_ELEMS(ms->interp_values), ms->nb_colloc_points, + // 1.0, ms->input_filter, ms->nb_colloc_points, ms->interp_buffer_prefilter, ms->nb_colloc_points, + // 0.0, ms->interp_buffer, ms->nb_colloc_points); + //} + //CCTK_TimerStop("MaximalSlicingAxi_filter_input"); + + return 0; +} + +static int calc_eq_coeffs(MaximalSlicingContext *ms, double *prhs_max) +{ + double rhs_max = 0.0; +#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x) reduction(max : rhs_max) for (int i = 0; i < ms->nb_colloc_points; i++) { CCTK_REAL Am[3][3], gtu[3][3]; CCTK_REAL a2; @@ -191,16 +423,56 @@ static int calc_geometry(MaximalSlicingContext *ms) for (int k = 0; k < 3; k++) a2 += Am[j][k] * Am[k][j]; - ms->metric_u[0][i] = gtu[0][0]; - ms->metric_u[1][i] = gtu[1][1]; - ms->metric_u[2][i] = gtu[2][2]; - ms->metric_u[3][i] = gtu[0][1]; - ms->metric_u[4][i] = gtu[0][2]; - ms->metric_u[5][i] = gtu[1][2]; + { + double x = ms->interp_coords[0][i]; + double z = ms->interp_coords[2][i]; + + const double gtuxx = gtu[0][0]; + const double gtuyy = gtu[1][1]; + const double gtuzz = gtu[2][2]; + const double gtuxz = gtu[0][2]; + + const double phi = ms->interp_values[I_PHI][i]; + const double phi_dx = ms->interp_values[I_PHI_DX][i]; + const double phi_dz = ms->interp_values[I_PHI_DZ][i]; + + const double Xtx = ms->interp_values[I_XTX][i]; + const double Xtz = ms->interp_values[I_XTZ][i]; + + const double k2 = a2 + SQR(trK) / 3.; + + const double trk_dx = ms->interp_values[I_K_DX][i]; + const double trk_dz = ms->interp_values[I_K_DZ][i]; + + const double betax = ms->interp_values[I_BETAX][i]; + const double betaz = ms->interp_values[I_BETAZ][i]; + + const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi); + const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); + + ms->eq_coeff_20[i] = SQR(phi) * (gtuxx + ((x <= EPS) ? gtuyy : 0.0)); + ms->eq_coeff_02[i] = SQR(phi) * gtuzz; + ms->eq_coeff_11[i] = SQR(phi) * gtuxz * 2; + ms->eq_coeff_10[i] = -Xx + ((x > EPS) ? SQR(phi) * gtuyy / x : 0.0); + ms->eq_coeff_01[i] = -Xz; + ms->eq_coeff_00[i] = -k2; + ms->rhs[i] = k2 + trK + betax * trk_dx + betaz * trk_dz; + + rhs_max = MAX(rhs_max, fabs(ms->rhs[i])); + } + + //ms->metric_u[0][i] = gtu[0][0]; + //ms->metric_u[1][i] = gtu[1][1]; + //ms->metric_u[2][i] = gtu[2][2]; + //ms->metric_u[3][i] = gtu[0][1]; + //ms->metric_u[4][i] = gtu[0][2]; + //ms->metric_u[5][i] = gtu[1][2]; - ms->kij_kij[i] = a2 + SQR(trK) / 3.; + //ms->kij_kij[i] = a2 + SQR(trK) / 3.; } + *prhs_max = rhs_max; + return 0; } @@ -274,8 +546,6 @@ static int solve_bicgstab(BiCGSTABContext *ctx, const int N, if (i == MAXITER) return -1; - ctx->iter_total += i + 1; - return i; } @@ -293,12 +563,9 @@ static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q, cl_event events[8]; - // upload the matrix and the RHS to the GPU - // k and x are assumed to be already uploaded - clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double), - rhs, 0, NULL, &events[0]); - clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), - mat, 0, NULL, &events[1]); + // the matrix, rhs, k and x are assumed to be already uploaded + clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs, 0, NULL, &events[0]); + clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]); // initialize the residual clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, @@ -393,18 +660,77 @@ static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q, if (i == MAXITER) return -1; - ctx->iter_total += i + 1; - return i; } static int lu_invert(const int N, double *mat, double *rhs, int *ipiv) { + char equed = 'N'; + double cond, ferr, berr, rpivot; + + double *mat_f, *x; + int ret = 0; +#if 1 LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, mat, N, ipiv, rhs, N); LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv); +#else + mat_f = malloc(SQR(N) * sizeof(*mat_f)); + x = malloc(N * sizeof(*x)); + + //{ + // int i, j; + // for (i = 0; i < N; i++) { + // for (j = 0; j < N; j++) + // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]); + // fprintf(stderr, "\n"); + // } + //} + { + double *mat_copy = malloc(SQR(N) * sizeof(double)); + double *svd = malloc(N * sizeof(double)); + double *rhs_copy = malloc(N * sizeof(double)); + int rank; + + memcpy(mat_copy, mat, SQR(N) * sizeof(double)); + memcpy(rhs_copy, rhs, N * sizeof(double)); + + LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N, + svd, 1e-13, &rank); + + free(mat_copy); + for (int i = 0; i < N; i++) { + if (i > 5 && i < N - 5) + continue; + + fprintf(stderr, "%g\t", svd[i]); + } + fprintf(stderr, "\n rank %d\n", rank); + free(svd); + free(rhs_copy); - return 0; + if (rank < N) + ret = 1; + } + + //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + // mat, N, ipiv, rhs, N); + LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1, + mat, N, mat_f, N, ipiv, &equed, NULL, NULL, + rhs, N, x, N, &cond, &ferr, &berr, &rpivot); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv); + memcpy(rhs, x, N * sizeof(double)); + memcpy(mat, mat_f, SQR(N) * sizeof(double)); + + fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: " + "condition number %16.16g; forward error %16.16g backward error %16.16g\n", + N, N, cond, ferr, berr); + + free(mat_f); + free(x); +#endif + + return ret; } /* @@ -420,23 +746,40 @@ int msa_maximal_solve(MaximalSlicingContext *ms) { const int N = ms->nb_coeffs; double rhs_max; + int64_t start; int ret = 0; /* interpolate the metric values and construct the quantities we'll need */ - CCTK_TimerStart("MaximalSlicingAxi_calc_geometry"); - ret = calc_geometry(ms); - CCTK_TimerStop("MaximalSlicingAxi_calc_geometry"); + CCTK_TimerStart("MaximalSlicingAxi_interp_geometry"); + start = gettime(); + ret = interp_geometry(ms); + ms->interp_geometry_time += gettime() - start; + ms->interp_geometry_count++; + CCTK_TimerStop("MaximalSlicingAxi_interp_geometry"); + if (ret < 0) + return ret; + + CCTK_TimerStart("MaximalSlicingAxi_calc_eq_coeffs"); + start = gettime(); + ret = calc_eq_coeffs(ms, &rhs_max); + ms->calc_eq_coeffs_time += gettime() - start; + ms->calc_eq_coeffs_count++; + CCTK_TimerStop("MaximalSlicingAxi_calc_eq_coeffs"); if (ret < 0) return ret; /* fill the matrix */ CCTK_TimerStart("MaximalSlicingAxi_construct_matrix"); - ret = construct_matrix(ms, ms->mat, ms->rhs, &rhs_max); + start = gettime(); + ret = construct_matrix(ms); + ms->construct_matrix_time += gettime() - start; + ms->construct_matrix_count++; CCTK_TimerStop("MaximalSlicingAxi_construct_matrix"); if (ret < 0) return ret; +#if 1 if (rhs_max < EPS) { memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); if (ms->cl_queue) { @@ -445,9 +788,10 @@ int msa_maximal_solve(MaximalSlicingContext *ms) } return 0; } +#endif /* solve for the coeffs */ - if (ms->steps_since_inverse < 128) { + if (ms->steps_since_inverse < 1024) { BiCGSTABContext *b = &ms->bicgstab; int64_t start = gettime(); @@ -461,15 +805,11 @@ int msa_maximal_solve(MaximalSlicingContext *ms) CCTK_TimerStop("MaximalSlicingAxi_solve_BiCGSTAB"); if (ret >= 0) { - b->time_total += gettime() - start; - b->solve_total++; + ms->cg_time_total += gettime() - start; + ms->cg_solve_count++; + ms->cg_iter_count += ret + 1; ms->steps_since_inverse++; - if (!(b->solve_total & 127)) { - fprintf(stderr, "BiCGSTAB %ld solves, %ld iterations, total time %ld, avg iterations per solve %g, avg time per solve %g, avg time per iteration %g\n", - b->solve_total, b->iter_total, b->time_total, (double)b->iter_total / b->solve_total, (double)b->time_total / b->solve_total, (double)b->time_total / b->iter_total); - fprintf(stderr, "LU %ld solves, total time %ld, avg time per solve %g\n", ms->lu_solves_total, ms->lu_solves_time, (double)ms->lu_solves_time / ms->lu_solves_total); - } #if 0 { double min, max; @@ -493,9 +833,9 @@ int msa_maximal_solve(MaximalSlicingContext *ms) CCTK_TimerStart("MaximalSlicingAxi_solve_LU"); start = gettime(); - lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv); + ret = lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv); ms->lu_solves_time += gettime() - start; - ms->lu_solves_total++; + ms->lu_solves_count++; CCTK_TimerStop("MaximalSlicingAxi_solve_LU"); tmpv = ms->coeffs; @@ -506,6 +846,11 @@ int msa_maximal_solve(MaximalSlicingContext *ms) ms->mat = ms->bicgstab.k; ms->bicgstab.k = tmpm; + if (ret == 1) { + memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); + ms->coeffs[0] = 1.0; + } + if (ms->cl_queue) { cl_event events[2]; clEnqueueWriteBuffer(ms->cl_queue, ms->bicgstab.cl_k, 0, 0, N * N * sizeof(double), @@ -518,6 +863,10 @@ int msa_maximal_solve(MaximalSlicingContext *ms) ms->steps_since_inverse = 0; } + for (int i = 0; i < N; i++) + ms->coeffs[i] *= ms->coeff_scale[i]; + + //fprintf(stderr, "solve %g %g\n", ms->gh->cctk_time, ms->coeffs[0]); return ret; } -- cgit v1.2.3