From a4915453f05ebdfeccd8f954026096e8f145fd06 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Sun, 28 Feb 2016 10:19:06 +0100 Subject: Initial commit. --- configuration.ccl | 2 + interface.ccl | 16 + param.ccl | 31 ++ schedule.ccl | 28 ++ src/basis.c | 375 +++++++++++++++++++++ src/make.code.defn | 7 + src/qms.c | 965 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/qms.h | 301 +++++++++++++++++ src/register.c | 9 + src/solve.c | 636 +++++++++++++++++++++++++++++++++++ src/solve.cl | 28 ++ src/solve_cl.c | 28 ++ 12 files changed, 2426 insertions(+) create mode 100644 configuration.ccl create mode 100644 interface.ccl create mode 100644 param.ccl create mode 100644 schedule.ccl create mode 100644 src/basis.c create mode 100644 src/make.code.defn create mode 100644 src/qms.c create mode 100644 src/qms.h create mode 100644 src/register.c create mode 100644 src/solve.c create mode 100644 src/solve.cl create mode 100644 src/solve_cl.c diff --git a/configuration.ccl b/configuration.ccl new file mode 100644 index 0000000..b4e7a1d --- /dev/null +++ b/configuration.ccl @@ -0,0 +1,2 @@ +# Configuration definition for thorn QuasiMaximalSlicing + diff --git a/interface.ccl b/interface.ccl new file mode 100644 index 0000000..324fc5f --- /dev/null +++ b/interface.ccl @@ -0,0 +1,16 @@ +# Interface definition for thorn QuasiMaximalSlicing +implements: QuasiMaximalSlicing + +INHERITS: ADMBase grid CoordBase MethodOfLines + +CCTK_INT FUNCTION MoLRegisterConstrained(CCTK_INT IN idx) +CCTK_INT FUNCTION MoLRegisterSaveAndRestore(CCTK_INT IN idx) +CCTK_INT FUNCTION MoLRegisterSaveAndRestoreGroup(CCTK_INT IN idx) + +REQUIRES FUNCTION MoLRegisterConstrained +REQUIRES FUNCTION MoLRegisterSaveAndRestore +REQUIRES FUNCTION MoLRegisterSaveAndRestoreGroup + +public: +CCTK_REAL W TYPE=GF +CCTK_REAL w_coeffs TYPE=array DIM=2 SIZE=basis_order_z,basis_order_r DISTRIB=constant diff --git a/param.ccl b/param.ccl new file mode 100644 index 0000000..e80c42b --- /dev/null +++ b/param.ccl @@ -0,0 +1,31 @@ +# Parameter definitions for thorn QuasiMaximalSlicing + +RESTRICTED: +CCTK_INT basis_order_r "Number of the basis functions in the radial direction" STEERABLE=recover +{ + 1: :: "" +} 40 + +CCTK_INT basis_order_z "Number of the basis functions in the z direction" STEERABLE=recover +{ + 1: :: "" +} 40 + +CCTK_REAL scale_factor "Scaling factor L for the SB basis" STEERABLE=recover +{ + 0: :: "" +} 3.0 + +CCTK_REAL filter_power "" STEERABLE=recover +{ + 0: :: "" +} 64.0 + +CCTK_REAL input_filter_power "" STEERABLE=recover +{ + 0: :: "" +} 64.0 + +BOOLEAN export_coeffs "Export the coefficients of the spectral expansion in alpha_coeffs" STEERABLE=recover +{ +} "no" diff --git a/schedule.ccl b/schedule.ccl new file mode 100644 index 0000000..07120fc --- /dev/null +++ b/schedule.ccl @@ -0,0 +1,28 @@ +# Schedule definitions for thorn MaximalSlicingAxi +# +SCHEDULE quasimaximal_slicing_axi_eval IN ML_BSSN_evolCalcGroup BEFORE ML_BSSN_lapse_evol { + LANG: C +} "Quasimaximal slicing eval W" + +#SCHEDULE quasimaximal_slicing_axi_solve IN ML_BSSN_evolCalcGroup BEFORE quasimaximal_slicing_axi_eval { +SCHEDULE quasimaximal_slicing_axi_solve IN MoL_PreStep { + LANG: C +} "Quasimaximal slicing solve W" + +#SCHEDULE quasimaximal_slicing_axi IN MoL_PseudoEvolution { +# LANG: C +#} "Quasimaximal slicing" + +SCHEDULE qms_register_mol IN MoL_Register { + LANG: C +} "" + +SCHEDULE qms_init IN ADMBase_InitialData { + LANG: C +} "" + +STORAGE: W + +if (export_coeffs) { + STORAGE: w_coeffs +} diff --git a/src/basis.c b/src/basis.c new file mode 100644 index 0000000..1ca61d8 --- /dev/null +++ b/src/basis.c @@ -0,0 +1,375 @@ + +#include + +#include "qms.h" + +/* + * The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9) + * SB(x, n) = sin((n + 1) arccot(|x| / L)) + * They are symmetric wrt origin and decay as 1/x in infinity. + */ +static double sb_even_eval(double coord, int idx) +{ + double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord)); + + idx *= 2; // even only + + return sin((idx + 1) * val); +} + +static double sb_even_eval_diff1(double coord, int idx) +{ + double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord)); + + idx *= 2; // even only + + return - SCALE_FACTOR * (idx + 1) * SGN(coord) * cos((idx + 1) * val) / (SQR(SCALE_FACTOR) + SQR(coord)); +} + +static double sb_even_eval_diff2(double coord, int idx) +{ + double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord)); + + idx *= 2; // even only + + return SCALE_FACTOR * (idx + 1) * SGN(coord) * (2 * fabs(coord) * cos((idx + 1) * val) - SCALE_FACTOR * (idx + 1) * sin((idx + 1) * val)) / SQR(SQR(SCALE_FACTOR) + SQR(coord)); +} + +static double sb_even_colloc_point(int order, int idx) +{ + double t; + + idx = order - idx - 1; + //order *= 2; + + //t = (idx + 2) * M_PI / (order + 4); +#if POLAR + t = (idx + 2) * M_PI / (2 * order + 3); +#else + t = (idx + 2) * M_PI / (2 * order + 2); +#endif + return SCALE_FACTOR / tan(t); +} + +const BasisSet qms_sb_even_basis = { + .eval = sb_even_eval, + .eval_diff1 = sb_even_eval_diff1, + .eval_diff2 = sb_even_eval_diff2, + .colloc_point = sb_even_colloc_point, +}; + +static double tb_even_eval(double coord, int idx) +{ + double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord)); + + idx++; + idx *= 2; // even only + + return cos(idx * val) - 1.0; +} + +static double tb_even_eval_diff1(double coord, int idx) +{ + double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord)); + + idx++; + idx *= 2; // even only + + return SCALE_FACTOR * idx * SGN(coord) * sin(idx * val) / (SQR(SCALE_FACTOR) + SQR(coord)); +} + +static double tb_even_eval_diff2(double coord, int idx) +{ + double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord)); + + idx++; + idx *= 2; // even only + + return -SCALE_FACTOR * idx * SGN(coord) * (2 * fabs(coord) * sin(idx * val) + SCALE_FACTOR * idx * cos(idx * val)) / SQR(SQR(SCALE_FACTOR) + SQR(coord)); +} + +static double tb_even_colloc_point(int order, int idx) +{ + double t; + + idx = order - idx - 1; + //order *= 2; + + //t = (idx + 2) * M_PI / (order + 4); + t = (idx + 2) * M_PI / (2 * order + 4); + return SCALE_FACTOR / tan(t); +} + +const BasisSet qms_tb_even_basis = { + .eval = tb_even_eval, + .eval_diff1 = tb_even_eval_diff1, + .eval_diff2 = tb_even_eval_diff2, + .colloc_point = tb_even_colloc_point, +}; + +static double tl_eval(double coord, int idx) +{ + double t = 2 * atan2(sqrt(SCALE_FACTOR), sqrt(fabs(coord))); + + //idx++; + + return cos(idx * t);// - 1.0; +} + +static double tl_eval_diff1(double coord, int idx) +{ + double y = fabs(coord); + double y12 = sqrt(y); + double t = 2 * atan2(sqrt(SCALE_FACTOR), y12); + + //idx++; + + if (y < 1e-10) + return -2.0 * SQR(idx) * pow(-1.0, idx) / SCALE_FACTOR; + + return idx * sqrt(SCALE_FACTOR) * sin(idx * t) / (y12 * (y + SCALE_FACTOR)); +} + +static double tl_eval_diff2(double coord, int idx) +{ + double y = fabs(coord); + double y12 = sqrt(y); + double t = 2 * atan2(sqrt(SCALE_FACTOR), y12); + + //idx++; + + if (y < 1e-10) + return 4.0 * SQR(idx) * (SQR(idx) + 2) * pow(-1.0, idx) / (3.0 * SQR(SCALE_FACTOR)); + + return -(SCALE_FACTOR * SQR(idx) * cos(idx * t) / (y * SQR(y + SCALE_FACTOR)) + sqrt(SCALE_FACTOR) * idx * sin(idx * t) / (y12 * SQR(y + SCALE_FACTOR)) + sqrt(SCALE_FACTOR) * idx * sin(idx * t) / (2 * y * y12 * (y + SCALE_FACTOR))); +} + +static double tl_colloc_point(int order, int idx) +{ + double t; + + if (!idx) + return 0.0; + + idx = order - idx - 1; + + order++; + idx++; + + t = (2 * idx + 1) * M_PI / (2 * order); + return SCALE_FACTOR / SQR(tan(0.5 * t)); +} + +const BasisSet qms_tl_basis = { + .eval = tl_eval, + .eval_diff1 = tl_eval_diff1, + .eval_diff2 = tl_eval_diff2, + .colloc_point = tl_colloc_point, +}; + +static double full_eval(double coord, int idx) +{ + double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord)); + int flag = idx & 1; + + idx /= 2; + + if (flag) return sin((2 * idx + 1) * 4 * val); + else return cos((2 * idx + 2) * 4 * val) - 1; +} + +static double full_eval_diff1(double coord, int idx) +{ + double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord)); + int flag = idx & 1; + + idx /= 2; + + if (flag) { + idx = 2 * idx + 1; + return -4 * idx * SCALE_FACTOR * cos(idx * 4 * val) / (SQR(SCALE_FACTOR) + SQR(coord)); + } else { + idx = 2 * (idx + 1); + return 4 * idx * SCALE_FACTOR * sin(idx * 4 * val) / (SQR(SCALE_FACTOR) + SQR(coord)); + } +} + +static double full_eval_diff2(double coord, int idx) +{ + double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord)); + int flag = idx & 1; + + idx /= 2; + + if (flag) { + idx = 2 * idx + 1; + return (16 * SQR(idx * SCALE_FACTOR) * cos(idx * 4 * val) - 4 * idx * SCALE_FACTOR * sin(idx * 4 * val) * 2 * coord) / SQR(SQR(SCALE_FACTOR) + SQR(coord)); + } else { + idx = 2 * (idx + 1); + return (-16 * SQR(idx * SCALE_FACTOR) * sin(idx * 4 * val) - 4 * idx * SCALE_FACTOR * cos(idx * 4 * val) * 2 * coord) / SQR(SQR(SCALE_FACTOR) + SQR(coord)); + } +} + +static double full_colloc_point(int order, int idx) +{ + double t; + + idx = order - idx - 1; + + t = (idx + 0.5) * M_PI / (2 * order); + + return SCALE_FACTOR / tan(t); + +} + +const BasisSet qms_full_basis = { + .eval = full_eval, + .eval_diff1 = full_eval_diff1, + .eval_diff2 = full_eval_diff2, + .colloc_point = full_colloc_point, +}; + +static double cheb_eval(double coord, int idx) +{ + double x = MIN(1.0, MAX(2.0 * coord / SCALE_FACTOR - 1.0, -1.0)); + + //idx += 1; + return cos(idx * acos(x)); +} + +static double cheb_eval_diff1(double coord, int idx) +{ + double x = MIN(1.0, MAX(2.0 * coord / SCALE_FACTOR - 1.0, -1.0)); + double t = acos(x); + + //idx += 1; + + if (fabs(x - 1.0) < 1e-8) + return (2.0 / SCALE_FACTOR) * SQR(idx); + if (fabs(x + 1.0) < 1e-8) + return -(2.0 / SCALE_FACTOR) * SQR(idx) * pow(-1.0, idx); + return (2.0 / SCALE_FACTOR) * idx * sin(idx * t) / sin(t); +} + +static double cheb_eval_diff2(double coord, int idx) +{ + double x = MIN(1.0, MAX(2.0 * coord / SCALE_FACTOR - 1.0, -1.0)); + double t = acos(x); + double st = sin(t); + + //idx += 1; + + if (fabs(x - 1.0) < 1e-8) + return SQR(2.0 / SCALE_FACTOR) * SQR(idx) * (SQR(idx) - 1.0) / 3.; + if (fabs(x + 1.0) < 1e-8) + return SQR(2.0 / SCALE_FACTOR) * pow(-1.0, idx) * SQR(idx) * (SQR(idx) - 1.0) / 3.; + + return SQR(2.0 / SCALE_FACTOR) * (-SQR(idx) * cos(idx * t) / SQR(st) + idx * cos(t) * sin(idx * t) / (st * SQR(st))); +} + +static double cheb_colloc_point(int order, int idx) +{ + double y; + + idx = order - idx - 1; + + //order += 1; + //idx += 1; + y = cos(idx * M_PI / (order - 1)); + //y = cos((2 * idx + 1) * M_PI / (2 * order)); + + return 0.5 * SCALE_FACTOR * (y + 1.0); +} + +const BasisSet qms_cheb_basis = { + .eval = cheb_eval, + .eval_diff1 = cheb_eval_diff1, + .eval_diff2 = cheb_eval_diff2, + .colloc_point = cheb_colloc_point, +}; + +static double cheb_even_eval(double coord, int idx) +{ + double x = MIN(1.0, MAX(coord / SCALE_FACTOR, -1.0)); + + idx *= 2; + //idx += 1; + return cos(idx * acos(x)); +} + +static double cheb_even_eval_diff1(double coord, int idx) +{ + double x = MIN(1.0, MAX(coord / SCALE_FACTOR, -1.0)); + double t = acos(x); + + idx *= 2; + //idx += 1; + + if (fabs(fabs(x) - 1.0) < 1e-8) + return (1.0 / SCALE_FACTOR) * SQR(idx); + return (1.0 / SCALE_FACTOR) * idx * sin(idx * t) / sin(t); +} + +static double cheb_even_eval_diff2(double coord, int idx) +{ + double x = MIN(1.0, MAX(coord / SCALE_FACTOR, -1.0)); + double t = acos(x); + double st = sin(t); + + idx *= 2; + //idx += 1; + + if (fabs(fabs(x) - 1) < 1e-8) + return SQR(1.0 / SCALE_FACTOR) * SQR(idx) * (SQR(idx) - 1.0) / 3.; + + return SQR(1.0 / SCALE_FACTOR) * (-SQR(idx) * cos(idx * t) / SQR(st) + idx * cos(t) * sin(idx * t) / (st * SQR(st))); +} + +static double cheb_even_colloc_point(int order, int idx) +{ + double y; + + idx = order - idx - 1; + + //order += 1; + //idx += 1; + order *= 2; + y = cos(idx * M_PI / (order - 1)); + y = cos((2 * idx + 2) * M_PI / (2 * order)); + + return SCALE_FACTOR * y; +} + +const BasisSet qms_cheb_even_basis = { + .eval = cheb_even_eval, + .eval_diff1 = cheb_even_eval_diff1, + .eval_diff2 = cheb_even_eval_diff2, + .colloc_point = cheb_even_colloc_point, +}; + +static double cos_even_eval(double coord, int idx) +{ + return cos(2 * idx * coord); +} + +static double cos_even_eval_diff1(double coord, int idx) +{ + return -2 * idx * sin(2 * idx * coord); +} + +static double cos_even_eval_diff2(double coord, int idx) +{ + return -4 * SQR(idx) * cos(2 * idx * coord); +} + +static double cos_even_colloc_point(int order, int idx) +{ + return M_PI * idx / (2 * order - 0); +} + +const BasisSet qms_cos_even_basis = { + .eval = cos_even_eval, + .eval_diff1 = cos_even_eval_diff1, + .eval_diff2 = cos_even_eval_diff2, + .colloc_point = cos_even_colloc_point, +}; diff --git a/src/make.code.defn b/src/make.code.defn new file mode 100644 index 0000000..f31ed36 --- /dev/null +++ b/src/make.code.defn @@ -0,0 +1,7 @@ +# Main make.code.defn file for thorn MaximalSlicingAxi + +# Source files in this directory +SRCS = basis.c qms.c register.c solve.c + +# Subdirectories containing source files +SUBDIRS = diff --git a/src/qms.c b/src/qms.c new file mode 100644 index 0000000..7332017 --- /dev/null +++ b/src/qms.c @@ -0,0 +1,965 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "cctk.h" +#include "cctk_Arguments.h" +#include "cctk_Parameters.h" +#include "cctk_Timers.h" +#include "util_Table.h" + +#include "qms.h" + +#define ACC_TEST 0 + +double scale_factor; + +/* mapping between our indices and thorn names */ +static const char *metric_vars[] = { +#if CCZ4 + [GTXX] = "ML_CCZ4::gt11", + [GTYY] = "ML_CCZ4::gt22", + [GTZZ] = "ML_CCZ4::gt33", + [GTXY] = "ML_CCZ4::gt12", + [GTXZ] = "ML_CCZ4::gt13", + [GTYZ] = "ML_CCZ4::gt23", + [ATXX] = "ML_CCZ4::At11", + [ATYY] = "ML_CCZ4::At22", + [ATZZ] = "ML_CCZ4::At33", + [ATXY] = "ML_CCZ4::At12", + [ATXZ] = "ML_CCZ4::At13", + [ATYZ] = "ML_CCZ4::At23", + [PHI] = "ML_CCZ4::phi", + [K] = "ML_CCZ4::trK", + [XTX] = "ML_CCZ4::Xt1", + [XTY] = "ML_CCZ4::Xt2", + [XTZ] = "ML_CCZ4::Xt3", + [BETAX] = "ML_CCZ4::beta1", + [BETAY] = "ML_CCZ4::beta2", + [BETAZ] = "ML_CCZ4::beta3", + [ALPHA] = "ML_CCZ4::alpha", + [KDOT_XX] = "ML_CCZ4::Kdot11", + [KDOT_YY] = "ML_CCZ4::Kdot22", + [KDOT_ZZ] = "ML_CCZ4::Kdot33", + [KDOT_XY] = "ML_CCZ4::Kdot12", + [KDOT_XZ] = "ML_CCZ4::Kdot13", + [KDOT_YZ] = "ML_CCZ4::Kdot23", + [XTDOT_X] = "ML_CCZ4::Xtdot1", + [XTDOT_Y] = "ML_CCZ4::Xtdot2", + [XTDOT_Z] = "ML_CCZ4::Xtdot3", + [PHIDOT] = "ML_CCZ4::phidot", +#else + [GTXX] = "ML_BSSN::gt11", + [GTYY] = "ML_BSSN::gt22", + [GTZZ] = "ML_BSSN::gt33", + [GTXY] = "ML_BSSN::gt12", + [GTXZ] = "ML_BSSN::gt13", + [GTYZ] = "ML_BSSN::gt23", + [ATXX] = "ML_BSSN::At11", + [ATYY] = "ML_BSSN::At22", + [ATZZ] = "ML_BSSN::At33", + [ATXY] = "ML_BSSN::At12", + [ATXZ] = "ML_BSSN::At13", + [ATYZ] = "ML_BSSN::At23", + [PHI] = "ML_BSSN::phi", + [K] = "ML_BSSN::trK", + [XTX] = "ML_BSSN::Xt1", + [XTY] = "ML_BSSN::Xt2", + [XTZ] = "ML_BSSN::Xt3", + [BETAX] = "ML_BSSN::beta1", + [BETAY] = "ML_BSSN::beta2", + [BETAZ] = "ML_BSSN::beta3", + [ALPHA] = "ML_BSSN::alpha", + //[ALPHA] = "ADMBase::alp", + [KDOT_XX] = "ML_BSSN::Kdot11", + [KDOT_YY] = "ML_BSSN::Kdot22", + [KDOT_ZZ] = "ML_BSSN::Kdot33", + [KDOT_XY] = "ML_BSSN::Kdot12", + [KDOT_XZ] = "ML_BSSN::Kdot13", + [KDOT_YZ] = "ML_BSSN::Kdot23", + [XTDOT_X] = "ML_BSSN::Xtdot1", + [XTDOT_Y] = "ML_BSSN::Xtdot2", + [XTDOT_Z] = "ML_BSSN::Xtdot3", + [PHIDOT] = "ML_BSSN::phidot", +#endif +}; + +/* mapping between the cactus grid values and interpolated values */ +static const CCTK_INT interp_operation_indices[] = { + [I_GTXX] = GTXX, + [I_GTYY] = GTYY, + [I_GTZZ] = GTZZ, + [I_GTXY] = GTXY, + [I_GTXZ] = GTXZ, + [I_GTYZ] = GTYZ, + [I_PHI] = PHI, + [I_PHI_DX] = PHI, + [I_PHI_DY] = PHI, + [I_PHI_DZ] = PHI, + [I_ATXX] = ATXX, + [I_ATYY] = ATYY, + [I_ATZZ] = ATZZ, + [I_ATXY] = ATXY, + [I_ATXZ] = ATXZ, + [I_ATYZ] = ATYZ, + [I_K] = K, + [I_K_DX] = K, + [I_K_DY] = K, + [I_K_DZ] = K, + [I_XTX] = XTX, + [I_XTY] = XTY, + [I_XTZ] = XTZ, + [I_BETAX] = BETAX, + [I_BETAY] = BETAY, + [I_BETAZ] = BETAZ, + [I_ALPHA] = ALPHA, + [I_ALPHA_DX] = ALPHA, + [I_ALPHA_DY] = ALPHA, + [I_ALPHA_DZ] = ALPHA, + [I_ALPHA_DXX] = ALPHA, + [I_ALPHA_DYY] = ALPHA, + [I_ALPHA_DZZ] = ALPHA, + [I_ALPHA_DXY] = ALPHA, + [I_ALPHA_DXZ] = ALPHA, + [I_ALPHA_DYZ] = ALPHA, + [I_KDOT_XX] = KDOT_XX, + [I_KDOT_YY] = KDOT_YY, + [I_KDOT_ZZ] = KDOT_ZZ, + [I_KDOT_XY] = KDOT_XY, + [I_KDOT_XZ] = KDOT_XZ, + [I_KDOT_YZ] = KDOT_YZ, + [I_XTDOT_X] = XTDOT_X, + [I_XTDOT_Y] = XTDOT_Y, + [I_XTDOT_Z] = XTDOT_Z, + [I_PHIDOT] = PHIDOT, + [I_PHIDOT_DX] = PHIDOT, + [I_PHIDOT_DY] = PHIDOT, + [I_PHIDOT_DZ] = PHIDOT, +}; + +/* the operation (plain value or x/y/z-derivative) to apply during interpolation */ +static const CCTK_INT interp_operation_codes[] = { + [I_GTXX] = 0, + [I_GTYY] = 0, + [I_GTZZ] = 0, + [I_GTXY] = 0, + [I_GTXZ] = 0, + [I_GTYZ] = 0, + [I_PHI] = 0, + [I_PHI_DX] = 1, + [I_PHI_DY] = 2, + [I_PHI_DZ] = 3, + [I_ATXX] = 0, + [I_ATYY] = 0, + [I_ATZZ] = 0, + [I_ATXY] = 0, + [I_ATXZ] = 0, + [I_ATYZ] = 0, + [I_K] = 0, + [I_K_DX] = 1, + [I_K_DY] = 2, + [I_K_DZ] = 3, + [I_XTX] = 0, + [I_XTY] = 0, + [I_XTZ] = 0, + [I_BETAX] = 0, + [I_BETAY] = 0, + [I_BETAZ] = 0, + [I_ALPHA] = 0, + [I_ALPHA_DX] = 1, + [I_ALPHA_DY] = 2, + [I_ALPHA_DZ] = 3, + [I_ALPHA_DXX] = 11, + [I_ALPHA_DYY] = 22, + [I_ALPHA_DZZ] = 33, + [I_ALPHA_DXY] = 12, + [I_ALPHA_DXZ] = 13, + [I_ALPHA_DYZ] = 23, + [I_KDOT_XX] = 0, + [I_KDOT_YY] = 0, + [I_KDOT_ZZ] = 0, + [I_KDOT_XY] = 0, + [I_KDOT_XZ] = 0, + [I_KDOT_YZ] = 0, + [I_XTDOT_X] = 0, + [I_XTDOT_Y] = 0, + [I_XTDOT_Z] = 0, + [I_PHIDOT] = 0, + [I_PHIDOT_DX] = 1, + [I_PHIDOT_DY] = 2, + [I_PHIDOT_DZ] = 3, +}; + +static void init_opencl(MaximalSlicingContext *ms) +{ + int err, count; + cl_platform_id platform; + cl_context_properties props[3]; + + err = clGetPlatformIDs(1, &platform, &count); + if (err != CL_SUCCESS || count < 1) { + fprintf(stderr, "Could not get an OpenCL platform ID\n"); + return; + } + + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ms->ocl_device, &count); + if (err != CL_SUCCESS || count < 1) { + fprintf(stderr, "Could not get an OpenCL device ID\n"); + return; + } + + props[0] = CL_CONTEXT_PLATFORM; + props[1] = (cl_context_properties)platform; + props[2] = 0; + + ms->cl_ctx = clCreateContext(props, 1, &ms->ocl_device, NULL, NULL, &err); + if (err != CL_SUCCESS || !ms->cl_ctx) { + fprintf(stderr, "Could not create an OpenCL context\n"); + return; + } + + ms->cl_queue = clCreateCommandQueue(ms->cl_ctx, ms->ocl_device, 0, &err); + if (err != CL_SUCCESS || !ms->cl_queue) { + fprintf(stderr, "Could not create an OpenCL command queue: %d\n", err); + goto fail; + } + + err = clblasSetup(); + if (err != CL_SUCCESS) { + fprintf(stderr, "Error setting up clBLAS\n"); + goto fail; + } + + ms->ocl_coeffs = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); + + ms->bicgstab.cl_p = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); + ms->bicgstab.cl_v = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); + ms->bicgstab.cl_y = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); + ms->bicgstab.cl_z = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); + ms->bicgstab.cl_t = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); + ms->bicgstab.cl_res = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); + ms->bicgstab.cl_res0 = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); + ms->bicgstab.cl_tmp = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err); + ms->bicgstab.cl_tmp1 = clCreateBuffer(ms->cl_ctx, 0, 2 * ms->nb_coeffs * sizeof(double), NULL, &err); + + ms->bicgstab.cl_k = clCreateBuffer(ms->cl_ctx, 0, ms->nb_colloc_points * ms->nb_coeffs * sizeof(double), NULL, &err); + ms->bicgstab.cl_mat = clCreateBuffer(ms->cl_ctx, 0, ms->nb_colloc_points * ms->nb_coeffs * sizeof(double), NULL, &err); + + ms->bicgstab.cl_rho = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err); + ms->bicgstab.cl_alpha = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err); + ms->bicgstab.cl_beta = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err); + ms->bicgstab.cl_omega = clCreateBuffer(ms->cl_ctx, 0, 2 * sizeof(double), NULL, &err); + ms->bicgstab.cl_omega1 = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err); + + return; +fail: + if (ms->cl_queue) + clReleaseCommandQueue(ms->cl_queue); + ms->cl_queue = 0; + + if (ms->cl_ctx) + clReleaseContext(ms->cl_ctx); + ms->cl_ctx = 0; +} + +static void construct_filter_matrix(MaximalSlicingContext *ms, double filter_power) +{ + char equed = 'N'; + double cond, ferr, berr, rpivot; + + double *m, *minv, *scale, *tmp; + int *ipiv; + int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z; + int N = ms->nb_coeffs; + + ms->input_filter = malloc(sizeof(*m) * N * N); + + m = malloc(sizeof(*m) * N * N); + minv = malloc(sizeof(*m) * N * N); + scale = malloc(sizeof(*m) * N * N); + tmp = malloc(sizeof(*m) * N * N); + ipiv = malloc(sizeof(*ipiv) * N); + +#define BASIS_X (ms->basis_x_val [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x]) +#define BASIS_Z (ms->basis_z_val [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z]) + for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) { + for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) { + int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x; + + for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++) + for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) { + const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x; + + minv[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z; + scale[idx_grid + ms->nb_colloc_points * idx_coeff] = (idx_grid == idx_coeff) ? + exp(-36.0 * pow((double)idx_grid_x / ms->nb_coeffs_x, filter_power)) * + exp(-36.0 * pow((double)idx_grid_z / ms->nb_coeffs_z, filter_power)) : 0.0; + + scale[idx_grid + ms->nb_colloc_points * idx_coeff] = (idx_grid == idx_coeff) ? 1.0 : 0.0; + //if (idx_coeff_z == idx_grid_z && idx_coeff_z == 0 && idx_grid_x == idx_coeff_x) + // fprintf(stderr, "%d %g\n", idx_coeff_x, scale[idx_grid + ms->nb_colloc_points * idx_coeff]); + } + } + } + + memcpy(m, minv, sizeof(*m) * N * N); + LAPACKE_dgetrf(LAPACK_COL_MAJOR, N, N, m, N, ipiv); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, m, N, ipiv); + + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + N, N, N, 1.0, scale, N, m, N, 0.0, tmp, N); + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + N, N, N, 1.0, minv, N, tmp, N, 0.0, ms->input_filter, N); + + free(m); + free(minv); + free(scale); + free(tmp); + free(ipiv); +} + +static MaximalSlicingContext *init_ms(cGH *cctkGH, + int basis_order_r, int basis_order_z, + double sf, double filter_power, double input_filter_power, + CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z, + const int grid_size[3]) +{ + MaximalSlicingContext *ms; + int ret; + + ms = calloc(1, sizeof(*ms)); + + ms->gh = cctkGH; + + ms->basis = &qms_sb_even_basis; + //ms->basis = &qms_cheb_basis; + //ms->basis = &qms_cheb_even_basis; + //ms->basis = &qms_tl_basis; +#if POLAR + ms->basis1 = &qms_cos_even_basis; +#else + ms->basis1 = &qms_sb_even_basis; +#endif + + ms->nb_coeffs_x = basis_order_r; + ms->nb_coeffs_z = basis_order_z; + + ms->nb_coeffs = ms->nb_coeffs_x * ms->nb_coeffs_z; + + ms->nb_colloc_points_x = basis_order_r; + ms->nb_colloc_points_z = basis_order_z; + + ms->nb_colloc_points = ms->nb_colloc_points_x * ms->nb_colloc_points_z; + + if (ms->nb_colloc_points != ms->nb_coeffs) + CCTK_WARN(0, "Non-square collocation matrix"); + + ms->colloc_grid_order_x = ms->nb_colloc_points_x; + ms->colloc_grid_order_z = ms->nb_colloc_points_z; + + ms->mat = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points); + ms->coeffs = malloc(sizeof(double) * ms->nb_coeffs); + ms->rhs = malloc(sizeof(double) * ms->nb_colloc_points); + + ms->coeffs_eval = malloc(sizeof(double) * ms->nb_coeffs); + for (int i = 0; i < ARRAY_ELEMS(ms->solution_cache); i++) { + ms->solution_cache[i].coeffs = malloc(sizeof(double) * ms->nb_coeffs); + if (!ms->solution_cache[i].coeffs) + CCTK_WARN(0, "malloc failure"); + } + + ms->mat_f = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points); + ms->ipiv = malloc(sizeof(*ms->ipiv) * ms->nb_coeffs); + +#if 1 + scale_factor = 1.0; + + //scale_factor = (x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)] * 0.75) / ms->basis->colloc_point(ms->colloc_grid_order_x, ms->nb_colloc_points_x - 1); + scale_factor = (64.0 / ms->basis->colloc_point(ms->colloc_grid_order_x, ms->nb_colloc_points_x - 1)); + //scale_factor = (x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)]); + //scale_factor = x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)] - 0.5; + fprintf(stderr, "scale factor %16.16g\n", scale_factor); + +#else + scale_factor = sf; +#endif + + /* initialize the collocation grid */ + posix_memalign((void**)&ms->colloc_grid[0], 32, ms->nb_colloc_points_x * sizeof(*ms->colloc_grid[0])); + posix_memalign((void**)&ms->colloc_grid[1], 32, ms->nb_colloc_points_z * sizeof(*ms->colloc_grid[1])); + + for (int i = 0; i < ms->nb_colloc_points_x; i++) { + ms->colloc_grid[0][i] = ms->basis->colloc_point(ms->colloc_grid_order_x, i); + fprintf(stderr, "%d %g\n", i, ms->colloc_grid[0][i]); + } + for (int i = 0; i < ms->nb_colloc_points_z; i++) { + ms->colloc_grid[1][i] = ms->basis1->colloc_point(ms->colloc_grid_order_z, i); + fprintf(stderr, "%d %g\n", i, ms->colloc_grid[1][i] / (POLAR ? M_PI : 1.0)); + } + + /* precompute the basis values we will need */ + ms->basis_x_val = malloc(sizeof(*ms->basis_x_val) * ms->nb_colloc_points_x * ms->nb_coeffs_x); + ms->basis_x_dval = malloc(sizeof(*ms->basis_x_dval) * ms->nb_colloc_points_x * ms->nb_coeffs_x); + ms->basis_x_d2val = malloc(sizeof(*ms->basis_x_d2val) * ms->nb_colloc_points_x * ms->nb_coeffs_x); + for (int i = 0; i < ms->nb_colloc_points_x; i++) { + CCTK_REAL coord = ms->colloc_grid[0][i]; + for (int j = 0; j < ms->nb_coeffs_x; j++) { + ms->basis_x_val [i * ms->nb_coeffs_x + j] = ms->basis->eval(coord, j); + ms->basis_x_dval [i * ms->nb_coeffs_x + j] = ms->basis->eval_diff1(coord, j); + ms->basis_x_d2val[i * ms->nb_coeffs_x + j] = ms->basis->eval_diff2(coord, j); + } + } + + ms->basis_z_val = malloc(sizeof(*ms->basis_z_val) * ms->nb_colloc_points_z * ms->nb_coeffs_z); + ms->basis_z_dval = malloc(sizeof(*ms->basis_z_dval) * ms->nb_colloc_points_z * ms->nb_coeffs_z); + ms->basis_z_d2val = malloc(sizeof(*ms->basis_z_d2val) * ms->nb_colloc_points_z * ms->nb_coeffs_z); + for (int i = 0; i < ms->nb_colloc_points_z; i++) { + CCTK_REAL coord = ms->colloc_grid[1][i]; + for (int j = 0; j < ms->nb_coeffs_z; j++) { + ms->basis_z_val [i * ms->nb_coeffs_z + j] = ms->basis1->eval(coord, j); + ms->basis_z_dval [i * ms->nb_coeffs_z + j] = ms->basis1->eval_diff1(coord, j); + ms->basis_z_d2val[i * ms->nb_coeffs_z + j] = ms->basis1->eval_diff2(coord, j); + } + } + + posix_memalign((void**)&ms->basis_val_00, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00)); + posix_memalign((void**)&ms->basis_val_11, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00)); + posix_memalign((void**)&ms->basis_val_10, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00)); + posix_memalign((void**)&ms->basis_val_01, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00)); + posix_memalign((void**)&ms->basis_val_02, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00)); + posix_memalign((void**)&ms->basis_val_20, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00)); + for (int i = 0; i < ms->nb_colloc_points_z; i++) { + const double *basis_z = ms->basis_z_val + i * ms->nb_coeffs_z; + const double *dbasis_z = ms->basis_z_dval + i * ms->nb_coeffs_z; + const double *d2basis_z = ms->basis_z_d2val + i * ms->nb_coeffs_z; + + for (int j = 0; j < ms->nb_colloc_points_x; j++) { + const double *basis_x = ms->basis_x_val + j * ms->nb_coeffs_x; + const double *dbasis_x = ms->basis_x_dval + j * ms->nb_coeffs_x; + const double *d2basis_x = ms->basis_x_d2val + j * ms->nb_coeffs_x; + const int idx_grid = i * ms->nb_colloc_points_x + j; + +#if POLAR + double r = ms->colloc_grid[0][j]; + double theta = ms->colloc_grid[1][i]; + + double x = r * cos(theta); + double z = r * sin(theta); +#else + double x = ms->colloc_grid[0][j]; + double z = ms->colloc_grid[1][i]; +#endif + + for (int k = 0; k < ms->nb_coeffs_z; k++) + for (int l = 0; l < ms->nb_coeffs_x; l++) { + const int idx_coeff = k * ms->nb_coeffs_x + l; + const int idx = idx_grid + ms->nb_colloc_points * idx_coeff; + ms->basis_val_00[idx] = basis_x[l] * basis_z[k]; +#if POLAR + ms->basis_val_10[idx] = ((r > EPS) ? (dbasis_x[l] * basis_z[k] * x / r - basis_x[l] * dbasis_z[k] * z / SQR(r)) : 0.0); + ms->basis_val_01[idx] = ((r > EPS) ? (dbasis_x[l] * basis_z[k] * z / r + basis_x[l] * dbasis_z[k] * x / SQR(r)) : 0.0); + ms->basis_val_20[idx] = ((r > EPS) ? (SQR(x / r) * d2basis_x[l] * basis_z[k] + SQR(z / SQR(r)) * basis_x[l] * d2basis_z[k] + + (1.0 - SQR(x / r)) / r * dbasis_x[l] * basis_z[k] + + 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k] + - 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0); + ms->basis_val_02[idx] = ((r > EPS) ? (SQR(z / r) * d2basis_x[l] * basis_z[k] + SQR(x / SQR(r)) * basis_x[l] * d2basis_z[k] + + (1.0 - SQR(z / r)) / r * dbasis_x[l] * basis_z[k] + - 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k] + + 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0); + ms->basis_val_11[idx] = ((r > EPS) ? (x * z / SQR(r) * d2basis_x[l] * basis_z[k] - x * z / SQR(SQR(r)) * basis_x[l] * d2basis_z[k] + - x * z / (r * SQR(r)) * dbasis_x[l] * basis_z[k] + - (1.0 - SQR(z / r)) / SQR(r) * basis_x[l] * dbasis_z[k] + + (SQR(x) - SQR(z)) / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0); +#else + ms->basis_val_10[idx] = dbasis_x[l] * basis_z[k]; + ms->basis_val_01[idx] = basis_x[l] * dbasis_z[k]; + ms->basis_val_20[idx] = d2basis_x[l] * basis_z[k]; + ms->basis_val_02[idx] = basis_x[l] * d2basis_z[k]; + ms->basis_val_11[idx] = dbasis_x[l] * dbasis_z[k]; +#endif + } + } + } + + posix_memalign((void**)&ms->eq_coeff_00, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00)); + posix_memalign((void**)&ms->eq_coeff_11, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00)); + posix_memalign((void**)&ms->eq_coeff_10, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00)); + posix_memalign((void**)&ms->eq_coeff_01, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00)); + posix_memalign((void**)&ms->eq_coeff_02, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00)); + posix_memalign((void**)&ms->eq_coeff_20, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00)); + + ms->interp_coords[0] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[0])); + ms->interp_coords[1] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[1])); + ms->interp_coords[2] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[2])); + for (int i = 0; i < ms->nb_colloc_points_z; i++) { + for (int j = 0; j < ms->nb_colloc_points_x; j++) { +#if POLAR + double phi = ms->colloc_grid[1][i]; + double r = ms->colloc_grid[0][j]; + + double x = r * cos(phi); + double z = r * sin(phi); +#else + double x = ms->colloc_grid[0][j]; + double z = ms->colloc_grid[1][i]; +#endif + + ms->interp_coords[0][i * ms->nb_colloc_points_x + j] = x; + ms->interp_coords[1][i * ms->nb_colloc_points_x + j] = 0; + ms->interp_coords[2][i * ms->nb_colloc_points_x + j] = z; + } + } + + for (int i = 0; i < ARRAY_ELEMS(ms->metric_u); i++) + ms->metric_u[i] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_values[i])); + + ms->kij_kij = malloc(ms->nb_colloc_points * sizeof(*ms->kij_kij)); + + ms->coeff_scale = malloc(ms->nb_coeffs * sizeof(double)); + for (int j = 0; j < ms->nb_coeffs_z; j++) + for (int i = 0; i < ms->nb_coeffs_x; i++) { + //ms->coeff_scale[j * ms->nb_coeffs_x + i] = 1.0; + ms->coeff_scale[j * ms->nb_coeffs_x + i] = exp(-36.0 * pow((double)i / ms->nb_coeffs_x, filter_power)) * + exp(-36.0 * pow((double)j / ms->nb_coeffs_z, filter_power)); + //ms->coeff_scale[j * ms->nb_coeffs_x + i] = ((i < (2.0 / 3.0) * ms->nb_coeffs_x) ? 1.0 : SQR(cos((((double)i / ms->nb_coeffs_x) - (2.0 / 3.0)) * 3.0 * M_PI / 2.0))) * + // ((j < (2.0 / 3.0) * ms->nb_coeffs_z) ? 1.0 : SQR(cos((((double)j / ms->nb_coeffs_z) - (2.0 / 3.0)) * 3.0 * M_PI / 2.0))); + } + + for (int i = 0; i < ARRAY_ELEMS(ms->interp_values); i++) { + ms->interp_values[i] = malloc(sizeof(*ms->interp_values[i]) * ms->nb_colloc_points); + ms->interp_values_prefilter[i] = malloc(sizeof(*ms->interp_values[i]) * ms->nb_colloc_points); + if (!ms->interp_values[i] || !ms->interp_values_prefilter[i]) + CCTK_WARN(0, "Malloc failure"); + ms->interp_value_codes[i] = CCTK_VARIABLE_REAL; + } + + for (int i = 0; i < ARRAY_ELEMS(metric_vars); i++) { + ms->interp_vars_indices[i] = CCTK_VarIndex(metric_vars[i]); + if (ms->interp_vars_indices[i] < 0) + CCTK_VWarn(0, __LINE__, __FILE__, CCTK_THORNSTRING, "Error getting the index of variable: %s\n", metric_vars[i]); + } + + ms->coord_system = CCTK_CoordSystemHandle("cart3d"); + if (ms->coord_system < 0) + CCTK_WARN(0, "Error getting the coordinate system"); + + ms->interp_operator = CCTK_InterpHandle("Lagrange polynomial interpolation (tensor product)"); + if (ms->interp_operator < 0) + CCTK_WARN(0, "Error getting the interpolation operator"); + + ms->interp_params = Util_TableCreateFromString("order=4 want_global_mode=1"); + if (ms->interp_params < 0) + CCTK_WARN(0, "Error creating interpolation parameters table"); + + ret = Util_TableSetIntArray(ms->interp_params, NB_INTERP_VARS, + interp_operation_codes, "operation_codes"); + if (ret < 0) + CCTK_WARN(0, "Error setting operation codes"); + + ret = Util_TableSetIntArray(ms->interp_params, NB_INTERP_VARS, + interp_operation_indices, "operand_indices"); + if (ret < 0) + CCTK_WARN(0, "Error setting operand indices"); + + ms->bicgstab.p = malloc(sizeof(double) * ms->nb_coeffs); + ms->bicgstab.v = malloc(sizeof(double) * ms->nb_coeffs); + ms->bicgstab.y = malloc(sizeof(double) * ms->nb_coeffs); + ms->bicgstab.z = malloc(sizeof(double) * ms->nb_coeffs); + ms->bicgstab.t = malloc(sizeof(double) * ms->nb_coeffs); + ms->bicgstab.res = malloc(sizeof(double) * ms->nb_coeffs); + ms->bicgstab.res0 = malloc(sizeof(double) * ms->nb_coeffs); + ms->bicgstab.k = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points); + + ms->steps_since_inverse = INT_MAX; + + init_opencl(ms); + + construct_filter_matrix(ms, input_filter_power); + + CCTK_TimerCreate("MaximalSlicingAxi_Solve"); + CCTK_TimerCreate("MaximalSlicingAxi_Expand"); + CCTK_TimerCreate("MaximalSlicingAxi_interp_geometry"); + CCTK_TimerCreate("MaximalSlicingAxi_calc_eq_coeffs"); + CCTK_TimerCreate("MaximalSlicingAxi_construct_matrix"); + CCTK_TimerCreate("MaximalSlicingAxi_filter_input"); + CCTK_TimerCreate("MaximalSlicingAxi_solve_LU"); + CCTK_TimerCreate("MaximalSlicingAxi_solve_BiCGSTAB"); + CCTK_TimerCreate("MaximalSlicingAxi_Polish"); + + return ms; +} + +static CoordPatch *get_coord_patch(MaximalSlicingContext *ms, + CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z) +{ + cGH *cctkGH = ms->gh; + + CoordPatch *cp; + int64_t grid_size; + int i; + + for (int i = 0; i < ms->nb_patches; i++) { + cp = &ms->patches[i]; + + if (cp->origin[0] == ms->gh->cctk_origin_space[0] && + cp->origin[1] == ms->gh->cctk_origin_space[1] && + cp->origin[2] == ms->gh->cctk_origin_space[2] && + cp->size[0] == ms->gh->cctk_lsh[0] && + cp->size[1] == ms->gh->cctk_lsh[1] && + cp->size[2] == ms->gh->cctk_lsh[2] && + cp->delta[0] == ms->gh->cctk_levfac[0] && + cp->delta[1] == ms->gh->cctk_levfac[1] && + cp->delta[2] == ms->gh->cctk_levfac[2]) + return cp; + } + + grid_size = cctkGH->cctk_lsh[0] * cctkGH->cctk_lsh[1] * cctkGH->cctk_lsh[2]; + + /* create a new patch */ + ms->patches = realloc(ms->patches, sizeof(*ms->patches) * (ms->nb_patches + 1)); + cp = &ms->patches[ms->nb_patches]; + + memset(cp, 0, sizeof(*cp)); + + memcpy(cp->origin, ms->gh->cctk_origin_space, sizeof(cp->origin)); + memcpy(cp->size, ms->gh->cctk_lsh, sizeof(cp->size)); + memcpy(cp->delta, ms->gh->cctk_levfac, sizeof(cp->delta)); + + for (i = 0; i < cp->size[1]; i++) + if (fabs(y[CCTK_GFINDEX3D(cctkGH, 0, i, 0)]) < 1e-8) { + cp->y_idx = i; + break; + } + if (i == cp->size[1]) + CCTK_WARN(0, "The grid does not include y==0"); + +#if 0 + posix_memalign((void**)&cp->basis_val_r, 32, sizeof(*cp->basis_val_r) * ms->nb_coeffs_x * ms->gh->cctk_lsh[1] * ms->gh->cctk_lsh[0]); + for (int j = 0; j < ms->gh->cctk_lsh[1]; j++) + for (int i = 0; i < ms->gh->cctk_lsh[0]; i++) { + CCTK_REAL xx = x[CCTK_GFINDEX3D(ms->gh, i, j, 0)]; + CCTK_REAL yy = y[CCTK_GFINDEX3D(ms->gh, i, j, 0)]; + CCTK_REAL r = sqrt(SQR(xx) + SQR(yy)); + + for (int k = 0; k < ms->nb_coeffs_x; k++) + //cp->basis_val_r [(j * ms->gh->cctk_lsh[0] + i) * ms->nb_coeffs_x + k] = ms->basis->eval(r, k); + cp->basis_val_r [(j * ms->gh->cctk_lsh[0] + i) + ms->gh->cctk_lsh[1] * ms->gh->cctk_lsh[0] * k] = ms->basis->eval(r, k); + } + + posix_memalign((void**)&cp->basis_val_z, 32, sizeof(*cp->basis_val_z) * ms->nb_coeffs_z * ms->gh->cctk_lsh[2]); + for (int i = 0; i < ms->gh->cctk_lsh[2]; i++) { + CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, i)]; + for (int j = 0; j < ms->nb_coeffs_z; j++) + cp->basis_val_z [i * ms->nb_coeffs_z + j] = ms->basis->eval(fabs(zz), j); + //cp->basis_val_z [i + ms->gh->cctk_lsh[2] * j] = ms->basis->eval(zz, j); + } + posix_memalign((void**)&cp->transform_z, 32, sizeof(*cp->transform_z) * cctkGH->cctk_lsh[2] * ms->nb_coeffs_x); + posix_memalign((void**)&cp->one, 32, sizeof(*cp->one) * grid_size); + for (int i = 0; i < grid_size; i++) + cp->one[i] = 1.0; +#else + posix_memalign((void**)&cp->basis_val_r, 32, sizeof(*cp->basis_val_r) * ms->nb_coeffs_x * ms->gh->cctk_lsh[0]); + for (int i = 0; i < ms->gh->cctk_lsh[0]; i++) { + CCTK_REAL xx = x[CCTK_GFINDEX3D(ms->gh, i, 0, 0)]; + + for (int k = 0; k < ms->nb_coeffs_x; k++) + cp->basis_val_r[i * ms->nb_coeffs_x + k] = ms->basis->eval(fabs(xx), k); + } + + posix_memalign((void**)&cp->basis_val_z, 32, sizeof(*cp->basis_val_z) * ms->nb_coeffs_z * ms->gh->cctk_lsh[2]); + for (int i = 0; i < ms->gh->cctk_lsh[2]; i++) { + CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, i)]; + for (int j = 0; j < ms->nb_coeffs_z; j++) + cp->basis_val_z[i * ms->nb_coeffs_z + j] = ms->basis->eval(fabs(zz), j); + } + + posix_memalign((void**)&cp->transform_matrix, 32, sizeof(*cp->transform_matrix) * ms->nb_coeffs_x * cp->size[0] * cp->size[2]); + posix_memalign((void**)&cp->transform_matrix1, 32, sizeof(*cp->transform_matrix1) * ms->nb_coeffs_z * cp->size[0] * cp->size[2]); +#pragma omp parallel for + for (int j = 0; j < cp->size[2]; j++) { + CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, j)]; + + for (int i = 0; i < cp->size[0]; i++) { + const int idx_grid = j * cp->size[0] + i; + + CCTK_REAL xx = x[CCTK_GFINDEX3D(ms->gh, i, 0, 0)]; + +#if POLAR + double coord0 = sqrt(SQR(xx) + SQR(zz)); + double coord1 = atan2(zz, xx); +#else + double coord0 = xx; + double coord1 = zz; +#endif + + //for (int k = 0; k < ms->nb_coeffs_z; k++) + // for (int l = 0; l < ms->nb_coeffs_x; l++) { + // const int idx_coeff = k * ms->nb_coeffs_x + l; + // cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * idx_coeff] = ms->basis->eval(r, l) * ms->basis1->eval(phi, k); + // } + for (int k = 0; k < ms->nb_coeffs_x; k++) + cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * k] = ms->basis->eval(coord0, k); + for (int k = 0; k < ms->nb_coeffs_z; k++) + cp->transform_matrix1[idx_grid * ms->nb_coeffs_z + k] = ms->basis1->eval(coord1, k); + } + } + posix_memalign((void**)&cp->transform_tmp, 32, sizeof(*cp->transform_tmp) * cp->size[0] * cp->size[2] * ms->nb_coeffs_z); +#endif + + ms->nb_patches++; + return cp; +} + +static MaximalSlicingContext *ms_context; + +void quasimaximal_slicing_axi_solve(CCTK_ARGUMENTS) +{ + MaximalSlicingContext *ms; + + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + double time; + + if (!ms_context) { + ms_context = init_ms(cctkGH, basis_order_r, basis_order_z, + scale_factor, filter_power, input_filter_power, x, y, z, cctk_lsh); + } + ms = ms_context; + + time = cctkGH->cctk_time / ms->gh->cctk_delta_time; + + if (ms->gh->cctk_levfac[0] != 1 || + fabs(time - ceilf(time)) > 1e-8) + return; + + fprintf(stderr, "qms solve: time %g %g\n", ms->gh->cctk_time, time); + + CCTK_TimerStart("MaximalSlicingAxi_Solve"); + qms_maximal_solve(ms); + CCTK_TimerStop("MaximalSlicingAxi_Solve"); + + if (export_coeffs) + memcpy(w_coeffs, ms->coeffs, sizeof(*w_coeffs) * ms->nb_coeffs); + + if (1) { + double *tmp; + if (ms->nb_solutions == ARRAY_ELEMS(ms->solution_cache)) { + tmp = ms->solution_cache[0].coeffs; + memmove(ms->solution_cache, ms->solution_cache + 1, sizeof(ms->solution_cache[0]) * (ARRAY_ELEMS(ms->solution_cache) - 1)); + } else { + ms->nb_solutions++; + tmp = ms->solution_cache[ms->nb_solutions - 1].coeffs; + } + ms->solution_cache[ms->nb_solutions - 1].coeffs = ms->coeffs; + ms->solution_cache[ms->nb_solutions - 1].time = ms->gh->cctk_time; + + ms->coeffs = tmp; + } +} + +void quasimaximal_slicing_axi_eval(CCTK_ARGUMENTS) +{ + MaximalSlicingContext *ms; + + CoordPatch *cp; + + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + int64_t expand_start, totaltime_start; + + double *coeffs = NULL; + int i, ret; + + totaltime_start = gettime(); + + /* on the first run, init the solver */ + if (!ms_context) { + ms_context = init_ms(cctkGH, basis_order_r, basis_order_z, + scale_factor, filter_power, input_filter_power, x, y, z, cctk_lsh); + } + ms = ms_context; + + cp = get_coord_patch(ms, x, y, z); + +#if 0 + coeffs = ms->coeffs; +#else + coeffs = ms->coeffs_eval; + + if (ms->nb_solutions < 2) { + memset(coeffs, 0, sizeof(*coeffs) * ms->nb_coeffs); + //fprintf(stderr, "qms eval: time %g zero\n", ms->gh->cctk_time); + } else { + double *coeffs0 = ms->solution_cache[ms->nb_solutions - 2].coeffs; + double *coeffs1 = ms->solution_cache[ms->nb_solutions - 1].coeffs; + double time0 = ms->solution_cache[ms->nb_solutions - 2].time; + double time1 = ms->solution_cache[ms->nb_solutions - 1].time; + double time = ms->gh->cctk_time; + + //fprintf(stderr, "qms eval: time %g interp from %g %g\n", ms->gh->cctk_time, time0, time1); + + for (int i = 0; i < ms->nb_coeffs; i++) + coeffs[i] = coeffs1[i] * (time - time0) / (time1 - time0) + coeffs0[i] * (time - time1) / (time0 - time1); + + } +#endif + + + CCTK_TimerStart("MaximalSlicingAxi_Expand"); + expand_start = gettime(); +#if 0 +#pragma omp parallel for + for (int k = 0; k < cctk_lsh[2]; k++) { + for (int i = 0; i < cctk_lsh[0]; i++) { + int idx = CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, k); + double xx = x[idx]; + double zz = z[idx]; + double r = sqrt(SQR(xx) + SQR(zz)); + double phi = atan2(zz, xx); + + double val = 1.0; + + for (int l = 0; l < ms->nb_coeffs_z; l++) { + double tmp = 0.0; + for (int m = 0; m < ms->nb_coeffs_x; m++) { + const int idx_coeff = l * ms->nb_coeffs_x + m; + tmp += ms->coeffs[idx_coeff] * ms->basis->eval(r, m); + } + val += tmp * ms->basis1->eval(phi, l); + } + + alp[idx] = val; + } + } +#else + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + cctk_lsh[0] * cctk_lsh[2], ms->nb_coeffs_z, ms->nb_coeffs_x, + 1.0, cp->transform_matrix, cctk_lsh[0] * cctk_lsh[2], + coeffs, ms->nb_coeffs_x, 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]); +#pragma omp parallel for + for (int j = 0; j < cctk_lsh[2]; j++) + for (int i = 0; i < cctk_lsh[0]; i++) { + const int idx_grid = j * cctk_lsh[0] + i; + const double val = cblas_ddot(ms->nb_coeffs_z, cp->transform_matrix1 + idx_grid * ms->nb_coeffs_z, 1, + cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]); + W[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = val; + } +#endif + //memcpy(alp, cp->one, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*alp)); + //memset(alp, 0, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*alp)); + //cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + // ms->nb_coeffs_x, cctk_lsh[2], ms->nb_coeffs_z, 1.0, + // coeffs, ms->nb_coeffs_x, cp->basis_val_z, ms->nb_coeffs_z, + // 0.0, cp->transform_z, ms->nb_coeffs_x); + //cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + // cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], ms->nb_coeffs_x, 1.0, + // cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, ms->nb_coeffs_x, + // 1.0, alp, cctk_lsh[0] * cctk_lsh[1]); + + ms->grid_expand_time += gettime() - expand_start; + ms->grid_expand_count++; + + CCTK_TimerStop("MaximalSlicingAxi_Expand"); + + ms->solve_time += gettime() - totaltime_start; + ms->solve_count++; + + /* print stats */ + if (!(ms->solve_count & 255)) { + fprintf(stderr, + "maximal slicing solves: %lu, " + "total time %g s, avg time per call %g ms\n", + ms->solve_count, (double)ms->solve_time / 1e6, + (double)ms->solve_time / ms->solve_count / 1e3); + fprintf(stderr, + "%g%% interpolate geometry: %lu, " + "total time %g s, avg time per call %g ms\n", + (double)ms->interp_geometry_time * 100 / ms->solve_time, + ms->interp_geometry_count, (double)ms->interp_geometry_time / 1e6, + (double)ms->interp_geometry_time / ms->interp_geometry_count / 1e3); + fprintf(stderr, + "%g%% calc equation coefficients: %lu, " + "total time %g s, avg time per call %g ms\n", + (double)ms->calc_eq_coeffs_time * 100 / ms->solve_time, + ms->calc_eq_coeffs_count, (double)ms->calc_eq_coeffs_time / 1e6, + (double)ms->calc_eq_coeffs_time / ms->calc_eq_coeffs_count / 1e3); + fprintf(stderr, + "%g%% pseudospectral matrix construction: %lu, " + "total time %g s, avg time per call %g ms\n", + (double)ms->construct_matrix_time * 100 / ms->solve_time, + ms->construct_matrix_count, (double)ms->construct_matrix_time / 1e6, + (double)ms->construct_matrix_time / ms->construct_matrix_count / 1e3); + fprintf(stderr, + "%g%% BiCGSTAB %lu solves, " + "%lu iterations, total time %g s, " + "avg iterations per solve %g, avg time per solve %g ms, " + "avg time per iteration %g ms\n", + (double)ms->cg_time_total * 100 / ms->solve_time, + ms->cg_solve_count, ms->cg_iter_count, (double)ms->cg_time_total / 1e6, + (double)ms->cg_iter_count / ms->cg_solve_count, + (double)ms->cg_time_total / ms->cg_solve_count / 1e3, + (double)ms->cg_time_total / ms->cg_iter_count / 1e3); + fprintf(stderr, + "%g%% LU %lu solves, total time %g s, avg time per solve %g ms\n", + (double)ms->lu_solves_time * 100 / ms->solve_time, + ms->lu_solves_count, (double)ms->lu_solves_time / 1e6, + (double)ms->lu_solves_time / ms->lu_solves_count / 1e3); + fprintf(stderr, + "%g%% grid expansion: %lu, total time %g s, avg time per call %g ms\n", + (double)ms->grid_expand_time * 100 / ms->solve_time, + ms->grid_expand_count, (double)ms->grid_expand_time / 1e6, + (double)ms->grid_expand_time / ms->grid_expand_count / 1e3); + } +} + +void qms_init(CCTK_ARGUMENTS) +{ + DECLARE_CCTK_ARGUMENTS; + DECLARE_CCTK_PARAMETERS; + + double *Kdot11 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot11"); + double *Kdot22 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot22"); + double *Kdot33 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot33"); + double *Kdot12 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot12"); + double *Kdot13 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot13"); + double *Kdot23 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot23"); + + double *Xtdot1 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Xtdot1"); + double *Xtdot2 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Xtdot2"); + double *Xtdot3 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Xtdot3"); + + double *phidot = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::phidot"); + + for (int k = 0; k < cctk_lsh[2]; k++) + for (int j = 0; j < cctk_lsh[1]; j++) + for (int i = 0; i < cctk_lsh[0]; i++) { + int idx = CCTK_GFINDEX3D(cctkGH, i, j, k); + + Kdot11[idx] = 0.0; + Kdot22[idx] = 0.0; + Kdot33[idx] = 0.0; + Kdot12[idx] = 0.0; + Kdot13[idx] = 0.0; + Kdot23[idx] = 0.0; + + Xtdot1[idx] = 0.0; + Xtdot2[idx] = 0.0; + Xtdot3[idx] = 0.0; + + phidot[idx] = 0.0; + } +} diff --git a/src/qms.h b/src/qms.h new file mode 100644 index 0000000..f88c0a8 --- /dev/null +++ b/src/qms.h @@ -0,0 +1,301 @@ + +#include + +#include + +#include "cctk.h" + +#define POLAR (1) + +#define CCZ4 (0) + +#define SQR(x) ((x) * (x)) +#define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0) +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define MIN(x, y) ((x) > (y) ? (y) : (x)) +#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr)) + +/* + * small number to avoid r=0 singularities + */ +#define EPS 1E-08 + +#define SCALE_FACTOR scale_factor + +/* indices (in our code, not cactus structs) of the grid functions which we'll need to + * interpolate on the pseudospectral grid */ +enum MetricVars { + GTXX = 0, + GTYY, + GTZZ, + GTXY, + GTXZ, + GTYZ, + PHI, + ATXX, + ATYY, + ATZZ, + ATXY, + ATXZ, + ATYZ, + K, + XTX, + XTY, + XTZ, + BETAX, + BETAY, + BETAZ, + ALPHA, + KDOT_XX, + KDOT_YY, + KDOT_ZZ, + KDOT_XY, + KDOT_XZ, + KDOT_YZ, + XTDOT_X, + XTDOT_Y, + XTDOT_Z, + PHIDOT, + NB_METRIC_VARS, +}; + +/* indices of the interpolated values of the above grid functions and their derivatives */ +enum InterpMetricVars { + I_GTXX = 0, + I_GTYY, + I_GTZZ, + I_GTXY, + I_GTXZ, + I_GTYZ, + I_PHI, + I_PHI_DX, + I_PHI_DY, + I_PHI_DZ, + I_ATXX, + I_ATYY, + I_ATZZ, + I_ATXY, + I_ATXZ, + I_ATYZ, + I_K, + I_K_DX, + I_K_DY, + I_K_DZ, + I_XTX, + I_XTY, + I_XTZ, + I_BETAX, + I_BETAY, + I_BETAZ, + I_ALPHA, + I_ALPHA_DX, + I_ALPHA_DY, + I_ALPHA_DZ, + I_ALPHA_DXX, + I_ALPHA_DYY, + I_ALPHA_DZZ, + I_ALPHA_DXY, + I_ALPHA_DXZ, + I_ALPHA_DYZ, + I_KDOT_XX, + I_KDOT_YY, + I_KDOT_ZZ, + I_KDOT_XY, + I_KDOT_XZ, + I_KDOT_YZ, + I_XTDOT_X, + I_XTDOT_Y, + I_XTDOT_Z, + I_PHIDOT, + I_PHIDOT_DX, + I_PHIDOT_DY, + I_PHIDOT_DZ, + NB_INTERP_VARS, +}; + +/* a set of basis functions */ +typedef struct BasisSet { + /* evaluate the idx-th basis function at the specified point*/ + double (*eval) (double coord, int idx); + /* evaluate the first derivative of the idx-th basis function at the specified point*/ + double (*eval_diff1)(double coord, int idx); + /* evaluate the second derivative of the idx-th basis function at the specified point*/ + double (*eval_diff2)(double coord, int idx); + /** + * Get the idx-th collocation point for the specified order. + * idx runs from 0 to order - 1 (inclusive) + */ + double (*colloc_point)(int order, int idx); +} BasisSet; + +extern const BasisSet qms_cheb_basis; +extern const BasisSet qms_cheb_even_basis; +extern const BasisSet qms_full_basis; +extern const BasisSet qms_tb_even_basis; +extern const BasisSet qms_sb_even_basis; +extern const BasisSet qms_tl_basis; +extern const BasisSet qms_cos_even_basis; + +extern double scale_factor; + +typedef struct MGContext MGContext; +typedef struct SORContext SORContext; + +/* precomputed values for a given refined grid */ +typedef struct CoordPatch { + CCTK_REAL origin[3]; + CCTK_INT delta[3]; + CCTK_INT size[3]; + + // basis values on the grid + double *basis_val_r; + double *basis_val_z; + + double *transform_z; + double *transform_matrix; + double *transform_matrix1; + double *transform_tmp; + double *one; + + int y_idx; + + MGContext *mg; + SORContext *sor; +} CoordPatch; + +/* state and scratch storage for the BiCGSTAB solver */ +typedef struct BiCGSTABContext { + double *p, *v, *y, *z, *t; + double *res, *res0; + double *k; + + cl_mem cl_p, cl_v, cl_y, cl_z, cl_t; + cl_mem cl_res, cl_res0; + cl_mem cl_k, cl_mat; + cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1; + cl_mem cl_tmp, cl_tmp1; + +} BiCGSTABContext; + +typedef struct MaximalSlicingContext { + cGH *gh; + const BasisSet *basis; + const BasisSet *basis1; + + struct { + double time; + double *coeffs; + } solution_cache[8]; + int nb_solutions; + + double *coeffs_eval; + + BiCGSTABContext bicgstab; + int steps_since_inverse; + + uint64_t solve_count; + uint64_t solve_time; + + uint64_t lu_solves_count; + uint64_t lu_solves_time; + + uint64_t cg_solve_count; + uint64_t cg_iter_count; + uint64_t cg_time_total; + + uint64_t interp_geometry_count; + uint64_t interp_geometry_time; + + uint64_t calc_eq_coeffs_count; + uint64_t calc_eq_coeffs_time; + + uint64_t construct_matrix_count; + uint64_t construct_matrix_time; + + uint64_t grid_expand_count; + uint64_t grid_expand_time; + + // the grid of collocation points + double *colloc_grid[2]; + + // interpolation parameters + int coord_system; + int interp_operator; + int interp_params; + + CCTK_REAL *interp_coords[3]; + + int interp_vars_indices[NB_METRIC_VARS]; + + CCTK_REAL *interp_values[NB_INTERP_VARS]; + CCTK_REAL *interp_values_prefilter[NB_INTERP_VARS]; + CCTK_INT interp_value_codes[NB_INTERP_VARS]; + + CCTK_REAL *metric_u[6]; + + CCTK_REAL *kij_kij; + CCTK_REAL *trk; + + int nb_coeffs_x; + int nb_coeffs_z; + int nb_coeffs; + + int nb_colloc_points_x; + int nb_colloc_points_z; + int nb_colloc_points; + + int colloc_grid_order_x; + int colloc_grid_order_z; + + double *mat; + double *mat_f; + double *rhs; + double *coeffs; + int *ipiv; + double *basis_x_val; + double *basis_x_dval; + double *basis_x_d2val; + + double *basis_z_val; + double *basis_z_dval; + double *basis_z_d2val; + + double *basis_val_00; + double *basis_val_20; + double *basis_val_02; + double *basis_val_11; + double *basis_val_10; + double *basis_val_01; + + double *eq_coeff_00; + double *eq_coeff_20; + double *eq_coeff_02; + double *eq_coeff_11; + double *eq_coeff_10; + double *eq_coeff_01; + + double *coeff_scale; + + double *input_filter; + + CoordPatch *patches; + int nb_patches; + + // OpenCL / CLBLAS stuff + cl_context cl_ctx; + cl_command_queue cl_queue; + cl_device_id ocl_device; + + cl_mem ocl_coeffs; +} MaximalSlicingContext; + +int qms_maximal_solve(MaximalSlicingContext *ms); + +#include +static inline int64_t gettime(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; +} + diff --git a/src/register.c b/src/register.c new file mode 100644 index 0000000..65c2d21 --- /dev/null +++ b/src/register.c @@ -0,0 +1,9 @@ + +#include "cctk.h" +#include "cctk_Arguments.h" +#include "Slicing.h" + +void qms_register_mol(CCTK_ARGUMENTS) +{ + MoLRegisterConstrained(CCTK_VarIndex("QuasiMaximalSlicing::W")); +} diff --git a/src/solve.c b/src/solve.c new file mode 100644 index 0000000..0bd56e2 --- /dev/null +++ b/src/solve.c @@ -0,0 +1,636 @@ + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "qms.h" + +static int construct_matrix(MaximalSlicingContext *ms) +{ + double *mat = ms->mat; + int idx_coeff, idx_grid; + +#pragma omp parallel for + for (idx_coeff = 0; idx_coeff < ms->nb_coeffs; idx_coeff++) + for (idx_grid = 0; idx_grid < ms->nb_colloc_points; idx_grid++) { + const int idx = idx_grid + ms->nb_colloc_points * idx_coeff; + + mat[idx] = ms->eq_coeff_00[idx_grid] * ms->basis_val_00[idx] + + ms->eq_coeff_10[idx_grid] * ms->basis_val_10[idx] + + ms->eq_coeff_01[idx_grid] * ms->basis_val_01[idx] + + ms->eq_coeff_11[idx_grid] * ms->basis_val_11[idx] + + ms->eq_coeff_20[idx_grid] * ms->basis_val_20[idx] + + ms->eq_coeff_02[idx_grid] * ms->basis_val_02[idx]; + } + + return 0; +} + +/* interpolate the cactus gridfunctions onto the pseudospectral grid */ +static int interp_geometry(MaximalSlicingContext *ms) +{ + int ret; + + ret = CCTK_InterpGridArrays(ms->gh, 3, ms->interp_operator, ms->interp_params, + ms->coord_system, ms->nb_colloc_points, CCTK_VARIABLE_REAL, + (const void * const *)ms->interp_coords, ARRAY_ELEMS(ms->interp_vars_indices), ms->interp_vars_indices, + ARRAY_ELEMS(ms->interp_values), ms->interp_value_codes, (void * const *)ms->interp_values_prefilter); + if (ret < 0) + CCTK_WARN(0, "Error interpolating"); + + CCTK_TimerStart("MaximalSlicingAxi_filter_input"); + for (int i = 0; i < ARRAY_ELEMS(ms->interp_values); i++) { + cblas_dgemv(CblasColMajor, CblasNoTrans, ms->nb_colloc_points, ms->nb_colloc_points, 1.0, + ms->input_filter, ms->nb_colloc_points, ms->interp_values_prefilter[i], 1, 0.0, + ms->interp_values[i], 1); + } + CCTK_TimerStop("MaximalSlicingAxi_filter_input"); + + return 0; +} + +static int calc_eq_coeffs(MaximalSlicingContext *ms, double *prhs_max) +{ + double rhs_max = 0.0; +//#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x) reduction(max : rhs_max) + for (int i = 0; i < ms->nb_colloc_points; i++) { + const double x = ms->interp_coords[0][i]; + const double z = ms->interp_coords[2][i]; + const int zaxis = x <= EPS; + + double Am[3][3], K[3][3], Km[3][3], Ku[3][3], gtu[3][3]; + double k2, kij_dij_alpha, k_kdot, k3; + + const double gtxx = ms->interp_values[I_GTXX][i]; + const double gtyy = ms->interp_values[I_GTYY][i]; + const double gtzz = ms->interp_values[I_GTZZ][i]; + const double gtxy = ms->interp_values[I_GTXY][i]; + const double gtxz = ms->interp_values[I_GTXZ][i]; + const double gtyz = ms->interp_values[I_GTYZ][i]; + + const double gt[3][3] = {{ gtxx, gtxy, gtxz }, + { gtxy, gtyy, gtyz }, + { gtxz, gtyz, gtzz }}; + + const double Atxx = ms->interp_values[I_ATXX][i]; + const double Atyy = ms->interp_values[I_ATYY][i]; + const double Atzz = ms->interp_values[I_ATZZ][i]; + const double Atxy = ms->interp_values[I_ATXY][i]; + const double Atxz = ms->interp_values[I_ATXZ][i]; + const double Atyz = ms->interp_values[I_ATYZ][i]; + + const double phi = ms->interp_values[I_PHI][i]; + + const double phidot = ms->interp_values[I_PHIDOT][i]; + const double phidot_dx = ms->interp_values[I_PHIDOT_DX][i]; + const double phidot_dz = ms->interp_values[I_PHIDOT_DZ][i]; + + const double At[3][3] = {{ Atxx, Atxy, Atxz }, + { Atxy, Atyy, Atyz }, + { Atxz, Atyz, Atzz }}; + + const double trK = ms->interp_values[I_K][i]; + const double kdot_xx = ms->interp_values[I_KDOT_XX][i]; + const double kdot_yy = ms->interp_values[I_KDOT_YY][i]; + const double kdot_zz = ms->interp_values[I_KDOT_ZZ][i]; + const double kdot_xy = ms->interp_values[I_KDOT_XY][i]; + const double kdot_xz = ms->interp_values[I_KDOT_XZ][i]; + const double kdot_yz = ms->interp_values[I_KDOT_YZ][i]; + + const double kdot[3][3] = {{ kdot_xx, kdot_xy, kdot_xz }, + { kdot_xy, kdot_yy, kdot_yz }, + { kdot_xz, kdot_yz, kdot_zz }}; + + const double alpha = ms->interp_values[I_ALPHA][i]; + const double dx_alpha = ms->interp_values[I_ALPHA_DX][i]; + const double dz_alpha = ms->interp_values[I_ALPHA_DZ][i]; + const double dxx_alpha = ms->interp_values[I_ALPHA_DXX][i]; + const double dzz_alpha = ms->interp_values[I_ALPHA_DZZ][i]; + const double dxz_alpha = ms->interp_values[I_ALPHA_DXZ][i]; + + const double dij_alpha[3][3] = {{ dxx_alpha, 0, dxz_alpha }, + { 0, zaxis ? dxx_alpha : dx_alpha / x, 0 }, + { dxz_alpha, 0, dzz_alpha }}; + + const double Xtx = ms->interp_values[I_XTX][i]; + const double Xtz = ms->interp_values[I_XTZ][i]; + + const double Xtdot_x = ms->interp_values[I_XTDOT_X][i]; + const double Xtdot_z = ms->interp_values[I_XTDOT_Z][i]; + + const double det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz); + + // \tilde{γ}^{ij} + gtu[0][0] = (gtyy * gtzz - SQR(gtyz)) / det; + gtu[1][1] = (gtxx * gtzz - SQR(gtxz)) / det; + gtu[2][2] = (gtxx * gtyy - SQR(gtxy)) / det; + gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det; + gtu[0][2] = (gtxy * gtyz - gtyy * gtxz) / det; + gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det; + gtu[1][0] = gtu[0][1]; + gtu[2][0] = gtu[0][2]; + gtu[2][1] = gtu[1][2]; + + // K_{ij} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + K[j][k] = At[j][k] / SQR(phi) + gt[j][k] * trK; + + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + double val = 0.0; + for (int l = 0; l < 3; l++) + val += SQR(phi) * gtu[j][l] * K[l][k]; + Km[j][k] = val; + } + + // K^{ij} + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + double val = 0.0; + for (int l = 0; l < 3; l++) + val += SQR(phi) * gtu[j][l] * Km[k][l]; + Ku[j][k] = val; + } + + // \tilde{A}_{i}^j + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + double val = 0.0; + for (int l = 0; l < 3; l++) + val += gtu[j][l] * At[l][k]; + Am[j][k] = val; + } + + kij_dij_alpha = 0.0; + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + kij_dij_alpha += Ku[j][k] * dij_alpha[j][k]; + + k_kdot = 0.0; + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + k_kdot += kdot[j][k] * Ku[j][k]; + + k3 = 0.0; + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) { + double val = 0.0; + for (int l = 0; l < 3; l++) + val += Km[k][l] * Ku[l][j]; + k3 += val * K[j][k]; + } + + // K_{ij} K^{ij} + k2 = 0.0; + for (int j = 0; j < 3; j++) + for (int k = 0; k < 3; k++) + k2 += Km[j][k] * Km[k][j]; + + { + const double gtuxx = gtu[0][0]; + const double gtuyy = gtu[1][1]; + const double gtuzz = gtu[2][2]; + const double gtuxz = gtu[0][2]; + + const double phi_dx = ms->interp_values[I_PHI_DX][i]; + const double phi_dz = ms->interp_values[I_PHI_DZ][i]; + + const double Xtx = ms->interp_values[I_XTX][i]; + const double Xtz = ms->interp_values[I_XTZ][i]; + + //const double k2 = a2 + SQR(trK) / 3.; + + const double trk_dx = ms->interp_values[I_K_DX][i]; + const double trk_dz = ms->interp_values[I_K_DZ][i]; + + const double betax = ms->interp_values[I_BETAX][i]; + const double betaz = ms->interp_values[I_BETAZ][i]; + + const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi); + const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi); + + const double Xdot_x = 2 * phi * phidot * Xtx + SQR(phi) * Xtdot_x + phi * (phidot_dx * gtuxx + phidot_dz * gtuxz) - + phidot * (phi_dx * gtuxx + phi_dz * gtuxz) + 2 * alpha * (phi_dx * Ku[0][0] + phi_dz * Ku[0][2]) / phi; + const double Xdot_z = 2 * phi * phidot * Xtz + SQR(phi) * Xtdot_z + phi * (phidot_dz * gtuzz + phidot_dx * gtuxz) - + phidot * (phi_dz * gtuzz + phi_dx * gtuxz) + 2 * alpha * (phi_dz * Ku[2][2] + phi_dx * Ku[0][2]) / phi; + + ms->eq_coeff_20[i] = SQR(phi) * (gtuxx + ((x <= EPS) ? gtuyy : 0.0)); + ms->eq_coeff_02[i] = SQR(phi) * gtuzz; + ms->eq_coeff_11[i] = SQR(phi) * gtuxz * 2; + ms->eq_coeff_10[i] = -Xx + ((x > EPS) ? SQR(phi) * gtuyy / x : 0.0); + ms->eq_coeff_01[i] = -Xz; + ms->eq_coeff_00[i] = -k2; + + ms->rhs[i] = -2 * alpha * kij_dij_alpha + Xdot_x * dx_alpha + Xdot_z * dz_alpha + + 2 * (k_kdot + 2 * alpha * k3) * alpha; + + rhs_max = MAX(rhs_max, fabs(ms->rhs[i])); + } + } + + *prhs_max = rhs_max; + + return 0; +} + +// based on the wikipedia article +// and http://www.netlib.org/templates/matlab/bicgstab.m +static int solve_bicgstab(BiCGSTABContext *ctx, const int N, + double *mat, double *rhs, double *x) +{ + const double rhs_norm = cblas_dnrm2(N, rhs, 1); + + double rho, rho_prev = 1.0; + double omega = 1.0; + double alpha = 1.0; + + double err; + int i; + + double *k = ctx->k; + double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t; + double *res = ctx->res, *res0 = ctx->res0; + + // initialize the residual + memcpy(res, rhs, N * sizeof(*res)); + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, + mat, N, x, 1, 1.0, res, 1); + + memcpy(res0, res, N * sizeof(*res0)); + memcpy(p, res, N * sizeof(*p)); + +#define MAXITER 16 +#define TOL (1e-15) + for (i = 0; i < MAXITER; i++) { + rho = cblas_ddot(N, res, 1, res0, 1); + + if (i) { + double beta = (rho / rho_prev) * (alpha / omega); + + cblas_daxpy(N, -omega, v, 1, p, 1); + cblas_dscal(N, beta, p, 1); + cblas_daxpy(N, 1, res, 1, p, 1); + } + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + k, N, p, 1, 0.0, y, 1); + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + mat, N, y, 1, 0.0, v, 1); + + alpha = rho / cblas_ddot(N, res0, 1, v, 1); + + cblas_daxpy(N, -alpha, v, 1, res, 1); + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + k, N, res, 1, 0.0, z, 1); + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + mat, N, z, 1, 0.0, t, 1); + + omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1); + + cblas_daxpy(N, alpha, y, 1, x, 1); + cblas_daxpy(N, omega, z, 1, x, 1); + + cblas_daxpy(N, -omega, t, 1, res, 1); + + err = cblas_dnrm2(N, res, 1) / rhs_norm; + if (err < TOL) + break; + + rho_prev = rho; + } + if (i == MAXITER) + return -1; + + return i; +} + +static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q, + const int N, double *mat, double *rhs, cl_mem ocl_x) +{ + const double rhs_norm = cblas_dnrm2(N, rhs, 1); + + double rho, rho_prev = 1.0; + double omega[2] = { 1.0 }; + double alpha = 1.0; + + double err; + int i; + + cl_event events[8]; + + // the matrix, rhs, k and x are assumed to be already uploaded + clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs, 0, NULL, &events[0]); + clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]); + + // initialize the residual + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, + ctx->cl_mat, 0, N, ocl_x, 0, 1, 1.0, ctx->cl_res, 0, 1, + 1, &cl_q, 2, events, &events[2]); + clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double), + 1, &events[2], &events[3]); + clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double), + 1, &events[2], &events[4]); + + clWaitForEvents(5, events); + // BARRIER + +#define MAXITER 16 +#define TOL (1e-15) + for (i = 0; i < MAXITER; i++) { + clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1, + ctx->cl_tmp, 1, &cl_q, 0, NULL, &events[0]); + clEnqueueReadBuffer(cl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho, + 1, &events[0], NULL); + // BARRIER + + if (i) { + double beta = (rho / rho_prev) * (alpha / omega[0]); + + clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1, + 1, &cl_q, 0, NULL, &events[0]); + clblasDscal(N, beta, ctx->cl_p, 0, 1, + 1, &cl_q, 1, &events[0], &events[1]); + clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1, + 1, &cl_q, 1, &events[1], &events[0]); + clWaitForEvents(1, &events[0]); + // BARRIER + } + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1, + 1, &cl_q, 0, NULL, &events[0]); + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1, + 1, &cl_q, 1, &events[0], &events[1]); + + clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1, + ctx->cl_tmp, 1, &cl_q, 1, &events[1], &events[0]); + clEnqueueReadBuffer(cl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha, + 1, &events[0], NULL); + // BARRIER + + alpha = rho / alpha; + + clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1, + 1, &cl_q, 0, NULL, &events[0]); + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1, + 1, &cl_q, 1, &events[0], &events[1]); + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1, + 1, &cl_q, 1, &events[1], &events[0]); + + clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, + ctx->cl_tmp, 1, &cl_q, 1, &events[0], &events[1]); + clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1, + ctx->cl_tmp1, 1, &cl_q, 1, &events[0], &events[2]); + + clEnqueueReadBuffer(cl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega, + 2, &events[1], NULL); + // BARRIER + + omega[0] /= omega[1]; + + clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ocl_x, 0, 1, + 1, &cl_q, 0, NULL, &events[0]); + clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ocl_x, 0, 1, + 1, &cl_q, 1, &events[0], &events[1]); + + clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, + 1, &cl_q, 0, NULL, &events[0]); + clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1, + 1, &cl_q, 1, &events[0], &events[2]); + clEnqueueReadBuffer(cl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err, + 1, &events[2], NULL); + clWaitForEvents(1, &events[1]); + // BARRIER + + if (err < TOL) + break; + + rho_prev = rho; + } + if (i == MAXITER) + return -1; + + return i; +} + +static int lu_invert(const int N, double *mat, double *rhs, int *ipiv) +{ + char equed = 'N'; + double cond, ferr, berr, rpivot; + + double *mat_f, *x; + int ret = 0; +#if 1 + LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + mat, N, ipiv, rhs, N); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv); +#else + mat_f = malloc(SQR(N) * sizeof(*mat_f)); + x = malloc(N * sizeof(*x)); + + //{ + // int i, j; + // for (i = 0; i < N; i++) { + // for (j = 0; j < N; j++) + // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]); + // fprintf(stderr, "\n"); + // } + //} + //{ + // double *mat_copy = malloc(SQR(N) * sizeof(double)); + // double *svd = malloc(N * sizeof(double)); + // double *rhs_copy = malloc(N * sizeof(double)); + // int rank; + + // memcpy(mat_copy, mat, SQR(N) * sizeof(double)); + // memcpy(rhs_copy, rhs, N * sizeof(double)); + + // LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N, + // svd, 1e-13, &rank); + + // free(mat_copy); + // for (int i = 0; i < N; i++) { + // if (i > 5 && i < N - 5) + // continue; + + // fprintf(stderr, "%g\t", svd[i]); + // } + // fprintf(stderr, "\n rank %d\n", rank); + // free(svd); + // free(rhs_copy); + + // if (rank < N) + // ret = 1; + //} + + //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + // mat, N, ipiv, rhs, N); + LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1, + mat, N, mat_f, N, ipiv, &equed, NULL, NULL, + rhs, N, x, N, &cond, &ferr, &berr, &rpivot); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv); + memcpy(rhs, x, N * sizeof(double)); + memcpy(mat, mat_f, SQR(N) * sizeof(double)); + + fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: " + "condition number %16.16g; forward error %16.16g backward error %16.16g\n", + N, N, cond, ferr, berr); + + free(mat_f); + free(x); +#endif + + return ret; +} + +/* + * Solve the equation + * D²α - KᵢⱼKⁱʲα = -K + * for the coefficients of spectral approximation of α: + * α(ρ, z) = 1 + ΣaᵢⱼTᵢ(ρ)Tⱼ(z) + * where i = { 0, ... , ms->nb_coeffs_x }; + * j = { 0, ... , ms->nb_coeffs_z }; + * Tᵢ(x) are defined by ms->basis. + */ +int qms_maximal_solve(MaximalSlicingContext *ms) +{ + const int N = ms->nb_coeffs; + double rhs_max; + int64_t start; + + int ret = 0; + + /* interpolate the metric values and construct the quantities we'll need */ + CCTK_TimerStart("MaximalSlicingAxi_interp_geometry"); + start = gettime(); + ret = interp_geometry(ms); + ms->interp_geometry_time += gettime() - start; + ms->interp_geometry_count++; + CCTK_TimerStop("MaximalSlicingAxi_interp_geometry"); + if (ret < 0) + return ret; + + CCTK_TimerStart("MaximalSlicingAxi_calc_eq_coeffs"); + start = gettime(); + ret = calc_eq_coeffs(ms, &rhs_max); + ms->calc_eq_coeffs_time += gettime() - start; + ms->calc_eq_coeffs_count++; + CCTK_TimerStop("MaximalSlicingAxi_calc_eq_coeffs"); + if (ret < 0) + return ret; + + /* fill the matrix */ + CCTK_TimerStart("MaximalSlicingAxi_construct_matrix"); + start = gettime(); + ret = construct_matrix(ms); + ms->construct_matrix_time += gettime() - start; + ms->construct_matrix_count++; + CCTK_TimerStop("MaximalSlicingAxi_construct_matrix"); + if (ret < 0) + return ret; + +#if 1 + if (rhs_max < EPS) { + fprintf(stderr, "zero rhs\n"); + memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); + if (ms->cl_queue) { + clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double), + ms->coeffs, 0, NULL, NULL); + } + return 0; + } +#endif + + /* solve for the coeffs */ + if (ms->steps_since_inverse < 1024) { + BiCGSTABContext *b = &ms->bicgstab; + int64_t start = gettime(); + + CCTK_TimerStart("MaximalSlicingAxi_solve_BiCGSTAB"); + if (ms->cl_queue) { + ret = solve_bicgstab_cl(b, ms->cl_queue, ms->nb_coeffs, ms->mat, ms->rhs, ms->ocl_coeffs); + clEnqueueReadBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, sizeof(double) * N, + ms->coeffs, 0, NULL, NULL); + } else + ret = solve_bicgstab(b, ms->nb_coeffs, ms->mat, ms->rhs, ms->coeffs); + CCTK_TimerStop("MaximalSlicingAxi_solve_BiCGSTAB"); + + if (ret >= 0) { + ms->cg_time_total += gettime() - start; + ms->cg_solve_count++; + ms->cg_iter_count += ret + 1; + ms->steps_since_inverse++; + +#if 0 + { + double min, max; + gsl_vector_memcpy(b->y, ms->rhs); + cblas_dgemv(CblasColMajor, CblasNoTrans, ms->mat->size1, ms->mat->size2, -1.0, + ms->mat->data, ms->mat->tda, ms->coeffs->data, 1, 1.0, b->y->data, 1); + gsl_vector_minmax(b->y, &min, &max); + if (fabs(min) > 1e-11 || fabs(max) > 1e-11) + abort(); + } +#endif + } + } else + ret = -1; + + if (ret < 0) { + double *tmpv; + double *tmpm; + int64_t start; + + CCTK_TimerStart("MaximalSlicingAxi_solve_LU"); + start = gettime(); + + ret = lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv); + ms->lu_solves_time += gettime() - start; + ms->lu_solves_count++; + CCTK_TimerStop("MaximalSlicingAxi_solve_LU"); + + tmpv = ms->coeffs; + ms->coeffs = ms->rhs; + ms->rhs = tmpv; + + tmpm = ms->mat; + ms->mat = ms->bicgstab.k; + ms->bicgstab.k = tmpm; + + if (ret == 1) { + memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); + ms->coeffs[0] = 1.0; + } + + if (ms->cl_queue) { + cl_event events[2]; + clEnqueueWriteBuffer(ms->cl_queue, ms->bicgstab.cl_k, 0, 0, N * N * sizeof(double), + ms->bicgstab.k, 0, NULL, &events[0]); + clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 0, 0, N * sizeof(double), + ms->coeffs, 0, NULL, &events[1]); + clWaitForEvents(2, events); + } + + ms->steps_since_inverse = 0; + } + + for (int i = 0; i < N; i++) + ms->coeffs[i] *= ms->coeff_scale[i]; + + return ret; +} diff --git a/src/solve.cl b/src/solve.cl new file mode 100644 index 0000000..125133a --- /dev/null +++ b/src/solve.cl @@ -0,0 +1,28 @@ +#pragma OPENCL EXTENSION all : enable + +__kernel void construct_matrix_polar(__global double *mat, + __global double *coeff_00, + __global double *coeff_10, + __global double *coeff_01, + __global double *coeff_11, + __global double *coeff_20, + __global double *coeff_02, + __global double *basis_00, + __global double *basis_10, + __global double *basis_01, + __global double *basis_11, + __global double *basis_20, + __global double *basis_02, + int nb_coeffs, int nb_colloc_points) +{ + unsigned int idx_coeff = get_global_id(0); + unsigned int idx_grid = get_global_id(1); + unsigned int idx = idx_grid + nb_colloc_points * idx_coeff; + + mat[idx] = coeff_00[idx_grid] * basis_00[idx] + + coeff_10[idx_grid] * basis_10[idx] + + coeff_01[idx_grid] * basis_01[idx] + + coeff_11[idx_grid] * basis_11[idx] + + coeff_20[idx_grid] * basis_20[idx] + + coeff_02[idx_grid] * basis_02[idx]; +} diff --git a/src/solve_cl.c b/src/solve_cl.c new file mode 100644 index 0000000..f138631 --- /dev/null +++ b/src/solve_cl.c @@ -0,0 +1,28 @@ +"#pragma OPENCL EXTENSION all : enable\n" +"\n" +"__kernel void construct_matrix_polar(__global double *mat,\n" +" __global double *coeff_00,\n" +" __global double *coeff_10,\n" +" __global double *coeff_01,\n" +" __global double *coeff_11,\n" +" __global double *coeff_20,\n" +" __global double *coeff_02,\n" +" __global double *basis_00,\n" +" __global double *basis_10,\n" +" __global double *basis_01,\n" +" __global double *basis_11,\n" +" __global double *basis_20,\n" +" __global double *basis_02,\n" +" int nb_coeffs, int nb_colloc_points)\n" +"{\n" +" unsigned int idx_coeff = get_global_id(0);\n" +" unsigned int idx_grid = get_global_id(1);\n" +" unsigned int idx = idx_grid + nb_colloc_points * idx_coeff;\n" +"\n" +" mat[idx] = coeff_00[idx_grid] * basis_00[idx] +\n" +" coeff_10[idx_grid] * basis_10[idx] +\n" +" coeff_01[idx_grid] * basis_01[idx] +\n" +" coeff_11[idx_grid] * basis_11[idx] +\n" +" coeff_20[idx_grid] * basis_20[idx] +\n" +" coeff_02[idx_grid] * basis_02[idx];\n" +"}\n" -- cgit v1.2.3