summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2015-03-13 16:06:46 +0100
committerAnton Khirnov <anton@khirnov.net>2015-03-13 16:06:46 +0100
commit3d9b62a6b6fbdde922cb73c262e20db4ac0d84ad (patch)
treed572cf0c873e6e214bb99934ec84afa4683fd2eb
Initial commit.
-rw-r--r--configuration.ccl2
-rw-r--r--interface.ccl14
-rw-r--r--param.ccl33
-rw-r--r--schedule.ccl21
-rw-r--r--src/make.code.defn7
-rw-r--r--src/maximal_slicing_axi.c1503
6 files changed, 1580 insertions, 0 deletions
diff --git a/configuration.ccl b/configuration.ccl
new file mode 100644
index 0000000..c3ae23d
--- /dev/null
+++ b/configuration.ccl
@@ -0,0 +1,2 @@
+# Configuration definition for thorn MaximalSlicingAxi
+
diff --git a/interface.ccl b/interface.ccl
new file mode 100644
index 0000000..cec39a3
--- /dev/null
+++ b/interface.ccl
@@ -0,0 +1,14 @@
+# Interface definition for thorn MaximalSlicingAxi
+implements: MaximalSlicingAxi
+
+INHERITS: ADMBase grid CoordBase MethodOfLines
+
+CCTK_INT FUNCTION MoLRegisterConstrained(CCTK_INT IN idx)
+CCTK_INT FUNCTION MoLRegisterSaveAndRestore(CCTK_INT IN idx)
+CCTK_INT FUNCTION MoLRegisterSaveAndRestoreGroup(CCTK_INT IN idx)
+
+REQUIRES FUNCTION MoLRegisterConstrained
+REQUIRES FUNCTION MoLRegisterSaveAndRestore
+REQUIRES FUNCTION MoLRegisterSaveAndRestoreGroup
+
+CCTK_REAL alpha_coeffs TYPE=array DIM=2 SIZE=basis_order_z,basis_order_r
diff --git a/param.ccl b/param.ccl
new file mode 100644
index 0000000..2ca5e50
--- /dev/null
+++ b/param.ccl
@@ -0,0 +1,33 @@
+# Parameter definitions for thorn MaximalSlicingAxi
+
+SHARES: ADMBase
+
+EXTENDS KEYWORD lapse_evolution_method
+{
+ "maximal_axi" :: "Maximal slicing for an axisymmetric spacetime"
+}
+
+RESTRICTED:
+CCTK_REAL amplitude "Wave amplitude A."
+{
+ 0: :: ""
+} 1.0
+
+CCTK_INT basis_order_r "Number of the basis functions in the radial direction" STEERABLE=recover
+{
+ 1: :: ""
+} 40
+
+CCTK_INT basis_order_z "Number of the basis functions in the z direction" STEERABLE=recover
+{
+ 1: :: ""
+} 40
+
+CCTK_REAL scale_factor "Scaling factor L for the SB basis" STEERABLE=recover
+{
+ 0: :: ""
+} 3.0
+
+BOOLEAN export_coeffs "Export the coefficients of the spectral expansion in alpha_coeffs" STEERABLE=recover
+{
+} "no"
diff --git a/schedule.ccl b/schedule.ccl
new file mode 100644
index 0000000..0d9c425
--- /dev/null
+++ b/schedule.ccl
@@ -0,0 +1,21 @@
+# Schedule definitions for thorn MaximalSlicingAxi
+#
+if (CCTK_Equals(lapse_evolution_method, "maximal_axi")) {
+
+ SCHEDULE maximal_slicing_axi IN MoL_CalcRHS BEFORE ML_BSSN_evolCalcGroup {
+ LANG: C
+ } "Maximal slicing in axisymmetry"
+
+ SCHEDULE maximal_slicing_axi_register AT Startup {
+ LANG: C
+ } ""
+
+ SCHEDULE maximal_slicing_axi_register_mol IN MoL_Register {
+ LANG: C
+ } ""
+
+ if (export_coeffs) {
+ STORAGE: alpha_coeffs
+ }
+}
+
diff --git a/src/make.code.defn b/src/make.code.defn
new file mode 100644
index 0000000..27625ce
--- /dev/null
+++ b/src/make.code.defn
@@ -0,0 +1,7 @@
+# Main make.code.defn file for thorn MaximalSlicingAxi
+
+# Source files in this directory
+SRCS = maximal_slicing_axi.c
+
+# Subdirectories containing source files
+SUBDIRS =
diff --git a/src/maximal_slicing_axi.c b/src/maximal_slicing_axi.c
new file mode 100644
index 0000000..7d3e6f4
--- /dev/null
+++ b/src/maximal_slicing_axi.c
@@ -0,0 +1,1503 @@
+#include <ctype.h>
+#include <errno.h>
+#include <float.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cblas.h>
+#include <lapacke.h>
+
+#include <cl.h>
+#include <clBLAS.h>
+
+#include "cctk.h"
+#include "cctk_Arguments.h"
+#include "cctk_Parameters.h"
+#include "cctk_Timers.h"
+#include "util_Table.h"
+#include "Slicing.h"
+
+double ms_scalarproduct_metric_avx(int offset_r, int offset_z, const double *coeffs,
+ const double *basis_r, const double *basis_z);
+
+#define ACC_TEST 0
+
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#define MIN(x, y) ((x) > (y) ? (y) : (x))
+#define SQR(x) ((x) * (x))
+#define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0)
+#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr))
+
+/*
+ * small number to avoid r=0 singularities
+ */
+#define EPS 1E-08
+
+#include <sys/time.h>
+int64_t gettime(void)
+{
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+/* a set of basis functions */
+typedef struct BasisSet {
+ /* evaluate the idx-th basis function at the specified point*/
+ long double (*eval) (long double coord, int idx);
+ /* evaluate the first derivative of the idx-th basis function at the specified point*/
+ long double (*eval_diff1)(long double coord, int idx);
+ /* evaluate the second derivative of the idx-th basis function at the specified point*/
+ long double (*eval_diff2)(long double coord, int idx);
+ /**
+ * Get the idx-th collocation point for the specified order.
+ * idx runs from 0 to order - 1 (inclusive)
+ */
+ long double (*colloc_point)(int order, int idx);
+} BasisSet;
+
+/*
+ * The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9)
+ * SB(x, n) = sin((n + 1) arccot(|x| / L))
+ * They are symmetric wrt origin and decay as 1/x in infinity.
+ */
+
+static CCTK_REAL scale_factor;
+
+#define SCALE_FACTOR scale_factor
+
+static long double sb_even_eval(long double coord, int idx)
+{
+ long double val = (coord == 0.0) ? M_PI_2 : atanl(SCALE_FACTOR / fabsl(coord));
+
+ idx *= 2; // even only
+
+ return sinl((idx + 1) * val);
+}
+
+static long double sb_even_eval_diff1(long double coord, int idx)
+{
+ long double val = (coord == 0.0) ? M_PI_2 : atanl(SCALE_FACTOR / fabsl(coord));
+
+ idx *= 2; // even only
+
+ return - SCALE_FACTOR * (idx + 1) * SGN(coord) * cosl((idx + 1) * val) / (SQR(SCALE_FACTOR) + SQR(coord));
+}
+
+static long double sb_even_eval_diff2(long double coord, int idx)
+{
+ long double val = (coord == 0.0) ? M_PI_2 : atanl(SCALE_FACTOR / fabsl(coord));
+
+ idx *= 2; // even only
+
+ return SCALE_FACTOR * (idx + 1) * SGN(coord) * (2 * fabsl(coord) * cosl((idx + 1) * val) - SCALE_FACTOR * (idx + 1) * sinl((idx + 1) * val)) / SQR(SQR(SCALE_FACTOR) + SQR(coord));
+}
+
+static long double sb_even_colloc_point(int order, int idx)
+{
+ long double t;
+
+ idx = order - idx - 1;
+ //order *= 2;
+
+ //t = (idx + 2) * M_PI / (order + 4);
+ t = (idx + 2) * M_PI / (2 * order + 2);
+ return SCALE_FACTOR / tanl(t);
+}
+
+static const BasisSet sb_even_basis = {
+ .eval = sb_even_eval,
+ .eval_diff1 = sb_even_eval_diff1,
+ .eval_diff2 = sb_even_eval_diff2,
+ .colloc_point = sb_even_colloc_point,
+};
+
+static long double tb_even_eval(long double coord, int idx)
+{
+ long double val = (coord == 0.0) ? M_PI_2 : atanl(SCALE_FACTOR / fabsl(coord));
+
+ idx++;
+ idx *= 2; // even only
+
+ return cosl(idx * val) - 1.0;
+}
+
+static long double tb_even_eval_diff1(long double coord, int idx)
+{
+ long double val = (coord == 0.0) ? M_PI_2 : atanl(SCALE_FACTOR / fabsl(coord));
+
+ idx++;
+ idx *= 2; // even only
+
+ return SCALE_FACTOR * idx * SGN(coord) * sinl(idx * val) / (SQR(SCALE_FACTOR) + SQR(coord));
+}
+
+static long double tb_even_eval_diff2(long double coord, int idx)
+{
+ long double val = (coord == 0.0) ? M_PI_2 : atanl(SCALE_FACTOR / fabsl(coord));
+
+ idx++;
+ idx *= 2; // even only
+
+ return -SCALE_FACTOR * idx * SGN(coord) * (2 * fabsl(coord) * sinl(idx * val) + SCALE_FACTOR * idx * cosl(idx * val)) / SQR(SQR(SCALE_FACTOR) + SQR(coord));
+}
+
+static long double tb_even_colloc_point(int order, int idx)
+{
+ long double t;
+
+ idx = order - idx - 1;
+ //order *= 2;
+
+ //t = (idx + 2) * M_PI / (order + 4);
+ t = (idx + 3) * M_PI / (2 * (order + 2));
+ return SCALE_FACTOR / tanl(t);
+}
+
+static const BasisSet tb_even_basis = {
+ .eval = tb_even_eval,
+ .eval_diff1 = tb_even_eval_diff1,
+ .eval_diff2 = tb_even_eval_diff2,
+ .colloc_point = tb_even_colloc_point,
+};
+
+static long double full_eval(long double coord, int idx)
+{
+ long double val = (coord == 0.0) ? M_PI_2 : atanl(SCALE_FACTOR / fabsl(coord));
+ int flag = idx & 1;
+
+ idx /= 2;
+
+ if (flag) return sinl((2 * idx + 1) * 4 * val);
+ else return cosl((2 * idx + 2) * 4 * val) - 1;
+}
+
+static long double full_eval_diff1(long double coord, int idx)
+{
+ long double val = (coord == 0.0) ? M_PI_2 : atanl(SCALE_FACTOR / fabsl(coord));
+ int flag = idx & 1;
+
+ idx /= 2;
+
+ if (flag) {
+ idx = 2 * idx + 1;
+ return -4 * idx * SCALE_FACTOR * cosl(idx * 4 * val) / (SQR(SCALE_FACTOR) + SQR(coord));
+ } else {
+ idx = 2 * (idx + 1);
+ return 4 * idx * SCALE_FACTOR * sinl(idx * 4 * val) / (SQR(SCALE_FACTOR) + SQR(coord));
+ }
+}
+
+static long double full_eval_diff2(long double coord, int idx)
+{
+ long double val = (coord == 0.0) ? M_PI_2 : atanl(SCALE_FACTOR / fabsl(coord));
+ int flag = idx & 1;
+
+ idx /= 2;
+
+ if (flag) {
+ idx = 2 * idx + 1;
+ return (16 * SQR(idx * SCALE_FACTOR) * cosl(idx * 4 * val) - 4 * idx * SCALE_FACTOR * sinl(idx * 4 * val) * 2 * coord) / SQR(SQR(SCALE_FACTOR) + SQR(coord));
+ } else {
+ idx = 2 * (idx + 1);
+ return (-16 * SQR(idx * SCALE_FACTOR) * sinl(idx * 4 * val) - 4 * idx * SCALE_FACTOR * cosl(idx * 4 * val) * 2 * coord) / SQR(SQR(SCALE_FACTOR) + SQR(coord));
+ }
+}
+
+static long double full_colloc_point(int order, int idx)
+{
+ long double t;
+
+ idx = order - idx - 1;
+
+ t = (idx + 0.5) * M_PI / (2 * order);
+
+ return SCALE_FACTOR / tanl(t);
+
+}
+
+static const BasisSet full_basis = {
+ .eval = full_eval,
+ .eval_diff1 = full_eval_diff1,
+ .eval_diff2 = full_eval_diff2,
+ .colloc_point = full_colloc_point,
+};
+
+static long double cheb_eval(long double coord, int idx)
+{
+ return cosl(2 * idx * acosl(coord / SCALE_FACTOR));
+}
+
+static long double cheb_eval_diff1(long double coord, int idx)
+{
+ return 2 * idx * sinl(2 * idx * acosl(coord / SCALE_FACTOR)) / sqrtl(SQR(SCALE_FACTOR) - SQR(coord));
+}
+
+static long double cheb_eval_diff2(long double coord, int idx)
+{
+ long double t = acosl(coord / SCALE_FACTOR);
+ return 2 * idx * (cosl(2 * idx * t) * 2 * idx / (SQR(SCALE_FACTOR) - SQR(coord)) + sinl(2 * idx * t) * coord / pow(SQR(SCALE_FACTOR) - SQR(coord), 1.5));
+}
+
+static long double cheb_colloc_point(int order, int idx)
+{
+ return SCALE_FACTOR * cosl((idx + 0.01) * M_PI / (4 * order + 4));
+}
+
+static const BasisSet cheb_basis = {
+ .eval = cheb_eval,
+ .eval_diff1 = cheb_eval_diff1,
+ .eval_diff2 = cheb_eval_diff2,
+ .colloc_point = cheb_colloc_point,
+};
+
+/* indices (in our code, not cactus structs) of the grid functions which we'll need to
+ * interpolate on the pseudospectral grid */
+enum MetricVars {
+ GTXX = 0,
+ GTYY,
+ GTZZ,
+ GTXY,
+ GTXZ,
+ GTYZ,
+ PHI,
+ ATXX,
+ ATYY,
+ ATZZ,
+ ATXY,
+ ATXZ,
+ ATYZ,
+ K,
+ XTX,
+ XTY,
+ XTZ,
+ BETAX,
+ BETAY,
+ BETAZ,
+ NB_METRIC_VARS,
+};
+
+/* indices of the interpolated values of the above grid functions and their derivatives */
+enum InterpMetricVars {
+ I_GTXX = 0,
+ I_GTXX_DX,
+ I_GTXX_DY,
+ I_GTXX_DZ,
+ I_GTYY,
+ I_GTYY_DX,
+ I_GTYY_DY,
+ I_GTYY_DZ,
+ I_GTZZ,
+ I_GTZZ_DX,
+ I_GTZZ_DY,
+ I_GTZZ_DZ,
+ I_GTXY,
+ I_GTXY_DX,
+ I_GTXY_DY,
+ I_GTXY_DZ,
+ I_GTXZ,
+ I_GTXZ_DX,
+ I_GTXZ_DY,
+ I_GTXZ_DZ,
+ I_GTYZ,
+ I_GTYZ_DX,
+ I_GTYZ_DY,
+ I_GTYZ_DZ,
+ I_PHI,
+ I_PHI_DX,
+ I_PHI_DY,
+ I_PHI_DZ,
+ I_ATXX,
+ I_ATYY,
+ I_ATZZ,
+ I_ATXY,
+ I_ATXZ,
+ I_ATYZ,
+ I_K,
+ I_K_DX,
+ I_K_DY,
+ I_K_DZ,
+ I_XTX,
+ I_XTY,
+ I_XTZ,
+ I_BETAX,
+ I_BETAY,
+ I_BETAZ,
+ NB_INTERP_VARS,
+};
+
+/* mapping between our indices and thorn names */
+static const char *metric_vars[] = {
+ [GTXX] = "ML_BSSN::gt11",
+ [GTYY] = "ML_BSSN::gt22",
+ [GTZZ] = "ML_BSSN::gt33",
+ [GTXY] = "ML_BSSN::gt12",
+ [GTXZ] = "ML_BSSN::gt13",
+ [GTYZ] = "ML_BSSN::gt23",
+ [ATXX] = "ML_BSSN::At11",
+ [ATYY] = "ML_BSSN::At22",
+ [ATZZ] = "ML_BSSN::At33",
+ [ATXY] = "ML_BSSN::At12",
+ [ATXZ] = "ML_BSSN::At13",
+ [ATYZ] = "ML_BSSN::At23",
+ [PHI] = "ML_BSSN::phi",
+ [K] = "ML_BSSN::trK",
+ [XTX] = "ML_BSSN::Xt1",
+ [XTY] = "ML_BSSN::Xt2",
+ [XTZ] = "ML_BSSN::Xt3",
+ [BETAX] = "ML_BSSN::beta1",
+ [BETAY] = "ML_BSSN::beta2",
+ [BETAZ] = "ML_BSSN::beta3",
+};
+
+/* mapping between the cactus grid values and interpolated values */
+static const CCTK_INT interp_operation_indices[] = {
+ [I_GTXX] = GTXX,
+ [I_GTXX_DX] = GTXX,
+ [I_GTXX_DY] = GTXX,
+ [I_GTXX_DZ] = GTXX,
+ [I_GTYY] = GTYY,
+ [I_GTYY_DX] = GTYY,
+ [I_GTYY_DY] = GTYY,
+ [I_GTYY_DZ] = GTYY,
+ [I_GTZZ] = GTZZ,
+ [I_GTZZ_DX] = GTZZ,
+ [I_GTZZ_DY] = GTZZ,
+ [I_GTZZ_DZ] = GTZZ,
+ [I_GTXY] = GTXY,
+ [I_GTXY_DX] = GTXY,
+ [I_GTXY_DY] = GTXY,
+ [I_GTXY_DZ] = GTXY,
+ [I_GTXZ] = GTXZ,
+ [I_GTXZ_DX] = GTXZ,
+ [I_GTXZ_DY] = GTXZ,
+ [I_GTXZ_DZ] = GTXZ,
+ [I_GTYZ] = GTYZ,
+ [I_GTYZ_DX] = GTYZ,
+ [I_GTYZ_DY] = GTYZ,
+ [I_GTYZ_DZ] = GTYZ,
+ [I_PHI] = PHI,
+ [I_PHI_DX] = PHI,
+ [I_PHI_DY] = PHI,
+ [I_PHI_DZ] = PHI,
+ [I_ATXX] = ATXX,
+ [I_ATYY] = ATYY,
+ [I_ATZZ] = ATZZ,
+ [I_ATXY] = ATXY,
+ [I_ATXZ] = ATXZ,
+ [I_ATYZ] = ATYZ,
+ [I_K] = K,
+ [I_K_DX] = K,
+ [I_K_DY] = K,
+ [I_K_DZ] = K,
+ [I_XTX] = XTX,
+ [I_XTY] = XTY,
+ [I_XTZ] = XTZ,
+ [I_BETAX] = BETAX,
+ [I_BETAY] = BETAY,
+ [I_BETAZ] = BETAZ,
+};
+
+/* the operation (plain value or x/y/z-derivative) to apply during interpolation */
+static const CCTK_INT interp_operation_codes[] = {
+ [I_GTXX] = 0,
+ [I_GTXX_DX] = 1,
+ [I_GTXX_DY] = 2,
+ [I_GTXX_DZ] = 3,
+ [I_GTYY] = 0,
+ [I_GTYY_DX] = 1,
+ [I_GTYY_DY] = 2,
+ [I_GTYY_DZ] = 3,
+ [I_GTZZ] = 0,
+ [I_GTZZ_DX] = 1,
+ [I_GTZZ_DY] = 2,
+ [I_GTZZ_DZ] = 3,
+ [I_GTXY] = 0,
+ [I_GTXY_DX] = 1,
+ [I_GTXY_DY] = 2,
+ [I_GTXY_DZ] = 3,
+ [I_GTXZ] = 0,
+ [I_GTXZ_DX] = 1,
+ [I_GTXZ_DY] = 2,
+ [I_GTXZ_DZ] = 3,
+ [I_GTYZ] = 0,
+ [I_GTYZ_DX] = 1,
+ [I_GTYZ_DY] = 2,
+ [I_GTYZ_DZ] = 3,
+ [I_PHI] = 0,
+ [I_PHI_DX] = 1,
+ [I_PHI_DY] = 2,
+ [I_PHI_DZ] = 3,
+ [I_ATXX] = 0,
+ [I_ATYY] = 0,
+ [I_ATZZ] = 0,
+ [I_ATXY] = 0,
+ [I_ATXZ] = 0,
+ [I_ATYZ] = 0,
+ [I_K] = 0,
+ [I_K_DX] = 1,
+ [I_K_DY] = 2,
+ [I_K_DZ] = 3,
+ [I_XTX] = 0,
+ [I_XTY] = 0,
+ [I_XTZ] = 0,
+ [I_BETAX] = 0,
+ [I_BETAY] = 0,
+ [I_BETAZ] = 0,
+};
+
+/* precomputed values for a given refined grid */
+typedef struct CoordPatch {
+ CCTK_REAL origin[3];
+ CCTK_REAL delta[3];
+ CCTK_INT size[3];
+
+ // basis values on the grid
+ double *basis_val_r;
+ double *basis_val_z;
+
+ double *transform_z;
+ double *one;
+} CoordPatch;
+
+typedef struct ExpansionThreadData {
+ struct MaximalSlicingContext *ms;
+ CoordPatch *cp;
+ double *alp;
+ double *vec_tmp;
+ int start, end;
+} ExpansionThreadData;
+
+/* state and scratch storage for the BiCGSTAB solver */
+typedef struct BiCGSTABContext {
+ double *p, *v, *y, *z, *t;
+ double *res, *res0;
+ double *k;
+
+ cl_mem cl_p, cl_v, cl_y, cl_z, cl_t;
+ cl_mem cl_res, cl_res0;
+ cl_mem cl_k, cl_mat;
+ cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1;
+ cl_mem cl_tmp, cl_tmp1;
+
+ int64_t solve_total;
+ int64_t iter_total;
+ int64_t time_total;
+} BiCGSTABContext;
+
+typedef struct MaximalSlicingContext {
+ cGH *gh;
+ const BasisSet *basis;
+
+ BiCGSTABContext bicgstab;
+ int steps_since_inverse;
+
+ int64_t lu_solves_total;
+ int64_t lu_solves_time;
+
+ // the grid of collocation points
+ CCTK_REAL *grid_x;
+ CCTK_REAL *grid_z;
+
+ // interpolation parameters
+ int coord_system;
+ int interp_operator;
+ int interp_params;
+
+ CCTK_REAL *interp_coords[3];
+
+ int interp_vars_indices[NB_METRIC_VARS];
+
+ CCTK_REAL *interp_values[NB_INTERP_VARS];
+ CCTK_INT interp_value_codes[NB_INTERP_VARS];
+
+ CCTK_REAL *metric_u[6];
+
+ CCTK_REAL *kij_kij;
+ CCTK_REAL *trk;
+
+ int nb_coeffs_x;
+ int nb_coeffs_z;
+ int nb_coeffs;
+
+ int nb_colloc_points_x;
+ int nb_colloc_points_z;
+ int nb_colloc_points;
+
+ int colloc_grid_order_x;
+ int colloc_grid_order_z;
+
+ double *mat;
+ double *mat_f;
+ double *rhs;
+ double *coeffs;
+ int *ipiv;
+ double *basis_x_val;
+ double *basis_x_dval;
+ double *basis_x_d2val;
+
+ double *basis_z_val;
+ double *basis_z_dval;
+ double *basis_z_d2val;
+
+ double *basis_val_00;
+ double *basis_val_20;
+ double *basis_val_02;
+ double *basis_val_11;
+ double *basis_val_10;
+ double *basis_val_01;
+
+ CoordPatch *patches;
+ int nb_patches;
+
+ // OpenCL / CLBLAS stuff
+ cl_context cl_ctx;
+ cl_command_queue cl_queue;
+
+ cl_mem ocl_coeffs;
+} MaximalSlicingContext;
+
+static int construct_matrix(MaximalSlicingContext *ms, double *mat,
+ double *rhs, double *prhs_max)
+{
+ int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z;
+ double rhs_max = 0.0;
+
+#define BASIS_X (ms->basis_x_val [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
+#define DBASIS_X (ms->basis_x_dval [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
+#define D2BASIS_X (ms->basis_x_d2val[idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
+#define BASIS_Z (ms->basis_z_val [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
+#define DBASIS_Z (ms->basis_z_dval [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
+#define D2BASIS_Z (ms->basis_z_d2val[idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
+
+ //memset(mat, 0, sizeof(*mat) * ms->nb_coeffs * ms->nb_colloc_points);
+
+#pragma omp parallel for reduction(max : rhs_max)
+ for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z; idx_grid_z++) {
+ for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x; idx_grid_x++) {
+ CCTK_REAL x_physical = ms->grid_x[idx_grid_x];
+ int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;
+
+ const double gtuxx = ms->metric_u[0][idx_grid];
+ const double gtuyy = ms->metric_u[1][idx_grid];
+ const double gtuzz = ms->metric_u[2][idx_grid];
+ const double gtuxz = ms->metric_u[4][idx_grid];
+
+ const double phi = ms->interp_values[I_PHI][idx_grid];
+ const double phi_dx = ms->interp_values[I_PHI_DX][idx_grid];
+ const double phi_dz = ms->interp_values[I_PHI_DZ][idx_grid];
+
+ const double Xtx = ms->interp_values[I_XTX][idx_grid];
+ const double Xtz = ms->interp_values[I_XTZ][idx_grid];
+
+ const double k2 = ms->kij_kij[idx_grid];
+ const double trk = ms->interp_values[I_K][idx_grid];
+
+ const double trk_dx = ms->interp_values[I_K_DX][idx_grid];
+ const double trk_dz = ms->interp_values[I_K_DZ][idx_grid];
+
+ const double betax = ms->interp_values[I_BETAX][idx_grid];
+ const double betaz = ms->interp_values[I_BETAZ][idx_grid];
+
+ const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi);
+ const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);
+
+ const double coeff_20 = SQR(phi) * (gtuxx + (x_physical <= EPS) * gtuyy);
+ const double coeff_02 = SQR(phi) * gtuzz;
+ const double coeff_11 = SQR(phi) * gtuxz * 2;
+ const double coeff_10 = -Xx + (x_physical > EPS) * SQR(phi) * gtuyy / x_physical;
+ const double coeff_01 = -Xz;
+ const double coeff_00 = -k2;
+
+#if 1
+ for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
+ for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
+ const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;
+
+ //double d2alpha = gtuxx * D2BASIS_X * BASIS_Z
+ // + gtuzz * BASIS_X * D2BASIS_Z
+ // + 2 * gtuxz * DBASIS_X * DBASIS_Z;
+ //if (x_physical > EPS)
+ // d2alpha += gtuyy * DBASIS_X * BASIS_Z / x_physical;
+ //else
+ // d2alpha += gtuyy * D2BASIS_X * BASIS_Z;
+
+ //double curv_term = Xx * DBASIS_X * BASIS_Z + Xz * BASIS_X * DBASIS_Z;
+
+
+ //double D2alpha = SQR(phi) * d2alpha - curv_term;
+
+ //mat[idx_grid + ms->nb_colloc_points * idx_coeff] = D2alpha - BASIS_X * BASIS_Z * k2;
+ mat[idx_grid + ms->nb_colloc_points * idx_coeff] = coeff_00 * BASIS_X * BASIS_Z +
+ coeff_10 * DBASIS_X * BASIS_Z +
+ coeff_01 * BASIS_X * DBASIS_Z +
+ coeff_11 * DBASIS_X * DBASIS_Z +
+ coeff_20 * D2BASIS_X * BASIS_Z +
+ coeff_02 * BASIS_X * D2BASIS_Z;
+ }
+#else
+
+ const double coeff_20 = SQR(phi) * (gtuxx + (x_physical <= EPS) * gtuyy);
+ const double coeff_02 = SQR(phi) * gtuzz;
+ const double coeff_11 = SQR(phi) * gtuxz * 2;
+ const double coeff_10 = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi + (x_physical > EPS) * gtuyy);
+ const double coeff_01 = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);
+ const double coeff_00 = -k2;
+ cblas_daxpy(ms->nb_coeffs, coeff_20, ms->basis_val_20 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
+ cblas_daxpy(ms->nb_coeffs, coeff_02, ms->basis_val_02 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
+ cblas_daxpy(ms->nb_coeffs, coeff_11, ms->basis_val_11 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
+ cblas_daxpy(ms->nb_coeffs, coeff_10, ms->basis_val_10 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
+ cblas_daxpy(ms->nb_coeffs, coeff_01, ms->basis_val_01 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
+ cblas_daxpy(ms->nb_coeffs, coeff_00, ms->basis_val_00 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
+#endif
+
+ rhs[idx_grid] = k2 + trk ;// betax * trk_dx + betaz * trk_dz;
+ //rhs[idx_grid] = k2;
+ rhs_max = MAX(rhs_max, fabs(rhs[idx_grid]));
+ //rhs_max = fabs(rhs[idx_grid]);
+ }
+ }
+
+ //memcpy(rhs, ms->kij_kij, sizeof(*rhs) * ms->nb_colloc_points);
+ //cblas_daxpy(ms->nb_colloc_points, 1.0, ms->interp_values[I_K], 1, rhs, 1);
+ //cblas_dsbmv(CblasColMajor, CblasUpper, ms->nb_colloc_points, 0, 1.0, ms->interp_values[I_BETAX], 1, ms->interp_values[I_K_DX], 1, 1.0, rhs, 1);
+ //cblas_dsbmv(CblasColMajor, CblasUpper, ms->nb_colloc_points, 0, 1.0, ms->interp_values[I_BETAZ], 1, ms->interp_values[I_K_DZ], 1, 1.0, rhs, 1);
+
+ //*prhs_max = rhs[cblas_idamax(ms->nb_colloc_points, rhs, 1)];
+ *prhs_max = rhs_max;
+
+ return 0;
+}
+
+// based on the wikipedia article
+// and http://www.netlib.org/templates/matlab/bicgstab.m
+static int solve_bicgstab(BiCGSTABContext *ctx, const int N,
+ double *mat, double *rhs, double *x)
+{
+ const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+ double rho, rho_prev = 1.0;
+ double omega = 1.0;
+ double alpha = 1.0;
+
+ double err;
+ int i;
+
+ double *k = ctx->k;
+ double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t;
+ double *res = ctx->res, *res0 = ctx->res0;
+
+ // initialize the residual
+ memcpy(res, rhs, N * sizeof(*res));
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+ mat, N, x, 1, 1.0, res, 1);
+
+ memcpy(res0, res, N * sizeof(*res0));
+ memcpy(p, res, N * sizeof(*p));
+
+#define MAXITER 16
+#define TOL (1e-15)
+ for (i = 0; i < MAXITER; i++) {
+ rho = cblas_ddot(N, res, 1, res0, 1);
+
+ if (i) {
+ double beta = (rho / rho_prev) * (alpha / omega);
+
+ cblas_daxpy(N, -omega, v, 1, p, 1);
+ cblas_dscal(N, beta, p, 1);
+ cblas_daxpy(N, 1, res, 1, p, 1);
+ }
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ k, N, p, 1, 0.0, y, 1);
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ mat, N, y, 1, 0.0, v, 1);
+
+ alpha = rho / cblas_ddot(N, res0, 1, v, 1);
+
+ cblas_daxpy(N, -alpha, v, 1, res, 1);
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ k, N, res, 1, 0.0, z, 1);
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ mat, N, z, 1, 0.0, t, 1);
+
+ omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1);
+
+ cblas_daxpy(N, alpha, y, 1, x, 1);
+ cblas_daxpy(N, omega, z, 1, x, 1);
+
+ cblas_daxpy(N, -omega, t, 1, res, 1);
+
+ err = cblas_dnrm2(N, res, 1) / rhs_norm;
+ if (err < TOL)
+ break;
+
+ rho_prev = rho;
+ }
+ if (i == MAXITER)
+ return -1;
+
+ ctx->iter_total += i + 1;
+
+ return i;
+}
+
+static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q,
+ const int N, double *mat, double *rhs, cl_mem ocl_x)
+{
+ const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+ double rho, rho_prev = 1.0;
+ double omega[2] = { 1.0 };
+ double alpha = 1.0;
+
+ double err;
+ int i;
+
+ cl_event events[8];
+
+ // upload the matrix and the RHS to the GPU
+ // k and x are assumed to be already uploaded
+ clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double),
+ rhs, 0, NULL, &events[0]);
+ clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double),
+ mat, 0, NULL, &events[1]);
+
+ // initialize the residual
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+ ctx->cl_mat, 0, N, ocl_x, 0, 1, 1.0, ctx->cl_res, 0, 1,
+ 1, &cl_q, 2, events, &events[2]);
+ clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double),
+ 1, &events[2], &events[3]);
+ clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double),
+ 1, &events[2], &events[4]);
+
+ clWaitForEvents(5, events);
+ // BARRIER
+
+#define MAXITER 16
+#define TOL (1e-15)
+ for (i = 0; i < MAXITER; i++) {
+ clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1,
+ ctx->cl_tmp, 1, &cl_q, 0, NULL, &events[0]);
+ clEnqueueReadBuffer(cl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho,
+ 1, &events[0], NULL);
+ // BARRIER
+
+ if (i) {
+ double beta = (rho / rho_prev) * (alpha / omega[0]);
+
+ clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1,
+ 1, &cl_q, 0, NULL, &events[0]);
+ clblasDscal(N, beta, ctx->cl_p, 0, 1,
+ 1, &cl_q, 1, &events[0], &events[1]);
+ clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1,
+ 1, &cl_q, 1, &events[1], &events[0]);
+ clWaitForEvents(1, &events[0]);
+ // BARRIER
+ }
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1,
+ 1, &cl_q, 0, NULL, &events[0]);
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1,
+ 1, &cl_q, 1, &events[0], &events[1]);
+
+ clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1,
+ ctx->cl_tmp, 1, &cl_q, 1, &events[1], &events[0]);
+ clEnqueueReadBuffer(cl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha,
+ 1, &events[0], NULL);
+ // BARRIER
+
+ alpha = rho / alpha;
+
+ clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1,
+ 1, &cl_q, 0, NULL, &events[0]);
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1,
+ 1, &cl_q, 1, &events[0], &events[1]);
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1,
+ 1, &cl_q, 1, &events[1], &events[0]);
+
+ clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+ ctx->cl_tmp, 1, &cl_q, 1, &events[0], &events[1]);
+ clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1,
+ ctx->cl_tmp1, 1, &cl_q, 1, &events[0], &events[2]);
+
+ clEnqueueReadBuffer(cl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega,
+ 2, &events[1], NULL);
+ // BARRIER
+
+ omega[0] /= omega[1];
+
+ clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ocl_x, 0, 1,
+ 1, &cl_q, 0, NULL, &events[0]);
+ clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ocl_x, 0, 1,
+ 1, &cl_q, 1, &events[0], &events[1]);
+
+ clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+ 1, &cl_q, 0, NULL, &events[0]);
+ clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1,
+ 1, &cl_q, 1, &events[0], &events[2]);
+ clEnqueueReadBuffer(cl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err,
+ 1, &events[2], NULL);
+ clWaitForEvents(1, &events[1]);
+ // BARRIER
+
+ if (err < TOL)
+ break;
+
+ rho_prev = rho;
+ }
+ if (i == MAXITER)
+ return -1;
+
+ ctx->iter_total += i + 1;
+
+ return i;
+}
+
+static int lu_invert(const int N, double *mat, double *rhs, int *ipiv)
+{
+ LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+ mat, N, ipiv, rhs, N);
+ LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv);
+
+ return 0;
+}
+
+static int calc_geometry(MaximalSlicingContext *ms)
+{
+ int ret;
+
+ ret = CCTK_InterpGridArrays(ms->gh, 3, ms->interp_operator, ms->interp_params,
+ ms->coord_system, ms->nb_colloc_points, CCTK_VARIABLE_REAL,
+ (const void * const *)ms->interp_coords, ARRAY_ELEMS(ms->interp_vars_indices), ms->interp_vars_indices,
+ ARRAY_ELEMS(ms->interp_values), ms->interp_value_codes, (void * const *)ms->interp_values);
+ if (ret < 0)
+ CCTK_WARN(0, "Error interpolating");
+
+#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x)
+ for (int i = 0; i < ms->nb_colloc_points; i++) {
+ CCTK_REAL Am[3][3], gtu[3][3];
+ CCTK_REAL a2;
+
+ CCTK_REAL gtxx = ms->interp_values[I_GTXX][i];
+ CCTK_REAL gtyy = ms->interp_values[I_GTYY][i];
+ CCTK_REAL gtzz = ms->interp_values[I_GTZZ][i];
+ CCTK_REAL gtxy = ms->interp_values[I_GTXY][i];
+ CCTK_REAL gtxz = ms->interp_values[I_GTXZ][i];
+ CCTK_REAL gtyz = ms->interp_values[I_GTYZ][i];
+
+ CCTK_REAL Atxx = ms->interp_values[I_ATXX][i];
+ CCTK_REAL Atyy = ms->interp_values[I_ATYY][i];
+ CCTK_REAL Atzz = ms->interp_values[I_ATZZ][i];
+ CCTK_REAL Atxy = ms->interp_values[I_ATXY][i];
+ CCTK_REAL Atxz = ms->interp_values[I_ATXZ][i];
+ CCTK_REAL Atyz = ms->interp_values[I_ATYZ][i];
+
+ CCTK_REAL At[3][3] = {{ Atxx, Atxy, Atxz },
+ { Atxy, Atyy, Atyz },
+ { Atxz, Atyz, Atzz }};
+
+ CCTK_REAL trK = ms->interp_values[I_K][i];
+
+ CCTK_REAL Xtx = ms->interp_values[I_XTX][i];
+ CCTK_REAL Xtz = ms->interp_values[I_XTZ][i];
+
+ CCTK_REAL det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz);
+
+ // \tilde{γ}^{ij}
+ gtu[0][0] = (gtyy * gtzz - SQR(gtyz)) / det;
+ gtu[1][1] = (gtxx * gtzz - SQR(gtxz)) / det;
+ gtu[2][2] = (gtxx * gtyy - SQR(gtxy)) / det;
+ gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det;
+ gtu[0][2] = (gtxy * gtyz - gtyy * gtxz) / det;
+ gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det;
+ gtu[1][0] = gtu[0][1];
+ gtu[2][0] = gtu[0][2];
+ gtu[2][1] = gtu[1][2];
+
+ // \tilde{A}_{i}^j
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ double val = 0.0;
+ for (int l = 0; l < 3; l++)
+ val += gtu[j][l] * At[l][k];
+ Am[j][k] = val;
+ }
+
+ // K_{ij} K^{ij}
+ a2 = 0.0;
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ a2 += Am[j][k] * Am[k][j];
+
+ ms->metric_u[0][i] = gtu[0][0];
+ ms->metric_u[1][i] = gtu[1][1];
+ ms->metric_u[2][i] = gtu[2][2];
+ ms->metric_u[3][i] = gtu[0][1];
+ ms->metric_u[4][i] = gtu[0][2];
+ ms->metric_u[5][i] = gtu[1][2];
+
+ ms->kij_kij[i] = a2 + SQR(trK) / 3.;
+ }
+
+ return 0;
+}
+
+/*
+ * Solve the equation
+ * D²α - KᵢⱼKⁱʲα = -K
+ * for the coefficients of spectral approximation of α:
+ * α(ρ, z) = 1 + ΣaᵢⱼTᵢ(ρ)Tⱼ(z)
+ * where i = { 0, ... , ms->nb_coeffs_x };
+ * j = { 0, ... , ms->nb_coeffs_z };
+ * Tᵢ(x) are defined by ms->basis.
+ */
+static int maximal_solve(MaximalSlicingContext *ms)
+{
+ const int N = ms->nb_coeffs;
+ double rhs_max;
+
+ int ret = 0;
+
+ /* interpolate the metric values and construct the quantities we'll need */
+ CCTK_TimerStart("MaximalSlicingAxi_calc_geometry");
+ ret = calc_geometry(ms);
+ CCTK_TimerStop("MaximalSlicingAxi_calc_geometry");
+ if (ret < 0)
+ return ret;
+
+ /* fill the matrix */
+ CCTK_TimerStart("MaximalSlicingAxi_construct_matrix");
+ ret = construct_matrix(ms, ms->mat, ms->rhs, &rhs_max);
+ CCTK_TimerStop("MaximalSlicingAxi_construct_matrix");
+ if (ret < 0)
+ return ret;
+
+ if (rhs_max < EPS) {
+ memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
+ if (ms->cl_queue) {
+ clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double),
+ ms->coeffs, 0, NULL, NULL);
+ }
+ return 0;
+ }
+
+ /* solve for the coeffs */
+ if (ms->steps_since_inverse < 128) {
+ BiCGSTABContext *b = &ms->bicgstab;
+ int64_t start = gettime();
+
+ CCTK_TimerStart("MaximalSlicingAxi_solve_BiCGSTAB");
+ if (ms->cl_queue) {
+ ret = solve_bicgstab_cl(b, ms->cl_queue, ms->nb_coeffs, ms->mat, ms->rhs, ms->ocl_coeffs);
+ clEnqueueReadBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, sizeof(double) * N,
+ ms->coeffs, 0, NULL, NULL);
+ } else
+ ret = solve_bicgstab(b, ms->nb_coeffs, ms->mat, ms->rhs, ms->coeffs);
+ CCTK_TimerStop("MaximalSlicingAxi_solve_BiCGSTAB");
+
+ if (ret >= 0) {
+ b->time_total += gettime() - start;
+ b->solve_total++;
+ ms->steps_since_inverse++;
+
+ if (!(b->solve_total & 127)) {
+ fprintf(stderr, "BiCGSTAB %ld solves, %ld iterations, total time %ld, avg iterations per solve %g, avg time per solve %g, avg time per iteration %g\n",
+ b->solve_total, b->iter_total, b->time_total, (double)b->iter_total / b->solve_total, (double)b->time_total / b->solve_total, (double)b->time_total / b->iter_total);
+ fprintf(stderr, "LU %ld solves, total time %ld, avg time per solve %g\n", ms->lu_solves_total, ms->lu_solves_time, (double)ms->lu_solves_time / ms->lu_solves_total);
+ }
+#if 0
+ {
+ double min, max;
+ gsl_vector_memcpy(b->y, ms->rhs);
+ cblas_dgemv(CblasColMajor, CblasNoTrans, ms->mat->size1, ms->mat->size2, -1.0,
+ ms->mat->data, ms->mat->tda, ms->coeffs->data, 1, 1.0, b->y->data, 1);
+ gsl_vector_minmax(b->y, &min, &max);
+ if (fabs(min) > 1e-11 || fabs(max) > 1e-11)
+ abort();
+ }
+#endif
+ }
+ } else
+ ret = -1;
+
+ if (ret < 0) {
+ double *tmpv;
+ double *tmpm;
+ int64_t start;
+
+ CCTK_TimerStart("MaximalSlicingAxi_solve_LU");
+ start = gettime();
+
+ lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv);
+ ms->lu_solves_time += gettime() - start;
+ ms->lu_solves_total++;
+ CCTK_TimerStop("MaximalSlicingAxi_solve_LU");
+
+ tmpv = ms->coeffs;
+ ms->coeffs = ms->rhs;
+ ms->rhs = tmpv;
+
+ tmpm = ms->mat;
+ ms->mat = ms->bicgstab.k;
+ ms->bicgstab.k = tmpm;
+
+ if (ms->cl_queue) {
+ cl_event events[2];
+ clEnqueueWriteBuffer(ms->cl_queue, ms->bicgstab.cl_k, 0, 0, N * N * sizeof(double),
+ ms->bicgstab.k, 0, NULL, &events[0]);
+ clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 0, 0, N * sizeof(double),
+ ms->coeffs, 0, NULL, &events[1]);
+ clWaitForEvents(2, events);
+ }
+
+ ms->steps_since_inverse = 0;
+ }
+
+
+ return ret;
+}
+
+static void init_opencl(MaximalSlicingContext *ms)
+{
+ int err, count;
+ cl_platform_id platform;
+ cl_device_id device;
+ cl_context_properties props[3];
+
+ err = clGetPlatformIDs(1, &platform, &count);
+ if (err != CL_SUCCESS || count < 1) {
+ fprintf(stderr, "Could not get an OpenCL platform ID\n");
+ return;
+ }
+
+ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &count);
+ if (err != CL_SUCCESS || count < 1) {
+ fprintf(stderr, "Could not get an OpenCL device ID\n");
+ return;
+ }
+
+ props[0] = CL_CONTEXT_PLATFORM;
+ props[1] = (cl_context_properties)platform;
+ props[2] = 0;
+
+ ms->cl_ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+ if (err != CL_SUCCESS || !ms->cl_ctx) {
+ fprintf(stderr, "Could not create an OpenCL context\n");
+ return;
+ }
+
+ ms->cl_queue = clCreateCommandQueue(ms->cl_ctx, device, 0, &err);
+ if (err != CL_SUCCESS || !ms->cl_queue) {
+ fprintf(stderr, "Could not create an OpenCL command queue: %d\n", err);
+ goto fail;
+ }
+
+ err = clblasSetup();
+ if (err != CL_SUCCESS) {
+ fprintf(stderr, "Error setting up clBLAS\n");
+ goto fail;
+ }
+
+ ms->ocl_coeffs = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+
+ ms->bicgstab.cl_p = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+ ms->bicgstab.cl_v = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+ ms->bicgstab.cl_y = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+ ms->bicgstab.cl_z = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+ ms->bicgstab.cl_t = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+ ms->bicgstab.cl_res = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+ ms->bicgstab.cl_res0 = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+ ms->bicgstab.cl_tmp = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+ ms->bicgstab.cl_tmp1 = clCreateBuffer(ms->cl_ctx, 0, 2 * ms->nb_coeffs * sizeof(double), NULL, &err);
+
+ ms->bicgstab.cl_k = clCreateBuffer(ms->cl_ctx, 0, ms->nb_colloc_points * ms->nb_coeffs * sizeof(double), NULL, &err);
+ ms->bicgstab.cl_mat = clCreateBuffer(ms->cl_ctx, 0, ms->nb_colloc_points * ms->nb_coeffs * sizeof(double), NULL, &err);
+
+ ms->bicgstab.cl_rho = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
+ ms->bicgstab.cl_alpha = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
+ ms->bicgstab.cl_beta = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
+ ms->bicgstab.cl_omega = clCreateBuffer(ms->cl_ctx, 0, 2 * sizeof(double), NULL, &err);
+ ms->bicgstab.cl_omega1 = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
+
+ return;
+fail:
+ if (ms->cl_queue)
+ clReleaseCommandQueue(ms->cl_queue);
+ ms->cl_queue = 0;
+
+ if (ms->cl_ctx)
+ clReleaseContext(ms->cl_ctx);
+ ms->cl_ctx = 0;
+}
+
+static MaximalSlicingContext *init_ms(cGH *cctkGH,
+ int basis_order_r, int basis_order_z,
+ double sf,
+ CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z,
+ const int grid_size[3])
+{
+ MaximalSlicingContext *ms;
+ int ret;
+
+ //const double h = (x[1] - x[0]) / 8;
+ //fprintf(stderr, "h %g\n", h);
+
+ //if (basis_order_r != basis_order_z)
+ // CCTK_WARN(0, "Different r and z basis orders are not supported.");
+
+ ms = calloc(1, sizeof(*ms));
+
+ ms->gh = cctkGH;
+
+ ms->basis = &sb_even_basis;
+ //ms->basis = &full_basis;
+ //ms->basis = &cheb_basis;
+
+ ms->nb_coeffs_x = basis_order_r;
+ ms->nb_coeffs_z = basis_order_z;
+
+ ms->nb_coeffs = ms->nb_coeffs_x * ms->nb_coeffs_z;
+
+ ms->nb_colloc_points_x = basis_order_r;
+ ms->nb_colloc_points_z = basis_order_z;
+
+ ms->nb_colloc_points = ms->nb_colloc_points_x * ms->nb_colloc_points_z;
+
+ if (ms->nb_colloc_points != ms->nb_coeffs)
+ CCTK_WARN(0, "Non-square collocation matrix");
+
+ ms->colloc_grid_order_x = ms->nb_colloc_points_x;
+ ms->colloc_grid_order_z = ms->nb_colloc_points_z;
+
+ ms->mat = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points);
+ ms->coeffs = malloc(sizeof(double) * ms->nb_coeffs);
+ ms->rhs = malloc(sizeof(double) * ms->nb_colloc_points);
+
+ ms->mat_f = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points);
+ ms->ipiv = malloc(sizeof(*ms->ipiv) * ms->nb_coeffs);
+
+#if 1
+ scale_factor = 1.0;
+
+ scale_factor = (x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)] - 3) / ms->basis->colloc_point(ms->colloc_grid_order_x, ms->nb_colloc_points_x - 1);
+ //scale_factor = x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)] - 0.5;
+ fprintf(stderr, "scale factor %g\n", scale_factor);
+
+#else
+ scale_factor = sf;
+#endif
+
+ ms->grid_x = malloc(ms->nb_colloc_points_x * sizeof(*ms->grid_x));
+
+ for (int i = 0; i < ms->nb_colloc_points_x; i++) {
+#if 0
+ double target_val = ms->basis->colloc_point(ms->colloc_grid_order_x, i);
+ double best_diff = DBL_MAX, best_val = DBL_MAX;
+
+ for (int j = 0; j < grid_size[0]; j++) {
+ int idx = CCTK_GFINDEX3D(cctkGH, j, 0, 0);
+ double val = x[idx];
+ double diff = fabs(target_val - val);
+
+ if (val > 0.0 && diff < best_diff) {
+ int k;
+ for (k = 0; k < i; k++)
+ if (ms->grid_x[k] == val)
+ break;
+ if (k == i) {
+ best_diff = diff;
+ best_val = val;
+ }
+ }
+ }
+ if (best_val == DBL_MAX)
+ abort();
+ fprintf(stderr, "%d %g -> %g (%g)\n", i, target_val, best_val, target_val - best_val);
+ ms->grid_x[i] = best_val;
+#elif 0
+ double max = x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)];
+ ms->grid_x[i] = max * SQR(SQR((double)i / ms->nb_colloc_points_x)) + 0.001;
+#else
+ ms->grid_x[i] = ms->basis->colloc_point(ms->colloc_grid_order_x, i);
+#endif
+ fprintf(stderr, "%d %g\n", i, ms->grid_x[i]);
+ }
+
+ ms->grid_z = malloc(ms->nb_colloc_points_z * sizeof(*ms->grid_z));
+
+ for (int i = 0; i < ms->nb_colloc_points_z; i++) {
+ ms->grid_z[i] = ms->basis->colloc_point(ms->colloc_grid_order_z, i);
+ }
+
+ /* precompute the basis values we will need */
+ ms->basis_x_val = malloc(sizeof(*ms->basis_x_val) * ms->nb_colloc_points_x * ms->nb_coeffs_x);
+ ms->basis_x_dval = malloc(sizeof(*ms->basis_x_dval) * ms->nb_colloc_points_x * ms->nb_coeffs_x);
+ ms->basis_x_d2val = malloc(sizeof(*ms->basis_x_d2val) * ms->nb_colloc_points_x * ms->nb_coeffs_x);
+ for (int i = 0; i < ms->nb_colloc_points_x; i++) {
+ CCTK_REAL coord = ms->grid_x[i];
+ for (int j = 0; j < ms->nb_coeffs_x; j++) {
+ ms->basis_x_val [i * ms->nb_coeffs_x + j] = ms->basis->eval(coord, j);
+ ms->basis_x_dval [i * ms->nb_coeffs_x + j] = ms->basis->eval_diff1(coord, j);
+ ms->basis_x_d2val[i * ms->nb_coeffs_x + j] = ms->basis->eval_diff2(coord, j);
+ }
+ }
+
+ ms->basis_z_val = malloc(sizeof(*ms->basis_z_val) * ms->nb_colloc_points_z * ms->nb_coeffs_z);
+ ms->basis_z_dval = malloc(sizeof(*ms->basis_z_dval) * ms->nb_colloc_points_z * ms->nb_coeffs_z);
+ ms->basis_z_d2val = malloc(sizeof(*ms->basis_z_d2val) * ms->nb_colloc_points_z * ms->nb_coeffs_z);
+ for (int i = 0; i < ms->nb_colloc_points_z; i++) {
+ CCTK_REAL coord = ms->grid_z[i];
+ for (int j = 0; j < ms->nb_coeffs_z; j++) {
+ ms->basis_z_val [i * ms->nb_coeffs_z + j] = ms->basis->eval(coord, j);
+ ms->basis_z_dval [i * ms->nb_coeffs_z + j] = ms->basis->eval_diff1(coord, j);
+ ms->basis_z_d2val[i * ms->nb_coeffs_z + j] = ms->basis->eval_diff2(coord, j);
+ }
+ }
+
+ ms->basis_val_00 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00));
+ ms->basis_val_11 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00));
+ ms->basis_val_10 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00));
+ ms->basis_val_01 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00));
+ ms->basis_val_02 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00));
+ ms->basis_val_20 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00));
+ for (int i = 0; i < ms->nb_colloc_points_z; i++) {
+ const double *basis_val_z = ms->basis_z_val + i * ms->nb_coeffs_z;
+ const double *dbasis_val_z = ms->basis_z_dval + i * ms->nb_coeffs_z;
+ const double *d2basis_val_z = ms->basis_z_d2val + i * ms->nb_coeffs_z;
+
+ for (int j = 0; j < ms->nb_colloc_points_x; j++) {
+ const double *basis_val_x = ms->basis_x_val + j * ms->nb_coeffs_x;
+ const double *dbasis_val_x = ms->basis_x_dval + j * ms->nb_coeffs_x;
+ const double *d2basis_val_x = ms->basis_x_d2val + j * ms->nb_coeffs_x;
+ const int idx_grid = i * ms->nb_colloc_points_x + j;
+
+ for (int k = 0; k < ms->nb_coeffs_z; k++)
+ for (int l = 0; l < ms->nb_coeffs_x; l++) {
+ const int idx_coeff = k * ms->nb_coeffs_x + l;
+ const int idx = idx_grid + ms->nb_colloc_points * idx_coeff;
+ ms->basis_val_00[idx] = basis_val_x[l] * basis_val_z[k];
+ ms->basis_val_11[idx] = dbasis_val_x[l] * dbasis_val_z[k];
+ ms->basis_val_10[idx] = dbasis_val_x[l] * basis_val_z[k];
+ ms->basis_val_01[idx] = basis_val_x[l] * dbasis_val_z[k];
+ ms->basis_val_02[idx] = basis_val_x[l] * d2basis_val_z[k];
+ ms->basis_val_20[idx] = d2basis_val_x[l] * basis_val_z[k];
+ }
+ }
+ }
+
+ ms->interp_coords[0] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[0]));
+ ms->interp_coords[1] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[1]));
+ ms->interp_coords[2] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[2]));
+ for (int i = 0; i < ms->nb_colloc_points_z; i++) {
+ CCTK_REAL z = ms->grid_z[i];
+ for (int j = 0; j < ms->nb_colloc_points_x; j++) {
+ CCTK_REAL x = ms->grid_x[j];
+
+ ms->interp_coords[0][i * ms->nb_colloc_points_x + j] = x;
+ ms->interp_coords[1][i * ms->nb_colloc_points_x + j] = 0;
+ ms->interp_coords[2][i * ms->nb_colloc_points_x + j] = z;
+ }
+ }
+
+ for (int i = 0; i < ARRAY_ELEMS(ms->metric_u); i++)
+ ms->metric_u[i] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_values[i]));
+
+ ms->kij_kij = malloc(ms->nb_colloc_points * sizeof(*ms->kij_kij));
+
+ for (int i = 0; i < ARRAY_ELEMS(ms->interp_values); i++) {
+ ms->interp_values[i] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_values[i]));
+ ms->interp_value_codes[i] = CCTK_VARIABLE_REAL;
+ }
+
+ for (int i = 0; i < ARRAY_ELEMS(metric_vars); i++)
+ ms->interp_vars_indices[i] = CCTK_VarIndex(metric_vars[i]);
+
+ ms->coord_system = CCTK_CoordSystemHandle("cart3d");
+ if (ms->coord_system < 0)
+ CCTK_WARN(0, "Error getting the coordinate system");
+
+ ms->interp_operator = CCTK_InterpHandle("Lagrange polynomial interpolation");
+ if (ms->interp_operator < 0)
+ CCTK_WARN(0, "Error getting the interpolation operator");
+
+ ms->interp_params = Util_TableCreateFromString("order=2 want_global_mode=1");
+ if (ms->interp_params < 0)
+ CCTK_WARN(0, "Error creating interpolation parameters table");
+
+ ret = Util_TableSetIntArray(ms->interp_params, NB_INTERP_VARS,
+ interp_operation_codes, "operation_codes");
+ if (ret < 0)
+ CCTK_WARN(0, "Error setting operation codes");
+
+ ret = Util_TableSetIntArray(ms->interp_params, NB_INTERP_VARS,
+ interp_operation_indices, "operand_indices");
+ if (ret < 0)
+ CCTK_WARN(0, "Error setting operand indices");
+
+ ms->bicgstab.p = malloc(sizeof(double) * ms->nb_coeffs);
+ ms->bicgstab.v = malloc(sizeof(double) * ms->nb_coeffs);
+ ms->bicgstab.y = malloc(sizeof(double) * ms->nb_coeffs);
+ ms->bicgstab.z = malloc(sizeof(double) * ms->nb_coeffs);
+ ms->bicgstab.t = malloc(sizeof(double) * ms->nb_coeffs);
+ ms->bicgstab.res = malloc(sizeof(double) * ms->nb_coeffs);
+ ms->bicgstab.res0 = malloc(sizeof(double) * ms->nb_coeffs);
+ ms->bicgstab.k = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points);
+
+ ms->steps_since_inverse = INT_MAX;
+
+ init_opencl(ms);
+
+ CCTK_TimerCreate("MaximalSlicingAxi_Solve");
+ CCTK_TimerCreate("MaximalSlicingAxi_Expand");
+ CCTK_TimerCreate("MaximalSlicingAxi_calc_geometry");
+ CCTK_TimerCreate("MaximalSlicingAxi_construct_matrix");
+ CCTK_TimerCreate("MaximalSlicingAxi_solve_LU");
+ CCTK_TimerCreate("MaximalSlicingAxi_solve_BiCGSTAB");
+
+ return ms;
+}
+
+static CoordPatch *get_coord_patch(MaximalSlicingContext *ms,
+ CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z,
+ CCTK_REAL *alp)
+{
+ cGH *cctkGH = ms->gh;
+
+ CoordPatch *cp;
+ int64_t grid_size;
+
+ for (int i = 0; i < ms->nb_patches; i++) {
+ cp = &ms->patches[i];
+
+ if (cp->origin[0] == ms->gh->cctk_origin_space[0] &&
+ cp->origin[1] == ms->gh->cctk_origin_space[1] &&
+ cp->origin[2] == ms->gh->cctk_origin_space[2] &&
+ cp->size[0] == ms->gh->cctk_lsh[0] &&
+ cp->size[1] == ms->gh->cctk_lsh[1] &&
+ cp->size[2] == ms->gh->cctk_lsh[2] &&
+ cp->delta[0] == ms->gh->cctk_delta_space[0] &&
+ cp->delta[1] == ms->gh->cctk_delta_space[1] &&
+ cp->delta[2] == ms->gh->cctk_delta_space[2])
+ return cp;
+ }
+
+ grid_size = cctkGH->cctk_lsh[0] * cctkGH->cctk_lsh[1] * cctkGH->cctk_lsh[2];
+
+ /* create a new patch */
+ ms->patches = realloc(ms->patches, sizeof(*ms->patches) * (ms->nb_patches + 1));
+ cp = &ms->patches[ms->nb_patches];
+
+ memcpy(cp->origin, ms->gh->cctk_origin_space, sizeof(cp->origin));
+ memcpy(cp->size, ms->gh->cctk_lsh, sizeof(cp->size));
+ memcpy(cp->delta, ms->gh->cctk_delta_space, sizeof(cp->delta));
+
+ posix_memalign((void**)&cp->basis_val_r, 32, sizeof(*cp->basis_val_r) * ms->nb_coeffs_x * ms->gh->cctk_lsh[1] * ms->gh->cctk_lsh[0]);
+ for (int j = 0; j < ms->gh->cctk_lsh[1]; j++)
+ for (int i = 0; i < ms->gh->cctk_lsh[0]; i++) {
+ CCTK_REAL xx = x[CCTK_GFINDEX3D(ms->gh, i, j, 0)];
+ CCTK_REAL yy = y[CCTK_GFINDEX3D(ms->gh, i, j, 0)];
+ CCTK_REAL r = sqrt(SQR(xx) + SQR(yy));
+
+ for (int k = 0; k < ms->nb_coeffs_x; k++)
+ //cp->basis_val_r [(j * ms->gh->cctk_lsh[0] + i) * ms->nb_coeffs_x + k] = ms->basis->eval(r, k);
+ cp->basis_val_r [(j * ms->gh->cctk_lsh[0] + i) + ms->gh->cctk_lsh[1] * ms->gh->cctk_lsh[0] * k] = ms->basis->eval(r, k);
+ }
+
+ posix_memalign((void**)&cp->basis_val_z, 32, sizeof(*cp->basis_val_z) * ms->nb_coeffs_z * ms->gh->cctk_lsh[2]);
+
+ for (int i = 0; i < ms->gh->cctk_lsh[2]; i++) {
+ CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, i)];
+ for (int j = 0; j < ms->nb_coeffs_z; j++)
+ cp->basis_val_z [i * ms->nb_coeffs_z + j] = ms->basis->eval(zz, j);
+ //cp->basis_val_z [i + ms->gh->cctk_lsh[2] * j] = ms->basis->eval(zz, j);
+ }
+
+ posix_memalign((void**)&cp->transform_z, 32, sizeof(*cp->transform_z) * cctkGH->cctk_lsh[2] * ms->nb_coeffs_x);
+ posix_memalign((void**)&cp->one, 32, sizeof(*cp->one) * grid_size);
+ for (int i = 0; i < grid_size; i++)
+ cp->one[i] = 1.0;
+
+ ms->nb_patches++;
+ return cp;
+}
+
+void maximal_slicing_axi(CCTK_ARGUMENTS)
+{
+ static MaximalSlicingContext *ms;
+
+ CoordPatch *cp;
+
+ DECLARE_CCTK_ARGUMENTS;
+ DECLARE_CCTK_PARAMETERS;
+
+ static double total;
+ static int64_t count;
+ int64_t timer_start;
+
+ const int64_t grid_size = cctk_lsh[2] * cctk_lsh[1] * cctk_lsh[0];
+
+ /* on the first run, init the solver */
+ if (!ms) {
+ ms = init_ms(cctkGH, basis_order_r, basis_order_z,
+ scale_factor, x, y, z, cctk_lsh);
+ }
+
+ cp = get_coord_patch(ms, x, y, z, alp);
+
+ CCTK_TimerStart("MaximalSlicingAxi_Solve");
+ maximal_solve(ms);
+ CCTK_TimerStop("MaximalSlicingAxi_Solve");
+
+ if (export_coeffs)
+ memcpy(alpha_coeffs, ms->coeffs, sizeof(*alpha_coeffs) * ms->nb_coeffs);
+
+ CCTK_TimerStart("MaximalSlicingAxi_Expand");
+ timer_start = gettime();
+
+ memcpy(alp, cp->one, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*alp));
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+ ms->nb_coeffs_x, cctk_lsh[2], ms->nb_coeffs_z, 1.0,
+ ms->coeffs, ms->nb_coeffs_x, cp->basis_val_z, ms->nb_coeffs_z,
+ 0.0, cp->transform_z, ms->nb_coeffs_x);
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+ cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], ms->nb_coeffs_x, 1.0,
+ cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, ms->nb_coeffs_x,
+ 1.0, alp, cctk_lsh[0] * cctk_lsh[1]);
+
+ total += gettime() - timer_start;
+
+ CCTK_TimerStop("MaximalSlicingAxi_Expand");
+
+ count++;
+ if (!(count & 15))
+ fprintf(stderr, "avg %g total %g\n", total / count, total);
+}
+
+int maximal_slicing_axi_register(void)
+{
+ Einstein_RegisterSlicing("maximal_axi");
+ return 0;
+}
+
+void maximal_slicing_axi_register_mol(CCTK_ARGUMENTS)
+{
+ MoLRegisterConstrained(CCTK_VarIndex("ADMBase::alp"));
+ MoLRegisterConstrained(CCTK_VarIndex("MaximalSlicingAxi::alpha_coeffs"));
+
+ MoLRegisterSaveAndRestoreGroup(CCTK_GroupIndex("ADMBase::metric"));
+ MoLRegisterSaveAndRestoreGroup(CCTK_GroupIndex("ADMBase::curv"));
+}