From a4915453f05ebdfeccd8f954026096e8f145fd06 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Sun, 28 Feb 2016 10:19:06 +0100
Subject: Initial commit.

---
 configuration.ccl  |   2 +
 interface.ccl      |  16 +
 param.ccl          |  31 ++
 schedule.ccl       |  28 ++
 src/basis.c        | 375 +++++++++++++++++++++
 src/make.code.defn |   7 +
 src/qms.c          | 965 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/qms.h          | 301 +++++++++++++++++
 src/register.c     |   9 +
 src/solve.c        | 636 +++++++++++++++++++++++++++++++++++
 src/solve.cl       |  28 ++
 src/solve_cl.c     |  28 ++
 12 files changed, 2426 insertions(+)
 create mode 100644 configuration.ccl
 create mode 100644 interface.ccl
 create mode 100644 param.ccl
 create mode 100644 schedule.ccl
 create mode 100644 src/basis.c
 create mode 100644 src/make.code.defn
 create mode 100644 src/qms.c
 create mode 100644 src/qms.h
 create mode 100644 src/register.c
 create mode 100644 src/solve.c
 create mode 100644 src/solve.cl
 create mode 100644 src/solve_cl.c

diff --git a/configuration.ccl b/configuration.ccl
new file mode 100644
index 0000000..b4e7a1d
--- /dev/null
+++ b/configuration.ccl
@@ -0,0 +1,2 @@
+# Configuration definition for thorn QuasiMaximalSlicing
+
diff --git a/interface.ccl b/interface.ccl
new file mode 100644
index 0000000..324fc5f
--- /dev/null
+++ b/interface.ccl
@@ -0,0 +1,16 @@
+# Interface definition for thorn QuasiMaximalSlicing
+implements: QuasiMaximalSlicing
+
+INHERITS: ADMBase grid CoordBase MethodOfLines
+
+CCTK_INT FUNCTION MoLRegisterConstrained(CCTK_INT IN idx)
+CCTK_INT FUNCTION MoLRegisterSaveAndRestore(CCTK_INT IN idx)
+CCTK_INT FUNCTION MoLRegisterSaveAndRestoreGroup(CCTK_INT IN idx)
+
+REQUIRES FUNCTION MoLRegisterConstrained
+REQUIRES FUNCTION MoLRegisterSaveAndRestore
+REQUIRES FUNCTION MoLRegisterSaveAndRestoreGroup
+
+public:
+CCTK_REAL W TYPE=GF
+CCTK_REAL w_coeffs TYPE=array DIM=2 SIZE=basis_order_z,basis_order_r DISTRIB=constant
diff --git a/param.ccl b/param.ccl
new file mode 100644
index 0000000..e80c42b
--- /dev/null
+++ b/param.ccl
@@ -0,0 +1,31 @@
+# Parameter definitions for thorn QuasiMaximalSlicing
+
+RESTRICTED:
+CCTK_INT basis_order_r "Number of the basis functions in the radial direction" STEERABLE=recover
+{
+    1: :: ""
+} 40
+
+CCTK_INT basis_order_z "Number of the basis functions in the z direction" STEERABLE=recover
+{
+    1: :: ""
+} 40
+
+CCTK_REAL scale_factor "Scaling factor L for the SB basis" STEERABLE=recover
+{
+    0: :: ""
+} 3.0
+
+CCTK_REAL filter_power "" STEERABLE=recover
+{
+    0: :: ""
+} 64.0
+
+CCTK_REAL input_filter_power "" STEERABLE=recover
+{
+    0: :: ""
+} 64.0
+
+BOOLEAN export_coeffs "Export the coefficients of the spectral expansion in alpha_coeffs" STEERABLE=recover
+{
+} "no"
diff --git a/schedule.ccl b/schedule.ccl
new file mode 100644
index 0000000..07120fc
--- /dev/null
+++ b/schedule.ccl
@@ -0,0 +1,28 @@
+# Schedule definitions for thorn MaximalSlicingAxi
+#
+SCHEDULE quasimaximal_slicing_axi_eval IN ML_BSSN_evolCalcGroup BEFORE ML_BSSN_lapse_evol {
+    LANG: C
+} "Quasimaximal slicing eval W"
+
+#SCHEDULE quasimaximal_slicing_axi_solve IN ML_BSSN_evolCalcGroup BEFORE quasimaximal_slicing_axi_eval {
+SCHEDULE quasimaximal_slicing_axi_solve IN MoL_PreStep {
+    LANG: C
+} "Quasimaximal slicing solve W"
+
+#SCHEDULE quasimaximal_slicing_axi IN MoL_PseudoEvolution {
+#    LANG: C
+#} "Quasimaximal slicing"
+
+SCHEDULE qms_register_mol IN MoL_Register {
+    LANG: C
+} ""
+
+SCHEDULE qms_init IN ADMBase_InitialData {
+    LANG: C
+} ""
+
+STORAGE: W
+
+if (export_coeffs) {
+    STORAGE: w_coeffs
+}
diff --git a/src/basis.c b/src/basis.c
new file mode 100644
index 0000000..1ca61d8
--- /dev/null
+++ b/src/basis.c
@@ -0,0 +1,375 @@
+
+#include <math.h>
+
+#include "qms.h"
+
+/*
+ * The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9)
+ * SB(x, n) = sin((n + 1) arccot(|x| / L))
+ * They are symmetric wrt origin and decay as 1/x in infinity.
+ */
+static double sb_even_eval(double coord, int idx)
+{
+    double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+
+    idx *= 2;   // even only
+
+    return sin((idx + 1) * val);
+}
+
+static double sb_even_eval_diff1(double coord, int idx)
+{
+    double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+
+    idx *= 2;   // even only
+
+    return - SCALE_FACTOR * (idx + 1) * SGN(coord) * cos((idx + 1) * val) / (SQR(SCALE_FACTOR) + SQR(coord));
+}
+
+static double sb_even_eval_diff2(double coord, int idx)
+{
+    double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+
+    idx *= 2;   // even only
+
+    return SCALE_FACTOR * (idx + 1) * SGN(coord) * (2 * fabs(coord) * cos((idx + 1) * val) - SCALE_FACTOR * (idx + 1) * sin((idx + 1) * val)) / SQR(SQR(SCALE_FACTOR) + SQR(coord));
+}
+
+static double sb_even_colloc_point(int order, int idx)
+{
+    double t;
+
+    idx = order - idx - 1;
+    //order *= 2;
+
+    //t = (idx + 2) * M_PI / (order + 4);
+#if POLAR
+    t = (idx + 2) * M_PI / (2 * order + 3);
+#else
+    t = (idx + 2) * M_PI / (2 * order + 2);
+#endif
+    return SCALE_FACTOR / tan(t);
+}
+
+const BasisSet qms_sb_even_basis = {
+    .eval         = sb_even_eval,
+    .eval_diff1   = sb_even_eval_diff1,
+    .eval_diff2   = sb_even_eval_diff2,
+    .colloc_point = sb_even_colloc_point,
+};
+
+static double tb_even_eval(double coord, int idx)
+{
+    double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+
+    idx++;
+    idx *= 2;   // even only
+
+    return cos(idx * val) - 1.0;
+}
+
+static double tb_even_eval_diff1(double coord, int idx)
+{
+    double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+
+    idx++;
+    idx *= 2;   // even only
+
+    return SCALE_FACTOR * idx * SGN(coord) * sin(idx * val) / (SQR(SCALE_FACTOR) + SQR(coord));
+}
+
+static double tb_even_eval_diff2(double coord, int idx)
+{
+    double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+
+    idx++;
+    idx *= 2;   // even only
+
+    return -SCALE_FACTOR * idx * SGN(coord) * (2 * fabs(coord) * sin(idx * val) + SCALE_FACTOR * idx * cos(idx * val)) / SQR(SQR(SCALE_FACTOR) + SQR(coord));
+}
+
+static double tb_even_colloc_point(int order, int idx)
+{
+    double t;
+
+    idx = order - idx - 1;
+    //order *= 2;
+
+    //t = (idx + 2) * M_PI / (order + 4);
+    t = (idx + 2) * M_PI / (2 * order + 4);
+    return SCALE_FACTOR / tan(t);
+}
+
+const BasisSet qms_tb_even_basis = {
+    .eval         = tb_even_eval,
+    .eval_diff1   = tb_even_eval_diff1,
+    .eval_diff2   = tb_even_eval_diff2,
+    .colloc_point = tb_even_colloc_point,
+};
+
+static double tl_eval(double coord, int idx)
+{
+    double t = 2 * atan2(sqrt(SCALE_FACTOR), sqrt(fabs(coord)));
+
+    //idx++;
+
+    return cos(idx * t);// - 1.0;
+}
+
+static double tl_eval_diff1(double coord, int idx)
+{
+    double y = fabs(coord);
+    double y12 = sqrt(y);
+    double t = 2 * atan2(sqrt(SCALE_FACTOR), y12);
+
+    //idx++;
+
+    if (y < 1e-10)
+        return -2.0 * SQR(idx) * pow(-1.0, idx) / SCALE_FACTOR;
+
+    return idx * sqrt(SCALE_FACTOR) * sin(idx * t) / (y12 * (y + SCALE_FACTOR));
+}
+
+static double tl_eval_diff2(double coord, int idx)
+{
+    double y = fabs(coord);
+    double y12 = sqrt(y);
+    double t = 2 * atan2(sqrt(SCALE_FACTOR), y12);
+
+    //idx++;
+
+    if (y < 1e-10)
+        return 4.0 * SQR(idx) * (SQR(idx) + 2) * pow(-1.0, idx) / (3.0 * SQR(SCALE_FACTOR));
+
+    return -(SCALE_FACTOR * SQR(idx) * cos(idx * t) / (y * SQR(y + SCALE_FACTOR)) + sqrt(SCALE_FACTOR) * idx * sin(idx * t) / (y12 * SQR(y + SCALE_FACTOR)) + sqrt(SCALE_FACTOR) * idx * sin(idx * t) / (2 * y * y12 * (y + SCALE_FACTOR)));
+}
+
+static double tl_colloc_point(int order, int idx)
+{
+    double t;
+
+    if (!idx)
+        return 0.0;
+
+    idx = order - idx - 1;
+
+    order++;
+    idx++;
+
+    t = (2 * idx + 1) * M_PI / (2 * order);
+    return SCALE_FACTOR / SQR(tan(0.5 * t));
+}
+
+const BasisSet qms_tl_basis = {
+    .eval         = tl_eval,
+    .eval_diff1   = tl_eval_diff1,
+    .eval_diff2   = tl_eval_diff2,
+    .colloc_point = tl_colloc_point,
+};
+
+static double full_eval(double coord, int idx)
+{
+    double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+    int flag = idx & 1;
+
+    idx /= 2;
+
+    if (flag) return sin((2 * idx + 1) * 4 * val);
+    else      return cos((2 * idx + 2) * 4 * val) - 1;
+}
+
+static double full_eval_diff1(double coord, int idx)
+{
+    double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+    int flag = idx & 1;
+
+    idx /= 2;
+
+    if (flag) {
+        idx = 2 * idx + 1;
+        return -4 * idx * SCALE_FACTOR * cos(idx * 4 * val) / (SQR(SCALE_FACTOR) + SQR(coord));
+    } else {
+        idx = 2 * (idx + 1);
+        return 4 * idx * SCALE_FACTOR * sin(idx * 4 * val) / (SQR(SCALE_FACTOR) + SQR(coord));
+    }
+}
+
+static double full_eval_diff2(double coord, int idx)
+{
+    double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+    int flag = idx & 1;
+
+    idx /= 2;
+
+    if (flag) {
+        idx = 2 * idx + 1;
+        return (16 * SQR(idx * SCALE_FACTOR) * cos(idx * 4 * val) - 4 * idx * SCALE_FACTOR * sin(idx * 4 * val) * 2 * coord) / SQR(SQR(SCALE_FACTOR) + SQR(coord));
+    } else {
+        idx = 2 * (idx + 1);
+        return (-16 * SQR(idx * SCALE_FACTOR) * sin(idx * 4 * val) - 4 * idx * SCALE_FACTOR * cos(idx * 4 * val) * 2 * coord) / SQR(SQR(SCALE_FACTOR) + SQR(coord));
+    }
+}
+
+static double full_colloc_point(int order, int idx)
+{
+    double t;
+
+    idx = order - idx - 1;
+
+    t = (idx + 0.5) * M_PI / (2 * order);
+
+    return SCALE_FACTOR / tan(t);
+
+}
+
+const BasisSet qms_full_basis = {
+    .eval         = full_eval,
+    .eval_diff1   = full_eval_diff1,
+    .eval_diff2   = full_eval_diff2,
+    .colloc_point = full_colloc_point,
+};
+
+static double cheb_eval(double coord, int idx)
+{
+    double x = MIN(1.0, MAX(2.0 * coord / SCALE_FACTOR - 1.0, -1.0));
+
+    //idx += 1;
+    return cos(idx * acos(x));
+}
+
+static double cheb_eval_diff1(double coord, int idx)
+{
+    double x = MIN(1.0, MAX(2.0 * coord / SCALE_FACTOR - 1.0, -1.0));
+    double t = acos(x);
+
+    //idx += 1;
+
+    if (fabs(x - 1.0) < 1e-8)
+        return (2.0 / SCALE_FACTOR) * SQR(idx);
+    if (fabs(x + 1.0) < 1e-8)
+        return -(2.0 / SCALE_FACTOR) * SQR(idx) * pow(-1.0, idx);
+    return (2.0 / SCALE_FACTOR) * idx * sin(idx * t) / sin(t);
+}
+
+static double cheb_eval_diff2(double coord, int idx)
+{
+    double x = MIN(1.0, MAX(2.0 * coord / SCALE_FACTOR - 1.0, -1.0));
+    double t = acos(x);
+    double st = sin(t);
+
+    //idx += 1;
+
+    if (fabs(x - 1.0) < 1e-8)
+        return SQR(2.0 / SCALE_FACTOR) * SQR(idx) * (SQR(idx) - 1.0) / 3.;
+    if (fabs(x + 1.0) < 1e-8)
+        return SQR(2.0 / SCALE_FACTOR) * pow(-1.0, idx) * SQR(idx) * (SQR(idx) - 1.0) / 3.;
+
+    return SQR(2.0 / SCALE_FACTOR) * (-SQR(idx) * cos(idx * t) / SQR(st) + idx * cos(t) * sin(idx * t) / (st * SQR(st)));
+}
+
+static double cheb_colloc_point(int order, int idx)
+{
+    double y;
+
+    idx = order - idx - 1;
+
+    //order += 1;
+    //idx += 1;
+    y = cos(idx * M_PI / (order - 1));
+    //y = cos((2 * idx + 1) * M_PI / (2 * order));
+
+    return 0.5 * SCALE_FACTOR * (y + 1.0);
+}
+
+const BasisSet qms_cheb_basis = {
+    .eval       = cheb_eval,
+    .eval_diff1 = cheb_eval_diff1,
+    .eval_diff2 = cheb_eval_diff2,
+    .colloc_point = cheb_colloc_point,
+};
+
+static double cheb_even_eval(double coord, int idx)
+{
+    double x = MIN(1.0, MAX(coord / SCALE_FACTOR, -1.0));
+
+    idx *= 2;
+    //idx += 1;
+    return cos(idx * acos(x));
+}
+
+static double cheb_even_eval_diff1(double coord, int idx)
+{
+    double x = MIN(1.0, MAX(coord / SCALE_FACTOR, -1.0));
+    double t = acos(x);
+
+    idx *= 2;
+    //idx += 1;
+
+    if (fabs(fabs(x) - 1.0) < 1e-8)
+        return (1.0 / SCALE_FACTOR) * SQR(idx);
+    return (1.0 / SCALE_FACTOR) * idx * sin(idx * t) / sin(t);
+}
+
+static double cheb_even_eval_diff2(double coord, int idx)
+{
+    double x = MIN(1.0, MAX(coord / SCALE_FACTOR, -1.0));
+    double t = acos(x);
+    double st = sin(t);
+
+    idx *= 2;
+    //idx += 1;
+
+    if (fabs(fabs(x) - 1) < 1e-8)
+        return SQR(1.0 / SCALE_FACTOR) * SQR(idx) * (SQR(idx) - 1.0) / 3.;
+
+    return SQR(1.0 / SCALE_FACTOR) * (-SQR(idx) * cos(idx * t) / SQR(st) + idx * cos(t) * sin(idx * t) / (st * SQR(st)));
+}
+
+static double cheb_even_colloc_point(int order, int idx)
+{
+    double y;
+
+    idx = order - idx - 1;
+
+    //order += 1;
+    //idx += 1;
+    order *= 2;
+    y = cos(idx * M_PI / (order - 1));
+    y = cos((2 * idx + 2) * M_PI / (2 * order));
+
+    return SCALE_FACTOR * y;
+}
+
+const BasisSet qms_cheb_even_basis = {
+    .eval         = cheb_even_eval,
+    .eval_diff1   = cheb_even_eval_diff1,
+    .eval_diff2   = cheb_even_eval_diff2,
+    .colloc_point = cheb_even_colloc_point,
+};
+
+static double cos_even_eval(double coord, int idx)
+{
+    return cos(2 * idx * coord);
+}
+
+static double cos_even_eval_diff1(double coord, int idx)
+{
+    return -2 * idx * sin(2 * idx * coord);
+}
+
+static double cos_even_eval_diff2(double coord, int idx)
+{
+    return -4 * SQR(idx) * cos(2 * idx * coord);
+}
+
+static double cos_even_colloc_point(int order, int idx)
+{
+    return M_PI * idx / (2 * order - 0);
+}
+
+const BasisSet qms_cos_even_basis = {
+    .eval         = cos_even_eval,
+    .eval_diff1   = cos_even_eval_diff1,
+    .eval_diff2   = cos_even_eval_diff2,
+    .colloc_point = cos_even_colloc_point,
+};
diff --git a/src/make.code.defn b/src/make.code.defn
new file mode 100644
index 0000000..f31ed36
--- /dev/null
+++ b/src/make.code.defn
@@ -0,0 +1,7 @@
+# Main make.code.defn file for thorn MaximalSlicingAxi
+
+# Source files in this directory
+SRCS = basis.c qms.c register.c solve.c
+
+# Subdirectories containing source files
+SUBDIRS = 
diff --git a/src/qms.c b/src/qms.c
new file mode 100644
index 0000000..7332017
--- /dev/null
+++ b/src/qms.c
@@ -0,0 +1,965 @@
+#include <ctype.h>
+#include <errno.h>
+#include <float.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cblas.h>
+#include <lapacke.h>
+
+#include <cl.h>
+#include <clBLAS.h>
+
+#include "cctk.h"
+#include "cctk_Arguments.h"
+#include "cctk_Parameters.h"
+#include "cctk_Timers.h"
+#include "util_Table.h"
+
+#include "qms.h"
+
+#define ACC_TEST 0
+
+double scale_factor;
+
+/* mapping between our indices and thorn names */
+static const char *metric_vars[] = {
+#if CCZ4
+    [GTXX]  = "ML_CCZ4::gt11",
+    [GTYY]  = "ML_CCZ4::gt22",
+    [GTZZ]  = "ML_CCZ4::gt33",
+    [GTXY]  = "ML_CCZ4::gt12",
+    [GTXZ]  = "ML_CCZ4::gt13",
+    [GTYZ]  = "ML_CCZ4::gt23",
+    [ATXX]  = "ML_CCZ4::At11",
+    [ATYY]  = "ML_CCZ4::At22",
+    [ATZZ]  = "ML_CCZ4::At33",
+    [ATXY]  = "ML_CCZ4::At12",
+    [ATXZ]  = "ML_CCZ4::At13",
+    [ATYZ]  = "ML_CCZ4::At23",
+    [PHI]   = "ML_CCZ4::phi",
+    [K]     = "ML_CCZ4::trK",
+    [XTX]   = "ML_CCZ4::Xt1",
+    [XTY]   = "ML_CCZ4::Xt2",
+    [XTZ]   = "ML_CCZ4::Xt3",
+    [BETAX] = "ML_CCZ4::beta1",
+    [BETAY] = "ML_CCZ4::beta2",
+    [BETAZ] = "ML_CCZ4::beta3",
+    [ALPHA] = "ML_CCZ4::alpha",
+    [KDOT_XX] = "ML_CCZ4::Kdot11",
+    [KDOT_YY] = "ML_CCZ4::Kdot22",
+    [KDOT_ZZ] = "ML_CCZ4::Kdot33",
+    [KDOT_XY] = "ML_CCZ4::Kdot12",
+    [KDOT_XZ] = "ML_CCZ4::Kdot13",
+    [KDOT_YZ] = "ML_CCZ4::Kdot23",
+    [XTDOT_X] = "ML_CCZ4::Xtdot1",
+    [XTDOT_Y] = "ML_CCZ4::Xtdot2",
+    [XTDOT_Z] = "ML_CCZ4::Xtdot3",
+    [PHIDOT]  = "ML_CCZ4::phidot",
+#else
+    [GTXX]  = "ML_BSSN::gt11",
+    [GTYY]  = "ML_BSSN::gt22",
+    [GTZZ]  = "ML_BSSN::gt33",
+    [GTXY]  = "ML_BSSN::gt12",
+    [GTXZ]  = "ML_BSSN::gt13",
+    [GTYZ]  = "ML_BSSN::gt23",
+    [ATXX]  = "ML_BSSN::At11",
+    [ATYY]  = "ML_BSSN::At22",
+    [ATZZ]  = "ML_BSSN::At33",
+    [ATXY]  = "ML_BSSN::At12",
+    [ATXZ]  = "ML_BSSN::At13",
+    [ATYZ]  = "ML_BSSN::At23",
+    [PHI]   = "ML_BSSN::phi",
+    [K]     = "ML_BSSN::trK",
+    [XTX]   = "ML_BSSN::Xt1",
+    [XTY]   = "ML_BSSN::Xt2",
+    [XTZ]   = "ML_BSSN::Xt3",
+    [BETAX] = "ML_BSSN::beta1",
+    [BETAY] = "ML_BSSN::beta2",
+    [BETAZ] = "ML_BSSN::beta3",
+    [ALPHA] = "ML_BSSN::alpha",
+    //[ALPHA] = "ADMBase::alp",
+    [KDOT_XX] = "ML_BSSN::Kdot11",
+    [KDOT_YY] = "ML_BSSN::Kdot22",
+    [KDOT_ZZ] = "ML_BSSN::Kdot33",
+    [KDOT_XY] = "ML_BSSN::Kdot12",
+    [KDOT_XZ] = "ML_BSSN::Kdot13",
+    [KDOT_YZ] = "ML_BSSN::Kdot23",
+    [XTDOT_X] = "ML_BSSN::Xtdot1",
+    [XTDOT_Y] = "ML_BSSN::Xtdot2",
+    [XTDOT_Z] = "ML_BSSN::Xtdot3",
+    [PHIDOT]  = "ML_BSSN::phidot",
+#endif
+};
+
+/* mapping between the cactus grid values and interpolated values */
+static const CCTK_INT interp_operation_indices[] = {
+    [I_GTXX]     = GTXX,
+    [I_GTYY]     = GTYY,
+    [I_GTZZ]     = GTZZ,
+    [I_GTXY]     = GTXY,
+    [I_GTXZ]     = GTXZ,
+    [I_GTYZ]     = GTYZ,
+    [I_PHI]      = PHI,
+    [I_PHI_DX]   = PHI,
+    [I_PHI_DY]   = PHI,
+    [I_PHI_DZ]   = PHI,
+    [I_ATXX]     = ATXX,
+    [I_ATYY]     = ATYY,
+    [I_ATZZ]     = ATZZ,
+    [I_ATXY]     = ATXY,
+    [I_ATXZ]     = ATXZ,
+    [I_ATYZ]     = ATYZ,
+    [I_K]        = K,
+    [I_K_DX]     = K,
+    [I_K_DY]     = K,
+    [I_K_DZ]     = K,
+    [I_XTX]      = XTX,
+    [I_XTY]      = XTY,
+    [I_XTZ]      = XTZ,
+    [I_BETAX]    = BETAX,
+    [I_BETAY]    = BETAY,
+    [I_BETAZ]    = BETAZ,
+    [I_ALPHA]    = ALPHA,
+    [I_ALPHA_DX] = ALPHA,
+    [I_ALPHA_DY] = ALPHA,
+    [I_ALPHA_DZ] = ALPHA,
+    [I_ALPHA_DXX] = ALPHA,
+    [I_ALPHA_DYY] = ALPHA,
+    [I_ALPHA_DZZ] = ALPHA,
+    [I_ALPHA_DXY] = ALPHA,
+    [I_ALPHA_DXZ] = ALPHA,
+    [I_ALPHA_DYZ] = ALPHA,
+    [I_KDOT_XX]   = KDOT_XX,
+    [I_KDOT_YY]   = KDOT_YY,
+    [I_KDOT_ZZ]   = KDOT_ZZ,
+    [I_KDOT_XY]   = KDOT_XY,
+    [I_KDOT_XZ]   = KDOT_XZ,
+    [I_KDOT_YZ]   = KDOT_YZ,
+    [I_XTDOT_X]   = XTDOT_X,
+    [I_XTDOT_Y]   = XTDOT_Y,
+    [I_XTDOT_Z]   = XTDOT_Z,
+    [I_PHIDOT]    = PHIDOT,
+    [I_PHIDOT_DX] = PHIDOT,
+    [I_PHIDOT_DY] = PHIDOT,
+    [I_PHIDOT_DZ] = PHIDOT,
+};
+
+/* the operation (plain value or x/y/z-derivative) to apply during interpolation */
+static const CCTK_INT interp_operation_codes[] = {
+    [I_GTXX]     = 0,
+    [I_GTYY]     = 0,
+    [I_GTZZ]     = 0,
+    [I_GTXY]     = 0,
+    [I_GTXZ]     = 0,
+    [I_GTYZ]     = 0,
+    [I_PHI]      = 0,
+    [I_PHI_DX]   = 1,
+    [I_PHI_DY]   = 2,
+    [I_PHI_DZ]   = 3,
+    [I_ATXX]     = 0,
+    [I_ATYY]     = 0,
+    [I_ATZZ]     = 0,
+    [I_ATXY]     = 0,
+    [I_ATXZ]     = 0,
+    [I_ATYZ]     = 0,
+    [I_K]        = 0,
+    [I_K_DX]     = 1,
+    [I_K_DY]     = 2,
+    [I_K_DZ]     = 3,
+    [I_XTX]      = 0,
+    [I_XTY]      = 0,
+    [I_XTZ]      = 0,
+    [I_BETAX]    = 0,
+    [I_BETAY]    = 0,
+    [I_BETAZ]    = 0,
+    [I_ALPHA]    = 0,
+    [I_ALPHA_DX] = 1,
+    [I_ALPHA_DY] = 2,
+    [I_ALPHA_DZ] = 3,
+    [I_ALPHA_DXX] = 11,
+    [I_ALPHA_DYY] = 22,
+    [I_ALPHA_DZZ] = 33,
+    [I_ALPHA_DXY] = 12,
+    [I_ALPHA_DXZ] = 13,
+    [I_ALPHA_DYZ] = 23,
+    [I_KDOT_XX]   = 0,
+    [I_KDOT_YY]   = 0,
+    [I_KDOT_ZZ]   = 0,
+    [I_KDOT_XY]   = 0,
+    [I_KDOT_XZ]   = 0,
+    [I_KDOT_YZ]   = 0,
+    [I_XTDOT_X]   = 0,
+    [I_XTDOT_Y]   = 0,
+    [I_XTDOT_Z]   = 0,
+    [I_PHIDOT]    = 0,
+    [I_PHIDOT_DX] = 1,
+    [I_PHIDOT_DY] = 2,
+    [I_PHIDOT_DZ] = 3,
+};
+
+static void init_opencl(MaximalSlicingContext *ms)
+{
+    int err, count;
+    cl_platform_id platform;
+    cl_context_properties props[3];
+
+    err = clGetPlatformIDs(1, &platform, &count);
+    if (err != CL_SUCCESS || count < 1) {
+        fprintf(stderr, "Could not get an OpenCL platform ID\n");
+        return;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ms->ocl_device, &count);
+    if (err != CL_SUCCESS || count < 1) {
+        fprintf(stderr, "Could not get an OpenCL device ID\n");
+        return;
+    }
+
+    props[0] = CL_CONTEXT_PLATFORM;
+    props[1] = (cl_context_properties)platform;
+    props[2] = 0;
+
+    ms->cl_ctx = clCreateContext(props, 1, &ms->ocl_device, NULL, NULL, &err);
+    if (err != CL_SUCCESS || !ms->cl_ctx) {
+        fprintf(stderr, "Could not create an OpenCL context\n");
+        return;
+    }
+
+    ms->cl_queue = clCreateCommandQueue(ms->cl_ctx, ms->ocl_device, 0, &err);
+    if (err != CL_SUCCESS || !ms->cl_queue) {
+        fprintf(stderr, "Could not create an OpenCL command queue: %d\n", err);
+        goto fail;
+    }
+
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        fprintf(stderr, "Error setting up clBLAS\n");
+        goto fail;
+    }
+
+    ms->ocl_coeffs       = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+
+    ms->bicgstab.cl_p    = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+    ms->bicgstab.cl_v    = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+    ms->bicgstab.cl_y    = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+    ms->bicgstab.cl_z    = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+    ms->bicgstab.cl_t    = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+    ms->bicgstab.cl_res  = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+    ms->bicgstab.cl_res0 = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+    ms->bicgstab.cl_tmp  = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
+    ms->bicgstab.cl_tmp1 = clCreateBuffer(ms->cl_ctx, 0, 2 * ms->nb_coeffs * sizeof(double), NULL, &err);
+
+    ms->bicgstab.cl_k    = clCreateBuffer(ms->cl_ctx, 0, ms->nb_colloc_points * ms->nb_coeffs * sizeof(double), NULL, &err);
+    ms->bicgstab.cl_mat  = clCreateBuffer(ms->cl_ctx, 0, ms->nb_colloc_points * ms->nb_coeffs * sizeof(double), NULL, &err);
+
+    ms->bicgstab.cl_rho    = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
+    ms->bicgstab.cl_alpha  = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
+    ms->bicgstab.cl_beta   = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
+    ms->bicgstab.cl_omega  = clCreateBuffer(ms->cl_ctx, 0, 2 * sizeof(double), NULL, &err);
+    ms->bicgstab.cl_omega1 = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
+
+    return;
+fail:
+    if (ms->cl_queue)
+        clReleaseCommandQueue(ms->cl_queue);
+    ms->cl_queue = 0;
+
+    if (ms->cl_ctx)
+        clReleaseContext(ms->cl_ctx);
+    ms->cl_ctx = 0;
+}
+
+static void construct_filter_matrix(MaximalSlicingContext *ms, double filter_power)
+{
+    char  equed = 'N';
+    double cond, ferr, berr, rpivot;
+
+    double *m, *minv, *scale, *tmp;
+    int *ipiv;
+    int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z;
+    int N = ms->nb_coeffs;
+
+    ms->input_filter = malloc(sizeof(*m) * N * N);
+
+    m     = malloc(sizeof(*m) * N * N);
+    minv  = malloc(sizeof(*m) * N * N);
+    scale = malloc(sizeof(*m) * N * N);
+    tmp   = malloc(sizeof(*m) * N * N);
+    ipiv  = malloc(sizeof(*ipiv) * N);
+
+#define   BASIS_X (ms->basis_x_val  [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
+#define   BASIS_Z (ms->basis_z_val  [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
+    for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) {
+        for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) {
+            int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;
+
+            for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
+                for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
+                    const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;
+
+                    minv[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z;
+                    scale[idx_grid + ms->nb_colloc_points * idx_coeff] = (idx_grid == idx_coeff) ? 
+                                                                         exp(-36.0 * pow((double)idx_grid_x / ms->nb_coeffs_x, filter_power)) *
+                                                                         exp(-36.0 * pow((double)idx_grid_z / ms->nb_coeffs_z, filter_power)) : 0.0;
+
+                    scale[idx_grid + ms->nb_colloc_points * idx_coeff] = (idx_grid == idx_coeff) ?  1.0 : 0.0;
+                    //if (idx_coeff_z == idx_grid_z && idx_coeff_z == 0 && idx_grid_x == idx_coeff_x)
+                    //    fprintf(stderr, "%d %g\n", idx_coeff_x, scale[idx_grid + ms->nb_colloc_points * idx_coeff]);
+                }
+        }
+    }
+
+    memcpy(m, minv, sizeof(*m) * N * N);
+    LAPACKE_dgetrf(LAPACK_COL_MAJOR, N, N, m, N, ipiv);
+    LAPACKE_dgetri(LAPACK_COL_MAJOR, N, m, N, ipiv);
+
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                N, N, N, 1.0, scale, N, m, N, 0.0, tmp, N);
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                N, N, N, 1.0, minv, N, tmp, N, 0.0, ms->input_filter, N);
+
+    free(m);
+    free(minv);
+    free(scale);
+    free(tmp);
+    free(ipiv);
+}
+
+static MaximalSlicingContext *init_ms(cGH *cctkGH,
+                                      int basis_order_r, int basis_order_z,
+                                      double sf, double filter_power, double input_filter_power,
+                                      CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z,
+                                      const int grid_size[3])
+{
+    MaximalSlicingContext *ms;
+    int ret;
+
+    ms = calloc(1, sizeof(*ms));
+
+    ms->gh = cctkGH;
+
+    ms->basis      = &qms_sb_even_basis;
+    //ms->basis      = &qms_cheb_basis;
+    //ms->basis      = &qms_cheb_even_basis;
+    //ms->basis      = &qms_tl_basis;
+#if POLAR
+    ms->basis1       = &qms_cos_even_basis;
+#else
+    ms->basis1       = &qms_sb_even_basis;
+#endif
+
+    ms->nb_coeffs_x = basis_order_r;
+    ms->nb_coeffs_z = basis_order_z;
+
+    ms->nb_coeffs = ms->nb_coeffs_x * ms->nb_coeffs_z;
+
+    ms->nb_colloc_points_x = basis_order_r;
+    ms->nb_colloc_points_z = basis_order_z;
+
+    ms->nb_colloc_points = ms->nb_colloc_points_x * ms->nb_colloc_points_z;
+
+    if (ms->nb_colloc_points != ms->nb_coeffs)
+        CCTK_WARN(0, "Non-square collocation matrix");
+
+    ms->colloc_grid_order_x = ms->nb_colloc_points_x;
+    ms->colloc_grid_order_z = ms->nb_colloc_points_z;
+
+    ms->mat    = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points);
+    ms->coeffs = malloc(sizeof(double) * ms->nb_coeffs);
+    ms->rhs    = malloc(sizeof(double) * ms->nb_colloc_points);
+
+    ms->coeffs_eval = malloc(sizeof(double) * ms->nb_coeffs);
+    for (int i = 0; i < ARRAY_ELEMS(ms->solution_cache); i++) {
+        ms->solution_cache[i].coeffs = malloc(sizeof(double) * ms->nb_coeffs);
+        if (!ms->solution_cache[i].coeffs)
+            CCTK_WARN(0, "malloc failure");
+    }
+
+    ms->mat_f  = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points);
+    ms->ipiv   = malloc(sizeof(*ms->ipiv) * ms->nb_coeffs);
+
+#if 1
+    scale_factor = 1.0;
+
+    //scale_factor = (x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)] * 0.75) / ms->basis->colloc_point(ms->colloc_grid_order_x, ms->nb_colloc_points_x - 1);
+    scale_factor = (64.0 / ms->basis->colloc_point(ms->colloc_grid_order_x, ms->nb_colloc_points_x - 1));
+    //scale_factor = (x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)]);
+    //scale_factor = x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)] - 0.5;
+    fprintf(stderr, "scale factor %16.16g\n", scale_factor);
+
+#else
+    scale_factor = sf;
+#endif
+
+    /* initialize the collocation grid */
+    posix_memalign((void**)&ms->colloc_grid[0], 32, ms->nb_colloc_points_x * sizeof(*ms->colloc_grid[0]));
+    posix_memalign((void**)&ms->colloc_grid[1], 32, ms->nb_colloc_points_z * sizeof(*ms->colloc_grid[1]));
+
+    for (int i = 0; i < ms->nb_colloc_points_x; i++) {
+        ms->colloc_grid[0][i] = ms->basis->colloc_point(ms->colloc_grid_order_x, i);
+        fprintf(stderr, "%d %g\n", i, ms->colloc_grid[0][i]);
+    }
+    for (int i = 0; i < ms->nb_colloc_points_z; i++) {
+        ms->colloc_grid[1][i] = ms->basis1->colloc_point(ms->colloc_grid_order_z, i);
+        fprintf(stderr, "%d %g\n", i, ms->colloc_grid[1][i] / (POLAR ? M_PI : 1.0));
+    }
+
+    /* precompute the basis values we will need */
+    ms->basis_x_val   = malloc(sizeof(*ms->basis_x_val)   * ms->nb_colloc_points_x * ms->nb_coeffs_x);
+    ms->basis_x_dval  = malloc(sizeof(*ms->basis_x_dval)  * ms->nb_colloc_points_x * ms->nb_coeffs_x);
+    ms->basis_x_d2val = malloc(sizeof(*ms->basis_x_d2val) * ms->nb_colloc_points_x * ms->nb_coeffs_x);
+    for (int i = 0; i < ms->nb_colloc_points_x; i++) {
+        CCTK_REAL coord = ms->colloc_grid[0][i];
+        for (int j = 0; j < ms->nb_coeffs_x; j++) {
+            ms->basis_x_val  [i * ms->nb_coeffs_x + j] = ms->basis->eval(coord, j);
+            ms->basis_x_dval [i * ms->nb_coeffs_x + j] = ms->basis->eval_diff1(coord, j);
+            ms->basis_x_d2val[i * ms->nb_coeffs_x + j] = ms->basis->eval_diff2(coord, j);
+        }
+    }
+
+    ms->basis_z_val   = malloc(sizeof(*ms->basis_z_val)   * ms->nb_colloc_points_z * ms->nb_coeffs_z);
+    ms->basis_z_dval  = malloc(sizeof(*ms->basis_z_dval)  * ms->nb_colloc_points_z * ms->nb_coeffs_z);
+    ms->basis_z_d2val = malloc(sizeof(*ms->basis_z_d2val) * ms->nb_colloc_points_z * ms->nb_coeffs_z);
+    for (int i = 0; i < ms->nb_colloc_points_z; i++) {
+        CCTK_REAL coord = ms->colloc_grid[1][i];
+        for (int j = 0; j < ms->nb_coeffs_z; j++) {
+            ms->basis_z_val  [i * ms->nb_coeffs_z + j] = ms->basis1->eval(coord, j);
+            ms->basis_z_dval [i * ms->nb_coeffs_z + j] = ms->basis1->eval_diff1(coord, j);
+            ms->basis_z_d2val[i * ms->nb_coeffs_z + j] = ms->basis1->eval_diff2(coord, j);
+        }
+    }
+
+    posix_memalign((void**)&ms->basis_val_00, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00));
+    posix_memalign((void**)&ms->basis_val_11, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00));
+    posix_memalign((void**)&ms->basis_val_10, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00));
+    posix_memalign((void**)&ms->basis_val_01, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00));
+    posix_memalign((void**)&ms->basis_val_02, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00));
+    posix_memalign((void**)&ms->basis_val_20, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00));
+    for (int i = 0; i < ms->nb_colloc_points_z; i++) {
+        const double *basis_z   = ms->basis_z_val + i * ms->nb_coeffs_z;
+        const double *dbasis_z  = ms->basis_z_dval + i * ms->nb_coeffs_z;
+        const double *d2basis_z = ms->basis_z_d2val + i * ms->nb_coeffs_z;
+
+        for (int j = 0; j < ms->nb_colloc_points_x; j++) {
+            const double *basis_x   = ms->basis_x_val   + j * ms->nb_coeffs_x;
+            const double *dbasis_x  = ms->basis_x_dval  + j * ms->nb_coeffs_x;
+            const double *d2basis_x = ms->basis_x_d2val + j * ms->nb_coeffs_x;
+            const int idx_grid = i * ms->nb_colloc_points_x + j;
+
+#if POLAR
+            double r     = ms->colloc_grid[0][j];
+            double theta = ms->colloc_grid[1][i];
+
+            double x = r * cos(theta);
+            double z = r * sin(theta);
+#else
+            double x = ms->colloc_grid[0][j];
+            double z = ms->colloc_grid[1][i];
+#endif
+
+            for (int k = 0; k < ms->nb_coeffs_z; k++)
+                for (int l = 0; l < ms->nb_coeffs_x; l++) {
+                    const int idx_coeff = k * ms->nb_coeffs_x + l;
+                    const int idx = idx_grid + ms->nb_colloc_points * idx_coeff;
+                    ms->basis_val_00[idx] =   basis_x[l] *   basis_z[k];
+#if POLAR
+                    ms->basis_val_10[idx] =  ((r > EPS) ? (dbasis_x[l] * basis_z[k] * x / r - basis_x[l] * dbasis_z[k] * z / SQR(r)) : 0.0);
+                    ms->basis_val_01[idx] =  ((r > EPS) ? (dbasis_x[l] * basis_z[k] * z / r + basis_x[l] * dbasis_z[k] * x / SQR(r)) : 0.0);
+                    ms->basis_val_20[idx] =  ((r > EPS) ? (SQR(x / r) * d2basis_x[l] * basis_z[k] + SQR(z / SQR(r)) * basis_x[l] * d2basis_z[k]
+                                                           + (1.0 - SQR(x / r)) / r * dbasis_x[l] * basis_z[k]
+                                                           + 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k]
+                                                           - 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
+                    ms->basis_val_02[idx] =  ((r > EPS) ? (SQR(z / r) * d2basis_x[l] * basis_z[k] + SQR(x / SQR(r)) * basis_x[l] * d2basis_z[k]
+                                                           + (1.0 - SQR(z / r)) / r * dbasis_x[l] * basis_z[k]
+                                                           - 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k]
+                                                           + 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
+                    ms->basis_val_11[idx] =  ((r > EPS) ? (x * z / SQR(r) * d2basis_x[l] * basis_z[k] - x * z / SQR(SQR(r)) * basis_x[l] * d2basis_z[k]
+                                                           - x * z / (r * SQR(r)) * dbasis_x[l] * basis_z[k]
+                                                           - (1.0 - SQR(z / r)) / SQR(r) * basis_x[l] * dbasis_z[k]
+                                                           + (SQR(x) - SQR(z)) / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
+#else
+                    ms->basis_val_10[idx] =  dbasis_x[l] *   basis_z[k];
+                    ms->basis_val_01[idx] =   basis_x[l] *  dbasis_z[k];
+                    ms->basis_val_20[idx] = d2basis_x[l] *   basis_z[k];
+                    ms->basis_val_02[idx] =   basis_x[l] * d2basis_z[k];
+                    ms->basis_val_11[idx] =  dbasis_x[l] *  dbasis_z[k];
+#endif
+                }
+        }
+    }
+
+    posix_memalign((void**)&ms->eq_coeff_00, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00));
+    posix_memalign((void**)&ms->eq_coeff_11, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00));
+    posix_memalign((void**)&ms->eq_coeff_10, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00));
+    posix_memalign((void**)&ms->eq_coeff_01, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00));
+    posix_memalign((void**)&ms->eq_coeff_02, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00));
+    posix_memalign((void**)&ms->eq_coeff_20, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00));
+
+    ms->interp_coords[0] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[0]));
+    ms->interp_coords[1] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[1]));
+    ms->interp_coords[2] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[2]));
+    for (int i = 0; i < ms->nb_colloc_points_z; i++) {
+        for (int j = 0; j < ms->nb_colloc_points_x; j++) {
+#if POLAR
+            double phi = ms->colloc_grid[1][i];
+            double r = ms->colloc_grid[0][j];
+
+            double x = r * cos(phi);
+            double z = r * sin(phi);
+#else
+            double x = ms->colloc_grid[0][j];
+            double z = ms->colloc_grid[1][i];
+#endif
+
+            ms->interp_coords[0][i * ms->nb_colloc_points_x + j] = x;
+            ms->interp_coords[1][i * ms->nb_colloc_points_x + j] = 0;
+            ms->interp_coords[2][i * ms->nb_colloc_points_x + j] = z;
+        }
+    }
+
+    for (int i = 0; i < ARRAY_ELEMS(ms->metric_u); i++)
+        ms->metric_u[i] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_values[i]));
+
+    ms->kij_kij = malloc(ms->nb_colloc_points * sizeof(*ms->kij_kij));
+
+    ms->coeff_scale = malloc(ms->nb_coeffs * sizeof(double));
+    for (int j = 0; j < ms->nb_coeffs_z; j++)
+        for (int i = 0; i < ms->nb_coeffs_x; i++) {
+            //ms->coeff_scale[j * ms->nb_coeffs_x + i] = 1.0;
+            ms->coeff_scale[j * ms->nb_coeffs_x + i] = exp(-36.0 * pow((double)i / ms->nb_coeffs_x, filter_power)) *
+                                                       exp(-36.0 * pow((double)j / ms->nb_coeffs_z, filter_power));
+            //ms->coeff_scale[j * ms->nb_coeffs_x + i] = ((i < (2.0 / 3.0) * ms->nb_coeffs_x) ? 1.0 : SQR(cos((((double)i / ms->nb_coeffs_x) - (2.0 / 3.0)) * 3.0 * M_PI / 2.0))) *
+            //                                           ((j < (2.0 / 3.0) * ms->nb_coeffs_z) ? 1.0 : SQR(cos((((double)j / ms->nb_coeffs_z) - (2.0 / 3.0)) * 3.0 * M_PI / 2.0)));
+        }
+
+    for (int i = 0; i < ARRAY_ELEMS(ms->interp_values); i++) {
+        ms->interp_values[i]           = malloc(sizeof(*ms->interp_values[i]) * ms->nb_colloc_points);
+        ms->interp_values_prefilter[i]  = malloc(sizeof(*ms->interp_values[i]) * ms->nb_colloc_points);
+        if (!ms->interp_values[i] || !ms->interp_values_prefilter[i])
+            CCTK_WARN(0, "Malloc failure");
+        ms->interp_value_codes[i]      = CCTK_VARIABLE_REAL;
+    }
+
+    for (int i = 0; i < ARRAY_ELEMS(metric_vars); i++) {
+        ms->interp_vars_indices[i] = CCTK_VarIndex(metric_vars[i]);
+        if (ms->interp_vars_indices[i] < 0)
+            CCTK_VWarn(0, __LINE__, __FILE__, CCTK_THORNSTRING, "Error getting the index of variable: %s\n", metric_vars[i]);
+    }
+
+    ms->coord_system = CCTK_CoordSystemHandle("cart3d");
+    if (ms->coord_system < 0)
+        CCTK_WARN(0, "Error getting the coordinate system");
+
+    ms->interp_operator = CCTK_InterpHandle("Lagrange polynomial interpolation (tensor product)");
+    if (ms->interp_operator < 0)
+        CCTK_WARN(0, "Error getting the interpolation operator");
+
+    ms->interp_params = Util_TableCreateFromString("order=4 want_global_mode=1");
+    if (ms->interp_params < 0)
+        CCTK_WARN(0, "Error creating interpolation parameters table");
+
+    ret = Util_TableSetIntArray(ms->interp_params, NB_INTERP_VARS,
+                                interp_operation_codes, "operation_codes");
+    if (ret < 0)
+        CCTK_WARN(0, "Error setting operation codes");
+
+    ret = Util_TableSetIntArray(ms->interp_params, NB_INTERP_VARS,
+                                interp_operation_indices, "operand_indices");
+    if (ret < 0)
+        CCTK_WARN(0, "Error setting operand indices");
+
+    ms->bicgstab.p    = malloc(sizeof(double) * ms->nb_coeffs);
+    ms->bicgstab.v    = malloc(sizeof(double) * ms->nb_coeffs);
+    ms->bicgstab.y    = malloc(sizeof(double) * ms->nb_coeffs);
+    ms->bicgstab.z    = malloc(sizeof(double) * ms->nb_coeffs);
+    ms->bicgstab.t    = malloc(sizeof(double) * ms->nb_coeffs);
+    ms->bicgstab.res  = malloc(sizeof(double) * ms->nb_coeffs);
+    ms->bicgstab.res0 = malloc(sizeof(double) * ms->nb_coeffs);
+    ms->bicgstab.k    = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points);
+
+    ms->steps_since_inverse = INT_MAX;
+
+    init_opencl(ms);
+
+    construct_filter_matrix(ms, input_filter_power);
+
+    CCTK_TimerCreate("MaximalSlicingAxi_Solve");
+    CCTK_TimerCreate("MaximalSlicingAxi_Expand");
+    CCTK_TimerCreate("MaximalSlicingAxi_interp_geometry");
+    CCTK_TimerCreate("MaximalSlicingAxi_calc_eq_coeffs");
+    CCTK_TimerCreate("MaximalSlicingAxi_construct_matrix");
+    CCTK_TimerCreate("MaximalSlicingAxi_filter_input");
+    CCTK_TimerCreate("MaximalSlicingAxi_solve_LU");
+    CCTK_TimerCreate("MaximalSlicingAxi_solve_BiCGSTAB");
+    CCTK_TimerCreate("MaximalSlicingAxi_Polish");
+
+    return ms;
+}
+
+static CoordPatch *get_coord_patch(MaximalSlicingContext *ms,
+                                   CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z)
+{
+    cGH *cctkGH = ms->gh;
+
+    CoordPatch *cp;
+    int64_t grid_size;
+    int i;
+
+    for (int i = 0; i < ms->nb_patches; i++) {
+        cp = &ms->patches[i];
+
+        if (cp->origin[0] == ms->gh->cctk_origin_space[0] &&
+            cp->origin[1] == ms->gh->cctk_origin_space[1] &&
+            cp->origin[2] == ms->gh->cctk_origin_space[2] &&
+            cp->size[0]   == ms->gh->cctk_lsh[0]          &&
+            cp->size[1]   == ms->gh->cctk_lsh[1]          &&
+            cp->size[2]   == ms->gh->cctk_lsh[2]          &&
+            cp->delta[0]  == ms->gh->cctk_levfac[0]  &&
+            cp->delta[1]  == ms->gh->cctk_levfac[1]  &&
+            cp->delta[2]  == ms->gh->cctk_levfac[2])
+            return cp;
+    }
+
+    grid_size = cctkGH->cctk_lsh[0] * cctkGH->cctk_lsh[1] * cctkGH->cctk_lsh[2];
+
+    /* create a new patch */
+    ms->patches = realloc(ms->patches, sizeof(*ms->patches) * (ms->nb_patches + 1));
+    cp = &ms->patches[ms->nb_patches];
+
+    memset(cp, 0, sizeof(*cp));
+
+    memcpy(cp->origin, ms->gh->cctk_origin_space, sizeof(cp->origin));
+    memcpy(cp->size,   ms->gh->cctk_lsh,          sizeof(cp->size));
+    memcpy(cp->delta,  ms->gh->cctk_levfac,  sizeof(cp->delta));
+
+    for (i = 0; i < cp->size[1]; i++)
+        if (fabs(y[CCTK_GFINDEX3D(cctkGH, 0, i, 0)]) < 1e-8) {
+            cp->y_idx = i;
+            break;
+        }
+    if (i == cp->size[1])
+        CCTK_WARN(0, "The grid does not include y==0");
+
+#if 0
+    posix_memalign((void**)&cp->basis_val_r,   32, sizeof(*cp->basis_val_r)   * ms->nb_coeffs_x * ms->gh->cctk_lsh[1] * ms->gh->cctk_lsh[0]);
+    for (int j = 0; j < ms->gh->cctk_lsh[1]; j++)
+        for (int i = 0; i < ms->gh->cctk_lsh[0]; i++) {
+            CCTK_REAL xx = x[CCTK_GFINDEX3D(ms->gh, i, j, 0)];
+            CCTK_REAL yy = y[CCTK_GFINDEX3D(ms->gh, i, j, 0)];
+            CCTK_REAL r = sqrt(SQR(xx) + SQR(yy));
+
+            for (int k = 0; k < ms->nb_coeffs_x; k++)
+                //cp->basis_val_r  [(j * ms->gh->cctk_lsh[0] + i) * ms->nb_coeffs_x + k] = ms->basis->eval(r, k);
+                cp->basis_val_r  [(j * ms->gh->cctk_lsh[0] + i) + ms->gh->cctk_lsh[1] * ms->gh->cctk_lsh[0] * k] = ms->basis->eval(r, k);
+        }
+
+    posix_memalign((void**)&cp->basis_val_z,   32, sizeof(*cp->basis_val_z) * ms->nb_coeffs_z * ms->gh->cctk_lsh[2]);
+    for (int i = 0; i < ms->gh->cctk_lsh[2]; i++) {
+        CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, i)];
+        for (int j = 0; j < ms->nb_coeffs_z; j++)
+            cp->basis_val_z  [i * ms->nb_coeffs_z + j] = ms->basis->eval(fabs(zz), j);
+            //cp->basis_val_z  [i + ms->gh->cctk_lsh[2] * j] = ms->basis->eval(zz, j);
+    }
+    posix_memalign((void**)&cp->transform_z, 32, sizeof(*cp->transform_z) * cctkGH->cctk_lsh[2] * ms->nb_coeffs_x);
+    posix_memalign((void**)&cp->one, 32, sizeof(*cp->one) * grid_size);
+    for (int i = 0; i < grid_size; i++)
+        cp->one[i] = 1.0;
+#else
+    posix_memalign((void**)&cp->basis_val_r, 32, sizeof(*cp->basis_val_r) * ms->nb_coeffs_x * ms->gh->cctk_lsh[0]);
+    for (int i = 0; i < ms->gh->cctk_lsh[0]; i++) {
+        CCTK_REAL xx = x[CCTK_GFINDEX3D(ms->gh, i, 0, 0)];
+
+        for (int k = 0; k < ms->nb_coeffs_x; k++)
+            cp->basis_val_r[i * ms->nb_coeffs_x + k] = ms->basis->eval(fabs(xx), k);
+    }
+
+    posix_memalign((void**)&cp->basis_val_z, 32, sizeof(*cp->basis_val_z) * ms->nb_coeffs_z * ms->gh->cctk_lsh[2]);
+    for (int i = 0; i < ms->gh->cctk_lsh[2]; i++) {
+        CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, i)];
+        for (int j = 0; j < ms->nb_coeffs_z; j++)
+            cp->basis_val_z[i * ms->nb_coeffs_z + j] = ms->basis->eval(fabs(zz), j);
+    }
+
+    posix_memalign((void**)&cp->transform_matrix,  32, sizeof(*cp->transform_matrix)  * ms->nb_coeffs_x * cp->size[0] * cp->size[2]);
+    posix_memalign((void**)&cp->transform_matrix1, 32, sizeof(*cp->transform_matrix1) * ms->nb_coeffs_z * cp->size[0] * cp->size[2]);
+#pragma omp parallel for
+    for (int j = 0; j < cp->size[2]; j++) {
+        CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, j)];
+
+        for (int i = 0; i < cp->size[0]; i++) {
+            const int idx_grid = j * cp->size[0] + i;
+
+            CCTK_REAL xx = x[CCTK_GFINDEX3D(ms->gh, i, 0, 0)];
+
+#if POLAR
+            double coord0 = sqrt(SQR(xx) + SQR(zz));
+            double coord1 = atan2(zz, xx);
+#else
+            double coord0 = xx;
+            double coord1 = zz;
+#endif
+
+            //for (int k = 0; k < ms->nb_coeffs_z; k++)
+            //    for (int l = 0; l < ms->nb_coeffs_x; l++) {
+            //        const int idx_coeff = k * ms->nb_coeffs_x + l;
+            //        cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * idx_coeff] = ms->basis->eval(r, l) * ms->basis1->eval(phi, k);
+            //    }
+            for (int k = 0; k < ms->nb_coeffs_x; k++)
+                cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * k] = ms->basis->eval(coord0, k);
+            for (int k = 0; k < ms->nb_coeffs_z; k++)
+                cp->transform_matrix1[idx_grid * ms->nb_coeffs_z + k] = ms->basis1->eval(coord1, k);
+        }
+    }
+    posix_memalign((void**)&cp->transform_tmp,  32, sizeof(*cp->transform_tmp)  * cp->size[0] * cp->size[2] * ms->nb_coeffs_z);
+#endif
+
+    ms->nb_patches++;
+    return cp;
+}
+
+static MaximalSlicingContext *ms_context;
+
+void quasimaximal_slicing_axi_solve(CCTK_ARGUMENTS)
+{
+    MaximalSlicingContext *ms;
+
+    DECLARE_CCTK_ARGUMENTS;
+    DECLARE_CCTK_PARAMETERS;
+
+    double time;
+
+    if (!ms_context) {
+        ms_context = init_ms(cctkGH, basis_order_r, basis_order_z,
+                             scale_factor, filter_power, input_filter_power, x, y, z, cctk_lsh);
+    }
+    ms = ms_context;
+
+    time = cctkGH->cctk_time / ms->gh->cctk_delta_time;
+
+    if (ms->gh->cctk_levfac[0] != 1 ||
+        fabs(time - ceilf(time)) > 1e-8)
+        return;
+
+    fprintf(stderr, "qms solve: time %g %g\n", ms->gh->cctk_time, time);
+
+    CCTK_TimerStart("MaximalSlicingAxi_Solve");
+    qms_maximal_solve(ms);
+    CCTK_TimerStop("MaximalSlicingAxi_Solve");
+
+    if (export_coeffs)
+        memcpy(w_coeffs, ms->coeffs, sizeof(*w_coeffs) * ms->nb_coeffs);
+
+    if (1) {
+        double *tmp;
+        if (ms->nb_solutions == ARRAY_ELEMS(ms->solution_cache)) {
+            tmp = ms->solution_cache[0].coeffs;
+            memmove(ms->solution_cache, ms->solution_cache + 1, sizeof(ms->solution_cache[0]) * (ARRAY_ELEMS(ms->solution_cache) - 1));
+        } else {
+            ms->nb_solutions++;
+            tmp = ms->solution_cache[ms->nb_solutions - 1].coeffs;
+        }
+        ms->solution_cache[ms->nb_solutions - 1].coeffs = ms->coeffs;
+        ms->solution_cache[ms->nb_solutions - 1].time   = ms->gh->cctk_time;
+
+        ms->coeffs = tmp;
+    }
+}
+
+void quasimaximal_slicing_axi_eval(CCTK_ARGUMENTS)
+{
+    MaximalSlicingContext *ms;
+
+    CoordPatch *cp;
+
+    DECLARE_CCTK_ARGUMENTS;
+    DECLARE_CCTK_PARAMETERS;
+
+    int64_t expand_start, totaltime_start;
+
+    double *coeffs = NULL;
+    int i, ret;
+
+    totaltime_start = gettime();
+
+    /* on the first run, init the solver */
+    if (!ms_context) {
+        ms_context = init_ms(cctkGH, basis_order_r, basis_order_z,
+                             scale_factor, filter_power, input_filter_power, x, y, z, cctk_lsh);
+    }
+    ms = ms_context;
+
+    cp = get_coord_patch(ms, x, y, z);
+
+#if 0
+    coeffs = ms->coeffs;
+#else
+    coeffs = ms->coeffs_eval;
+
+    if (ms->nb_solutions < 2) {
+        memset(coeffs, 0, sizeof(*coeffs) * ms->nb_coeffs);
+        //fprintf(stderr, "qms eval: time %g zero\n", ms->gh->cctk_time);
+    } else {
+        double *coeffs0 = ms->solution_cache[ms->nb_solutions - 2].coeffs;
+        double *coeffs1 = ms->solution_cache[ms->nb_solutions - 1].coeffs;
+        double time0 = ms->solution_cache[ms->nb_solutions - 2].time;
+        double time1 = ms->solution_cache[ms->nb_solutions - 1].time;
+        double time  = ms->gh->cctk_time;
+
+        //fprintf(stderr, "qms eval: time %g interp from %g %g\n", ms->gh->cctk_time, time0, time1);
+
+        for (int i = 0; i < ms->nb_coeffs; i++)
+            coeffs[i] = coeffs1[i] * (time - time0) / (time1 - time0) + coeffs0[i] * (time - time1) / (time0 - time1);
+
+    }
+#endif
+
+
+    CCTK_TimerStart("MaximalSlicingAxi_Expand");
+    expand_start = gettime();
+#if 0
+#pragma omp parallel for
+    for (int k = 0; k < cctk_lsh[2]; k++) {
+        for (int i = 0; i < cctk_lsh[0]; i++) {
+            int idx = CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, k);
+            double xx = x[idx];
+            double zz = z[idx];
+            double r = sqrt(SQR(xx) + SQR(zz));
+            double phi = atan2(zz, xx);
+
+            double val = 1.0;
+
+            for (int l = 0; l < ms->nb_coeffs_z; l++) {
+                double tmp = 0.0;
+                for (int m = 0; m < ms->nb_coeffs_x; m++) {
+                    const int idx_coeff = l * ms->nb_coeffs_x + m;
+                    tmp += ms->coeffs[idx_coeff] * ms->basis->eval(r, m);
+                }
+                val += tmp * ms->basis1->eval(phi, l);
+            }
+
+            alp[idx] = val;
+        }
+    }
+#else
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                cctk_lsh[0] * cctk_lsh[2], ms->nb_coeffs_z, ms->nb_coeffs_x,
+                1.0, cp->transform_matrix, cctk_lsh[0] * cctk_lsh[2],
+                coeffs, ms->nb_coeffs_x, 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]);
+#pragma omp parallel for
+    for (int j = 0; j < cctk_lsh[2]; j++)
+        for (int i = 0; i < cctk_lsh[0]; i++) {
+            const int idx_grid = j * cctk_lsh[0] + i;
+            const double val = cblas_ddot(ms->nb_coeffs_z, cp->transform_matrix1 + idx_grid * ms->nb_coeffs_z, 1,
+                                          cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]);
+            W[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = val;
+        }
+#endif
+    //memcpy(alp, cp->one, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*alp));
+    //memset(alp, 0, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*alp));
+    //cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+    //            ms->nb_coeffs_x, cctk_lsh[2], ms->nb_coeffs_z, 1.0,
+    //            coeffs, ms->nb_coeffs_x, cp->basis_val_z, ms->nb_coeffs_z,
+    //            0.0, cp->transform_z, ms->nb_coeffs_x);
+    //cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+    //            cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], ms->nb_coeffs_x, 1.0,
+    //            cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, ms->nb_coeffs_x,
+    //            1.0, alp, cctk_lsh[0] * cctk_lsh[1]);
+
+    ms->grid_expand_time += gettime() - expand_start;
+    ms->grid_expand_count++;
+
+    CCTK_TimerStop("MaximalSlicingAxi_Expand");
+
+    ms->solve_time += gettime() - totaltime_start;
+    ms->solve_count++;
+
+    /* print stats */
+    if (!(ms->solve_count & 255)) {
+        fprintf(stderr,
+                "maximal slicing solves: %lu, "
+                "total time %g s, avg time per call %g ms\n",
+                ms->solve_count, (double)ms->solve_time / 1e6,
+                (double)ms->solve_time / ms->solve_count / 1e3);
+        fprintf(stderr,
+                "%g%% interpolate geometry: %lu, "
+                "total time %g s, avg time per call %g ms\n",
+                (double)ms->interp_geometry_time * 100 / ms->solve_time,
+                ms->interp_geometry_count, (double)ms->interp_geometry_time / 1e6,
+                (double)ms->interp_geometry_time / ms->interp_geometry_count / 1e3);
+        fprintf(stderr,
+                "%g%% calc equation coefficients: %lu, "
+                "total time %g s, avg time per call %g ms\n",
+                (double)ms->calc_eq_coeffs_time * 100 / ms->solve_time,
+                ms->calc_eq_coeffs_count, (double)ms->calc_eq_coeffs_time / 1e6,
+                (double)ms->calc_eq_coeffs_time / ms->calc_eq_coeffs_count / 1e3);
+        fprintf(stderr,
+                "%g%% pseudospectral matrix construction: %lu, "
+                "total time %g s, avg time per call %g ms\n",
+                (double)ms->construct_matrix_time * 100 / ms->solve_time,
+                ms->construct_matrix_count, (double)ms->construct_matrix_time / 1e6,
+                (double)ms->construct_matrix_time / ms->construct_matrix_count / 1e3);
+        fprintf(stderr,
+                "%g%% BiCGSTAB %lu solves, "
+                "%lu iterations, total time %g s, "
+                "avg iterations per solve %g, avg time per solve %g ms, "
+                "avg time per iteration %g ms\n",
+                (double)ms->cg_time_total * 100 / ms->solve_time,
+                ms->cg_solve_count, ms->cg_iter_count, (double)ms->cg_time_total / 1e6,
+                (double)ms->cg_iter_count / ms->cg_solve_count,
+                (double)ms->cg_time_total / ms->cg_solve_count / 1e3,
+                (double)ms->cg_time_total / ms->cg_iter_count / 1e3);
+        fprintf(stderr,
+                "%g%% LU %lu solves, total time %g s, avg time per solve %g ms\n",
+                (double)ms->lu_solves_time * 100 / ms->solve_time,
+                ms->lu_solves_count, (double)ms->lu_solves_time / 1e6,
+                (double)ms->lu_solves_time / ms->lu_solves_count / 1e3);
+        fprintf(stderr,
+                "%g%% grid expansion: %lu, total time %g s, avg time per call %g ms\n",
+                (double)ms->grid_expand_time * 100 / ms->solve_time,
+                ms->grid_expand_count, (double)ms->grid_expand_time / 1e6,
+                (double)ms->grid_expand_time / ms->grid_expand_count / 1e3);
+    }
+}
+
+void qms_init(CCTK_ARGUMENTS)
+{
+    DECLARE_CCTK_ARGUMENTS;
+    DECLARE_CCTK_PARAMETERS;
+
+    double *Kdot11 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot11");
+    double *Kdot22 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot22");
+    double *Kdot33 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot33");
+    double *Kdot12 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot12");
+    double *Kdot13 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot13");
+    double *Kdot23 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot23");
+
+    double *Xtdot1 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Xtdot1");
+    double *Xtdot2 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Xtdot2");
+    double *Xtdot3 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Xtdot3");
+
+    double *phidot = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::phidot");
+
+    for (int k = 0; k < cctk_lsh[2]; k++)
+        for (int j = 0; j < cctk_lsh[1]; j++)
+            for (int i = 0; i < cctk_lsh[0]; i++) {
+                int idx = CCTK_GFINDEX3D(cctkGH, i, j, k);
+
+                Kdot11[idx] = 0.0;
+                Kdot22[idx] = 0.0;
+                Kdot33[idx] = 0.0;
+                Kdot12[idx] = 0.0;
+                Kdot13[idx] = 0.0;
+                Kdot23[idx] = 0.0;
+
+                Xtdot1[idx] = 0.0;
+                Xtdot2[idx] = 0.0;
+                Xtdot3[idx] = 0.0;
+
+                phidot[idx] = 0.0;
+            }
+}
diff --git a/src/qms.h b/src/qms.h
new file mode 100644
index 0000000..f88c0a8
--- /dev/null
+++ b/src/qms.h
@@ -0,0 +1,301 @@
+
+#include <inttypes.h>
+
+#include <cl.h>
+
+#include "cctk.h"
+
+#define POLAR (1)
+
+#define CCZ4 (0)
+
+#define SQR(x) ((x) * (x))
+#define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0)
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#define MIN(x, y) ((x) > (y) ? (y) : (x))
+#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr))
+
+/*
+ * small number to avoid r=0 singularities
+ */
+#define EPS 1E-08
+
+#define SCALE_FACTOR scale_factor
+
+/* indices (in our code, not cactus structs) of the grid functions which we'll need to
+ * interpolate on the pseudospectral grid */
+enum MetricVars {
+    GTXX = 0,
+    GTYY,
+    GTZZ,
+    GTXY,
+    GTXZ,
+    GTYZ,
+    PHI,
+    ATXX,
+    ATYY,
+    ATZZ,
+    ATXY,
+    ATXZ,
+    ATYZ,
+    K,
+    XTX,
+    XTY,
+    XTZ,
+    BETAX,
+    BETAY,
+    BETAZ,
+    ALPHA,
+    KDOT_XX,
+    KDOT_YY,
+    KDOT_ZZ,
+    KDOT_XY,
+    KDOT_XZ,
+    KDOT_YZ,
+    XTDOT_X,
+    XTDOT_Y,
+    XTDOT_Z,
+    PHIDOT,
+    NB_METRIC_VARS,
+};
+
+/* indices of the interpolated values of the above grid functions and their derivatives */
+enum InterpMetricVars {
+    I_GTXX = 0,
+    I_GTYY,
+    I_GTZZ,
+    I_GTXY,
+    I_GTXZ,
+    I_GTYZ,
+    I_PHI,
+    I_PHI_DX,
+    I_PHI_DY,
+    I_PHI_DZ,
+    I_ATXX,
+    I_ATYY,
+    I_ATZZ,
+    I_ATXY,
+    I_ATXZ,
+    I_ATYZ,
+    I_K,
+    I_K_DX,
+    I_K_DY,
+    I_K_DZ,
+    I_XTX,
+    I_XTY,
+    I_XTZ,
+    I_BETAX,
+    I_BETAY,
+    I_BETAZ,
+    I_ALPHA,
+    I_ALPHA_DX,
+    I_ALPHA_DY,
+    I_ALPHA_DZ,
+    I_ALPHA_DXX,
+    I_ALPHA_DYY,
+    I_ALPHA_DZZ,
+    I_ALPHA_DXY,
+    I_ALPHA_DXZ,
+    I_ALPHA_DYZ,
+    I_KDOT_XX,
+    I_KDOT_YY,
+    I_KDOT_ZZ,
+    I_KDOT_XY,
+    I_KDOT_XZ,
+    I_KDOT_YZ,
+    I_XTDOT_X,
+    I_XTDOT_Y,
+    I_XTDOT_Z,
+    I_PHIDOT,
+    I_PHIDOT_DX,
+    I_PHIDOT_DY,
+    I_PHIDOT_DZ,
+    NB_INTERP_VARS,
+};
+
+/* a set of basis functions */
+typedef struct BasisSet {
+    /* evaluate the idx-th basis function at the specified point*/
+    double (*eval)      (double coord, int idx);
+    /* evaluate the first derivative of the idx-th basis function at the specified point*/
+    double (*eval_diff1)(double coord, int idx);
+    /* evaluate the second derivative of the idx-th basis function at the specified point*/
+    double (*eval_diff2)(double coord, int idx);
+    /**
+     * Get the idx-th collocation point for the specified order.
+     * idx runs from 0 to order - 1 (inclusive)
+     */
+    double (*colloc_point)(int order, int idx);
+} BasisSet;
+
+extern const BasisSet qms_cheb_basis;
+extern const BasisSet qms_cheb_even_basis;
+extern const BasisSet qms_full_basis;
+extern const BasisSet qms_tb_even_basis;
+extern const BasisSet qms_sb_even_basis;
+extern const BasisSet qms_tl_basis;
+extern const BasisSet qms_cos_even_basis;
+
+extern double scale_factor;
+
+typedef struct MGContext MGContext;
+typedef struct SORContext SORContext;
+
+/* precomputed values for a given refined grid */
+typedef struct CoordPatch {
+    CCTK_REAL origin[3];
+    CCTK_INT delta[3];
+    CCTK_INT size[3];
+
+    // basis values on the grid
+    double *basis_val_r;
+    double *basis_val_z;
+
+    double *transform_z;
+    double *transform_matrix;
+    double *transform_matrix1;
+    double *transform_tmp;
+    double *one;
+
+    int y_idx;
+
+    MGContext *mg;
+    SORContext *sor;
+} CoordPatch;
+
+/* state and scratch storage for the BiCGSTAB solver */
+typedef struct BiCGSTABContext {
+    double *p, *v, *y, *z, *t;
+    double *res, *res0;
+    double *k;
+
+    cl_mem cl_p, cl_v, cl_y, cl_z, cl_t;
+    cl_mem cl_res, cl_res0;
+    cl_mem cl_k, cl_mat;
+    cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1;
+    cl_mem cl_tmp, cl_tmp1;
+
+} BiCGSTABContext;
+
+typedef struct MaximalSlicingContext {
+    cGH *gh;
+    const BasisSet *basis;
+    const BasisSet *basis1;
+
+    struct {
+        double time;
+        double *coeffs;
+    } solution_cache[8];
+    int nb_solutions;
+
+    double *coeffs_eval;
+
+    BiCGSTABContext bicgstab;
+    int steps_since_inverse;
+
+    uint64_t solve_count;
+    uint64_t solve_time;
+
+    uint64_t lu_solves_count;
+    uint64_t lu_solves_time;
+
+    uint64_t cg_solve_count;
+    uint64_t cg_iter_count;
+    uint64_t cg_time_total;
+
+    uint64_t interp_geometry_count;
+    uint64_t interp_geometry_time;
+
+    uint64_t calc_eq_coeffs_count;
+    uint64_t calc_eq_coeffs_time;
+
+    uint64_t construct_matrix_count;
+    uint64_t construct_matrix_time;
+
+    uint64_t grid_expand_count;
+    uint64_t grid_expand_time;
+
+    // the grid of collocation points
+    double *colloc_grid[2];
+
+    // interpolation parameters
+    int coord_system;
+    int interp_operator;
+    int interp_params;
+
+    CCTK_REAL *interp_coords[3];
+
+    int interp_vars_indices[NB_METRIC_VARS];
+
+    CCTK_REAL *interp_values[NB_INTERP_VARS];
+    CCTK_REAL *interp_values_prefilter[NB_INTERP_VARS];
+    CCTK_INT   interp_value_codes[NB_INTERP_VARS];
+
+    CCTK_REAL *metric_u[6];
+
+    CCTK_REAL *kij_kij;
+    CCTK_REAL *trk;
+
+    int nb_coeffs_x;
+    int nb_coeffs_z;
+    int nb_coeffs;
+
+    int nb_colloc_points_x;
+    int nb_colloc_points_z;
+    int nb_colloc_points;
+
+    int colloc_grid_order_x;
+    int colloc_grid_order_z;
+
+    double *mat;
+    double *mat_f;
+    double *rhs;
+    double *coeffs;
+    int *ipiv;
+    double *basis_x_val;
+    double *basis_x_dval;
+    double *basis_x_d2val;
+
+    double *basis_z_val;
+    double *basis_z_dval;
+    double *basis_z_d2val;
+
+    double *basis_val_00;
+    double *basis_val_20;
+    double *basis_val_02;
+    double *basis_val_11;
+    double *basis_val_10;
+    double *basis_val_01;
+
+    double *eq_coeff_00;
+    double *eq_coeff_20;
+    double *eq_coeff_02;
+    double *eq_coeff_11;
+    double *eq_coeff_10;
+    double *eq_coeff_01;
+
+    double *coeff_scale;
+
+    double *input_filter;
+
+    CoordPatch *patches;
+    int nb_patches;
+
+    // OpenCL / CLBLAS stuff
+    cl_context cl_ctx;
+    cl_command_queue cl_queue;
+    cl_device_id ocl_device;
+
+    cl_mem ocl_coeffs;
+} MaximalSlicingContext;
+
+int qms_maximal_solve(MaximalSlicingContext *ms);
+
+#include <sys/time.h>
+static inline int64_t gettime(void)
+{
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
diff --git a/src/register.c b/src/register.c
new file mode 100644
index 0000000..65c2d21
--- /dev/null
+++ b/src/register.c
@@ -0,0 +1,9 @@
+
+#include "cctk.h"
+#include "cctk_Arguments.h"
+#include "Slicing.h"
+
+void qms_register_mol(CCTK_ARGUMENTS)
+{
+    MoLRegisterConstrained(CCTK_VarIndex("QuasiMaximalSlicing::W"));
+}
diff --git a/src/solve.c b/src/solve.c
new file mode 100644
index 0000000..0bd56e2
--- /dev/null
+++ b/src/solve.c
@@ -0,0 +1,636 @@
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cblas.h>
+#include <lapacke.h>
+
+#include <cl.h>
+#include <clBLAS.h>
+
+#include "qms.h"
+
+static int construct_matrix(MaximalSlicingContext *ms)
+{
+    double *mat = ms->mat;
+    int idx_coeff, idx_grid;
+
+#pragma omp parallel for
+    for (idx_coeff = 0; idx_coeff < ms->nb_coeffs; idx_coeff++)
+        for (idx_grid = 0; idx_grid < ms->nb_colloc_points; idx_grid++) {
+            const int idx      = idx_grid + ms->nb_colloc_points * idx_coeff;
+
+            mat[idx] = ms->eq_coeff_00[idx_grid] * ms->basis_val_00[idx] +
+                       ms->eq_coeff_10[idx_grid] * ms->basis_val_10[idx] +
+                       ms->eq_coeff_01[idx_grid] * ms->basis_val_01[idx] +
+                       ms->eq_coeff_11[idx_grid] * ms->basis_val_11[idx] +
+                       ms->eq_coeff_20[idx_grid] * ms->basis_val_20[idx] +
+                       ms->eq_coeff_02[idx_grid] * ms->basis_val_02[idx];
+        }
+
+    return 0;
+}
+
+/* interpolate the cactus gridfunctions onto the pseudospectral grid */
+static int interp_geometry(MaximalSlicingContext *ms)
+{
+    int ret;
+
+    ret = CCTK_InterpGridArrays(ms->gh, 3, ms->interp_operator, ms->interp_params,
+                                ms->coord_system, ms->nb_colloc_points, CCTK_VARIABLE_REAL,
+                                (const void * const *)ms->interp_coords, ARRAY_ELEMS(ms->interp_vars_indices), ms->interp_vars_indices,
+                                ARRAY_ELEMS(ms->interp_values), ms->interp_value_codes, (void * const *)ms->interp_values_prefilter);
+    if (ret < 0)
+        CCTK_WARN(0, "Error interpolating");
+
+    CCTK_TimerStart("MaximalSlicingAxi_filter_input");
+    for (int i = 0; i < ARRAY_ELEMS(ms->interp_values); i++) {
+        cblas_dgemv(CblasColMajor, CblasNoTrans, ms->nb_colloc_points, ms->nb_colloc_points, 1.0,
+                    ms->input_filter, ms->nb_colloc_points, ms->interp_values_prefilter[i], 1, 0.0,
+                    ms->interp_values[i], 1);
+    }
+    CCTK_TimerStop("MaximalSlicingAxi_filter_input");
+
+    return 0;
+}
+
+static int calc_eq_coeffs(MaximalSlicingContext *ms, double *prhs_max)
+{
+    double rhs_max = 0.0;
+//#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x) reduction(max : rhs_max)
+    for (int i = 0; i < ms->nb_colloc_points; i++) {
+        const double x = ms->interp_coords[0][i];
+        const double z = ms->interp_coords[2][i];
+        const int zaxis = x <= EPS;
+
+        double Am[3][3], K[3][3], Km[3][3], Ku[3][3], gtu[3][3];
+        double k2, kij_dij_alpha, k_kdot, k3;
+
+        const double gtxx = ms->interp_values[I_GTXX][i];
+        const double gtyy = ms->interp_values[I_GTYY][i];
+        const double gtzz = ms->interp_values[I_GTZZ][i];
+        const double gtxy = ms->interp_values[I_GTXY][i];
+        const double gtxz = ms->interp_values[I_GTXZ][i];
+        const double gtyz = ms->interp_values[I_GTYZ][i];
+
+        const double gt[3][3] = {{ gtxx, gtxy, gtxz },
+                                 { gtxy, gtyy, gtyz },
+                                 { gtxz, gtyz, gtzz }};
+
+        const double Atxx = ms->interp_values[I_ATXX][i];
+        const double Atyy = ms->interp_values[I_ATYY][i];
+        const double Atzz = ms->interp_values[I_ATZZ][i];
+        const double Atxy = ms->interp_values[I_ATXY][i];
+        const double Atxz = ms->interp_values[I_ATXZ][i];
+        const double Atyz = ms->interp_values[I_ATYZ][i];
+
+        const double phi  = ms->interp_values[I_PHI][i];
+
+        const double phidot = ms->interp_values[I_PHIDOT][i];
+        const double phidot_dx = ms->interp_values[I_PHIDOT_DX][i];
+        const double phidot_dz = ms->interp_values[I_PHIDOT_DZ][i];
+
+        const double At[3][3] = {{ Atxx, Atxy, Atxz },
+                                 { Atxy, Atyy, Atyz },
+                                 { Atxz, Atyz, Atzz }};
+
+        const double trK  = ms->interp_values[I_K][i];
+        const double kdot_xx = ms->interp_values[I_KDOT_XX][i];
+        const double kdot_yy = ms->interp_values[I_KDOT_YY][i];
+        const double kdot_zz = ms->interp_values[I_KDOT_ZZ][i];
+        const double kdot_xy = ms->interp_values[I_KDOT_XY][i];
+        const double kdot_xz = ms->interp_values[I_KDOT_XZ][i];
+        const double kdot_yz = ms->interp_values[I_KDOT_YZ][i];
+
+        const double kdot[3][3] = {{ kdot_xx, kdot_xy, kdot_xz },
+                                   { kdot_xy, kdot_yy, kdot_yz },
+                                   { kdot_xz, kdot_yz, kdot_zz }};
+
+        const double alpha     = ms->interp_values[I_ALPHA][i];
+        const double dx_alpha  = ms->interp_values[I_ALPHA_DX][i];
+        const double dz_alpha  = ms->interp_values[I_ALPHA_DZ][i];
+        const double dxx_alpha = ms->interp_values[I_ALPHA_DXX][i];
+        const double dzz_alpha = ms->interp_values[I_ALPHA_DZZ][i];
+        const double dxz_alpha = ms->interp_values[I_ALPHA_DXZ][i];
+
+        const double dij_alpha[3][3] = {{ dxx_alpha,                                0, dxz_alpha },
+                                        {         0, zaxis ? dxx_alpha : dx_alpha / x,         0 },
+                                        { dxz_alpha,                                0, dzz_alpha }};
+
+        const double Xtx  = ms->interp_values[I_XTX][i];
+        const double Xtz  = ms->interp_values[I_XTZ][i];
+
+        const double Xtdot_x  = ms->interp_values[I_XTDOT_X][i];
+        const double Xtdot_z  = ms->interp_values[I_XTDOT_Z][i];
+
+        const double det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz);
+
+        // \tilde{γ}^{ij}
+        gtu[0][0] =  (gtyy * gtzz - SQR(gtyz)) / det;
+        gtu[1][1] =  (gtxx * gtzz - SQR(gtxz)) / det;
+        gtu[2][2] =  (gtxx * gtyy - SQR(gtxy)) / det;
+        gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det;
+        gtu[0][2] =  (gtxy * gtyz - gtyy * gtxz) / det;
+        gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det;
+        gtu[1][0] = gtu[0][1];
+        gtu[2][0] = gtu[0][2];
+        gtu[2][1] = gtu[1][2];
+
+        // K_{ij}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                K[j][k] = At[j][k] / SQR(phi) + gt[j][k] * trK;
+
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                double val = 0.0;
+                for (int l = 0; l < 3; l++)
+                    val += SQR(phi) * gtu[j][l] * K[l][k];
+                Km[j][k] = val;
+            }
+
+        // K^{ij}
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                double val = 0.0;
+                for (int l = 0; l < 3; l++)
+                    val += SQR(phi) * gtu[j][l] * Km[k][l];
+                Ku[j][k] = val;
+            }
+
+        // \tilde{A}_{i}^j
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                double val = 0.0;
+                for (int l = 0; l < 3; l++)
+                    val += gtu[j][l] * At[l][k];
+                Am[j][k] = val;
+            }
+
+        kij_dij_alpha = 0.0;
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                kij_dij_alpha += Ku[j][k] * dij_alpha[j][k];
+
+        k_kdot = 0.0;
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                k_kdot += kdot[j][k] * Ku[j][k];
+
+        k3 = 0.0;
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                double val = 0.0;
+                for (int l = 0; l < 3; l++)
+                    val += Km[k][l] * Ku[l][j];
+                k3 += val * K[j][k];
+            }
+
+        // K_{ij} K^{ij}
+        k2 = 0.0;
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                k2 += Km[j][k] * Km[k][j];
+
+        {
+            const double gtuxx         = gtu[0][0];
+            const double gtuyy         = gtu[1][1];
+            const double gtuzz         = gtu[2][2];
+            const double gtuxz         = gtu[0][2];
+
+            const double phi_dx        = ms->interp_values[I_PHI_DX][i];
+            const double phi_dz        = ms->interp_values[I_PHI_DZ][i];
+
+            const double Xtx           = ms->interp_values[I_XTX][i];
+            const double Xtz           = ms->interp_values[I_XTZ][i];
+
+            //const double k2            = a2 + SQR(trK) / 3.;
+
+            const double trk_dx        = ms->interp_values[I_K_DX][i];
+            const double trk_dz        = ms->interp_values[I_K_DZ][i];
+
+            const double betax         = ms->interp_values[I_BETAX][i];
+            const double betaz         = ms->interp_values[I_BETAZ][i];
+
+            const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi);
+            const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);
+
+            const double Xdot_x = 2 * phi * phidot * Xtx + SQR(phi) * Xtdot_x + phi * (phidot_dx * gtuxx + phidot_dz * gtuxz) -
+                                  phidot * (phi_dx * gtuxx + phi_dz * gtuxz) + 2 * alpha * (phi_dx * Ku[0][0] + phi_dz * Ku[0][2]) / phi;
+            const double Xdot_z = 2 * phi * phidot * Xtz + SQR(phi) * Xtdot_z + phi * (phidot_dz * gtuzz + phidot_dx * gtuxz) -
+                                  phidot * (phi_dz * gtuzz + phi_dx * gtuxz) + 2 * alpha * (phi_dz * Ku[2][2] + phi_dx * Ku[0][2]) / phi;
+
+            ms->eq_coeff_20[i] = SQR(phi) * (gtuxx + ((x <= EPS) ? gtuyy : 0.0));
+            ms->eq_coeff_02[i] = SQR(phi) * gtuzz;
+            ms->eq_coeff_11[i] = SQR(phi) * gtuxz * 2;
+            ms->eq_coeff_10[i] = -Xx + ((x > EPS) ? SQR(phi) * gtuyy / x : 0.0);
+            ms->eq_coeff_01[i] = -Xz;
+            ms->eq_coeff_00[i] = -k2;
+
+            ms->rhs[i] = -2 * alpha * kij_dij_alpha + Xdot_x * dx_alpha + Xdot_z * dz_alpha +
+                         2 * (k_kdot + 2 * alpha * k3) * alpha;
+
+            rhs_max = MAX(rhs_max, fabs(ms->rhs[i]));
+        }
+    }
+
+    *prhs_max = rhs_max;
+
+    return 0;
+}
+
+// based on the wikipedia article
+// and http://www.netlib.org/templates/matlab/bicgstab.m
+static int solve_bicgstab(BiCGSTABContext *ctx, const int N,
+                          double *mat, double *rhs, double *x)
+{
+    const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+    double rho, rho_prev = 1.0;
+    double omega = 1.0;
+    double alpha = 1.0;
+
+    double err;
+    int i;
+
+    double *k = ctx->k;
+    double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t;
+    double *res = ctx->res, *res0 = ctx->res0;
+
+    // initialize the residual
+    memcpy(res, rhs, N * sizeof(*res));
+    cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+                mat, N, x, 1, 1.0, res, 1);
+
+    memcpy(res0, res, N * sizeof(*res0));
+    memcpy(p,    res, N * sizeof(*p));
+
+#define MAXITER 16
+#define TOL (1e-15)
+    for (i = 0; i < MAXITER; i++) {
+        rho = cblas_ddot(N, res, 1, res0, 1);
+
+        if (i) {
+            double beta = (rho / rho_prev) * (alpha / omega);
+
+            cblas_daxpy(N, -omega, v, 1, p, 1);
+            cblas_dscal(N, beta, p, 1);
+            cblas_daxpy(N, 1, res, 1, p, 1);
+        }
+
+        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    k, N, p, 1, 0.0, y, 1);
+
+        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    mat, N, y, 1, 0.0, v, 1);
+
+        alpha = rho / cblas_ddot(N, res0, 1, v, 1);
+
+        cblas_daxpy(N, -alpha, v, 1, res, 1);
+
+        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    k, N, res, 1, 0.0, z, 1);
+        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    mat, N, z, 1, 0.0, t, 1);
+
+        omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1);
+
+        cblas_daxpy(N, alpha, y, 1, x, 1);
+        cblas_daxpy(N, omega, z, 1, x, 1);
+
+        cblas_daxpy(N, -omega, t, 1, res, 1);
+
+        err = cblas_dnrm2(N, res, 1) / rhs_norm;
+        if (err < TOL)
+            break;
+
+        rho_prev = rho;
+    }
+    if (i == MAXITER)
+        return -1;
+
+    return i;
+}
+
+static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q,
+                             const int N, double *mat, double *rhs, cl_mem ocl_x)
+{
+    const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+    double rho, rho_prev = 1.0;
+    double omega[2] = { 1.0 };
+    double alpha = 1.0;
+
+    double err;
+    int i;
+
+    cl_event events[8];
+
+    // the matrix, rhs, k and x are assumed to be already uploaded
+    clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs,     0, NULL, &events[0]);
+    clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]);
+
+    // initialize the residual
+    clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+                ctx->cl_mat, 0, N, ocl_x, 0, 1, 1.0, ctx->cl_res, 0, 1,
+                1, &cl_q, 2, events, &events[2]);
+    clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double),
+                        1, &events[2], &events[3]);
+    clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double),
+                        1, &events[2], &events[4]);
+
+    clWaitForEvents(5, events);
+    // BARRIER
+
+#define MAXITER 16
+#define TOL (1e-15)
+    for (i = 0; i < MAXITER; i++) {
+        clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1,
+                   ctx->cl_tmp, 1, &cl_q, 0, NULL, &events[0]);
+        clEnqueueReadBuffer(cl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho,
+                            1, &events[0], NULL);
+        // BARRIER
+
+        if (i) {
+            double beta = (rho / rho_prev) * (alpha / omega[0]);
+
+            clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1,
+                        1, &cl_q, 0, NULL, &events[0]);
+            clblasDscal(N, beta, ctx->cl_p, 0, 1,
+                        1, &cl_q, 1, &events[0], &events[1]);
+            clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1,
+                        1, &cl_q, 1, &events[1], &events[0]);
+            clWaitForEvents(1, &events[0]);
+            // BARRIER
+        }
+
+        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1,
+                    1, &cl_q, 0, NULL, &events[0]);
+
+        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1,
+                    1, &cl_q, 1, &events[0], &events[1]);
+
+        clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1,
+                   ctx->cl_tmp, 1, &cl_q, 1, &events[1], &events[0]);
+        clEnqueueReadBuffer(cl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha,
+                            1, &events[0], NULL);
+        // BARRIER
+
+        alpha = rho / alpha;
+
+        clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1,
+                    1, &cl_q, 0, NULL, &events[0]);
+
+        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1,
+                    1, &cl_q, 1, &events[0], &events[1]);
+        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1,
+                    1, &cl_q, 1, &events[1], &events[0]);
+
+        clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+                   ctx->cl_tmp, 1, &cl_q, 1, &events[0], &events[1]);
+        clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1,
+                   ctx->cl_tmp1, 1, &cl_q, 1, &events[0], &events[2]);
+
+        clEnqueueReadBuffer(cl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega,
+                            2, &events[1], NULL);
+        // BARRIER
+
+        omega[0] /= omega[1];
+
+        clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ocl_x, 0, 1,
+                    1, &cl_q, 0, NULL, &events[0]);
+        clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ocl_x, 0, 1,
+                    1, &cl_q, 1, &events[0], &events[1]);
+
+        clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+                    1, &cl_q, 0, NULL, &events[0]);
+        clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1,
+                    1, &cl_q, 1, &events[0], &events[2]);
+        clEnqueueReadBuffer(cl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err,
+                            1, &events[2], NULL);
+        clWaitForEvents(1, &events[1]);
+        // BARRIER
+
+        if (err < TOL)
+            break;
+
+        rho_prev = rho;
+    }
+    if (i == MAXITER)
+        return -1;
+
+    return i;
+}
+
+static int lu_invert(const int N, double *mat, double *rhs, int *ipiv)
+{
+    char  equed = 'N';
+    double cond, ferr, berr, rpivot;
+
+    double *mat_f, *x;
+    int ret = 0;
+#if 1
+    LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+                  mat, N, ipiv, rhs, N);
+    LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv);
+#else
+    mat_f = malloc(SQR(N) * sizeof(*mat_f));
+    x     = malloc(N * sizeof(*x));
+
+    //{
+    //    int i, j;
+    //    for (i = 0; i < N; i++) {
+    //        for (j = 0; j < N; j++)
+    //            fprintf(stderr, "%+#010.8g\t", mat[i + j * N]);
+    //        fprintf(stderr, "\n");
+    //    }
+    //}
+    //{
+    //    double *mat_copy = malloc(SQR(N) * sizeof(double));
+    //    double *svd = malloc(N * sizeof(double));
+    //    double *rhs_copy = malloc(N * sizeof(double));
+    //    int rank;
+
+    //    memcpy(mat_copy, mat, SQR(N) * sizeof(double));
+    //    memcpy(rhs_copy, rhs, N * sizeof(double));
+
+    //    LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N,
+    //                   svd, 1e-13, &rank);
+
+    //    free(mat_copy);
+    //    for (int i = 0; i < N; i++) {
+    //        if (i > 5 && i < N - 5)
+    //            continue;
+
+    //        fprintf(stderr, "%g\t", svd[i]);
+    //    }
+    //    fprintf(stderr, "\n rank %d\n", rank);
+    //    free(svd);
+    //    free(rhs_copy);
+
+    //    if (rank < N)
+    //        ret = 1;
+    //}
+
+    //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+    //              mat, N, ipiv, rhs, N);
+    LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1,
+                   mat, N, mat_f, N, ipiv, &equed, NULL, NULL,
+                   rhs, N, x, N, &cond, &ferr, &berr, &rpivot);
+    LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv);
+    memcpy(rhs, x, N * sizeof(double));
+    memcpy(mat, mat_f, SQR(N) * sizeof(double));
+
+    fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: "
+            "condition number %16.16g; forward error %16.16g backward error %16.16g\n",
+            N, N, cond, ferr, berr);
+
+    free(mat_f);
+    free(x);
+#endif
+
+    return ret;
+}
+
+/*
+ * Solve the equation
+ * D²α - KᵢⱼKⁱʲα = -K
+ * for the coefficients of spectral approximation of α:
+ * α(ρ, z) = 1 + ΣaᵢⱼTᵢ(ρ)Tⱼ(z)
+ * where i =  { 0, ... , ms->nb_coeffs_x };
+ *       j =  { 0, ... , ms->nb_coeffs_z };
+ * Tᵢ(x) are defined by ms->basis.
+ */
+int qms_maximal_solve(MaximalSlicingContext *ms)
+{
+    const int N = ms->nb_coeffs;
+    double rhs_max;
+    int64_t start;
+
+    int ret = 0;
+
+    /* interpolate the metric values and construct the quantities we'll need */
+    CCTK_TimerStart("MaximalSlicingAxi_interp_geometry");
+    start = gettime();
+    ret = interp_geometry(ms);
+    ms->interp_geometry_time += gettime() - start;
+    ms->interp_geometry_count++;
+    CCTK_TimerStop("MaximalSlicingAxi_interp_geometry");
+    if (ret < 0)
+        return ret;
+
+    CCTK_TimerStart("MaximalSlicingAxi_calc_eq_coeffs");
+    start = gettime();
+    ret = calc_eq_coeffs(ms, &rhs_max);
+    ms->calc_eq_coeffs_time += gettime() - start;
+    ms->calc_eq_coeffs_count++;
+    CCTK_TimerStop("MaximalSlicingAxi_calc_eq_coeffs");
+    if (ret < 0)
+        return ret;
+
+    /* fill the matrix */
+    CCTK_TimerStart("MaximalSlicingAxi_construct_matrix");
+    start = gettime();
+    ret = construct_matrix(ms);
+    ms->construct_matrix_time += gettime() - start;
+    ms->construct_matrix_count++;
+    CCTK_TimerStop("MaximalSlicingAxi_construct_matrix");
+    if (ret < 0)
+        return ret;
+
+#if 1
+    if (rhs_max < EPS) {
+        fprintf(stderr, "zero rhs\n");
+        memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
+        if (ms->cl_queue) {
+            clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double),
+                                 ms->coeffs, 0, NULL, NULL);
+        }
+        return 0;
+    }
+#endif
+
+    /* solve for the coeffs */
+    if (ms->steps_since_inverse < 1024) {
+        BiCGSTABContext *b = &ms->bicgstab;
+        int64_t start = gettime();
+
+        CCTK_TimerStart("MaximalSlicingAxi_solve_BiCGSTAB");
+        if (ms->cl_queue) {
+            ret = solve_bicgstab_cl(b, ms->cl_queue, ms->nb_coeffs, ms->mat, ms->rhs, ms->ocl_coeffs);
+            clEnqueueReadBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, sizeof(double) * N,
+                                ms->coeffs, 0, NULL, NULL);
+        } else
+            ret = solve_bicgstab(b, ms->nb_coeffs, ms->mat, ms->rhs, ms->coeffs);
+        CCTK_TimerStop("MaximalSlicingAxi_solve_BiCGSTAB");
+
+        if (ret >= 0) {
+            ms->cg_time_total += gettime() - start;
+            ms->cg_solve_count++;
+            ms->cg_iter_count += ret + 1;
+            ms->steps_since_inverse++;
+
+#if 0
+            {
+                double min, max;
+                gsl_vector_memcpy(b->y, ms->rhs);
+                cblas_dgemv(CblasColMajor, CblasNoTrans, ms->mat->size1, ms->mat->size2, -1.0,
+                            ms->mat->data, ms->mat->tda, ms->coeffs->data, 1, 1.0, b->y->data, 1);
+                gsl_vector_minmax(b->y, &min, &max);
+                if (fabs(min) > 1e-11 || fabs(max) > 1e-11)
+                    abort();
+            }
+#endif
+        }
+    } else
+        ret = -1;
+
+    if (ret < 0) {
+        double *tmpv;
+        double *tmpm;
+        int64_t start;
+
+        CCTK_TimerStart("MaximalSlicingAxi_solve_LU");
+        start = gettime();
+
+        ret = lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv);
+        ms->lu_solves_time += gettime() - start;
+        ms->lu_solves_count++;
+        CCTK_TimerStop("MaximalSlicingAxi_solve_LU");
+
+        tmpv = ms->coeffs;
+        ms->coeffs = ms->rhs;
+        ms->rhs = tmpv;
+
+        tmpm = ms->mat;
+        ms->mat = ms->bicgstab.k;
+        ms->bicgstab.k = tmpm;
+
+        if (ret == 1) {
+            memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
+            ms->coeffs[0] = 1.0;
+        }
+
+        if (ms->cl_queue) {
+            cl_event events[2];
+            clEnqueueWriteBuffer(ms->cl_queue, ms->bicgstab.cl_k, 0, 0, N * N * sizeof(double),
+                                 ms->bicgstab.k, 0, NULL, &events[0]);
+            clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 0, 0, N * sizeof(double),
+                                 ms->coeffs, 0, NULL, &events[1]);
+            clWaitForEvents(2, events);
+        }
+
+        ms->steps_since_inverse = 0;
+    }
+
+    for (int i = 0; i < N; i++)
+        ms->coeffs[i] *= ms->coeff_scale[i];
+
+    return ret;
+}
diff --git a/src/solve.cl b/src/solve.cl
new file mode 100644
index 0000000..125133a
--- /dev/null
+++ b/src/solve.cl
@@ -0,0 +1,28 @@
+#pragma OPENCL EXTENSION all : enable
+
+__kernel void construct_matrix_polar(__global double *mat,
+                                     __global double *coeff_00,
+                                     __global double *coeff_10,
+                                     __global double *coeff_01,
+                                     __global double *coeff_11,
+                                     __global double *coeff_20,
+                                     __global double *coeff_02,
+                                     __global double *basis_00,
+                                     __global double *basis_10,
+                                     __global double *basis_01,
+                                     __global double *basis_11,
+                                     __global double *basis_20,
+                                     __global double *basis_02,
+                                     int nb_coeffs, int nb_colloc_points)
+{
+    unsigned int idx_coeff = get_global_id(0);
+    unsigned int idx_grid  = get_global_id(1);
+    unsigned int idx       = idx_grid + nb_colloc_points * idx_coeff;
+
+    mat[idx] = coeff_00[idx_grid] * basis_00[idx] +
+               coeff_10[idx_grid] * basis_10[idx] +
+               coeff_01[idx_grid] * basis_01[idx] +
+               coeff_11[idx_grid] * basis_11[idx] +
+               coeff_20[idx_grid] * basis_20[idx] +
+               coeff_02[idx_grid] * basis_02[idx];
+}
diff --git a/src/solve_cl.c b/src/solve_cl.c
new file mode 100644
index 0000000..f138631
--- /dev/null
+++ b/src/solve_cl.c
@@ -0,0 +1,28 @@
+"#pragma OPENCL EXTENSION all : enable\n"
+"\n"
+"__kernel void construct_matrix_polar(__global double *mat,\n"
+"                                     __global double *coeff_00,\n"
+"                                     __global double *coeff_10,\n"
+"                                     __global double *coeff_01,\n"
+"                                     __global double *coeff_11,\n"
+"                                     __global double *coeff_20,\n"
+"                                     __global double *coeff_02,\n"
+"                                     __global double *basis_00,\n"
+"                                     __global double *basis_10,\n"
+"                                     __global double *basis_01,\n"
+"                                     __global double *basis_11,\n"
+"                                     __global double *basis_20,\n"
+"                                     __global double *basis_02,\n"
+"                                     int nb_coeffs, int nb_colloc_points)\n"
+"{\n"
+"    unsigned int idx_coeff = get_global_id(0);\n"
+"    unsigned int idx_grid  = get_global_id(1);\n"
+"    unsigned int idx       = idx_grid + nb_colloc_points * idx_coeff;\n"
+"\n"
+"    mat[idx] = coeff_00[idx_grid] * basis_00[idx] +\n"
+"               coeff_10[idx_grid] * basis_10[idx] +\n"
+"               coeff_01[idx_grid] * basis_01[idx] +\n"
+"               coeff_11[idx_grid] * basis_11[idx] +\n"
+"               coeff_20[idx_grid] * basis_20[idx] +\n"
+"               coeff_02[idx_grid] * basis_02[idx];\n"
+"}\n"
-- 
cgit v1.2.3