From efce90efb4d4e38fdd44a96ccbf1424c8b5ab94e Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Sat, 7 Apr 2018 09:54:54 +0200
Subject: Large rewrite.

Split into separate subsystems.
---
 interface.ccl             |   2 +-
 param.ccl                 |  10 +
 schedule.ccl              |  10 +-
 src/basis.c               | 236 ++++++++++++++--
 src/basis.h               |  49 ++++
 src/bicgstab.c            | 420 ++++++++++++++++++++++++++++
 src/bicgstab.h            |  60 ++++
 src/common.h              |  30 ++
 src/make.code.defn        |   2 +-
 src/maximal_slicing_axi.c | 631 ++++++++++++-----------------------------
 src/maximal_slicing_axi.h | 209 +-------------
 src/ms_solve.c            | 697 ++++++++++++++++++++++++++++++++++++++++++++++
 src/ms_solve.h            |  52 ++++
 src/pssolve.c             | 389 ++++++++++++++++++++++++++
 src/pssolve.h             | 116 ++++++++
 src/solve.c               | 463 ++++++++++++++++++++++++++----
 16 files changed, 2645 insertions(+), 731 deletions(-)
 create mode 100644 src/basis.h
 create mode 100644 src/bicgstab.c
 create mode 100644 src/bicgstab.h
 create mode 100644 src/common.h
 create mode 100644 src/ms_solve.c
 create mode 100644 src/ms_solve.h
 create mode 100644 src/pssolve.c
 create mode 100644 src/pssolve.h

diff --git a/interface.ccl b/interface.ccl
index cec39a3..1a2c245 100644
--- a/interface.ccl
+++ b/interface.ccl
@@ -1,7 +1,7 @@
 # Interface definition for thorn MaximalSlicingAxi
 implements: MaximalSlicingAxi
 
-INHERITS: ADMBase grid CoordBase MethodOfLines
+INHERITS: ADMBase grid CoordBase MethodOfLines ML_BSSN
 
 CCTK_INT FUNCTION MoLRegisterConstrained(CCTK_INT IN idx)
 CCTK_INT FUNCTION MoLRegisterSaveAndRestore(CCTK_INT IN idx)
diff --git a/param.ccl b/param.ccl
index 2ca5e50..5309214 100644
--- a/param.ccl
+++ b/param.ccl
@@ -23,6 +23,16 @@ CCTK_INT basis_order_z "Number of the basis functions in the z direction" STEERA
     1: :: ""
 } 40
 
+CCTK_REAL filter_power "" STEERABLE=recover
+{
+    0: :: ""
+} 64.0
+
+CCTK_REAL input_filter_power "" STEERABLE=recover
+{
+    0: :: ""
+} 64.0
+
 CCTK_REAL scale_factor "Scaling factor L for the SB basis" STEERABLE=recover
 {
     0: :: ""
diff --git a/schedule.ccl b/schedule.ccl
index 0d9c425..eafabd1 100644
--- a/schedule.ccl
+++ b/schedule.ccl
@@ -2,10 +2,18 @@
 #
 if (CCTK_Equals(lapse_evolution_method, "maximal_axi")) {
 
-    SCHEDULE maximal_slicing_axi IN MoL_CalcRHS BEFORE ML_BSSN_evolCalcGroup {
+    SCHEDULE maximal_slicing_axi IN ML_BSSN_evolCalcGroup BEFORE ML_BSSN_RHS {
         LANG: C
     } "Maximal slicing in axisymmetry"
 
+    #SCHEDULE maximal_slicing_axi IN MoL_PostStepModify {
+    #    LANG: C
+    #} "Maximal slicing in axisymmetry"
+
+    #SCHEDULE maximal_slicing_axi IN MoL_PseudoEvolution {
+    #    LANG: C
+    #} "Maximal slicing in axisymmetry"
+
     SCHEDULE maximal_slicing_axi_register AT Startup {
         LANG: C
     } ""
diff --git a/src/basis.c b/src/basis.c
index 5ea043e..20e9f12 100644
--- a/src/basis.c
+++ b/src/basis.c
@@ -1,11 +1,34 @@
+/*
+ * Basis sets for pseudospectral methods
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
 
 #include <math.h>
 
-#include "maximal_slicing_axi.h"
+#include "basis.h"
+#include "common.h"
 
+/*
+ * The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9)
+ * SB(x, n) = sin((n + 1) arccot(|x| / L))
+ * They are symmetric wrt origin and decay as 1/x in infinity.
+ */
 static double sb_even_eval(double coord, int idx)
 {
-    double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+    double val = atan2(SCALE_FACTOR, coord);
 
     idx *= 2;   // even only
 
@@ -14,20 +37,20 @@ static double sb_even_eval(double coord, int idx)
 
 static double sb_even_eval_diff1(double coord, int idx)
 {
-    double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+    double val = atan2(SCALE_FACTOR, coord);
 
     idx *= 2;   // even only
 
-    return - SCALE_FACTOR * (idx + 1) * SGN(coord) * cos((idx + 1) * val) / (SQR(SCALE_FACTOR) + SQR(coord));
+    return - SCALE_FACTOR * (idx + 1) * cos((idx + 1) * val) / (SQR(SCALE_FACTOR) + SQR(coord));
 }
 
 static double sb_even_eval_diff2(double coord, int idx)
 {
-    double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+    double val = atan2(SCALE_FACTOR, coord);
 
     idx *= 2;   // even only
 
-    return SCALE_FACTOR * (idx + 1) * SGN(coord) * (2 * fabs(coord) * cos((idx + 1) * val) - SCALE_FACTOR * (idx + 1) * sin((idx + 1) * val)) / SQR(SQR(SCALE_FACTOR) + SQR(coord));
+    return SCALE_FACTOR * (idx + 1) * (2 * coord * cos((idx + 1) * val) - SCALE_FACTOR * (idx + 1) * sin((idx + 1) * val)) / SQR(SQR(SCALE_FACTOR) + SQR(coord));
 }
 
 static double sb_even_colloc_point(int order, int idx)
@@ -38,11 +61,15 @@ static double sb_even_colloc_point(int order, int idx)
     //order *= 2;
 
     //t = (idx + 2) * M_PI / (order + 4);
+#if MS_POLAR
+    t = (idx + 2) * M_PI / (2 * order + 3);
+#else
     t = (idx + 2) * M_PI / (2 * order + 2);
+#endif
     return SCALE_FACTOR / tan(t);
 }
 
-const BasisSet msa_sb_even_basis = {
+const BasisSet ms_sb_even_basis = {
     .eval         = sb_even_eval,
     .eval_diff1   = sb_even_eval_diff1,
     .eval_diff2   = sb_even_eval_diff2,
@@ -87,17 +114,77 @@ static double tb_even_colloc_point(int order, int idx)
     //order *= 2;
 
     //t = (idx + 2) * M_PI / (order + 4);
-    t = (idx + 3) * M_PI / (2 * (order + 2));
+    t = (idx + 2) * M_PI / (2 * order + 4);
     return SCALE_FACTOR / tan(t);
 }
 
-const BasisSet msa_tb_even_basis = {
+const BasisSet ms_tb_even_basis = {
     .eval         = tb_even_eval,
     .eval_diff1   = tb_even_eval_diff1,
     .eval_diff2   = tb_even_eval_diff2,
     .colloc_point = tb_even_colloc_point,
 };
 
+static double tl_eval(double coord, int idx)
+{
+    double t = 2 * atan2(sqrt(SCALE_FACTOR), sqrt(fabs(coord)));
+
+    //idx++;
+
+    return cos(idx * t);// - 1.0;
+}
+
+static double tl_eval_diff1(double coord, int idx)
+{
+    double y = fabs(coord);
+    double y12 = sqrt(y);
+    double t = 2 * atan2(sqrt(SCALE_FACTOR), y12);
+
+    //idx++;
+
+    if (y < 1e-10)
+        return -2.0 * SQR(idx) * pow(-1.0, idx) / SCALE_FACTOR;
+
+    return idx * sqrt(SCALE_FACTOR) * sin(idx * t) / (y12 * (y + SCALE_FACTOR));
+}
+
+static double tl_eval_diff2(double coord, int idx)
+{
+    double y = fabs(coord);
+    double y12 = sqrt(y);
+    double t = 2 * atan2(sqrt(SCALE_FACTOR), y12);
+
+    //idx++;
+
+    if (y < 1e-10)
+        return 4.0 * SQR(idx) * (SQR(idx) + 2) * pow(-1.0, idx) / (3.0 * SQR(SCALE_FACTOR));
+
+    return -(SCALE_FACTOR * SQR(idx) * cos(idx * t) / (y * SQR(y + SCALE_FACTOR)) + sqrt(SCALE_FACTOR) * idx * sin(idx * t) / (y12 * SQR(y + SCALE_FACTOR)) + sqrt(SCALE_FACTOR) * idx * sin(idx * t) / (2 * y * y12 * (y + SCALE_FACTOR)));
+}
+
+static double tl_colloc_point(int order, int idx)
+{
+    double t;
+
+    if (!idx)
+        return 0.0;
+
+    idx = order - idx - 1;
+
+    order++;
+    idx++;
+
+    t = (2 * idx + 1) * M_PI / (2 * order);
+    return SCALE_FACTOR / SQR(tan(0.5 * t));
+}
+
+const BasisSet ms_tl_basis = {
+    .eval         = tl_eval,
+    .eval_diff1   = tl_eval_diff1,
+    .eval_diff2   = tl_eval_diff2,
+    .colloc_point = tl_colloc_point,
+};
+
 static double full_eval(double coord, int idx)
 {
     double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
@@ -153,7 +240,7 @@ static double full_colloc_point(int order, int idx)
 
 }
 
-const BasisSet msa_full_basis = {
+const BasisSet ms_full_basis = {
     .eval         = full_eval,
     .eval_diff1   = full_eval_diff1,
     .eval_diff2   = full_eval_diff2,
@@ -162,28 +249,145 @@ const BasisSet msa_full_basis = {
 
 static double cheb_eval(double coord, int idx)
 {
-    return cos(2 * idx * acos(coord / SCALE_FACTOR));
+    double x = MIN(1.0, MAX(2.0 * coord / SCALE_FACTOR - 1.0, -1.0));
+
+    //idx += 1;
+    return cos(idx * acos(x));
 }
 
 static double cheb_eval_diff1(double coord, int idx)
 {
-    return 2 * idx * sin(2 * idx * acos(coord / SCALE_FACTOR)) / sqrt(SQR(SCALE_FACTOR) - SQR(coord));
+    double x = MIN(1.0, MAX(2.0 * coord / SCALE_FACTOR - 1.0, -1.0));
+    double t = acos(x);
+
+    //idx += 1;
+
+    if (fabs(x - 1.0) < 1e-8)
+        return (2.0 / SCALE_FACTOR) * SQR(idx);
+    if (fabs(x + 1.0) < 1e-8)
+        return -(2.0 / SCALE_FACTOR) * SQR(idx) * pow(-1.0, idx);
+    return (2.0 / SCALE_FACTOR) * idx * sin(idx * t) / sin(t);
 }
 
 static double cheb_eval_diff2(double coord, int idx)
 {
-    double t = acos(coord / SCALE_FACTOR);
-    return 2 * idx * (cos(2 * idx * t) * 2 * idx / (SQR(SCALE_FACTOR) - SQR(coord)) + sin(2 * idx * t) * coord / pow(SQR(SCALE_FACTOR) - SQR(coord), 1.5));
+    double x = MIN(1.0, MAX(2.0 * coord / SCALE_FACTOR - 1.0, -1.0));
+    double t = acos(x);
+    double st = sin(t);
+
+    //idx += 1;
+
+    if (fabs(x - 1.0) < 1e-8)
+        return SQR(2.0 / SCALE_FACTOR) * SQR(idx) * (SQR(idx) - 1.0) / 3.;
+    if (fabs(x + 1.0) < 1e-8)
+        return SQR(2.0 / SCALE_FACTOR) * pow(-1.0, idx) * SQR(idx) * (SQR(idx) - 1.0) / 3.;
+
+    return SQR(2.0 / SCALE_FACTOR) * (-SQR(idx) * cos(idx * t) / SQR(st) + idx * cos(t) * sin(idx * t) / (st * SQR(st)));
 }
 
 static double cheb_colloc_point(int order, int idx)
 {
-    return SCALE_FACTOR * cos((idx + 0.01) * M_PI / (4 * order + 4));
+    double y;
+
+    idx = order - idx - 1;
+
+    //order += 1;
+    //idx += 1;
+    y = cos(idx * M_PI / (order - 1));
+    //y = cos((2 * idx + 1) * M_PI / (2 * order));
+
+    return 0.5 * SCALE_FACTOR * (y + 1.0);
 }
 
-const BasisSet msa_cheb_basis = {
+const BasisSet ms_cheb_basis = {
     .eval       = cheb_eval,
     .eval_diff1 = cheb_eval_diff1,
     .eval_diff2 = cheb_eval_diff2,
     .colloc_point = cheb_colloc_point,
 };
+
+static double cheb_even_eval(double coord, int idx)
+{
+    double x = MIN(1.0, MAX(coord / SCALE_FACTOR, -1.0));
+
+    idx *= 2;
+    //idx += 1;
+    return cos(idx * acos(x));
+}
+
+static double cheb_even_eval_diff1(double coord, int idx)
+{
+    double x = MIN(1.0, MAX(coord / SCALE_FACTOR, -1.0));
+    double t = acos(x);
+
+    idx *= 2;
+    //idx += 1;
+
+    if (fabs(fabs(x) - 1.0) < 1e-8)
+        return (1.0 / SCALE_FACTOR) * SQR(idx);
+    return (1.0 / SCALE_FACTOR) * idx * sin(idx * t) / sin(t);
+}
+
+static double cheb_even_eval_diff2(double coord, int idx)
+{
+    double x = MIN(1.0, MAX(coord / SCALE_FACTOR, -1.0));
+    double t = acos(x);
+    double st = sin(t);
+
+    idx *= 2;
+    //idx += 1;
+
+    if (fabs(fabs(x) - 1) < 1e-8)
+        return SQR(1.0 / SCALE_FACTOR) * SQR(idx) * (SQR(idx) - 1.0) / 3.;
+
+    return SQR(1.0 / SCALE_FACTOR) * (-SQR(idx) * cos(idx * t) / SQR(st) + idx * cos(t) * sin(idx * t) / (st * SQR(st)));
+}
+
+static double cheb_even_colloc_point(int order, int idx)
+{
+    double y;
+
+    idx = order - idx - 1;
+
+    //order += 1;
+    //idx += 1;
+    order *= 2;
+    y = cos(idx * M_PI / (order - 1));
+    y = cos((2 * idx + 2) * M_PI / (2 * order));
+
+    return SCALE_FACTOR * y;
+}
+
+const BasisSet ms_cheb_even_basis = {
+    .eval         = cheb_even_eval,
+    .eval_diff1   = cheb_even_eval_diff1,
+    .eval_diff2   = cheb_even_eval_diff2,
+    .colloc_point = cheb_even_colloc_point,
+};
+
+static double cos_even_eval(double coord, int idx)
+{
+    return cos(2 * idx * coord);
+}
+
+static double cos_even_eval_diff1(double coord, int idx)
+{
+    return -2 * idx * sin(2 * idx * coord);
+}
+
+static double cos_even_eval_diff2(double coord, int idx)
+{
+    return -4 * SQR(idx) * cos(2 * idx * coord);
+}
+
+static double cos_even_colloc_point(int order, int idx)
+{
+    return M_PI * idx / (2 * order - 0);
+}
+
+const BasisSet ms_cos_even_basis = {
+    .eval         = cos_even_eval,
+    .eval_diff1   = cos_even_eval_diff1,
+    .eval_diff2   = cos_even_eval_diff2,
+    .colloc_point = cos_even_colloc_point,
+};
diff --git a/src/basis.h b/src/basis.h
new file mode 100644
index 0000000..c275d6f
--- /dev/null
+++ b/src/basis.h
@@ -0,0 +1,49 @@
+/*
+ * Basis sets for pseudospectral methods
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MS_BASIS_H
+#define MS_BASIS_H
+
+/* a set of basis functions */
+typedef struct BasisSet {
+    /* evaluate the idx-th basis function at the specified point*/
+    double (*eval)      (double coord, int idx);
+    /* evaluate the first derivative of the idx-th basis function at the specified point*/
+    double (*eval_diff1)(double coord, int idx);
+    /* evaluate the second derivative of the idx-th basis function at the specified point*/
+    double (*eval_diff2)(double coord, int idx);
+    /**
+     * Get the idx-th collocation point for the specified order.
+     * idx runs from 0 to order - 1 (inclusive)
+     */
+    double (*colloc_point)(int order, int idx);
+} BasisSet;
+
+extern const BasisSet ms_cheb_basis;
+extern const BasisSet ms_cheb_even_basis;
+extern const BasisSet ms_full_basis;
+extern const BasisSet ms_tb_even_basis;
+extern const BasisSet ms_sb_even_basis;
+extern const BasisSet ms_sb_odd_basis;
+extern const BasisSet ms_tl_basis;
+extern const BasisSet ms_cos_even_basis;
+
+#define SCALE_FACTOR scale_factor
+extern double scale_factor;
+
+#endif /* MS_BASIS_H */
diff --git a/src/bicgstab.c b/src/bicgstab.c
new file mode 100644
index 0000000..e5d782f
--- /dev/null
+++ b/src/bicgstab.c
@@ -0,0 +1,420 @@
+/*
+ * BiCGStab iterative linear system solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#include <clBLAS.h>
+#endif
+
+#include <cblas.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bicgstab.h"
+
+#define BICGSTAB_MAXITER 16
+#define BICGSTAB_TOL (1e-15)
+
+struct BiCGStabContext {
+    int N;
+
+    double *x;
+    double *p, *v, *y, *z, *t;
+    double *res, *res0;
+    double *k;
+
+#if HAVE_OPENCL
+    cl_context       ocl_ctx;
+    cl_command_queue ocl_queue;
+
+    cl_mem cl_x;
+    cl_mem cl_p, cl_v, cl_y, cl_z, cl_t;
+    cl_mem cl_res, cl_res0;
+    cl_mem cl_k, cl_mat;
+    cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1;
+    cl_mem cl_tmp, cl_tmp1;
+#endif
+};
+
+#if HAVE_OPENCL
+static int solve_cl(BiCGStabContext *ctx,
+                    const double *mat, const double *rhs, double *x)
+{
+    cl_command_queue ocl_q = ctx->ocl_queue;
+    const int            N = ctx->N;
+    const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+    double rho, rho_prev = 1.0;
+    double omega[2] = { 1.0 };
+    double alpha = 1.0;
+
+    double err;
+    int i;
+
+    cl_event events[8];
+
+    if (rhs_norm < BICGSTAB_TOL) {
+        memset(x, 0, N * sizeof(*x));
+        return 0;
+    }
+
+    // upload the matrix and RHS
+    clEnqueueWriteBuffer(ocl_q, ctx->cl_res, 0, 0, N * sizeof(double),     rhs, 0, NULL, &events[0]);
+    clEnqueueWriteBuffer(ocl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]);
+
+    // initialize the residual
+    clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+                ctx->cl_mat, 0, N, ctx->cl_x, 0, 1, 1.0, ctx->cl_res, 0, 1,
+                1, &ocl_q, 2, events, &events[2]);
+    clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double),
+                        1, &events[2], &events[3]);
+    clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double),
+                        1, &events[2], &events[4]);
+
+    clWaitForEvents(5, events);
+    // BARRIER
+
+    for (i = 0; i < MAXITER; i++) {
+        clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1,
+                   ctx->cl_tmp, 1, &ocl_q, 0, NULL, &events[0]);
+        clEnqueueReadBuffer(ocl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho,
+                            1, &events[0], NULL);
+        // BARRIER
+
+        if (i) {
+            double beta = (rho / rho_prev) * (alpha / omega[0]);
+
+            clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1,
+                        1, &ocl_q, 0, NULL, &events[0]);
+            clblasDscal(N, beta, ctx->cl_p, 0, 1,
+                        1, &ocl_q, 1, &events[0], &events[1]);
+            clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1,
+                        1, &ocl_q, 1, &events[1], &events[0]);
+            clWaitForEvents(1, &events[0]);
+            // BARRIER
+        }
+
+        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1,
+                    1, &ocl_q, 0, NULL, &events[0]);
+
+        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1,
+                    1, &ocl_q, 1, &events[0], &events[1]);
+
+        clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1,
+                   ctx->cl_tmp, 1, &ocl_q, 1, &events[1], &events[0]);
+        clEnqueueReadBuffer(ocl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha,
+                            1, &events[0], NULL);
+        // BARRIER
+
+        alpha = rho / alpha;
+
+        clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1,
+                    1, &ocl_q, 0, NULL, &events[0]);
+
+        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1,
+                    1, &ocl_q, 1, &events[0], &events[1]);
+        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1,
+                    1, &ocl_q, 1, &events[1], &events[0]);
+
+        clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+                   ctx->cl_tmp, 1, &ocl_q, 1, &events[0], &events[1]);
+        clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1,
+                   ctx->cl_tmp1, 1, &ocl_q, 1, &events[0], &events[2]);
+
+        clEnqueueReadBuffer(ocl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega,
+                            2, &events[1], NULL);
+        // BARRIER
+
+        omega[0] /= omega[1];
+
+        clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ctx->cl_x, 0, 1,
+                    1, &ocl_q, 0, NULL, &events[0]);
+        clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ctx->cl_x, 0, 1,
+                    1, &ocl_q, 1, &events[0], &events[1]);
+
+        clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+                    1, &ocl_q, 0, NULL, &events[0]);
+        clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1,
+                    1, &ocl_q, 1, &events[0], &events[2]);
+        clEnqueueReadBuffer(ocl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err,
+                            1, &events[2], NULL);
+        clWaitForEvents(1, &events[1]);
+        // BARRIER
+
+        if (err < BICGSTAB_TOL)
+            break;
+
+        rho_prev = rho;
+    }
+    if (i == BICGSTAB_MAXITER)
+        return -1;
+
+    clEnqueueReadBuffer(ocl_q, ctx->cl_x, 1, 0, sizeof(double) * N,
+                        x, 0, NULL, NULL);
+    return i;
+}
+#endif
+
+// based on the wikipedia article
+// and http://www.netlib.org/templates/matlab/bicgstab.m
+static int solve_sw(BiCGStabContext *ctx,
+                    const double *mat, const double *rhs, double *x)
+{
+    const int N = ctx->N;
+    const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+    double rho, rho_prev = 1.0;
+    double omega = 1.0;
+    double alpha = 1.0;
+
+    double err;
+    int i;
+
+    double *k = ctx->k;
+    double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t;
+    double *res = ctx->res, *res0 = ctx->res0;
+
+    if (rhs_norm < BICGSTAB_TOL) {
+        memset(x, 0, N * sizeof(*x));
+        return 0;
+    }
+
+    // initialize the residual
+    memcpy(res, rhs, N * sizeof(*res));
+    cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+                mat, N, ctx->x, 1, 1.0, res, 1);
+
+    memcpy(res0, res, N * sizeof(*res0));
+    memcpy(p,    res, N * sizeof(*p));
+
+    for (i = 0; i < BICGSTAB_MAXITER; i++) {
+        rho = cblas_ddot(N, res, 1, res0, 1);
+
+        if (i) {
+            double beta = (rho / rho_prev) * (alpha / omega);
+
+            cblas_daxpy(N, -omega, v, 1, p, 1);
+            cblas_dscal(N, beta, p, 1);
+            cblas_daxpy(N, 1, res, 1, p, 1);
+        }
+
+        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    k, N, p, 1, 0.0, y, 1);
+
+        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    mat, N, y, 1, 0.0, v, 1);
+
+        alpha = rho / cblas_ddot(N, res0, 1, v, 1);
+
+        cblas_daxpy(N, -alpha, v, 1, res, 1);
+
+        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    k, N, res, 1, 0.0, z, 1);
+        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+                    mat, N, z, 1, 0.0, t, 1);
+
+        omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1);
+
+        cblas_daxpy(N, alpha, y, 1, ctx->x, 1);
+        cblas_daxpy(N, omega, z, 1, ctx->x, 1);
+
+        cblas_daxpy(N, -omega, t, 1, res, 1);
+
+        err = cblas_dnrm2(N, res, 1) / rhs_norm;
+        if (err < BICGSTAB_TOL)
+            break;
+
+        rho_prev = rho;
+    }
+    if (i == BICGSTAB_MAXITER)
+        return -1;
+
+    memcpy(x, ctx->x, sizeof(*x) * ctx->N);
+
+    return i;
+}
+
+int ms_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x)
+{
+    int ret;
+
+#if HAVE_OPENCL
+    if (ctx->ocl_ctx)
+        ret = solve_cl(ctx, mat, rhs, x);
+    else
+#endif
+        ret = solve_sw(ctx, mat, rhs, x);
+    if (ret < 0)
+        return ret;
+
+#if MS_VERIFY
+    {
+        int i;
+        double *y;
+
+        y = malloc(sizeof(*y) * ctx->N);
+        memcpy(y, rhs, sizeof(*y) * ctx->N);
+        cblas_dgemv(CblasColMajor, CblasNoTrans, ctx->N, ctx->N, -1.0,
+                    mat, ctx->N, x, 1, 1.0, y, 1);
+        i = cblas_idamax(ctx->N, y, 1);
+        if (fabs(y[i]) > 1e-11)
+            abort();
+    }
+#endif
+
+    return ret;
+}
+
+int ms_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0)
+{
+#if HAVE_OPENCL
+    if (ctx->ocl_ctx) {
+        cl_event events[2];
+        clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_k, 0, 0, ctx->N * ctx->N * sizeof(double),
+                             k, 0, NULL, &events[0]);
+        clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_x, 0, 0, ctx->N * sizeof(double),
+                             x0, 0, NULL, &events[1]);
+        clWaitForEvents(2, events);
+    } else
+#endif
+    {
+        memcpy(ctx->x, x0, ctx->N * sizeof(*x0));
+        memcpy(ctx->k, k,  ctx->N * ctx->N * sizeof(*k));
+    }
+
+    return 0;
+}
+
+int ms_bicgstab_context_alloc(BiCGStabContext **pctx, int N,
+                               cl_context ocl_ctx, cl_command_queue ocl_q)
+{
+    BiCGStabContext *ctx;
+    int ret = 0;
+
+    ctx = calloc(1, sizeof(*ctx));
+    if (!ctx)
+        return -ENOMEM;
+
+    ctx->N = N;
+
+#if HAVE_OPENCL
+    if (ocl_ctx) {
+        ctx->ocl_ctx   = ocl_ctx;
+        ctx->ocl_queue = ocl_q;
+
+#define ALLOC(dst, size)                                        \
+do {                                                            \
+    ctx->dst = clCreateBuffer(ocl_ctx, 0, size, NULL, &ret);    \
+    if (ret != CL_SUCCESS)                                      \
+        goto fail;                                              \
+} while (0)
+
+        ALLOC(cl_x,    N * sizeof(double));
+        ALLOC(cl_p,    N * sizeof(double));
+        ALLOC(cl_v,    N * sizeof(double));
+        ALLOC(cl_y,    N * sizeof(double));
+        ALLOC(cl_z,    N * sizeof(double));
+        ALLOC(cl_t,    N * sizeof(double));
+        ALLOC(cl_res,  N * sizeof(double));
+        ALLOC(cl_res0, N * sizeof(double));
+        ALLOC(cl_tmp,  N * sizeof(double));
+        ALLOC(cl_tmp1, N * 2 * sizeof(double));
+
+        ALLOC(cl_k,   N * N * sizeof(double));
+        ALLOC(cl_mat, N * N * sizeof(double));
+
+        ALLOC(cl_rho,    sizeof(double));
+        ALLOC(cl_alpha,  sizeof(double));
+        ALLOC(cl_beta,   sizeof(double));
+        ALLOC(cl_omega,  2 * sizeof(double));
+        ALLOC(cl_omega1, sizeof(double));
+    } else
+#endif
+    {
+        ret |= posix_memalign((void**)&ctx->x,    32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->p,    32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->v,    32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->y,    32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->z,    32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->t,    32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->res,  32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->res0, 32, sizeof(double) * N);
+        ret |= posix_memalign((void**)&ctx->k,    32, sizeof(double) * N * N);
+    }
+
+fail:
+    if (ret) {
+        ms_bicgstab_context_free(&ctx);
+        return -ENOMEM;
+    }
+
+    *pctx = ctx;
+    return 0;
+}
+
+void ms_bicgstab_context_free(BiCGStabContext **pctx)
+{
+    BiCGStabContext *ctx = *pctx;
+
+    if (!ctx)
+        return;
+
+    free(ctx->x);
+    free(ctx->p);
+    free(ctx->v);
+    free(ctx->y);
+    free(ctx->z);
+    free(ctx->t);
+    free(ctx->res);
+    free(ctx->res0);
+    free(ctx->k);
+
+#if HAVE_OPENCL
+    if (ctx->ocl_ctx) {
+        clReleaseMemObject(ctx->cl_x);
+        clReleaseMemObject(ctx->cl_p);
+        clReleaseMemObject(ctx->cl_v);
+        clReleaseMemObject(ctx->cl_y);
+        clReleaseMemObject(ctx->cl_z);
+        clReleaseMemObject(ctx->cl_t);
+        clReleaseMemObject(ctx->cl_res);
+        clReleaseMemObject(ctx->cl_res0);
+        clReleaseMemObject(ctx->cl_tmp);
+        clReleaseMemObject(ctx->cl_tmp1);
+
+        clReleaseMemObject(ctx->cl_k);
+        clReleaseMemObject(ctx->cl_mat);
+
+        clReleaseMemObject(ctx->cl_rho);
+        clReleaseMemObject(ctx->cl_alpha);
+        clReleaseMemObject(ctx->cl_beta);
+        clReleaseMemObject(ctx->cl_omega);
+        clReleaseMemObject(ctx->cl_omega1);
+    }
+#endif
+
+    free(ctx);
+    *pctx = NULL;
+}
diff --git a/src/bicgstab.h b/src/bicgstab.h
new file mode 100644
index 0000000..150c05c
--- /dev/null
+++ b/src/bicgstab.h
@@ -0,0 +1,60 @@
+/*
+ * BiCGStab iterative linear system solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MS_BICGSTAB_H
+#define MS_BICGSTAB_H
+
+#include "common.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#else
+typedef void* cl_context;
+typedef void* cl_command_queue;
+#endif
+
+typedef struct BiCGStabContext BiCGStabContext;
+
+/**
+ * Allocate and initialize the solver for the NxN system.
+ *
+ * If the OpenCL context and command queue are provided (non-NULL), the solver
+ * will run using clBLAS.
+ */
+int ms_bicgstab_context_alloc(BiCGStabContext **ctx, int N,
+                               cl_context ocl_ctx, cl_command_queue ocl_q);
+
+/**
+ * Free the solver and all its internal state.
+ */
+void ms_bicgstab_context_free(BiCGStabContext **ctx);
+
+/**
+ * Initialise the solver with the given preconditioner matrix. This function
+ * may be any number of times on a given solver context.
+ */
+int ms_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0);
+
+/**
+ * Solve the linear system
+ * mat · x = rhs
+ * The result is written into x.
+ */
+int ms_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x);
+
+#endif /* MS_BICGSTAB_H */
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..8f8f3fb
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,30 @@
+#ifndef MS_COMMON_H
+#define MS_COMMON_H
+
+#define HAVE_OPENCL 0
+#define MS_VERIFY  0
+#define MS_POLAR   0
+#define MS_CCZ4    0
+
+#define SQR(x) ((x) * (x))
+#define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0)
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#define MIN(x, y) ((x) > (y) ? (y) : (x))
+#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr))
+
+/*
+ * small number to avoid r=0 singularities
+ */
+#define EPS 1E-08
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <sys/time.h>
+static inline int64_t gettime(void)
+{
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+#endif /* MS_COMMON_H */
diff --git a/src/make.code.defn b/src/make.code.defn
index d11b536..d74c8bb 100644
--- a/src/make.code.defn
+++ b/src/make.code.defn
@@ -1,7 +1,7 @@
 # Main make.code.defn file for thorn MaximalSlicingAxi
 
 # Source files in this directory
-SRCS = basis.c maximal_slicing_axi.c register.c solve.c
+SRCS = basis.c bicgstab.c maximal_slicing_axi.c ms_solve.c pssolve.c register.c
 
 # Subdirectories containing source files
 SUBDIRS = 
diff --git a/src/maximal_slicing_axi.c b/src/maximal_slicing_axi.c
index 48bb4c9..e5b0d85 100644
--- a/src/maximal_slicing_axi.c
+++ b/src/maximal_slicing_axi.c
@@ -21,446 +21,26 @@
 #include "util_Table.h"
 
 #include "maximal_slicing_axi.h"
-
-#define ACC_TEST 0
-
-/*
- * The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9)
- * SB(x, n) = sin((n + 1) arccot(|x| / L))
- * They are symmetric wrt origin and decay as 1/x in infinity.
- */
+#include "ms_solve.h"
 
 double scale_factor;
 
-
-/* mapping between our indices and thorn names */
-static const char *metric_vars[] = {
-    [GTXX]  = "ML_BSSN::gt11",
-    [GTYY]  = "ML_BSSN::gt22",
-    [GTZZ]  = "ML_BSSN::gt33",
-    [GTXY]  = "ML_BSSN::gt12",
-    [GTXZ]  = "ML_BSSN::gt13",
-    [GTYZ]  = "ML_BSSN::gt23",
-    [ATXX]  = "ML_BSSN::At11",
-    [ATYY]  = "ML_BSSN::At22",
-    [ATZZ]  = "ML_BSSN::At33",
-    [ATXY]  = "ML_BSSN::At12",
-    [ATXZ]  = "ML_BSSN::At13",
-    [ATYZ]  = "ML_BSSN::At23",
-    [PHI]   = "ML_BSSN::phi",
-    [K]     = "ML_BSSN::trK",
-    [XTX]   = "ML_BSSN::Xt1",
-    [XTY]   = "ML_BSSN::Xt2",
-    [XTZ]   = "ML_BSSN::Xt3",
-    [BETAX] = "ML_BSSN::beta1",
-    [BETAY] = "ML_BSSN::beta2",
-    [BETAZ] = "ML_BSSN::beta3",
-};
-
-/* mapping between the cactus grid values and interpolated values */
-static const CCTK_INT interp_operation_indices[] = {
-    [I_GTXX]     = GTXX,
-    [I_GTXX_DX]  = GTXX,
-    [I_GTXX_DY]  = GTXX,
-    [I_GTXX_DZ]  = GTXX,
-    [I_GTYY]     = GTYY,
-    [I_GTYY_DX]  = GTYY,
-    [I_GTYY_DY]  = GTYY,
-    [I_GTYY_DZ]  = GTYY,
-    [I_GTZZ]     = GTZZ,
-    [I_GTZZ_DX]  = GTZZ,
-    [I_GTZZ_DY]  = GTZZ,
-    [I_GTZZ_DZ]  = GTZZ,
-    [I_GTXY]     = GTXY,
-    [I_GTXY_DX]  = GTXY,
-    [I_GTXY_DY]  = GTXY,
-    [I_GTXY_DZ]  = GTXY,
-    [I_GTXZ]     = GTXZ,
-    [I_GTXZ_DX]  = GTXZ,
-    [I_GTXZ_DY]  = GTXZ,
-    [I_GTXZ_DZ]  = GTXZ,
-    [I_GTYZ]     = GTYZ,
-    [I_GTYZ_DX]  = GTYZ,
-    [I_GTYZ_DY]  = GTYZ,
-    [I_GTYZ_DZ]  = GTYZ,
-    [I_PHI]      = PHI,
-    [I_PHI_DX]   = PHI,
-    [I_PHI_DY]   = PHI,
-    [I_PHI_DZ]   = PHI,
-    [I_ATXX]     = ATXX,
-    [I_ATYY]     = ATYY,
-    [I_ATZZ]     = ATZZ,
-    [I_ATXY]     = ATXY,
-    [I_ATXZ]     = ATXZ,
-    [I_ATYZ]     = ATYZ,
-    [I_K]        = K,
-    [I_K_DX]     = K,
-    [I_K_DY]     = K,
-    [I_K_DZ]     = K,
-    [I_XTX]      = XTX,
-    [I_XTY]      = XTY,
-    [I_XTZ]      = XTZ,
-    [I_BETAX]    = BETAX,
-    [I_BETAY]    = BETAY,
-    [I_BETAZ]    = BETAZ,
-};
-
-/* the operation (plain value or x/y/z-derivative) to apply during interpolation */
-static const CCTK_INT interp_operation_codes[] = {
-    [I_GTXX]     = 0,
-    [I_GTXX_DX]  = 1,
-    [I_GTXX_DY]  = 2,
-    [I_GTXX_DZ]  = 3,
-    [I_GTYY]     = 0,
-    [I_GTYY_DX]  = 1,
-    [I_GTYY_DY]  = 2,
-    [I_GTYY_DZ]  = 3,
-    [I_GTZZ]     = 0,
-    [I_GTZZ_DX]  = 1,
-    [I_GTZZ_DY]  = 2,
-    [I_GTZZ_DZ]  = 3,
-    [I_GTXY]     = 0,
-    [I_GTXY_DX]  = 1,
-    [I_GTXY_DY]  = 2,
-    [I_GTXY_DZ]  = 3,
-    [I_GTXZ]     = 0,
-    [I_GTXZ_DX]  = 1,
-    [I_GTXZ_DY]  = 2,
-    [I_GTXZ_DZ]  = 3,
-    [I_GTYZ]     = 0,
-    [I_GTYZ_DX]  = 1,
-    [I_GTYZ_DY]  = 2,
-    [I_GTYZ_DZ]  = 3,
-    [I_PHI]      = 0,
-    [I_PHI_DX]   = 1,
-    [I_PHI_DY]   = 2,
-    [I_PHI_DZ]   = 3,
-    [I_ATXX]     = 0,
-    [I_ATYY]     = 0,
-    [I_ATZZ]     = 0,
-    [I_ATXY]     = 0,
-    [I_ATXZ]     = 0,
-    [I_ATYZ]     = 0,
-    [I_K]        = 0,
-    [I_K_DX]     = 1,
-    [I_K_DY]     = 2,
-    [I_K_DZ]     = 3,
-    [I_XTX]      = 0,
-    [I_XTY]      = 0,
-    [I_XTZ]      = 0,
-    [I_BETAX]    = 0,
-    [I_BETAY]    = 0,
-    [I_BETAZ]    = 0,
-};
-
-static void init_opencl(MaximalSlicingContext *ms)
+/* get an approximate "main" frequency component in a basis function */
+static double calc_basis_freq(const BasisSet *b, int order)
 {
-    int err, count;
-    cl_platform_id platform;
-    cl_device_id   device;
-    cl_context_properties props[3];
-
-    err = clGetPlatformIDs(1, &platform, &count);
-    if (err != CL_SUCCESS || count < 1) {
-        fprintf(stderr, "Could not get an OpenCL platform ID\n");
-        return;
-    }
-
-    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &count);
-    if (err != CL_SUCCESS || count < 1) {
-        fprintf(stderr, "Could not get an OpenCL device ID\n");
-        return;
-    }
-
-    props[0] = CL_CONTEXT_PLATFORM;
-    props[1] = (cl_context_properties)platform;
-    props[2] = 0;
-
-    ms->cl_ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
-    if (err != CL_SUCCESS || !ms->cl_ctx) {
-        fprintf(stderr, "Could not create an OpenCL context\n");
-        return;
-    }
-
-    ms->cl_queue = clCreateCommandQueue(ms->cl_ctx, device, 0, &err);
-    if (err != CL_SUCCESS || !ms->cl_queue) {
-        fprintf(stderr, "Could not create an OpenCL command queue: %d\n", err);
-        goto fail;
-    }
-
-    err = clblasSetup();
-    if (err != CL_SUCCESS) {
-        fprintf(stderr, "Error setting up clBLAS\n");
-        goto fail;
-    }
-
-    ms->ocl_coeffs       = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
-
-    ms->bicgstab.cl_p    = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
-    ms->bicgstab.cl_v    = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
-    ms->bicgstab.cl_y    = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
-    ms->bicgstab.cl_z    = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
-    ms->bicgstab.cl_t    = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
-    ms->bicgstab.cl_res  = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
-    ms->bicgstab.cl_res0 = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
-    ms->bicgstab.cl_tmp  = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
-    ms->bicgstab.cl_tmp1 = clCreateBuffer(ms->cl_ctx, 0, 2 * ms->nb_coeffs * sizeof(double), NULL, &err);
-
-    ms->bicgstab.cl_k    = clCreateBuffer(ms->cl_ctx, 0, ms->nb_colloc_points * ms->nb_coeffs * sizeof(double), NULL, &err);
-    ms->bicgstab.cl_mat  = clCreateBuffer(ms->cl_ctx, 0, ms->nb_colloc_points * ms->nb_coeffs * sizeof(double), NULL, &err);
-
-    ms->bicgstab.cl_rho    = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
-    ms->bicgstab.cl_alpha  = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
-    ms->bicgstab.cl_beta   = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
-    ms->bicgstab.cl_omega  = clCreateBuffer(ms->cl_ctx, 0, 2 * sizeof(double), NULL, &err);
-    ms->bicgstab.cl_omega1 = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
-
-    return;
-fail:
-    if (ms->cl_queue)
-        clReleaseCommandQueue(ms->cl_queue);
-    ms->cl_queue = 0;
-
-    if (ms->cl_ctx)
-        clReleaseContext(ms->cl_ctx);
-    ms->cl_ctx = 0;
-}
-
-static MaximalSlicingContext *init_ms(cGH *cctkGH,
-                                      int basis_order_r, int basis_order_z,
-                                      double sf,
-                                      CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z,
-                                      const int grid_size[3])
-{
-    MaximalSlicingContext *ms;
-    int ret;
-
-    //const double h = (x[1] - x[0]) / 8;
-    //fprintf(stderr, "h %g\n", h);
-
-    //if (basis_order_r != basis_order_z)
-    //    CCTK_WARN(0, "Different r and z basis orders are not supported.");
-
-    ms = calloc(1, sizeof(*ms));
-
-    ms->gh = cctkGH;
-
-    ms->basis      = &msa_sb_even_basis;
-    //ms->basis      = &full_basis;
-    //ms->basis      = &cheb_basis;
-
-    ms->nb_coeffs_x = basis_order_r;
-    ms->nb_coeffs_z = basis_order_z;
-
-    ms->nb_coeffs = ms->nb_coeffs_x * ms->nb_coeffs_z;
-
-    ms->nb_colloc_points_x = basis_order_r;
-    ms->nb_colloc_points_z = basis_order_z;
-
-    ms->nb_colloc_points = ms->nb_colloc_points_x * ms->nb_colloc_points_z;
-
-    if (ms->nb_colloc_points != ms->nb_coeffs)
-        CCTK_WARN(0, "Non-square collocation matrix");
-
-    ms->colloc_grid_order_x = ms->nb_colloc_points_x;
-    ms->colloc_grid_order_z = ms->nb_colloc_points_z;
-
-    ms->mat    = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points);
-    ms->coeffs = malloc(sizeof(double) * ms->nb_coeffs);
-    ms->rhs    = malloc(sizeof(double) * ms->nb_colloc_points);
-
-    ms->mat_f  = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points);
-    ms->ipiv   = malloc(sizeof(*ms->ipiv) * ms->nb_coeffs);
-
-#if 1
-    scale_factor = 1.0;
-
-    scale_factor = (x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)] - 3) / ms->basis->colloc_point(ms->colloc_grid_order_x, ms->nb_colloc_points_x - 1);
-    //scale_factor = x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)] - 0.5;
-    fprintf(stderr, "scale factor %g\n", scale_factor);
-
-#else
-    scale_factor = sf;
-#endif
-
-    ms->grid_x = malloc(ms->nb_colloc_points_x * sizeof(*ms->grid_x));
-
-    for (int i = 0; i < ms->nb_colloc_points_x; i++) {
-#if 0
-        double target_val = ms->basis->colloc_point(ms->colloc_grid_order_x, i);
-        double best_diff = DBL_MAX, best_val = DBL_MAX;
-
-        for (int j = 0; j < grid_size[0]; j++) {
-            int idx = CCTK_GFINDEX3D(cctkGH, j, 0, 0);
-            double val = x[idx];
-            double diff = fabs(target_val - val);
-
-            if (val > 0.0 && diff < best_diff) {
-                int k;
-                for (k = 0; k < i; k++)
-                    if (ms->grid_x[k] == val)
-                        break;
-                if (k == i) {
-                    best_diff = diff;
-                    best_val = val;
-                }
-            }
-        }
-        if (best_val == DBL_MAX)
-            abort();
-        fprintf(stderr, "%d %g -> %g (%g)\n", i, target_val, best_val, target_val - best_val);
-        ms->grid_x[i] = best_val;
-#elif 0
-        double max = x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)];
-        ms->grid_x[i] = max * SQR(SQR((double)i / ms->nb_colloc_points_x)) + 0.001;
-#else
-        ms->grid_x[i] = ms->basis->colloc_point(ms->colloc_grid_order_x, i);
-#endif
-        fprintf(stderr, "%d %g\n", i, ms->grid_x[i]);
-    }
-
-    ms->grid_z = malloc(ms->nb_colloc_points_z * sizeof(*ms->grid_z));
-
-    for (int i = 0; i < ms->nb_colloc_points_z; i++) {
-        ms->grid_z[i] = ms->basis->colloc_point(ms->colloc_grid_order_z, i);
-    }
-
-    /* precompute the basis values we will need */
-    ms->basis_x_val   = malloc(sizeof(*ms->basis_x_val)   * ms->nb_colloc_points_x * ms->nb_coeffs_x);
-    ms->basis_x_dval  = malloc(sizeof(*ms->basis_x_dval)  * ms->nb_colloc_points_x * ms->nb_coeffs_x);
-    ms->basis_x_d2val = malloc(sizeof(*ms->basis_x_d2val) * ms->nb_colloc_points_x * ms->nb_coeffs_x);
-    for (int i = 0; i < ms->nb_colloc_points_x; i++) {
-        CCTK_REAL coord = ms->grid_x[i];
-        for (int j = 0; j < ms->nb_coeffs_x; j++) {
-            ms->basis_x_val  [i * ms->nb_coeffs_x + j] = ms->basis->eval(coord, j);
-            ms->basis_x_dval [i * ms->nb_coeffs_x + j] = ms->basis->eval_diff1(coord, j);
-            ms->basis_x_d2val[i * ms->nb_coeffs_x + j] = ms->basis->eval_diff2(coord, j);
-        }
-    }
-
-    ms->basis_z_val   = malloc(sizeof(*ms->basis_z_val)   * ms->nb_colloc_points_z * ms->nb_coeffs_z);
-    ms->basis_z_dval  = malloc(sizeof(*ms->basis_z_dval)  * ms->nb_colloc_points_z * ms->nb_coeffs_z);
-    ms->basis_z_d2val = malloc(sizeof(*ms->basis_z_d2val) * ms->nb_colloc_points_z * ms->nb_coeffs_z);
-    for (int i = 0; i < ms->nb_colloc_points_z; i++) {
-        CCTK_REAL coord = ms->grid_z[i];
-        for (int j = 0; j < ms->nb_coeffs_z; j++) {
-            ms->basis_z_val  [i * ms->nb_coeffs_z + j] = ms->basis->eval(coord, j);
-            ms->basis_z_dval [i * ms->nb_coeffs_z + j] = ms->basis->eval_diff1(coord, j);
-            ms->basis_z_d2val[i * ms->nb_coeffs_z + j] = ms->basis->eval_diff2(coord, j);
-        }
-    }
-
-    ms->basis_val_00 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00));
-    ms->basis_val_11 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00));
-    ms->basis_val_10 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00));
-    ms->basis_val_01 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00));
-    ms->basis_val_02 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00));
-    ms->basis_val_20 = calloc(ms->nb_colloc_points * ms->nb_coeffs, sizeof(*ms->basis_val_00));
-    for (int i = 0; i < ms->nb_colloc_points_z; i++) {
-        const double *basis_val_z   = ms->basis_z_val + i * ms->nb_coeffs_z;
-        const double *dbasis_val_z  = ms->basis_z_dval + i * ms->nb_coeffs_z;
-        const double *d2basis_val_z = ms->basis_z_d2val + i * ms->nb_coeffs_z;
-
-        for (int j = 0; j < ms->nb_colloc_points_x; j++) {
-            const double *basis_val_x   = ms->basis_x_val   + j * ms->nb_coeffs_x;
-            const double *dbasis_val_x  = ms->basis_x_dval  + j * ms->nb_coeffs_x;
-            const double *d2basis_val_x = ms->basis_x_d2val + j * ms->nb_coeffs_x;
-            const int idx_grid = i * ms->nb_colloc_points_x + j;
-
-            for (int k = 0; k < ms->nb_coeffs_z; k++)
-                for (int l = 0; l < ms->nb_coeffs_x; l++) {
-                    const int idx_coeff = k * ms->nb_coeffs_x + l;
-                    const int idx = idx_grid + ms->nb_colloc_points * idx_coeff;
-                    ms->basis_val_00[idx] =   basis_val_x[l] *   basis_val_z[k];
-                    ms->basis_val_11[idx] =  dbasis_val_x[l] *  dbasis_val_z[k];
-                    ms->basis_val_10[idx] =  dbasis_val_x[l] *   basis_val_z[k];
-                    ms->basis_val_01[idx] =   basis_val_x[l] *  dbasis_val_z[k];
-                    ms->basis_val_02[idx] =   basis_val_x[l] * d2basis_val_z[k];
-                    ms->basis_val_20[idx] = d2basis_val_x[l] *   basis_val_z[k];
-                }
-        }
-    }
-
-    ms->interp_coords[0] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[0]));
-    ms->interp_coords[1] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[1]));
-    ms->interp_coords[2] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[2]));
-    for (int i = 0; i < ms->nb_colloc_points_z; i++) {
-        CCTK_REAL z = ms->grid_z[i];
-        for (int j = 0; j < ms->nb_colloc_points_x; j++) {
-            CCTK_REAL x = ms->grid_x[j];
-
-            ms->interp_coords[0][i * ms->nb_colloc_points_x + j] = x;
-            ms->interp_coords[1][i * ms->nb_colloc_points_x + j] = 0;
-            ms->interp_coords[2][i * ms->nb_colloc_points_x + j] = z;
-        }
-    }
-
-    for (int i = 0; i < ARRAY_ELEMS(ms->metric_u); i++)
-        ms->metric_u[i] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_values[i]));
-
-    ms->kij_kij = malloc(ms->nb_colloc_points * sizeof(*ms->kij_kij));
-
-    for (int i = 0; i < ARRAY_ELEMS(ms->interp_values); i++) {
-        ms->interp_values[i] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_values[i]));
-        ms->interp_value_codes[i] = CCTK_VARIABLE_REAL;
-    }
-
-    for (int i = 0; i < ARRAY_ELEMS(metric_vars); i++)
-        ms->interp_vars_indices[i] = CCTK_VarIndex(metric_vars[i]);
-
-    ms->coord_system = CCTK_CoordSystemHandle("cart3d");
-    if (ms->coord_system < 0)
-        CCTK_WARN(0, "Error getting the coordinate system");
-
-    ms->interp_operator = CCTK_InterpHandle("Lagrange polynomial interpolation");
-    if (ms->interp_operator < 0)
-        CCTK_WARN(0, "Error getting the interpolation operator");
-
-    ms->interp_params = Util_TableCreateFromString("order=2 want_global_mode=1");
-    if (ms->interp_params < 0)
-        CCTK_WARN(0, "Error creating interpolation parameters table");
-
-    ret = Util_TableSetIntArray(ms->interp_params, NB_INTERP_VARS,
-                                interp_operation_codes, "operation_codes");
-    if (ret < 0)
-        CCTK_WARN(0, "Error setting operation codes");
-
-    ret = Util_TableSetIntArray(ms->interp_params, NB_INTERP_VARS,
-                                interp_operation_indices, "operand_indices");
-    if (ret < 0)
-        CCTK_WARN(0, "Error setting operand indices");
-
-    ms->bicgstab.p    = malloc(sizeof(double) * ms->nb_coeffs);
-    ms->bicgstab.v    = malloc(sizeof(double) * ms->nb_coeffs);
-    ms->bicgstab.y    = malloc(sizeof(double) * ms->nb_coeffs);
-    ms->bicgstab.z    = malloc(sizeof(double) * ms->nb_coeffs);
-    ms->bicgstab.t    = malloc(sizeof(double) * ms->nb_coeffs);
-    ms->bicgstab.res  = malloc(sizeof(double) * ms->nb_coeffs);
-    ms->bicgstab.res0 = malloc(sizeof(double) * ms->nb_coeffs);
-    ms->bicgstab.k    = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points);
-
-    ms->steps_since_inverse = INT_MAX;
-
-    init_opencl(ms);
-
-    CCTK_TimerCreate("MaximalSlicingAxi_Solve");
-    CCTK_TimerCreate("MaximalSlicingAxi_Expand");
-    CCTK_TimerCreate("MaximalSlicingAxi_calc_geometry");
-    CCTK_TimerCreate("MaximalSlicingAxi_construct_matrix");
-    CCTK_TimerCreate("MaximalSlicingAxi_solve_LU");
-    CCTK_TimerCreate("MaximalSlicingAxi_solve_BiCGSTAB");
-
-    return ms;
+    return b->colloc_point(order, 1);
 }
 
 static CoordPatch *get_coord_patch(MaximalSlicingContext *ms,
-                                   CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z,
-                                   CCTK_REAL *alp)
+                                   CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z)
 {
     cGH *cctkGH = ms->gh;
+    int nb_coeffs_x = ms->solver->nb_coeffs[0];
+    int nb_coeffs_z = ms->solver->nb_coeffs[1];
 
     CoordPatch *cp;
     int64_t grid_size;
+    int i;
 
     for (int i = 0; i < ms->nb_patches; i++) {
         cp = &ms->patches[i];
@@ -471,9 +51,9 @@ static CoordPatch *get_coord_patch(MaximalSlicingContext *ms,
             cp->size[0]   == ms->gh->cctk_lsh[0]          &&
             cp->size[1]   == ms->gh->cctk_lsh[1]          &&
             cp->size[2]   == ms->gh->cctk_lsh[2]          &&
-            cp->delta[0]  == ms->gh->cctk_delta_space[0]  &&
-            cp->delta[1]  == ms->gh->cctk_delta_space[1]  &&
-            cp->delta[2]  == ms->gh->cctk_delta_space[2])
+            cp->delta[0]  == ms->gh->cctk_levfac[0]  &&
+            cp->delta[1]  == ms->gh->cctk_levfac[1]  &&
+            cp->delta[2]  == ms->gh->cctk_levfac[2])
             return cp;
     }
 
@@ -483,10 +63,21 @@ static CoordPatch *get_coord_patch(MaximalSlicingContext *ms,
     ms->patches = realloc(ms->patches, sizeof(*ms->patches) * (ms->nb_patches + 1));
     cp = &ms->patches[ms->nb_patches];
 
+    memset(cp, 0, sizeof(*cp));
+
     memcpy(cp->origin, ms->gh->cctk_origin_space, sizeof(cp->origin));
     memcpy(cp->size,   ms->gh->cctk_lsh,          sizeof(cp->size));
-    memcpy(cp->delta,  ms->gh->cctk_delta_space,  sizeof(cp->delta));
+    memcpy(cp->delta,  ms->gh->cctk_levfac,  sizeof(cp->delta));
+
+    for (i = 0; i < cp->size[1]; i++)
+        if (fabs(y[CCTK_GFINDEX3D(cctkGH, 0, i, 0)]) < 1e-8) {
+            cp->y_idx = i;
+            break;
+        }
+    if (i == cp->size[1])
+        CCTK_WARN(0, "The grid does not include y==0");
 
+#if 0
     posix_memalign((void**)&cp->basis_val_r,   32, sizeof(*cp->basis_val_r)   * ms->nb_coeffs_x * ms->gh->cctk_lsh[1] * ms->gh->cctk_lsh[0]);
     for (int j = 0; j < ms->gh->cctk_lsh[1]; j++)
         for (int i = 0; i < ms->gh->cctk_lsh[0]; i++) {
@@ -500,23 +91,91 @@ static CoordPatch *get_coord_patch(MaximalSlicingContext *ms,
         }
 
     posix_memalign((void**)&cp->basis_val_z,   32, sizeof(*cp->basis_val_z) * ms->nb_coeffs_z * ms->gh->cctk_lsh[2]);
-
     for (int i = 0; i < ms->gh->cctk_lsh[2]; i++) {
         CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, i)];
         for (int j = 0; j < ms->nb_coeffs_z; j++)
-            cp->basis_val_z  [i * ms->nb_coeffs_z + j] = ms->basis->eval(zz, j);
+            cp->basis_val_z  [i * ms->nb_coeffs_z + j] = ms->basis->eval(fabs(zz), j);
             //cp->basis_val_z  [i + ms->gh->cctk_lsh[2] * j] = ms->basis->eval(zz, j);
     }
-
     posix_memalign((void**)&cp->transform_z, 32, sizeof(*cp->transform_z) * cctkGH->cctk_lsh[2] * ms->nb_coeffs_x);
     posix_memalign((void**)&cp->one, 32, sizeof(*cp->one) * grid_size);
     for (int i = 0; i < grid_size; i++)
         cp->one[i] = 1.0;
+#else
+    posix_memalign((void**)&cp->transform_matrix,  32, sizeof(*cp->transform_matrix)  * nb_coeffs_x * cp->size[0] * cp->size[2]);
+    posix_memalign((void**)&cp->transform_matrix1, 32, sizeof(*cp->transform_matrix1) * nb_coeffs_z * cp->size[0] * cp->size[2]);
+#pragma omp parallel for
+    for (int j = 0; j < cp->size[2]; j++) {
+        CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, j)];
+
+        for (int i = 0; i < cp->size[0]; i++) {
+            const int idx_grid = j * cp->size[0] + i;
+
+            double xx = x[CCTK_GFINDEX3D(ms->gh, i, 0, 0)];
+            double rr = sqrt(SQR(xx) + SQR(zz));
+
+#if MS_POLAR
+            double coord0 = rr;
+            double coord1 = atan2(zz, xx);
+#else
+            double coord0 = xx;
+            double coord1 = zz;
+#endif
+
+            //for (int k = 0; k < ms->nb_coeffs_z; k++)
+            //    for (int l = 0; l < ms->nb_coeffs_x; l++) {
+            //        const int idx_coeff = k * ms->nb_coeffs_x + l;
+            //        cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * idx_coeff] = ms->basis->eval(r, l) * ms->basis1->eval(phi, k);
+            //    }
+            for (int k = 0; k < nb_coeffs_x; k++) {
+                double dx = calc_basis_freq(ms->solver->basis[0], k);
+                double r0 = dx * 64;
+                double fact =  exp(-36.0 * pow(rr / r0, 10));
+                fact = 1.0;
+
+                cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * k] = ms->solver->basis[0]->eval(coord0, k) * fact;
+            }
+            for (int k = 0; k < nb_coeffs_z; k++) {
+                double dx = calc_basis_freq(ms->solver->basis[1], k);
+                double r0 = dx * 64;
+                double fact =  exp(-36.0 * pow(rr / r0, 10));
+                fact = 1.0;
+
+                cp->transform_matrix1[idx_grid * nb_coeffs_z + k]              = ms->solver->basis[1]->eval(coord1, k) * fact;
+            }
+        }
+    }
+    posix_memalign((void**)&cp->transform_tmp,  32, sizeof(*cp->transform_tmp)  * cp->size[0] * cp->size[2] * nb_coeffs_z);
+#endif
 
     ms->nb_patches++;
     return cp;
 }
 
+static int context_init(cGH *cctkGH, MaximalSlicingContext **ctx)
+{
+    MaximalSlicingContext *ms;
+    int ret;
+
+    DECLARE_CCTK_ARGUMENTS;
+    DECLARE_CCTK_PARAMETERS;
+
+    ms = calloc(1, sizeof(*ms));
+    if (!ms)
+        return -ENOMEM;
+
+    ms->gh = cctkGH;
+
+    ret = ms_solver_init(&ms->solver, cctkGH, basis_order_r, basis_order_z,
+                         scale_factor, filter_power, 0.0);
+    if (ret < 0)
+        return ret;
+
+    *ctx = ms;
+
+    return 0;
+}
+
 void maximal_slicing_axi(CCTK_ARGUMENTS)
 {
     static MaximalSlicingContext *ms;
@@ -526,45 +185,97 @@ void maximal_slicing_axi(CCTK_ARGUMENTS)
     DECLARE_CCTK_ARGUMENTS;
     DECLARE_CCTK_PARAMETERS;
 
-    static double total;
-    static int64_t count;
-    int64_t timer_start;
+    int64_t expand_start, totaltime_start;
+
+    double *coeffs = NULL;
+    int i, ret;
 
-    const int64_t grid_size = cctk_lsh[2] * cctk_lsh[1] * cctk_lsh[0];
+    totaltime_start = gettime();
 
     /* on the first run, init the solver */
-    if (!ms) {
-        ms = init_ms(cctkGH, basis_order_r, basis_order_z,
-                     scale_factor, x, y, z, cctk_lsh);
-    }
+    if (!ms)
+        context_init(cctkGH, &ms);
 
-    cp = get_coord_patch(ms, x, y, z, alp);
+    cp = get_coord_patch(ms, x, y, z);
 
     CCTK_TimerStart("MaximalSlicingAxi_Solve");
-    msa_maximal_solve(ms);
+    ms_solver_solve(ms->solver);
     CCTK_TimerStop("MaximalSlicingAxi_Solve");
 
+    coeffs = ms->solver->coeffs;
+
     if (export_coeffs)
-        memcpy(alpha_coeffs, ms->coeffs, sizeof(*alpha_coeffs) * ms->nb_coeffs);
+        memcpy(alpha_coeffs, coeffs, sizeof(*alpha_coeffs) * ms->solver->nb_coeffs[0] * ms->solver->nb_coeffs[1]);
 
     CCTK_TimerStart("MaximalSlicingAxi_Expand");
-    timer_start = gettime();
+    expand_start = gettime();
+#if 0
+#pragma omp parallel for
+    for (int k = 0; k < cctk_lsh[2]; k++) {
+        for (int i = 0; i < cctk_lsh[0]; i++) {
+            int idx = CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, k);
+            double xx = x[idx];
+            double zz = z[idx];
+            double r = sqrt(SQR(xx) + SQR(zz));
+            double phi = atan2(zz, xx);
+
+            double val = 1.0;
+
+            for (int l = 0; l < ms->nb_coeffs_z; l++) {
+                double tmp = 0.0;
+                for (int m = 0; m < ms->nb_coeffs_x; m++) {
+                    const int idx_coeff = l * ms->nb_coeffs_x + m;
+                    tmp += ms->coeffs[idx_coeff] * ms->basis->eval(r, m);
+                }
+                val += tmp * ms->basis1->eval(phi, l);
+            }
 
-    memcpy(alp, cp->one, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*alp));
-    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
-                ms->nb_coeffs_x, cctk_lsh[2], ms->nb_coeffs_z, 1.0,
-                ms->coeffs, ms->nb_coeffs_x, cp->basis_val_z, ms->nb_coeffs_z,
-                0.0, cp->transform_z, ms->nb_coeffs_x);
+            alp[idx] = val;
+        }
+    }
+#else
     cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
-                cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], ms->nb_coeffs_x, 1.0,
-                cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, ms->nb_coeffs_x,
-                1.0, alp, cctk_lsh[0] * cctk_lsh[1]);
-
-    total += gettime() - timer_start;
+                cctk_lsh[0] * cctk_lsh[2], ms->solver->nb_coeffs[1], ms->solver->nb_coeffs[0],
+                1.0, cp->transform_matrix, cctk_lsh[0] * cctk_lsh[2],
+                coeffs, ms->solver->nb_coeffs[0], 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]);
+#pragma omp parallel for
+    for (int j = 0; j < cctk_lsh[2]; j++)
+        for (int i = 0; i < cctk_lsh[0]; i++) {
+            const int idx_grid = j * cctk_lsh[0] + i;
+            const double val = cblas_ddot(ms->solver->nb_coeffs[1], cp->transform_matrix1 + idx_grid * ms->solver->nb_coeffs[1], 1,
+                                          cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]);
+            alpha[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = 1.0 + val;
+        }
+#endif
+    //memcpy(alp, cp->one, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*alp));
+    //memset(alp, 0, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*alp));
+    //cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+    //            ms->nb_coeffs_x, cctk_lsh[2], ms->nb_coeffs_z, 1.0,
+    //            coeffs, ms->nb_coeffs_x, cp->basis_val_z, ms->nb_coeffs_z,
+    //            0.0, cp->transform_z, ms->nb_coeffs_x);
+    //cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+    //            cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], ms->nb_coeffs_x, 1.0,
+    //            cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, ms->nb_coeffs_x,
+    //            1.0, alp, cctk_lsh[0] * cctk_lsh[1]);
+
+    ms->grid_expand_time += gettime() - expand_start;
+    ms->grid_expand_count++;
 
     CCTK_TimerStop("MaximalSlicingAxi_Expand");
 
-    count++;
-    if (!(count & 15))
-        fprintf(stderr, "avg %g total %g\n", total / count, total);
+    //CCTK_TimerStart("MaximalSlicingAxi_Polish");
+    //msa_sor(ms, cp);
+    //CCTK_TimerStop("MaximalSlicingAxi_Polish");
+
+    /* print stats */
+    if (!(ms->grid_expand_count & 255)) {
+        fprintf(stderr, "Maximal slicing stats:\n");
+
+        ms_solver_print_stats(ms->solver);
+        fprintf(stderr,
+                "%lu evals: total time %g s, avg time per call %g ms\n",
+                ms->grid_expand_count, (double)ms->grid_expand_time / 1e6,
+                (double)ms->grid_expand_time / ms->grid_expand_count / 1e3);
+
+    }
 }
diff --git a/src/maximal_slicing_axi.h b/src/maximal_slicing_axi.h
index fc6c86f..3bb97aa 100644
--- a/src/maximal_slicing_axi.h
+++ b/src/maximal_slicing_axi.h
@@ -1,124 +1,23 @@
 
+#ifndef MAXIMAL_SLICING_AXI_H
+#define MAXIMAL_SLICING_AXI_H
+
 #include <inttypes.h>
 
 #include <cl.h>
 
 #include "cctk.h"
 
-#define SQR(x) ((x) * (x))
-#define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0)
-#define MAX(x, y) ((x) > (y) ? (x) : (y))
-#define MIN(x, y) ((x) > (y) ? (y) : (x))
-#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr))
-
-/*
- * small number to avoid r=0 singularities
- */
-#define EPS 1E-08
+#include "ms_solve.h"
 
 #define SCALE_FACTOR scale_factor
 
-/* indices (in our code, not cactus structs) of the grid functions which we'll need to
- * interpolate on the pseudospectral grid */
-enum MetricVars {
-    GTXX = 0,
-    GTYY,
-    GTZZ,
-    GTXY,
-    GTXZ,
-    GTYZ,
-    PHI,
-    ATXX,
-    ATYY,
-    ATZZ,
-    ATXY,
-    ATXZ,
-    ATYZ,
-    K,
-    XTX,
-    XTY,
-    XTZ,
-    BETAX,
-    BETAY,
-    BETAZ,
-    NB_METRIC_VARS,
-};
-
-/* indices of the interpolated values of the above grid functions and their derivatives */
-enum InterpMetricVars {
-    I_GTXX = 0,
-    I_GTXX_DX,
-    I_GTXX_DY,
-    I_GTXX_DZ,
-    I_GTYY,
-    I_GTYY_DX,
-    I_GTYY_DY,
-    I_GTYY_DZ,
-    I_GTZZ,
-    I_GTZZ_DX,
-    I_GTZZ_DY,
-    I_GTZZ_DZ,
-    I_GTXY,
-    I_GTXY_DX,
-    I_GTXY_DY,
-    I_GTXY_DZ,
-    I_GTXZ,
-    I_GTXZ_DX,
-    I_GTXZ_DY,
-    I_GTXZ_DZ,
-    I_GTYZ,
-    I_GTYZ_DX,
-    I_GTYZ_DY,
-    I_GTYZ_DZ,
-    I_PHI,
-    I_PHI_DX,
-    I_PHI_DY,
-    I_PHI_DZ,
-    I_ATXX,
-    I_ATYY,
-    I_ATZZ,
-    I_ATXY,
-    I_ATXZ,
-    I_ATYZ,
-    I_K,
-    I_K_DX,
-    I_K_DY,
-    I_K_DZ,
-    I_XTX,
-    I_XTY,
-    I_XTZ,
-    I_BETAX,
-    I_BETAY,
-    I_BETAZ,
-    NB_INTERP_VARS,
-};
-
-/* a set of basis functions */
-typedef struct BasisSet {
-    /* evaluate the idx-th basis function at the specified point*/
-    double (*eval)      (double coord, int idx);
-    /* evaluate the first derivative of the idx-th basis function at the specified point*/
-    double (*eval_diff1)(double coord, int idx);
-    /* evaluate the second derivative of the idx-th basis function at the specified point*/
-    double (*eval_diff2)(double coord, int idx);
-    /**
-     * Get the idx-th collocation point for the specified order.
-     * idx runs from 0 to order - 1 (inclusive)
-     */
-    double (*colloc_point)(int order, int idx);
-} BasisSet;
-
-extern const BasisSet msa_cheb_basis;
-extern const BasisSet msa_full_basis;
-extern const BasisSet msa_tb_even_basis;
-extern const BasisSet msa_sb_even_basis;
-
 extern double scale_factor;
 
 /* precomputed values for a given refined grid */
 typedef struct CoordPatch {
     CCTK_REAL origin[3];
-    CCTK_REAL delta[3];
+    CCTK_INT delta[3];
     CCTK_INT size[3];
 
     // basis values on the grid
@@ -126,105 +25,25 @@ typedef struct CoordPatch {
     double *basis_val_z;
 
     double *transform_z;
+    double *transform_matrix;
+    double *transform_matrix1;
+    double *transform_tmp;
     double *one;
-} CoordPatch;
-
-/* state and scratch storage for the BiCGSTAB solver */
-typedef struct BiCGSTABContext {
-    double *p, *v, *y, *z, *t;
-    double *res, *res0;
-    double *k;
-
-    cl_mem cl_p, cl_v, cl_y, cl_z, cl_t;
-    cl_mem cl_res, cl_res0;
-    cl_mem cl_k, cl_mat;
-    cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1;
-    cl_mem cl_tmp, cl_tmp1;
 
-    int64_t solve_total;
-    int64_t iter_total;
-    int64_t time_total;
-} BiCGSTABContext;
+    int y_idx;
+} CoordPatch;
 
 typedef struct MaximalSlicingContext {
+    MSSolver *solver;
     cGH *gh;
-    const BasisSet *basis;
-
-    BiCGSTABContext bicgstab;
-    int steps_since_inverse;
-
-    int64_t lu_solves_total;
-    int64_t lu_solves_time;
-
-    // the grid of collocation points
-    CCTK_REAL *grid_x;
-    CCTK_REAL *grid_z;
-
-    // interpolation parameters
-    int coord_system;
-    int interp_operator;
-    int interp_params;
-
-    CCTK_REAL *interp_coords[3];
 
-    int interp_vars_indices[NB_METRIC_VARS];
-
-    CCTK_REAL *interp_values[NB_INTERP_VARS];
-    CCTK_INT   interp_value_codes[NB_INTERP_VARS];
-
-    CCTK_REAL *metric_u[6];
-
-    CCTK_REAL *kij_kij;
-    CCTK_REAL *trk;
-
-    int nb_coeffs_x;
-    int nb_coeffs_z;
-    int nb_coeffs;
-
-    int nb_colloc_points_x;
-    int nb_colloc_points_z;
-    int nb_colloc_points;
-
-    int colloc_grid_order_x;
-    int colloc_grid_order_z;
-
-    double *mat;
-    double *mat_f;
-    double *rhs;
-    double *coeffs;
-    int *ipiv;
-    double *basis_x_val;
-    double *basis_x_dval;
-    double *basis_x_d2val;
-
-    double *basis_z_val;
-    double *basis_z_dval;
-    double *basis_z_d2val;
-
-    double *basis_val_00;
-    double *basis_val_20;
-    double *basis_val_02;
-    double *basis_val_11;
-    double *basis_val_10;
-    double *basis_val_01;
+    uint64_t grid_expand_count;
+    uint64_t grid_expand_time;
 
     CoordPatch *patches;
     int nb_patches;
-
-    // OpenCL / CLBLAS stuff
-    cl_context cl_ctx;
-    cl_command_queue cl_queue;
-
-    cl_mem ocl_coeffs;
 } MaximalSlicingContext;
 
 int msa_maximal_solve(MaximalSlicingContext *ms);
 
-#include <sys/time.h>
-static inline int64_t gettime(void)
-{
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
-}
-
+#endif /* MAXIMAL_SLICING_AXI_H */
diff --git a/src/ms_solve.c b/src/ms_solve.c
new file mode 100644
index 0000000..90502be
--- /dev/null
+++ b/src/ms_solve.c
@@ -0,0 +1,697 @@
+/*
+ * Maximal slicing -- actual solver code
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+
+#include <errno.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#if HAVE_OPENCL
+#include <cl.h>
+#include <clBLAS.h>
+#endif
+
+#include "cctk.h"
+#include "cctk_Timers.h"
+#include "util_Table.h"
+
+#include "basis.h"
+#include "pssolve.h"
+#include "ms_solve.h"
+
+#define NB_COEFFS(ms)        (ms->nb_coeffs[0]        * ms->nb_coeffs[1])
+#define NB_COLLOC_POINTS(ms) (ms->nb_colloc_points[0] * ms->nb_colloc_points[1])
+
+/* indices (in our code, not cactus structs) of the grid functions which we'll need to
+ * interpolate on the pseudospectral grid */
+enum MetricVars {
+    GTXX = 0,
+    GTYY,
+    GTZZ,
+    GTXY,
+    GTXZ,
+    GTYZ,
+    PHI,
+    ATXX,
+    ATYY,
+    ATZZ,
+    ATXY,
+    ATXZ,
+    ATYZ,
+    K,
+    XTX,
+    XTY,
+    XTZ,
+    BETAX,
+    BETAY,
+    BETAZ,
+    NB_METRIC_VARS,
+};
+
+/* indices of the interpolated values of the above grid functions and their derivatives */
+enum InterpMetricVars {
+    I_GTXX = 0,
+    I_GTYY,
+    I_GTZZ,
+    I_GTXY,
+    I_GTXZ,
+    I_GTYZ,
+    I_PHI,
+    I_PHI_DX,
+    I_PHI_DY,
+    I_PHI_DZ,
+    I_ATXX,
+    I_ATYY,
+    I_ATZZ,
+    I_ATXY,
+    I_ATXZ,
+    I_ATYZ,
+    I_K,
+    I_XTX,
+    I_XTY,
+    I_XTZ,
+    I_BETAX,
+    I_BETAY,
+    I_BETAZ,
+    NB_INTERP_VARS,
+};
+
+struct MSSolverPriv {
+    PSSolveContext *ps_ctx;
+    cGH *gh;
+
+    int colloc_grid_order[2];
+
+    double *eq_coeffs[PSSOLVE_DIFF_ORDER_NB];
+    double *rhs;
+
+    double *coeff_scale;
+
+    // interpolation parameters
+    int coord_system;
+    int interp_operator;
+    int interp_params;
+
+    CCTK_REAL *interp_coords[3];
+
+    int        interp_vars_indices[NB_METRIC_VARS];
+    CCTK_REAL *interp_values[NB_INTERP_VARS];
+    CCTK_INT   interp_value_codes[NB_INTERP_VARS];
+
+#if HAVE_OPENCL
+    // OpenCL / CLBLAS stuff
+    cl_context       ocl_ctx;
+    cl_command_queue ocl_queue;
+#endif
+
+    uint64_t solve_count;
+    uint64_t solve_time;
+
+    uint64_t interp_geometry_count;
+    uint64_t interp_geometry_time;
+
+    uint64_t calc_eq_coeffs_count;
+    uint64_t calc_eq_coeffs_time;
+};
+
+/* mapping between our indices and thorn names */
+static const char *metric_vars[] = {
+#if MS_CCZ4
+    [GTXX]  = "ML_CCZ4::gt11",
+    [GTYY]  = "ML_CCZ4::gt22",
+    [GTZZ]  = "ML_CCZ4::gt33",
+    [GTXY]  = "ML_CCZ4::gt12",
+    [GTXZ]  = "ML_CCZ4::gt13",
+    [GTYZ]  = "ML_CCZ4::gt23",
+    [ATXX]  = "ML_CCZ4::At11",
+    [ATYY]  = "ML_CCZ4::At22",
+    [ATZZ]  = "ML_CCZ4::At33",
+    [ATXY]  = "ML_CCZ4::At12",
+    [ATXZ]  = "ML_CCZ4::At13",
+    [ATYZ]  = "ML_CCZ4::At23",
+    [PHI]   = "ML_CCZ4::phi",
+    [K]     = "ML_CCZ4::trK",
+    [XTX]   = "ML_CCZ4::Xt1",
+    [XTY]   = "ML_CCZ4::Xt2",
+    [XTZ]   = "ML_CCZ4::Xt3",
+    [BETAX] = "ML_CCZ4::beta1",
+    [BETAY] = "ML_CCZ4::beta2",
+    [BETAZ] = "ML_CCZ4::beta3",
+#else
+    [GTXX]  = "ML_BSSN::gt11",
+    [GTYY]  = "ML_BSSN::gt22",
+    [GTZZ]  = "ML_BSSN::gt33",
+    [GTXY]  = "ML_BSSN::gt12",
+    [GTXZ]  = "ML_BSSN::gt13",
+    [GTYZ]  = "ML_BSSN::gt23",
+    [ATXX]  = "ML_BSSN::At11",
+    [ATYY]  = "ML_BSSN::At22",
+    [ATZZ]  = "ML_BSSN::At33",
+    [ATXY]  = "ML_BSSN::At12",
+    [ATXZ]  = "ML_BSSN::At13",
+    [ATYZ]  = "ML_BSSN::At23",
+    [PHI]   = "ML_BSSN::phi",
+    [K]     = "ML_BSSN::trK",
+    [XTX]   = "ML_BSSN::Xt1",
+    [XTY]   = "ML_BSSN::Xt2",
+    [XTZ]   = "ML_BSSN::Xt3",
+    [BETAX] = "ML_BSSN::beta1",
+    [BETAY] = "ML_BSSN::beta2",
+    [BETAZ] = "ML_BSSN::beta3",
+#endif
+};
+
+/* mapping between the cactus grid values and interpolated values */
+static const CCTK_INT interp_operation_indices[] = {
+    [I_GTXX]     = GTXX,
+    [I_GTYY]     = GTYY,
+    [I_GTZZ]     = GTZZ,
+    [I_GTXY]     = GTXY,
+    [I_GTXZ]     = GTXZ,
+    [I_GTYZ]     = GTYZ,
+    [I_PHI]      = PHI,
+    [I_PHI_DX]   = PHI,
+    [I_PHI_DY]   = PHI,
+    [I_PHI_DZ]   = PHI,
+    [I_ATXX]     = ATXX,
+    [I_ATYY]     = ATYY,
+    [I_ATZZ]     = ATZZ,
+    [I_ATXY]     = ATXY,
+    [I_ATXZ]     = ATXZ,
+    [I_ATYZ]     = ATYZ,
+    [I_K]        = K,
+    [I_XTX]      = XTX,
+    [I_XTY]      = XTY,
+    [I_XTZ]      = XTZ,
+    [I_BETAX]    = BETAX,
+    [I_BETAY]    = BETAY,
+    [I_BETAZ]    = BETAZ,
+};
+
+/* the operation (plain value or x/y/z-derivative) to apply during interpolation */
+static const CCTK_INT interp_operation_codes[] = {
+    [I_GTXX]     = 0,
+    [I_GTYY]     = 0,
+    [I_GTZZ]     = 0,
+    [I_GTXY]     = 0,
+    [I_GTXZ]     = 0,
+    [I_GTYZ]     = 0,
+    [I_PHI]      = 0,
+    [I_PHI_DX]   = 1,
+    [I_PHI_DY]   = 2,
+    [I_PHI_DZ]   = 3,
+    [I_ATXX]     = 0,
+    [I_ATYY]     = 0,
+    [I_ATZZ]     = 0,
+    [I_ATXY]     = 0,
+    [I_ATXZ]     = 0,
+    [I_ATYZ]     = 0,
+    [I_K]        = 0,
+    [I_XTX]      = 0,
+    [I_XTY]      = 0,
+    [I_XTZ]      = 0,
+    [I_BETAX]    = 0,
+    [I_BETAY]    = 0,
+    [I_BETAZ]    = 0,
+};
+
+/* interpolate the cactus gridfunctions onto the pseudospectral grid */
+static int interp_geometry(MSSolver *ctx)
+{
+    MSSolverPriv *s = ctx->priv;
+    int ret;
+
+    ret = CCTK_InterpGridArrays(s->gh, 3, s->interp_operator, s->interp_params,
+                                s->coord_system, NB_COLLOC_POINTS(ctx), CCTK_VARIABLE_REAL,
+                                (const void * const *)s->interp_coords, ARRAY_ELEMS(s->interp_vars_indices), s->interp_vars_indices,
+                                ARRAY_ELEMS(s->interp_values), s->interp_value_codes, (void * const *)s->interp_values);
+    if (ret < 0)
+        CCTK_WARN(0, "Error interpolating");
+
+    return 0;
+}
+
+/* evaluate the equation coefficients at the collocation points */
+static int calc_eq_coeffs(MSSolver *ctx)
+{
+    MSSolverPriv *s = ctx->priv;
+
+#pragma omp parallel for
+    for (int i = 0; i < NB_COLLOC_POINTS(ctx); i++) {
+        CCTK_REAL Am[3][3], gtu[3][3];
+        CCTK_REAL a2;
+
+        CCTK_REAL gtxx = s->interp_values[I_GTXX][i];
+        CCTK_REAL gtyy = s->interp_values[I_GTYY][i];
+        CCTK_REAL gtzz = s->interp_values[I_GTZZ][i];
+        CCTK_REAL gtxy = s->interp_values[I_GTXY][i];
+        CCTK_REAL gtxz = s->interp_values[I_GTXZ][i];
+        CCTK_REAL gtyz = s->interp_values[I_GTYZ][i];
+
+        CCTK_REAL Atxx = s->interp_values[I_ATXX][i];
+        CCTK_REAL Atyy = s->interp_values[I_ATYY][i];
+        CCTK_REAL Atzz = s->interp_values[I_ATZZ][i];
+        CCTK_REAL Atxy = s->interp_values[I_ATXY][i];
+        CCTK_REAL Atxz = s->interp_values[I_ATXZ][i];
+        CCTK_REAL Atyz = s->interp_values[I_ATYZ][i];
+
+        CCTK_REAL At[3][3] = {{ Atxx, Atxy, Atxz },
+                              { Atxy, Atyy, Atyz },
+                              { Atxz, Atyz, Atzz }};
+
+        CCTK_REAL trK  = s->interp_values[I_K][i];
+
+        CCTK_REAL Xtx  = s->interp_values[I_XTX][i];
+        CCTK_REAL Xtz  = s->interp_values[I_XTZ][i];
+
+        CCTK_REAL det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz);
+
+        // \tilde{γ}^{ij}
+        gtu[0][0] =  (gtyy * gtzz - SQR(gtyz)) / det;
+        gtu[1][1] =  (gtxx * gtzz - SQR(gtxz)) / det;
+        gtu[2][2] =  (gtxx * gtyy - SQR(gtxy)) / det;
+        gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det;
+        gtu[0][2] =  (gtxy * gtyz - gtyy * gtxz) / det;
+        gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det;
+        gtu[1][0] = gtu[0][1];
+        gtu[2][0] = gtu[0][2];
+        gtu[2][1] = gtu[1][2];
+
+        // \tilde{A}_{i}^j
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++) {
+                double val = 0.0;
+                for (int l = 0; l < 3; l++)
+                    val += gtu[j][l] * At[l][k];
+                Am[j][k] = val;
+            }
+
+        // K_{ij} K^{ij}
+        a2 = 0.0;
+        for (int j = 0; j < 3; j++)
+            for (int k = 0; k < 3; k++)
+                a2 += Am[j][k] * Am[k][j];
+
+        {
+            double x = s->interp_coords[0][i];
+            double z = s->interp_coords[2][i];
+
+            const double gtuxx         = gtu[0][0];
+            const double gtuyy         = gtu[1][1];
+            const double gtuzz         = gtu[2][2];
+            const double gtuxz         = gtu[0][2];
+
+            const double phi           = s->interp_values[I_PHI][i];
+            const double phi_dx        = s->interp_values[I_PHI_DX][i];
+            const double phi_dz        = s->interp_values[I_PHI_DZ][i];
+
+            const double Xtx           = s->interp_values[I_XTX][i];
+            const double Xtz           = s->interp_values[I_XTZ][i];
+
+            const double k2            = a2 + SQR(trK) / 3.;
+
+            const double betax         = s->interp_values[I_BETAX][i];
+            const double betaz         = s->interp_values[I_BETAZ][i];
+
+            const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi);
+            const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);
+
+            s->eq_coeffs[PSSOLVE_DIFF_ORDER_20][i] = SQR(phi) * (gtuxx + ((x <= EPS) ? gtuyy : 0.0));
+            s->eq_coeffs[PSSOLVE_DIFF_ORDER_02][i] = SQR(phi) * gtuzz;
+            s->eq_coeffs[PSSOLVE_DIFF_ORDER_11][i] = SQR(phi) * gtuxz * 2;
+            s->eq_coeffs[PSSOLVE_DIFF_ORDER_10][i] = -Xx + ((x > EPS) ? SQR(phi) * gtuyy / x : 0.0);
+            s->eq_coeffs[PSSOLVE_DIFF_ORDER_01][i] = -Xz;
+            s->eq_coeffs[PSSOLVE_DIFF_ORDER_00][i] = -k2;
+            s->rhs[i]                              = k2 + trK;
+        }
+    }
+
+    return 0;
+}
+
+int ms_solver_solve(MSSolver *ctx)
+{
+    MSSolverPriv *s = ctx->priv;
+    int ret;
+    int64_t start, totaltime_start;
+
+    totaltime_start = gettime();
+
+    /* interpolate the metric values and construct the quantities we'll need */
+    CCTK_TimerStart("MaximalSlicing_interp_geometry");
+    start = gettime();
+
+    ret = interp_geometry(ctx);
+
+    s->interp_geometry_time += gettime() - start;
+    s->interp_geometry_count++;
+    CCTK_TimerStop("MaximalSlicing_interp_geometry");
+    if (ret < 0)
+        return ret;
+
+    CCTK_TimerStart("MaximalSlicing_calc_eq_coeffs");
+    start = gettime();
+
+    ret = calc_eq_coeffs(ctx);
+
+    s->calc_eq_coeffs_time += gettime() - start;
+    s->calc_eq_coeffs_count++;
+    CCTK_TimerStop("MaximalSlicing_calc_eq_coeffs");
+    if (ret < 0)
+        return ret;
+
+    ret = ms_pssolve_solve(s->ps_ctx, (const double * const *)s->eq_coeffs,
+                           s->rhs, ctx->coeffs);
+    if (ret < 0)
+        return ret;
+
+    for (int i = 0; i < NB_COEFFS(ctx); i++)
+        ctx->coeffs[i] *= s->coeff_scale[i];
+
+    s->solve_count++;
+    s->solve_time += gettime() - totaltime_start;
+
+    return 0;
+}
+
+void ms_solver_print_stats(MSSolver *ctx)
+{
+    MSSolverPriv *s = ctx->priv;
+
+    fprintf(stderr,
+            "%g%% interpolate geometry: %lu, "
+            "total time %g s, avg time per call %g ms\n",
+            (double)s->interp_geometry_time * 100 / s->solve_time,
+            s->interp_geometry_count, (double)s->interp_geometry_time / 1e6,
+            (double)s->interp_geometry_time / s->interp_geometry_count / 1e3);
+    fprintf(stderr,
+            "%g%% calc equation coefficients: %lu, "
+            "total time %g s, avg time per call %g ms\n",
+            (double)s->calc_eq_coeffs_time * 100 / s->solve_time,
+            s->calc_eq_coeffs_count, (double)s->calc_eq_coeffs_time / 1e6,
+            (double)s->calc_eq_coeffs_time / s->calc_eq_coeffs_count / 1e3);
+    fprintf(stderr,
+            "%g%% pseudospectral matrix construction: %lu, "
+            "total time %g s, avg time per call %g ms\n",
+            (double)s->ps_ctx->construct_matrix_time * 100 / s->solve_time,
+            s->ps_ctx->construct_matrix_count, (double)s->ps_ctx->construct_matrix_time / 1e6,
+            (double)s->ps_ctx->construct_matrix_time / s->ps_ctx->construct_matrix_count / 1e3);
+    fprintf(stderr,
+            "%g%% BiCGSTAB %lu solves, "
+            "%lu iterations, total time %g s, "
+            "avg iterations per solve %g, avg time per solve %g ms, "
+            "avg time per iteration %g ms\n",
+            (double)s->ps_ctx->cg_time_total * 100 / s->solve_time,
+            s->ps_ctx->cg_solve_count, s->ps_ctx->cg_iter_count, (double)s->ps_ctx->cg_time_total / 1e6,
+            (double)s->ps_ctx->cg_iter_count / s->ps_ctx->cg_solve_count,
+            (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_solve_count / 1e3,
+            (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_iter_count / 1e3);
+    fprintf(stderr,
+            "%g%% LU %lu solves, total time %g s, avg time per solve %g ms\n",
+            (double)s->ps_ctx->lu_solves_time * 100 / s->solve_time,
+            s->ps_ctx->lu_solves_count, (double)s->ps_ctx->lu_solves_time / 1e6,
+            (double)s->ps_ctx->lu_solves_time / s->ps_ctx->lu_solves_count / 1e3);
+}
+
+static void init_opencl(MSSolver *ctx)
+#if HAVE_OPENCL
+{
+    MSSolverPriv *s = ctx->priv;
+    int err, count;
+    cl_platform_id platform;
+    cl_context_properties props[3];
+    cl_device_id ocl_device;
+
+    err = clGetPlatformIDs(1, &platform, &count);
+    if (err != CL_SUCCESS || count < 1) {
+        fprintf(stderr, "Could not get an OpenCL platform ID\n");
+        return;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ocl_device, &count);
+    if (err != CL_SUCCESS || count < 1) {
+        fprintf(stderr, "Could not get an OpenCL device ID\n");
+        return;
+    }
+
+    props[0] = CL_CONTEXT_PLATFORM;
+    props[1] = (cl_context_properties)platform;
+    props[2] = 0;
+
+    s->ocl_ctx = clCreateContext(props, 1, &ocl_device, NULL, NULL, &err);
+    if (err != CL_SUCCESS || !s->ocl_ctx) {
+        fprintf(stderr, "Could not create an OpenCL context\n");
+        return;
+    }
+
+    s->ocl_queue = clCreateCommandQueue(s->ocl_ctx, ocl_device, 0, &err);
+    if (err != CL_SUCCESS || !s->ocl_queue) {
+        fprintf(stderr, "Could not create an OpenCL command queue: %d\n", err);
+        goto fail;
+    }
+
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        fprintf(stderr, "Error setting up clBLAS\n");
+        goto fail;
+    }
+
+    return;
+fail:
+    if (s->ocl_queue)
+        clReleaseCommandQueue(s->ocl_queue);
+    s->ocl_queue = 0;
+
+    if (s->ocl_ctx)
+        clReleaseContext(s->ocl_ctx);
+    s->ocl_ctx = 0;
+}
+#else
+{
+}
+#endif
+
+int ms_solver_init(MSSolver **pctx,
+                    cGH *cctkGH,
+                    int basis_order_r, int basis_order_z,
+                    double sf, double filter_power, double input_filter_power)
+{
+    MSSolver *ctx;
+    MSSolverPriv *s;
+    int ret;
+
+    ctx = calloc(1, sizeof(*ctx));
+    if (!ctx)
+        return -ENOMEM;
+
+    ctx->priv = calloc(1, sizeof(*ctx->priv));
+    if (!ctx->priv)
+        goto fail;
+    s = ctx->priv;
+
+    s->gh = cctkGH;
+
+    ctx->basis[0] = &ms_sb_even_basis;
+#if MS_POLAR
+    ctx->basis[1] = &ms_cos_even_basis;
+#else
+    ctx->basis[1] = &ms_sb_even_basis;
+#endif
+
+    ctx->nb_coeffs[0] = basis_order_r;
+    ctx->nb_coeffs[1] = basis_order_z;
+
+    ctx->nb_colloc_points[0] = basis_order_r;
+    ctx->nb_colloc_points[1] = basis_order_z;
+
+    if (NB_COLLOC_POINTS(ctx) != NB_COEFFS(ctx))
+        CCTK_WARN(0, "Non-square collocation matrix");
+
+    s->colloc_grid_order[0] = ctx->nb_colloc_points[0];
+    s->colloc_grid_order[1] = ctx->nb_colloc_points[1];
+
+    ret  = posix_memalign((void**)&ctx->coeffs, 32, sizeof(*ctx->coeffs) * NB_COEFFS(ctx));
+    ret |= posix_memalign((void**)&s->rhs,      32, sizeof(*s->rhs)      * NB_COLLOC_POINTS(ctx));
+    if (ret)
+        goto fail;
+
+    //FIXME
+    scale_factor = 1.0;
+    scale_factor = (64.0 / ctx->basis[0]->colloc_point(s->colloc_grid_order[0], ctx->nb_colloc_points[0] - 1));
+    fprintf(stderr, "scale factor %16.16g\n", scale_factor);
+
+    init_opencl(ctx);
+
+    ret = ms_pssolve_context_alloc(&s->ps_ctx);
+    if (ret < 0)
+        CCTK_WARN(0, "Error allocating the pseudospectral solver");
+
+    s->ps_ctx->basis[0]       = ctx->basis[0];
+    s->ps_ctx->basis[1]       = ctx->basis[1];
+    s->ps_ctx->solve_order[0] = basis_order_r;
+    s->ps_ctx->solve_order[1] = basis_order_z;
+#if HAVE_OPENCL
+    s->ps_ctx->ocl_ctx        = s->ocl_ctx;
+    s->ps_ctx->ocl_queue      = s->ocl_queue;
+#endif
+
+    ret = ms_pssolve_context_init(s->ps_ctx);
+    if (ret < 0)
+        CCTK_WARN(0, "Error initializing the pseudospectral solver");
+
+    for (int i = 0; i < MAX(s->ps_ctx->solve_order[0], s->ps_ctx->solve_order[1]); i++) {
+        fprintf(stderr, "%d ", i);
+        if (i < s->ps_ctx->solve_order[0])
+            fprintf(stderr, "%g\t", s->ps_ctx->colloc_grid[0][i]);
+        else
+            fprintf(stderr, "\t\t");
+        if (i < s->ps_ctx->solve_order[1])
+            fprintf(stderr, "%g\t", s->ps_ctx->colloc_grid[1][i]);
+        fprintf(stderr, "\n");
+    }
+
+    for (int i = 0; i < ARRAY_ELEMS(s->eq_coeffs); i++) {
+        ret = posix_memalign((void**)&s->eq_coeffs[i], 32,
+                             NB_COLLOC_POINTS(ctx) * sizeof(*s->eq_coeffs[i]));
+        if (ret)
+            goto fail;
+    }
+
+    for (int i = 0; i < ARRAY_ELEMS(s->interp_coords); i++) {
+        ret |= posix_memalign((void**)&s->interp_coords[i], 32,
+                              NB_COLLOC_POINTS(ctx) * sizeof(*s->interp_coords[i]));
+    }
+    if (ret)
+        goto fail;
+
+    for (int i = 0; i < ctx->nb_colloc_points[1]; i++) {
+        for (int j = 0; j < ctx->nb_colloc_points[0]; j++) {
+#if MS_POLAR
+            double phi = s->ps_ctx->colloc_grid[1][i];
+            double r   = s->ps_ctx->colloc_grid[0][j];
+
+            double x = r * cos(phi);
+            double z = r * sin(phi);
+#else
+            double x = s->ps_ctx->colloc_grid[0][j];
+            double z = s->ps_ctx->colloc_grid[1][i];
+#endif
+
+            s->interp_coords[0][i * ctx->nb_colloc_points[0] + j] = x;
+            s->interp_coords[1][i * ctx->nb_colloc_points[0] + j] = 0;
+            s->interp_coords[2][i * ctx->nb_colloc_points[0] + j] = z;
+        }
+    }
+
+    ret = posix_memalign((void**)&s->coeff_scale, 32, NB_COEFFS(ctx) * sizeof(*s->coeff_scale));
+    if (ret)
+        goto fail;
+    for (int j = 0; j < ctx->nb_coeffs[1]; j++)
+        for (int i = 0; i < ctx->nb_coeffs[0]; i++) {
+            s->coeff_scale[j * ctx->nb_coeffs[0] + i] = exp(-36.0 * pow((double)i / ctx->nb_coeffs[0], filter_power)) *
+                                                        exp(-36.0 * pow((double)j / ctx->nb_coeffs[1], filter_power));
+        }
+
+    for (int i = 0; i < ARRAY_ELEMS(s->interp_values); i++) {
+        ret = posix_memalign((void**)&s->interp_values[i], 32,
+                             NB_COLLOC_POINTS(ctx) * sizeof(*s->interp_values[i]));
+        if (ret)
+            goto fail;
+        s->interp_value_codes[i] = CCTK_VARIABLE_REAL;
+    }
+
+    for (int i = 0; i < ARRAY_ELEMS(metric_vars); i++) {
+        s->interp_vars_indices[i] = CCTK_VarIndex(metric_vars[i]);
+        if (s->interp_vars_indices[i] < 0)
+            CCTK_VWarn(0, __LINE__, __FILE__, CCTK_THORNSTRING, "Error getting the index of variable: %s\n", metric_vars[i]);
+    }
+
+    s->coord_system = CCTK_CoordSystemHandle("cart3d");
+    if (s->coord_system < 0)
+        CCTK_WARN(0, "Error getting the coordinate system");
+
+    s->interp_operator = CCTK_InterpHandle("Lagrange polynomial interpolation (tensor product)");
+    if (s->interp_operator < 0)
+        CCTK_WARN(0, "Error getting the interpolation operator");
+
+    s->interp_params = Util_TableCreateFromString("order=4 want_global_mode=1");
+    if (s->interp_params < 0)
+        CCTK_WARN(0, "Error creating interpolation parameters table");
+
+    ret = Util_TableSetIntArray(s->interp_params, NB_INTERP_VARS,
+                                interp_operation_codes, "operation_codes");
+    if (ret < 0)
+        CCTK_WARN(0, "Error setting operation codes");
+
+    ret = Util_TableSetIntArray(s->interp_params, NB_INTERP_VARS,
+                                interp_operation_indices, "operand_indices");
+    if (ret < 0)
+        CCTK_WARN(0, "Error setting operand indices");
+
+    CCTK_TimerCreate("MaximalSlicing_Solve");
+    CCTK_TimerCreate("MaximalSlicing_Expand");
+    CCTK_TimerCreate("MaximalSlicing_interp_geometry");
+    CCTK_TimerCreate("MaximalSlicing_calc_eq_coeffs");
+    CCTK_TimerCreate("MaximalSlicing_construct_matrix");
+    CCTK_TimerCreate("MaximalSlicing_solve_LU");
+    CCTK_TimerCreate("MaximalSlicing_solve_BiCGSTAB");
+
+    *pctx = ctx;
+    return 0;
+fail:
+    ms_solver_free(&ctx);
+    return -ENOMEM;
+}
+
+void ms_solver_free(MSSolver **pctx)
+{
+    MSSolver *ctx = *pctx;
+
+    if (!ctx)
+        return;
+
+    if (ctx->priv) {
+        for (int i = 0; i < ARRAY_ELEMS(ctx->priv->interp_coords); i++)
+            free(ctx->priv->interp_coords[i]);
+        for (int i = 0; i < ARRAY_ELEMS(ctx->priv->interp_values); i++)
+            free(ctx->priv->interp_values[i]);
+        for (int i = 0; i < ARRAY_ELEMS(ctx->priv->eq_coeffs); i++)
+            free(ctx->priv->eq_coeffs[i]);
+        free(ctx->priv->rhs);
+        free(ctx->priv->coeff_scale);
+
+        ms_pssolve_context_free(&ctx->priv->ps_ctx);
+
+#if HAVE_OPENCL
+        if (ctx->priv->ocl_queue)
+            clReleaseCommandQueue(ctx->priv->ocl_queue);
+        if (ctx->priv->ocl_ctx)
+            clReleaseContext(ctx->priv->ocl_ctx);
+#endif
+    }
+
+    free(ctx->priv);
+
+    free(ctx->coeffs);
+
+    free(ctx);
+    *pctx = NULL;
+}
diff --git a/src/ms_solve.h b/src/ms_solve.h
new file mode 100644
index 0000000..2ebf1e1
--- /dev/null
+++ b/src/ms_solve.h
@@ -0,0 +1,52 @@
+/*
+ * Maximal slicing -- actual solver code
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MS_SOLVE_H
+#define MS_SOLVE_H
+
+#include "common.h"
+
+#include "cctk.h"
+
+#include "basis.h"
+
+typedef struct MSSolverPriv MSSolverPriv;
+
+typedef struct MSSolver {
+    MSSolverPriv *priv;
+
+    const BasisSet *basis[2];
+
+    int nb_coeffs[2];
+    int nb_colloc_points[2];
+
+    double *coeffs;
+} MSSolver;
+
+int ms_solver_init(MSSolver **ctx,
+                   cGH *cctkGH,
+                   int basis_order_r, int basis_order_z,
+                   double sf, double filter_power, double input_filter_power);
+
+void ms_solver_free(MSSolver **ctx);
+
+int ms_solver_solve(MSSolver *ctx);
+
+void ms_solver_print_stats(MSSolver *ctx);
+
+#endif /* MS_SOLVE_H */
diff --git a/src/pssolve.c b/src/pssolve.c
new file mode 100644
index 0000000..9d6124c
--- /dev/null
+++ b/src/pssolve.c
@@ -0,0 +1,389 @@
+/*
+ * Pseudospectral 2nd order 2D linear PDE solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cblas.h>
+#include <lapacke.h>
+
+#include "bicgstab.h"
+#include "pssolve.h"
+
+#define NB_COEFFS(priv)        (priv->nb_coeffs[0]        * priv->nb_coeffs[1])
+#define NB_COLLOC_POINTS(priv) (priv->nb_colloc_points[0] * priv->nb_colloc_points[1])
+
+struct PSSolvePriv {
+    BiCGStabContext *bicgstab;
+    int steps_since_inverse;
+
+    int nb_coeffs[2];
+    int nb_colloc_points[2];
+    int colloc_grid_order[2];
+
+    double *basis_val[PSSOLVE_DIFF_ORDER_NB];
+
+    int *ipiv;
+    double *mat;
+};
+
+static int construct_matrix(PSSolveContext *ctx,
+                            const double *eq_coeffs[PSSOLVE_DIFF_ORDER_NB])
+{
+    double *mat = ctx->priv->mat;
+    int idx_coeff, idx_grid;
+
+#pragma omp parallel for simd
+    for (idx_coeff = 0; idx_coeff < NB_COEFFS(ctx->priv); idx_coeff++)
+        for (idx_grid = 0; idx_grid < NB_COLLOC_POINTS(ctx->priv); idx_grid++) {
+            const int idx      = idx_grid + NB_COLLOC_POINTS(ctx->priv) * idx_coeff;
+            double val = 0.0;
+
+            for (int i = 0; i < ARRAY_ELEMS(ctx->priv->basis_val); i++)
+                val += eq_coeffs[i][idx_grid] * ctx->priv->basis_val[i][idx];
+
+            mat[idx] = val;
+        }
+
+    return 0;
+}
+
+static int lu_invert(const int N, double *mat, double *rhs, int *ipiv)
+{
+    char  equed = 'N';
+    double cond, ferr, berr, rpivot;
+
+    double *mat_f, *x;
+    int ret = 0;
+#if 0
+    LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+                  mat, N, ipiv, rhs, N);
+    LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv);
+#else
+    mat_f = malloc(SQR(N) * sizeof(*mat_f));
+    x     = malloc(N * sizeof(*x));
+
+    //{
+    //    int i, j;
+    //    for (i = 0; i < N; i++) {
+    //        for (j = 0; j < N; j++)
+    //            fprintf(stderr, "%+#010.8g\t", mat[i + j * N]);
+    //        fprintf(stderr, "\n");
+    //    }
+    //}
+    //{
+    //    double *mat_copy = malloc(SQR(N) * sizeof(double));
+    //    double *svd = malloc(N * sizeof(double));
+    //    double *rhs_copy = malloc(N * sizeof(double));
+    //    int rank;
+
+    //    memcpy(mat_copy, mat, SQR(N) * sizeof(double));
+    //    memcpy(rhs_copy, rhs, N * sizeof(double));
+
+    //    LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N,
+    //                   svd, 1e-13, &rank);
+
+    //    free(mat_copy);
+    //    for (int i = 0; i < N; i++) {
+    //        if (i > 5 && i < N - 5)
+    //            continue;
+
+    //        fprintf(stderr, "%g\t", svd[i]);
+    //    }
+    //    fprintf(stderr, "\n rank %d\n", rank);
+    //    free(svd);
+    //    free(rhs_copy);
+
+    //    if (rank < N)
+    //        ret = 1;
+    //}
+
+    //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+    //              mat, N, ipiv, rhs, N);
+    LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1,
+                   mat, N, mat_f, N, ipiv, &equed, NULL, NULL,
+                   rhs, N, x, N, &cond, &ferr, &berr, &rpivot);
+    LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv);
+    memcpy(rhs, x, N * sizeof(double));
+    memcpy(mat, mat_f, SQR(N) * sizeof(double));
+
+    fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: "
+            "condition number %16.16g; forward error %16.16g backward error %16.16g\n",
+            N, N, cond, ferr, berr);
+
+    free(mat_f);
+    free(x);
+#endif
+
+    return ret;
+}
+
+int ms_pssolve_solve(PSSolveContext *ctx,
+                     const double * const eq_coeffs[PSSOLVE_DIFF_ORDER_NB],
+                     const double *rhs, double *coeffs)
+{
+    PSSolvePriv *s = ctx->priv;
+    const int N = NB_COEFFS(s);
+    double rhs_max;
+    int64_t start;
+
+    int ret = 0;
+
+    /* fill the matrix */
+    CCTK_TimerStart("QuasiMaximalSlicing_construct_matrix");
+    start = gettime();
+    ret = construct_matrix(ctx, eq_coeffs);
+    ctx->construct_matrix_time += gettime() - start;
+    ctx->construct_matrix_count++;
+    CCTK_TimerStop("QuasiMaximalSlicing_construct_matrix");
+    if (ret < 0)
+        return ret;
+
+#if 0
+    if (rhs_max < EPS) {
+        fprintf(stderr, "zero rhs\n");
+        memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
+        if (ms->cl_queue) {
+            clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double),
+                                 ms->coeffs, 0, NULL, NULL);
+        }
+        return 0;
+    }
+#endif
+
+    /* solve for the coeffs */
+    if (s->steps_since_inverse < 1024) {
+        int64_t start;
+
+        start = gettime();
+
+        CCTK_TimerStart("QuasiMaximalSlicing_solve_BiCGSTAB");
+        ret = ms_bicgstab_solve(s->bicgstab, s->mat, rhs, coeffs);
+        CCTK_TimerStop("QuasiMaximalSlicing_solve_BiCGSTAB");
+
+        if (ret >= 0) {
+            ctx->cg_time_total += gettime() - start;
+            ctx->cg_solve_count++;
+            ctx->cg_iter_count += ret + 1;
+            s->steps_since_inverse++;
+
+        }
+    } else
+        ret = -1;
+
+    if (ret < 0) {
+        int64_t start;
+
+        CCTK_TimerStart("QuasiMaximalSlicing_solve_LU");
+        start = gettime();
+
+        memcpy(coeffs, rhs, N * sizeof(*rhs));
+
+        ret = lu_invert(N, s->mat, coeffs, s->ipiv);
+        ctx->lu_solves_time += gettime() - start;
+        ctx->lu_solves_count++;
+        CCTK_TimerStop("QuasiMaximalSlicing_solve_LU");
+
+        ret = ms_bicgstab_init(s->bicgstab, s->mat, coeffs);
+
+        s->steps_since_inverse = 0;
+    }
+
+    return ret;
+}
+
+int ms_pssolve_context_init(PSSolveContext *ctx)
+{
+    PSSolvePriv *s = ctx->priv;
+    double *basis_val[2][3] = { { NULL } };
+
+    int ret = 0;
+
+    if (ctx->solve_order[0] <= 0 || ctx->solve_order[1] <= 0)
+        return -EINVAL;
+    s->nb_coeffs[0]         = ctx->solve_order[0];
+    s->nb_coeffs[1]         = ctx->solve_order[1];
+    s->nb_colloc_points[0]  = ctx->solve_order[0];
+    s->nb_colloc_points[1]  = ctx->solve_order[1];
+    s->colloc_grid_order[0] = ctx->solve_order[0];
+    s->colloc_grid_order[1] = ctx->solve_order[1];
+
+    s->steps_since_inverse = INT_MAX;
+
+    /* init the BiCGStab solver */
+    ret = ms_bicgstab_context_alloc(&s->bicgstab, NB_COEFFS(s), ctx->ocl_ctx,
+                                    ctx->ocl_queue);
+    if (ret < 0)
+        return ret;
+
+    /* compute the collocation grid */
+    posix_memalign((void**)&ctx->colloc_grid[0], 32, s->nb_colloc_points[0] * sizeof(*ctx->colloc_grid[0]));
+    posix_memalign((void**)&ctx->colloc_grid[1], 32, s->nb_colloc_points[1] * sizeof(*ctx->colloc_grid[1]));
+    if (!ctx->colloc_grid[0] || !ctx->colloc_grid[1])
+        return -ENOMEM;
+
+    for (int i = 0; i < s->nb_colloc_points[0]; i++)
+        ctx->colloc_grid[0][i] = ctx->basis[0]->colloc_point(s->colloc_grid_order[0], i);
+    for (int i = 0; i < s->nb_colloc_points[1]; i++)
+        ctx->colloc_grid[1][i] = ctx->basis[1]->colloc_point(s->colloc_grid_order[1], i);
+
+    /* precompute the basis values we will need */
+    for (int i = 0; i < ARRAY_ELEMS(basis_val); i++) {
+        for (int j = 0; j < ARRAY_ELEMS(basis_val[i]); j++) {
+            int ret = posix_memalign((void**)&basis_val[i][j], 32,
+                                     sizeof(*basis_val[i][j]) * s->nb_coeffs[i] * s->nb_colloc_points[i]);
+            if (ret) {
+                ret = -ENOMEM;
+                goto fail;
+            }
+        }
+
+        for (int j = 0; j < s->nb_colloc_points[i]; j++) {
+            double coord = ctx->colloc_grid[i][j];
+            for (int k = 0; k < s->nb_coeffs[i]; k++) {
+                basis_val[i][0][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval      (coord, k);
+                basis_val[i][1][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval_diff1(coord, k);
+                basis_val[i][2][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval_diff2(coord, k);
+            }
+        }
+    }
+
+    for (int i = 0; i < ARRAY_ELEMS(s->basis_val); i++) {
+        ret = posix_memalign((void**)&s->basis_val[i], 32, NB_COLLOC_POINTS(s) * NB_COEFFS(s) * sizeof(*s->basis_val[i]));
+        if (ret) {
+            ret = -ENOMEM;
+            goto fail;
+        }
+    }
+
+    for (int i = 0; i < s->nb_colloc_points[1]; i++) {
+        const double *basis_z   = basis_val[1][0] + i * s->nb_coeffs[1];
+        const double *dbasis_z  = basis_val[1][1] + i * s->nb_coeffs[1];
+        const double *d2basis_z = basis_val[1][2] + i * s->nb_coeffs[1];
+
+        for (int j = 0; j < s->nb_colloc_points[0]; j++) {
+            const double *basis_x   = basis_val[0][0] + j * s->nb_coeffs[0];
+            const double *dbasis_x  = basis_val[0][1] + j * s->nb_coeffs[0];
+            const double *d2basis_x = basis_val[0][2] + j * s->nb_coeffs[0];
+            const int idx_grid = i * s->nb_colloc_points[0] + j;
+
+#if MS_POLAR
+            double r     = ctx->colloc_grid[0][j];
+            double theta = ctx->colloc_grid[1][i];
+
+            double x = r * cos(theta);
+            double z = r * sin(theta);
+#else
+            double x = ctx->colloc_grid[0][j];
+            double z = ctx->colloc_grid[1][i];
+#endif
+
+            for (int k = 0; k < s->nb_coeffs[1]; k++)
+                for (int l = 0; l < s->nb_coeffs[0]; l++) {
+                    const int idx_coeff = k * s->nb_coeffs[0] + l;
+                    const int idx = idx_grid + NB_COLLOC_POINTS(s) * idx_coeff;
+                    s->basis_val[PSSOLVE_DIFF_ORDER_00][idx] =   basis_x[l] *   basis_z[k];
+#if MS_POLAR
+                    s->basis_val[PSSOLVE_DIFF_ORDER_10][idx] =  ((r > EPS) ? (dbasis_x[l] * basis_z[k] * x / r - basis_x[l] * dbasis_z[k] * z / SQR(r)) : 0.0);
+                    s->basis_val[PSSOLVE_DIFF_ORDER_01][idx] =  ((r > EPS) ? (dbasis_x[l] * basis_z[k] * z / r + basis_x[l] * dbasis_z[k] * x / SQR(r)) : 0.0);
+                    s->basis_val[PSSOLVE_DIFF_ORDER_20][idx] =  ((r > EPS) ? (SQR(x / r) * d2basis_x[l] * basis_z[k] + SQR(z / SQR(r)) * basis_x[l] * d2basis_z[k]
+                                                           + (1.0 - SQR(x / r)) / r * dbasis_x[l] * basis_z[k]
+                                                           + 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k]
+                                                           - 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
+                    s->basis_val[PSSOLVE_DIFF_ORDER_02][idx] =  ((r > EPS) ? (SQR(z / r) * d2basis_x[l] * basis_z[k] + SQR(x / SQR(r)) * basis_x[l] * d2basis_z[k]
+                                                           + (1.0 - SQR(z / r)) / r * dbasis_x[l] * basis_z[k]
+                                                           - 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k]
+                                                           + 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
+                    s->basis_val[PSSOLVE_DIFF_ORDER_11][idx] =  ((r > EPS) ? (x * z / SQR(r) * d2basis_x[l] * basis_z[k] - x * z / SQR(SQR(r)) * basis_x[l] * d2basis_z[k]
+                                                           - x * z / (r * SQR(r)) * dbasis_x[l] * basis_z[k]
+                                                           - (1.0 - SQR(z / r)) / SQR(r) * basis_x[l] * dbasis_z[k]
+                                                           + (SQR(x) - SQR(z)) / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
+#else
+                    s->basis_val[PSSOLVE_DIFF_ORDER_10][idx] =  dbasis_x[l] *   basis_z[k];
+                    s->basis_val[PSSOLVE_DIFF_ORDER_01][idx] =   basis_x[l] *  dbasis_z[k];
+                    s->basis_val[PSSOLVE_DIFF_ORDER_20][idx] = d2basis_x[l] *   basis_z[k];
+                    s->basis_val[PSSOLVE_DIFF_ORDER_02][idx] =   basis_x[l] * d2basis_z[k];
+                    s->basis_val[PSSOLVE_DIFF_ORDER_11][idx] =  dbasis_x[l] *  dbasis_z[k];
+#endif
+                }
+        }
+    }
+
+    ret  = posix_memalign((void**)&s->ipiv, 32, sizeof(*s->ipiv) * NB_COEFFS(s));
+    ret |= posix_memalign((void**)&s->mat,  32, sizeof(*s->mat)  * NB_COEFFS(s) * NB_COLLOC_POINTS(s));
+    if (ret) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+fail:
+    for (int i = 0; i < ARRAY_ELEMS(basis_val); i++)
+        for (int j = 0; j < ARRAY_ELEMS(basis_val[i]); j++)
+            free(basis_val[i][j]);
+
+    return ret;
+}
+
+int ms_pssolve_context_alloc(PSSolveContext **pctx)
+{
+    PSSolveContext *ctx = calloc(1, sizeof(*ctx));
+
+    if (!ctx)
+        return -ENOMEM;
+
+    ctx->priv = calloc(1, sizeof(*ctx->priv));
+    if (!ctx->priv)
+        goto fail;
+
+    *pctx = ctx;
+    return 0;
+fail:
+    ms_pssolve_context_free(&ctx);
+    return -ENOMEM;
+}
+
+void ms_pssolve_context_free(PSSolveContext **pctx)
+{
+    PSSolveContext *ctx = *pctx;
+
+    if (!ctx)
+        return;
+
+    if (ctx->priv) {
+        for (int i = 0; i < ARRAY_ELEMS(ctx->priv->basis_val); i++)
+            free(ctx->priv->basis_val[i]);
+
+        free(ctx->priv->ipiv);
+        free(ctx->priv->mat);
+
+        ms_bicgstab_context_free(&ctx->priv->bicgstab);
+    }
+
+    free(ctx->priv);
+
+    free(ctx->colloc_grid[0]);
+    free(ctx->colloc_grid[1]);
+
+    free(ctx);
+    *pctx = NULL;
+}
diff --git a/src/pssolve.h b/src/pssolve.h
new file mode 100644
index 0000000..544988b
--- /dev/null
+++ b/src/pssolve.h
@@ -0,0 +1,116 @@
+/*
+ * Pseudospectral 2nd order 2D linear PDE solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MS_PSSOLVE_H
+#define MS_PSSOLVE_H
+
+#include "common.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#else
+typedef void* cl_context;
+typedef void* cl_command_queue;
+#endif
+
+#include <stdint.h>
+
+#include "basis.h"
+
+enum PSSolveDiffOrder {
+    PSSOLVE_DIFF_ORDER_00,
+    PSSOLVE_DIFF_ORDER_10,
+    PSSOLVE_DIFF_ORDER_01,
+    PSSOLVE_DIFF_ORDER_11,
+    PSSOLVE_DIFF_ORDER_20,
+    PSSOLVE_DIFF_ORDER_02,
+    PSSOLVE_DIFF_ORDER_NB,
+};
+
+typedef struct PSSolvePriv PSSolvePriv;
+
+typedef struct PSSolveContext {
+    /**
+     * Solver private data, not to be touched by the caller.
+     */
+    PSSolvePriv *priv;
+
+    /**
+     * The basis sets to be used in each direction.
+     * Set by the caller before ms_pssolve_context_init().
+     */
+    const BasisSet *basis[2];
+
+    /**
+     * Order of the solver in each direction.
+     * Set by the caller before ms_pssolve_context_init().
+     */
+    int solve_order[2];
+
+    /**
+     * Locations of the collocation points in each direction. The equation
+     * coefficients passed to ms_pssolve_solve() should be evaluated at those
+     * grid positions.
+     *
+     * Set by the solver after ms_pssolve_context_init(). Each array is
+     * solve_order[i]-sized.
+     */
+    double *colloc_grid[2];
+
+    cl_context       ocl_ctx;
+    cl_command_queue ocl_queue;
+
+    uint64_t lu_solves_count;
+    uint64_t lu_solves_time;
+
+    uint64_t cg_solve_count;
+    uint64_t cg_iter_count;
+    uint64_t cg_time_total;
+
+    uint64_t construct_matrix_count;
+    uint64_t construct_matrix_time;
+} PSSolveContext;
+
+/**
+ * Allocate a new solver.
+ */
+int ms_pssolve_context_alloc(PSSolveContext **ctx);
+
+/**
+ * Initialize the solver for use after all the context options have been set.
+ */
+int ms_pssolve_context_init(PSSolveContext *ctx);
+
+/**
+ * Free the solver and all its internal state.
+ */
+void ms_pssolve_context_free(PSSolveContext **ctx);
+
+/**
+ * Solve a second order linear PDE in 2D with a pseudospectral method.
+ *
+ * @param eq_coeffs the coefficients of each derivative term at the collocation
+ *                  points.
+ * @param rhs the right-hand side of the equation at the collocation points.
+ * @param coeffs the spectral coefficients of the solution will be written here.
+ */
+int ms_pssolve_solve(PSSolveContext *ctx,
+                      const double * const eq_coeffs[PSSOLVE_DIFF_ORDER_NB],
+                      const double *rhs, double *coeffs);
+
+#endif /* MS_PSSOLVE_H */
diff --git a/src/solve.c b/src/solve.c
index 610dc0d..a1bee5e 100644
--- a/src/solve.c
+++ b/src/solve.c
@@ -1,4 +1,5 @@
 
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -11,7 +12,7 @@
 
 #include "maximal_slicing_axi.h"
 
-static int construct_matrix(MaximalSlicingContext *ms, double *mat,
+static int construct_matrix_cylindric(MaximalSlicingContext *ms, double *mat,
                             double *rhs, double *prhs_max)
 {
     int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z;
@@ -29,7 +30,7 @@ static int construct_matrix(MaximalSlicingContext *ms, double *mat,
 #pragma omp parallel for reduction(max : rhs_max)
     for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z; idx_grid_z++) {
         for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x; idx_grid_x++) {
-            CCTK_REAL x_physical = ms->grid_x[idx_grid_x];
+            CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x];
             int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;
 
             const double gtuxx         = ms->metric_u[0][idx_grid];
@@ -56,47 +57,48 @@ static int construct_matrix(MaximalSlicingContext *ms, double *mat,
             const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi);
             const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);
 
-            const double coeff_20 = SQR(phi) * (gtuxx + (x_physical <= EPS) * gtuyy);
+            //const double tr_ric        = ms->interp_values[I_TR_R][idx_grid];
+
+            const double coeff_20 = SQR(phi) * (gtuxx + ((x_physical <= EPS) ? gtuyy : 0.0));
             const double coeff_02 = SQR(phi) * gtuzz;
             const double coeff_11 = SQR(phi) * gtuxz * 2;
-            const double coeff_10 = -Xx + (x_physical > EPS) * SQR(phi) * gtuyy / x_physical;
+            const double coeff_10 = -Xx + ((x_physical > EPS) ? SQR(phi) * gtuyy / x_physical : 0.0);
             const double coeff_01 = -Xz;
+
             const double coeff_00 = -k2;
+            //const double coeff_00 = -(tr_ric + SQR(trk));
 
 #if 1
             for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
                 for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
                     const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;
 
-                    //double d2alpha =     gtuxx * D2BASIS_X *   BASIS_Z
-                    //               +     gtuzz *   BASIS_X * D2BASIS_Z
-                    //               + 2 * gtuxz *  DBASIS_X *  DBASIS_Z;
-                    //if (x_physical > EPS)
-                    //    d2alpha += gtuyy *  DBASIS_X *   BASIS_Z / x_physical;
-                    //else
-                    //    d2alpha += gtuyy * D2BASIS_X * BASIS_Z;
+#if 0
+                    double d2alpha =     gtuxx * D2BASIS_X *   BASIS_Z
+                                   +     gtuzz *   BASIS_X * D2BASIS_Z
+                                   + 2 * gtuxz *  DBASIS_X *  DBASIS_Z;
+                    if (x_physical > EPS)
+                        d2alpha += gtuyy *  DBASIS_X *   BASIS_Z / x_physical;
+                    else
+                        d2alpha += gtuyy * D2BASIS_X * BASIS_Z;
 
-                    //double curv_term = Xx * DBASIS_X * BASIS_Z + Xz * BASIS_X * DBASIS_Z;
+                    double curv_term = (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi) * DBASIS_X *  BASIS_Z +
+                                       (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi) *  BASIS_X * DBASIS_Z;
 
 
-                    //double D2alpha = SQR(phi) * d2alpha - curv_term;
+                    double D2alpha = SQR(phi) * (d2alpha - curv_term);
 
-                    //mat[idx_grid + ms->nb_colloc_points * idx_coeff] = D2alpha - BASIS_X * BASIS_Z * k2;
+                    mat[idx_grid + ms->nb_colloc_points * idx_coeff] = D2alpha - BASIS_X * BASIS_Z * k2;
+#else
                     mat[idx_grid + ms->nb_colloc_points * idx_coeff] = coeff_00 *   BASIS_X *   BASIS_Z +
                                                                        coeff_10 *  DBASIS_X *   BASIS_Z +
                                                                        coeff_01 *   BASIS_X *  DBASIS_Z +
                                                                        coeff_11 *  DBASIS_X *  DBASIS_Z +
                                                                        coeff_20 * D2BASIS_X *   BASIS_Z +
                                                                        coeff_02 *   BASIS_X * D2BASIS_Z;
+#endif
                 }
 #else
-
-            const double coeff_20 = SQR(phi) * (gtuxx + (x_physical <= EPS) * gtuyy);
-            const double coeff_02 = SQR(phi) * gtuzz;
-            const double coeff_11 = SQR(phi) * gtuxz * 2;
-            const double coeff_10 = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi + (x_physical > EPS) * gtuyy);
-            const double coeff_01 = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);
-            const double coeff_00 = -k2;
             cblas_daxpy(ms->nb_coeffs, coeff_20, ms->basis_val_20 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
             cblas_daxpy(ms->nb_coeffs, coeff_02, ms->basis_val_02 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
             cblas_daxpy(ms->nb_coeffs, coeff_11, ms->basis_val_11 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
@@ -105,8 +107,11 @@ static int construct_matrix(MaximalSlicingContext *ms, double *mat,
             cblas_daxpy(ms->nb_coeffs, coeff_00, ms->basis_val_00 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
 #endif
 
-            rhs[idx_grid] = k2 + trk ;// betax * trk_dx + betaz * trk_dz;
-            //rhs[idx_grid] = k2;
+            rhs[idx_grid] = k2 + trk + betax * trk_dx + betaz * trk_dz;
+            //rhs[idx_grid] = 0.0;
+
+            //rhs[idx_grid] = tr_ric + SQR(trk) + betax * trk_dx + betaz * trk_dz;
+
             rhs_max = MAX(rhs_max, fabs(rhs[idx_grid]));
             //rhs_max = fabs(rhs[idx_grid]);
         }
@@ -123,8 +128,220 @@ static int construct_matrix(MaximalSlicingContext *ms, double *mat,
     return 0;
 }
 
+static int construct_matrix_polar(MaximalSlicingContext *ms, double *mat,
+                                  double *rhs, double *prhs_max)
+{
+    int idx_coeff, idx_grid;
+
+#pragma omp parallel for
+    for (idx_coeff = 0; idx_coeff < ms->nb_coeffs; idx_coeff++)
+        for (idx_grid = 0; idx_grid < ms->nb_colloc_points; idx_grid++) {
+            const int idx      = idx_grid + ms->nb_colloc_points * idx_coeff;
+
+            mat[idx] = ms->eq_coeff_00[idx_grid] * ms->basis_val_00[idx] +
+                       ms->eq_coeff_10[idx_grid] * ms->basis_val_10[idx] +
+                       ms->eq_coeff_01[idx_grid] * ms->basis_val_01[idx] +
+                       ms->eq_coeff_11[idx_grid] * ms->basis_val_11[idx] +
+                       ms->eq_coeff_20[idx_grid] * ms->basis_val_20[idx] +
+                       ms->eq_coeff_02[idx_grid] * ms->basis_val_02[idx];
+        }
+
+    //*prhs_max = ms->rhs[cblas_idamax(ms->nb_colloc_points, ms->rhs, 1)];
+
+    return 0;
+}
+
+static int construct_matrix_boundary(MaximalSlicingContext *ms, double *mat,
+                                     double *rhs, double *prhs_max)
+{
+    int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z;
+    double rhs_max = 0.0;
+
+#define   BASIS_X (ms->basis_x_val  [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
+#define  DBASIS_X (ms->basis_x_dval [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
+#define D2BASIS_X (ms->basis_x_d2val[idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
+#define   BASIS_Z (ms->basis_z_val  [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
+#define  DBASIS_Z (ms->basis_z_dval [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
+#define D2BASIS_Z (ms->basis_z_d2val[idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
+
+#pragma omp parallel for
+    for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) {
+        for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) {
+            CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x];
+            int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;
+
+            const double gtuxx         = ms->metric_u[0][idx_grid];
+            const double gtuyy         = ms->metric_u[1][idx_grid];
+            const double gtuzz         = ms->metric_u[2][idx_grid];
+            const double gtuxz         = ms->metric_u[4][idx_grid];
+
+            const double phi           = ms->interp_values[I_PHI][idx_grid];
+            const double phi_dx        = ms->interp_values[I_PHI_DX][idx_grid];
+            const double phi_dz        = ms->interp_values[I_PHI_DZ][idx_grid];
+
+            const double Xtx           = ms->interp_values[I_XTX][idx_grid];
+            const double Xtz           = ms->interp_values[I_XTZ][idx_grid];
+
+            const double k2            = ms->kij_kij[idx_grid];
+            const double trk           = ms->interp_values[I_K][idx_grid];
+
+            const double trk_dx        = ms->interp_values[I_K_DX][idx_grid];
+            const double trk_dz        = ms->interp_values[I_K_DZ][idx_grid];
+
+            const double betax         = ms->interp_values[I_BETAX][idx_grid];
+            const double betaz         = ms->interp_values[I_BETAZ][idx_grid];
+
+            const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi);
+            const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);
+
+            const double coeff_20 = SQR(phi) * (gtuxx + ((fabs(x_physical) <= EPS) ? gtuyy : 0));
+            const double coeff_02 = SQR(phi) * gtuzz;
+            const double coeff_11 = SQR(phi) * gtuxz * 2;
+            const double coeff_10 = -Xx + ((fabs(x_physical) > EPS) ? (SQR(phi) * gtuyy / x_physical) : 0);
+            const double coeff_01 = -Xz;
+
+            const double coeff_00 = -k2;
+
+            for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
+                for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
+                    const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;
+
+                    mat[idx_grid + ms->nb_colloc_points * idx_coeff] = coeff_00 *   BASIS_X *   BASIS_Z +
+                                                                       coeff_10 *  DBASIS_X *   BASIS_Z +
+                                                                       coeff_01 *   BASIS_X *  DBASIS_Z +
+                                                                       coeff_11 *  DBASIS_X *  DBASIS_Z +
+                                                                       coeff_20 * D2BASIS_X *   BASIS_Z +
+                                                                       coeff_02 *   BASIS_X * D2BASIS_Z;
+                }
+
+            rhs[idx_grid] = 0.0;
+            //rhs[idx_grid] = k2 + trk + betax * trk_dx + betaz * trk_dz;
+
+            //rhs_max = MAX(rhs_max, fabs(rhs[idx_grid]));
+            //rhs_max = fabs(rhs[idx_grid]);
+        }
+    }
+
+#if 1
+    // z = 0 axis
+    idx_grid_z = 0;
+    for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) {
+        int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;
+        int idx_grid1        = (ms->nb_colloc_points_z - 2) * ms->nb_colloc_points_x + idx_grid_x;
+
+        for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
+            for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
+                const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;
+
+                mat[idx_grid1 + ms->nb_colloc_points * idx_coeff] = BASIS_X * DBASIS_Z;
+            }
+        rhs[idx_grid1] = 0.0;
+    }
+
+    // x = 0 axis
+    idx_grid_x = 0;
+    for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) {
+        int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;
+        int idx_grid1        = idx_grid_z * ms->nb_colloc_points_x + ms->nb_colloc_points_x - 2;
+
+        for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
+            for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
+                const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;
+
+                mat[idx_grid1 + ms->nb_colloc_points * idx_coeff] = DBASIS_X * BASIS_Z;
+            }
+        rhs[idx_grid1] = 0.0;
+    }
+
+    // z = z_max axis
+    idx_grid_z = ms->nb_colloc_points_z - 1;
+    for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) {
+        CCTK_REAL z_physical = ms->colloc_grid[1][idx_grid_z];
+
+        int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;
+
+        for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
+            for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
+                const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;
+
+                mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z; //+ z_physical * BASIS_X * DBASIS_Z;
+            }
+        rhs[idx_grid] = 1.0;
+    }
+
+    // x = x_max axis
+    idx_grid_x = ms->nb_colloc_points_x - 1;
+    for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) {
+        CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x];
+
+        int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;
+
+        for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
+            for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
+                const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;
+
+                mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z;// + x_physical * DBASIS_X * BASIS_Z;
+            }
+        rhs[idx_grid] = 1.0;
+    }
+
+    //idx_grid_x = 0;
+    //idx_grid_z = 0;
+    //{
+    //    int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;
+
+    //    for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
+    //        for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
+    //            const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;
+
+    //            mat[idx_grid + ms->nb_colloc_points * idx_coeff] = DBASIS_X * BASIS_Z + BASIS_X * DBASIS_Z;
+    //        }
+    //    rhs[idx_grid] = 0.0;
+    //}
+
+    //idx_grid_x = ms->nb_colloc_points_x - 1;
+    //idx_grid_z = ms->nb_colloc_points_z - 1;
+    //{
+    //    CCTK_REAL z_physical = ms->colloc_grid[1][idx_grid_z];
+    //    CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x];
+    //    CCTK_REAL r2 = SQR(z_physical) + SQR(x_physical);
+    //    int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;
+
+    //    for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
+    //        for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
+    //            const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;
+
+    //            mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z + 0.5 * r2 / z_physical * BASIS_X * DBASIS_Z + 0.5 * r2 / x_physical * DBASIS_X * BASIS_Z;
+    //        }
+    //    rhs[idx_grid] = 1.0;
+    //}
+#endif
+
+    *prhs_max = rhs_max;
+
+#if 0
+    fprintf(stderr, "matrix\n");
+    for (int i = 0; i < ms->nb_colloc_points; i++) {
+        for (int j = 0; j < ms->nb_coeffs; j++)
+            fprintf(stderr, "%+08.04g ", mat[i + ms->nb_colloc_points * j]);
+        fprintf(stderr, "\n");
+    }
+
+    fprintf(stderr, "rhs\n");
+    for (int i = 0; i < ms->nb_colloc_points; i++)
+        fprintf(stderr, "%+08.04g ", rhs[i]);
+    fprintf(stderr, "\n");
+#endif
 
-static int calc_geometry(MaximalSlicingContext *ms)
+    return 0;
+}
+
+static int construct_matrix(MaximalSlicingContext *ms)
+{
+    return construct_matrix_polar(ms, ms->mat, NULL, NULL);
+}
+/* interpolate the cactus gridfunctions onto the pseudospectral grid */
+static int interp_geometry(MaximalSlicingContext *ms)
 {
     int ret;
 
@@ -135,7 +352,22 @@ static int calc_geometry(MaximalSlicingContext *ms)
     if (ret < 0)
         CCTK_WARN(0, "Error interpolating");
 
-#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x)
+    //CCTK_TimerStart("MaximalSlicingAxi_filter_input");
+    //{
+    //    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+    //                ms->nb_colloc_points, ARRAY_ELEMS(ms->interp_values), ms->nb_colloc_points,
+    //                1.0, ms->input_filter, ms->nb_colloc_points, ms->interp_buffer_prefilter, ms->nb_colloc_points,
+    //                0.0, ms->interp_buffer, ms->nb_colloc_points);
+    //}
+    //CCTK_TimerStop("MaximalSlicingAxi_filter_input");
+
+    return 0;
+}
+
+static int calc_eq_coeffs(MaximalSlicingContext *ms, double *prhs_max)
+{
+    double rhs_max = 0.0;
+#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x) reduction(max : rhs_max)
     for (int i = 0; i < ms->nb_colloc_points; i++) {
         CCTK_REAL Am[3][3], gtu[3][3];
         CCTK_REAL a2;
@@ -191,16 +423,56 @@ static int calc_geometry(MaximalSlicingContext *ms)
             for (int k = 0; k < 3; k++)
                 a2 += Am[j][k] * Am[k][j];
 
-        ms->metric_u[0][i] = gtu[0][0];
-        ms->metric_u[1][i] = gtu[1][1];
-        ms->metric_u[2][i] = gtu[2][2];
-        ms->metric_u[3][i] = gtu[0][1];
-        ms->metric_u[4][i] = gtu[0][2];
-        ms->metric_u[5][i] = gtu[1][2];
+        {
+            double x = ms->interp_coords[0][i];
+            double z = ms->interp_coords[2][i];
+
+            const double gtuxx         = gtu[0][0];
+            const double gtuyy         = gtu[1][1];
+            const double gtuzz         = gtu[2][2];
+            const double gtuxz         = gtu[0][2];
+
+            const double phi           = ms->interp_values[I_PHI][i];
+            const double phi_dx        = ms->interp_values[I_PHI_DX][i];
+            const double phi_dz        = ms->interp_values[I_PHI_DZ][i];
+
+            const double Xtx           = ms->interp_values[I_XTX][i];
+            const double Xtz           = ms->interp_values[I_XTZ][i];
+
+            const double k2            = a2 + SQR(trK) / 3.;
+
+            const double trk_dx        = ms->interp_values[I_K_DX][i];
+            const double trk_dz        = ms->interp_values[I_K_DZ][i];
+
+            const double betax         = ms->interp_values[I_BETAX][i];
+            const double betaz         = ms->interp_values[I_BETAZ][i];
+
+            const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi);
+            const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);
+
+            ms->eq_coeff_20[i] = SQR(phi) * (gtuxx + ((x <= EPS) ? gtuyy : 0.0));
+            ms->eq_coeff_02[i] = SQR(phi) * gtuzz;
+            ms->eq_coeff_11[i] = SQR(phi) * gtuxz * 2;
+            ms->eq_coeff_10[i] = -Xx + ((x > EPS) ? SQR(phi) * gtuyy / x : 0.0);
+            ms->eq_coeff_01[i] = -Xz;
+            ms->eq_coeff_00[i] = -k2;
+            ms->rhs[i]         = k2 + trK + betax * trk_dx + betaz * trk_dz;
+
+            rhs_max = MAX(rhs_max, fabs(ms->rhs[i]));
+        }
+
+        //ms->metric_u[0][i] = gtu[0][0];
+        //ms->metric_u[1][i] = gtu[1][1];
+        //ms->metric_u[2][i] = gtu[2][2];
+        //ms->metric_u[3][i] = gtu[0][1];
+        //ms->metric_u[4][i] = gtu[0][2];
+        //ms->metric_u[5][i] = gtu[1][2];
 
-        ms->kij_kij[i] = a2 + SQR(trK) / 3.;
+        //ms->kij_kij[i] = a2 + SQR(trK) / 3.;
     }
 
+    *prhs_max = rhs_max;
+
     return 0;
 }
 
@@ -274,8 +546,6 @@ static int solve_bicgstab(BiCGSTABContext *ctx, const int N,
     if (i == MAXITER)
         return -1;
 
-    ctx->iter_total += i + 1;
-
     return i;
 }
 
@@ -293,12 +563,9 @@ static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q,
 
     cl_event events[8];
 
-    // upload the matrix and the RHS to the GPU
-    // k and x are assumed to be already uploaded
-    clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double),
-                         rhs, 0, NULL, &events[0]);
-    clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double),
-                         mat, 0, NULL, &events[1]);
+    // the matrix, rhs, k and x are assumed to be already uploaded
+    clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs,     0, NULL, &events[0]);
+    clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]);
 
     // initialize the residual
     clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
@@ -393,18 +660,77 @@ static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q,
     if (i == MAXITER)
         return -1;
 
-    ctx->iter_total += i + 1;
-
     return i;
 }
 
 static int lu_invert(const int N, double *mat, double *rhs, int *ipiv)
 {
+    char  equed = 'N';
+    double cond, ferr, berr, rpivot;
+
+    double *mat_f, *x;
+    int ret = 0;
+#if 1
     LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
                   mat, N, ipiv, rhs, N);
     LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv);
+#else
+    mat_f = malloc(SQR(N) * sizeof(*mat_f));
+    x     = malloc(N * sizeof(*x));
+
+    //{
+    //    int i, j;
+    //    for (i = 0; i < N; i++) {
+    //        for (j = 0; j < N; j++)
+    //            fprintf(stderr, "%+#010.8g\t", mat[i + j * N]);
+    //        fprintf(stderr, "\n");
+    //    }
+    //}
+    {
+        double *mat_copy = malloc(SQR(N) * sizeof(double));
+        double *svd = malloc(N * sizeof(double));
+        double *rhs_copy = malloc(N * sizeof(double));
+        int rank;
+
+        memcpy(mat_copy, mat, SQR(N) * sizeof(double));
+        memcpy(rhs_copy, rhs, N * sizeof(double));
+
+        LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N,
+                       svd, 1e-13, &rank);
+
+        free(mat_copy);
+        for (int i = 0; i < N; i++) {
+            if (i > 5 && i < N - 5)
+                continue;
+
+            fprintf(stderr, "%g\t", svd[i]);
+        }
+        fprintf(stderr, "\n rank %d\n", rank);
+        free(svd);
+        free(rhs_copy);
 
-    return 0;
+        if (rank < N)
+            ret = 1;
+    }
+
+    //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+    //              mat, N, ipiv, rhs, N);
+    LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1,
+                   mat, N, mat_f, N, ipiv, &equed, NULL, NULL,
+                   rhs, N, x, N, &cond, &ferr, &berr, &rpivot);
+    LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv);
+    memcpy(rhs, x, N * sizeof(double));
+    memcpy(mat, mat_f, SQR(N) * sizeof(double));
+
+    fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: "
+            "condition number %16.16g; forward error %16.16g backward error %16.16g\n",
+            N, N, cond, ferr, berr);
+
+    free(mat_f);
+    free(x);
+#endif
+
+    return ret;
 }
 
 /*
@@ -420,23 +746,40 @@ int msa_maximal_solve(MaximalSlicingContext *ms)
 {
     const int N = ms->nb_coeffs;
     double rhs_max;
+    int64_t start;
 
     int ret = 0;
 
     /* interpolate the metric values and construct the quantities we'll need */
-    CCTK_TimerStart("MaximalSlicingAxi_calc_geometry");
-    ret = calc_geometry(ms);
-    CCTK_TimerStop("MaximalSlicingAxi_calc_geometry");
+    CCTK_TimerStart("MaximalSlicingAxi_interp_geometry");
+    start = gettime();
+    ret = interp_geometry(ms);
+    ms->interp_geometry_time += gettime() - start;
+    ms->interp_geometry_count++;
+    CCTK_TimerStop("MaximalSlicingAxi_interp_geometry");
+    if (ret < 0)
+        return ret;
+
+    CCTK_TimerStart("MaximalSlicingAxi_calc_eq_coeffs");
+    start = gettime();
+    ret = calc_eq_coeffs(ms, &rhs_max);
+    ms->calc_eq_coeffs_time += gettime() - start;
+    ms->calc_eq_coeffs_count++;
+    CCTK_TimerStop("MaximalSlicingAxi_calc_eq_coeffs");
     if (ret < 0)
         return ret;
 
     /* fill the matrix */
     CCTK_TimerStart("MaximalSlicingAxi_construct_matrix");
-    ret = construct_matrix(ms, ms->mat, ms->rhs, &rhs_max);
+    start = gettime();
+    ret = construct_matrix(ms);
+    ms->construct_matrix_time += gettime() - start;
+    ms->construct_matrix_count++;
     CCTK_TimerStop("MaximalSlicingAxi_construct_matrix");
     if (ret < 0)
         return ret;
 
+#if 1
     if (rhs_max < EPS) {
         memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
         if (ms->cl_queue) {
@@ -445,9 +788,10 @@ int msa_maximal_solve(MaximalSlicingContext *ms)
         }
         return 0;
     }
+#endif
 
     /* solve for the coeffs */
-    if (ms->steps_since_inverse < 128) {
+    if (ms->steps_since_inverse < 1024) {
         BiCGSTABContext *b = &ms->bicgstab;
         int64_t start = gettime();
 
@@ -461,15 +805,11 @@ int msa_maximal_solve(MaximalSlicingContext *ms)
         CCTK_TimerStop("MaximalSlicingAxi_solve_BiCGSTAB");
 
         if (ret >= 0) {
-            b->time_total += gettime() - start;
-            b->solve_total++;
+            ms->cg_time_total += gettime() - start;
+            ms->cg_solve_count++;
+            ms->cg_iter_count += ret + 1;
             ms->steps_since_inverse++;
 
-            if (!(b->solve_total & 127)) {
-                fprintf(stderr, "BiCGSTAB %ld solves, %ld iterations, total time %ld, avg iterations per solve %g, avg time per solve %g, avg time per iteration %g\n",
-                        b->solve_total, b->iter_total, b->time_total, (double)b->iter_total / b->solve_total, (double)b->time_total / b->solve_total, (double)b->time_total / b->iter_total);
-                fprintf(stderr, "LU %ld solves, total time %ld, avg time per solve %g\n", ms->lu_solves_total, ms->lu_solves_time, (double)ms->lu_solves_time / ms->lu_solves_total);
-            }
 #if 0
             {
                 double min, max;
@@ -493,9 +833,9 @@ int msa_maximal_solve(MaximalSlicingContext *ms)
         CCTK_TimerStart("MaximalSlicingAxi_solve_LU");
         start = gettime();
 
-        lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv);
+        ret = lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv);
         ms->lu_solves_time += gettime() - start;
-        ms->lu_solves_total++;
+        ms->lu_solves_count++;
         CCTK_TimerStop("MaximalSlicingAxi_solve_LU");
 
         tmpv = ms->coeffs;
@@ -506,6 +846,11 @@ int msa_maximal_solve(MaximalSlicingContext *ms)
         ms->mat = ms->bicgstab.k;
         ms->bicgstab.k = tmpm;
 
+        if (ret == 1) {
+            memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
+            ms->coeffs[0] = 1.0;
+        }
+
         if (ms->cl_queue) {
             cl_event events[2];
             clEnqueueWriteBuffer(ms->cl_queue, ms->bicgstab.cl_k, 0, 0, N * N * sizeof(double),
@@ -518,6 +863,10 @@ int msa_maximal_solve(MaximalSlicingContext *ms)
         ms->steps_since_inverse = 0;
     }
 
+    for (int i = 0; i < N; i++)
+        ms->coeffs[i] *= ms->coeff_scale[i];
+
+    //fprintf(stderr, "solve %g %g\n", ms->gh->cctk_time, ms->coeffs[0]);
 
     return ret;
 }
-- 
cgit v1.2.3