#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <cblas.h>
#include <lapacke.h>

#include <cl.h>
#include <clBLAS.h>

#include "maximal_slicing_axi.h"

static int construct_matrix_cylindric(MaximalSlicingContext *ms, double *mat,
                            double *rhs, double *prhs_max)
{
    int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z;
    double rhs_max = 0.0;

#define   BASIS_X (ms->basis_x_val  [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
#define  DBASIS_X (ms->basis_x_dval [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
#define D2BASIS_X (ms->basis_x_d2val[idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
#define   BASIS_Z (ms->basis_z_val  [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
#define  DBASIS_Z (ms->basis_z_dval [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
#define D2BASIS_Z (ms->basis_z_d2val[idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])

    //memset(mat, 0, sizeof(*mat) * ms->nb_coeffs * ms->nb_colloc_points);

#pragma omp parallel for reduction(max : rhs_max)
    for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z; idx_grid_z++) {
        for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x; idx_grid_x++) {
            CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x];
            int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;

            const double gtuxx         = ms->metric_u[0][idx_grid];
            const double gtuyy         = ms->metric_u[1][idx_grid];
            const double gtuzz         = ms->metric_u[2][idx_grid];
            const double gtuxz         = ms->metric_u[4][idx_grid];

            const double phi           = ms->interp_values[I_PHI][idx_grid];
            const double phi_dx        = ms->interp_values[I_PHI_DX][idx_grid];
            const double phi_dz        = ms->interp_values[I_PHI_DZ][idx_grid];

            const double Xtx           = ms->interp_values[I_XTX][idx_grid];
            const double Xtz           = ms->interp_values[I_XTZ][idx_grid];

            const double k2            = ms->kij_kij[idx_grid];
            const double trk           = ms->interp_values[I_K][idx_grid];

            const double trk_dx        = ms->interp_values[I_K_DX][idx_grid];
            const double trk_dz        = ms->interp_values[I_K_DZ][idx_grid];

            const double betax         = ms->interp_values[I_BETAX][idx_grid];
            const double betaz         = ms->interp_values[I_BETAZ][idx_grid];

            const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi);
            const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);

            //const double tr_ric        = ms->interp_values[I_TR_R][idx_grid];

            const double coeff_20 = SQR(phi) * (gtuxx + ((x_physical <= EPS) ? gtuyy : 0.0));
            const double coeff_02 = SQR(phi) * gtuzz;
            const double coeff_11 = SQR(phi) * gtuxz * 2;
            const double coeff_10 = -Xx + ((x_physical > EPS) ? SQR(phi) * gtuyy / x_physical : 0.0);
            const double coeff_01 = -Xz;

            const double coeff_00 = -k2;
            //const double coeff_00 = -(tr_ric + SQR(trk));

#if 1
            for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
                for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
                    const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;

#if 0
                    double d2alpha =     gtuxx * D2BASIS_X *   BASIS_Z
                                   +     gtuzz *   BASIS_X * D2BASIS_Z
                                   + 2 * gtuxz *  DBASIS_X *  DBASIS_Z;
                    if (x_physical > EPS)
                        d2alpha += gtuyy *  DBASIS_X *   BASIS_Z / x_physical;
                    else
                        d2alpha += gtuyy * D2BASIS_X * BASIS_Z;

                    double curv_term = (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi) * DBASIS_X *  BASIS_Z +
                                       (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi) *  BASIS_X * DBASIS_Z;


                    double D2alpha = SQR(phi) * (d2alpha - curv_term);

                    mat[idx_grid + ms->nb_colloc_points * idx_coeff] = D2alpha - BASIS_X * BASIS_Z * k2;
#else
                    mat[idx_grid + ms->nb_colloc_points * idx_coeff] = coeff_00 *   BASIS_X *   BASIS_Z +
                                                                       coeff_10 *  DBASIS_X *   BASIS_Z +
                                                                       coeff_01 *   BASIS_X *  DBASIS_Z +
                                                                       coeff_11 *  DBASIS_X *  DBASIS_Z +
                                                                       coeff_20 * D2BASIS_X *   BASIS_Z +
                                                                       coeff_02 *   BASIS_X * D2BASIS_Z;
#endif
                }
#else
            cblas_daxpy(ms->nb_coeffs, coeff_20, ms->basis_val_20 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
            cblas_daxpy(ms->nb_coeffs, coeff_02, ms->basis_val_02 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
            cblas_daxpy(ms->nb_coeffs, coeff_11, ms->basis_val_11 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
            cblas_daxpy(ms->nb_coeffs, coeff_10, ms->basis_val_10 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
            cblas_daxpy(ms->nb_coeffs, coeff_01, ms->basis_val_01 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
            cblas_daxpy(ms->nb_coeffs, coeff_00, ms->basis_val_00 + idx_grid, ms->nb_colloc_points, mat + idx_grid, ms->nb_colloc_points);
#endif

            rhs[idx_grid] = k2 + trk + betax * trk_dx + betaz * trk_dz;
            //rhs[idx_grid] = 0.0;

            //rhs[idx_grid] = tr_ric + SQR(trk) + betax * trk_dx + betaz * trk_dz;

            rhs_max = MAX(rhs_max, fabs(rhs[idx_grid]));
            //rhs_max = fabs(rhs[idx_grid]);
        }
    }

    //memcpy(rhs, ms->kij_kij, sizeof(*rhs) * ms->nb_colloc_points);
    //cblas_daxpy(ms->nb_colloc_points, 1.0, ms->interp_values[I_K], 1, rhs, 1);
    //cblas_dsbmv(CblasColMajor, CblasUpper, ms->nb_colloc_points, 0, 1.0, ms->interp_values[I_BETAX], 1, ms->interp_values[I_K_DX], 1, 1.0, rhs, 1);
    //cblas_dsbmv(CblasColMajor, CblasUpper, ms->nb_colloc_points, 0, 1.0, ms->interp_values[I_BETAZ], 1, ms->interp_values[I_K_DZ], 1, 1.0, rhs, 1);

    //*prhs_max = rhs[cblas_idamax(ms->nb_colloc_points, rhs, 1)];
    *prhs_max = rhs_max;

    return 0;
}

static int construct_matrix_polar(MaximalSlicingContext *ms, double *mat,
                                  double *rhs, double *prhs_max)
{
    int idx_coeff, idx_grid;

#pragma omp parallel for
    for (idx_coeff = 0; idx_coeff < ms->nb_coeffs; idx_coeff++)
        for (idx_grid = 0; idx_grid < ms->nb_colloc_points; idx_grid++) {
            const int idx      = idx_grid + ms->nb_colloc_points * idx_coeff;

            mat[idx] = ms->eq_coeff_00[idx_grid] * ms->basis_val_00[idx] +
                       ms->eq_coeff_10[idx_grid] * ms->basis_val_10[idx] +
                       ms->eq_coeff_01[idx_grid] * ms->basis_val_01[idx] +
                       ms->eq_coeff_11[idx_grid] * ms->basis_val_11[idx] +
                       ms->eq_coeff_20[idx_grid] * ms->basis_val_20[idx] +
                       ms->eq_coeff_02[idx_grid] * ms->basis_val_02[idx];
        }

    //*prhs_max = ms->rhs[cblas_idamax(ms->nb_colloc_points, ms->rhs, 1)];

    return 0;
}

static int construct_matrix_boundary(MaximalSlicingContext *ms, double *mat,
                                     double *rhs, double *prhs_max)
{
    int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z;
    double rhs_max = 0.0;

#define   BASIS_X (ms->basis_x_val  [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
#define  DBASIS_X (ms->basis_x_dval [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
#define D2BASIS_X (ms->basis_x_d2val[idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
#define   BASIS_Z (ms->basis_z_val  [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
#define  DBASIS_Z (ms->basis_z_dval [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
#define D2BASIS_Z (ms->basis_z_d2val[idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])

#pragma omp parallel for
    for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) {
        for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) {
            CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x];
            int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;

            const double gtuxx         = ms->metric_u[0][idx_grid];
            const double gtuyy         = ms->metric_u[1][idx_grid];
            const double gtuzz         = ms->metric_u[2][idx_grid];
            const double gtuxz         = ms->metric_u[4][idx_grid];

            const double phi           = ms->interp_values[I_PHI][idx_grid];
            const double phi_dx        = ms->interp_values[I_PHI_DX][idx_grid];
            const double phi_dz        = ms->interp_values[I_PHI_DZ][idx_grid];

            const double Xtx           = ms->interp_values[I_XTX][idx_grid];
            const double Xtz           = ms->interp_values[I_XTZ][idx_grid];

            const double k2            = ms->kij_kij[idx_grid];
            const double trk           = ms->interp_values[I_K][idx_grid];

            const double trk_dx        = ms->interp_values[I_K_DX][idx_grid];
            const double trk_dz        = ms->interp_values[I_K_DZ][idx_grid];

            const double betax         = ms->interp_values[I_BETAX][idx_grid];
            const double betaz         = ms->interp_values[I_BETAZ][idx_grid];

            const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi);
            const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);

            const double coeff_20 = SQR(phi) * (gtuxx + ((fabs(x_physical) <= EPS) ? gtuyy : 0));
            const double coeff_02 = SQR(phi) * gtuzz;
            const double coeff_11 = SQR(phi) * gtuxz * 2;
            const double coeff_10 = -Xx + ((fabs(x_physical) > EPS) ? (SQR(phi) * gtuyy / x_physical) : 0);
            const double coeff_01 = -Xz;

            const double coeff_00 = -k2;

            for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
                for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
                    const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;

                    mat[idx_grid + ms->nb_colloc_points * idx_coeff] = coeff_00 *   BASIS_X *   BASIS_Z +
                                                                       coeff_10 *  DBASIS_X *   BASIS_Z +
                                                                       coeff_01 *   BASIS_X *  DBASIS_Z +
                                                                       coeff_11 *  DBASIS_X *  DBASIS_Z +
                                                                       coeff_20 * D2BASIS_X *   BASIS_Z +
                                                                       coeff_02 *   BASIS_X * D2BASIS_Z;
                }

            rhs[idx_grid] = 0.0;
            //rhs[idx_grid] = k2 + trk + betax * trk_dx + betaz * trk_dz;

            //rhs_max = MAX(rhs_max, fabs(rhs[idx_grid]));
            //rhs_max = fabs(rhs[idx_grid]);
        }
    }

#if 1
    // z = 0 axis
    idx_grid_z = 0;
    for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) {
        int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;
        int idx_grid1        = (ms->nb_colloc_points_z - 2) * ms->nb_colloc_points_x + idx_grid_x;

        for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
            for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
                const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;

                mat[idx_grid1 + ms->nb_colloc_points * idx_coeff] = BASIS_X * DBASIS_Z;
            }
        rhs[idx_grid1] = 0.0;
    }

    // x = 0 axis
    idx_grid_x = 0;
    for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) {
        int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;
        int idx_grid1        = idx_grid_z * ms->nb_colloc_points_x + ms->nb_colloc_points_x - 2;

        for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
            for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
                const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;

                mat[idx_grid1 + ms->nb_colloc_points * idx_coeff] = DBASIS_X * BASIS_Z;
            }
        rhs[idx_grid1] = 0.0;
    }

    // z = z_max axis
    idx_grid_z = ms->nb_colloc_points_z - 1;
    for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) {
        CCTK_REAL z_physical = ms->colloc_grid[1][idx_grid_z];

        int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;

        for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
            for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
                const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;

                mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z; //+ z_physical * BASIS_X * DBASIS_Z;
            }
        rhs[idx_grid] = 1.0;
    }

    // x = x_max axis
    idx_grid_x = ms->nb_colloc_points_x - 1;
    for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) {
        CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x];

        int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;

        for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
            for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
                const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;

                mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z;// + x_physical * DBASIS_X * BASIS_Z;
            }
        rhs[idx_grid] = 1.0;
    }

    //idx_grid_x = 0;
    //idx_grid_z = 0;
    //{
    //    int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;

    //    for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
    //        for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
    //            const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;

    //            mat[idx_grid + ms->nb_colloc_points * idx_coeff] = DBASIS_X * BASIS_Z + BASIS_X * DBASIS_Z;
    //        }
    //    rhs[idx_grid] = 0.0;
    //}

    //idx_grid_x = ms->nb_colloc_points_x - 1;
    //idx_grid_z = ms->nb_colloc_points_z - 1;
    //{
    //    CCTK_REAL z_physical = ms->colloc_grid[1][idx_grid_z];
    //    CCTK_REAL x_physical = ms->colloc_grid[0][idx_grid_x];
    //    CCTK_REAL r2 = SQR(z_physical) + SQR(x_physical);
    //    int idx_grid         = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;

    //    for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
    //        for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
    //            const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;

    //            mat[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z + 0.5 * r2 / z_physical * BASIS_X * DBASIS_Z + 0.5 * r2 / x_physical * DBASIS_X * BASIS_Z;
    //        }
    //    rhs[idx_grid] = 1.0;
    //}
#endif

    *prhs_max = rhs_max;

#if 0
    fprintf(stderr, "matrix\n");
    for (int i = 0; i < ms->nb_colloc_points; i++) {
        for (int j = 0; j < ms->nb_coeffs; j++)
            fprintf(stderr, "%+08.04g ", mat[i + ms->nb_colloc_points * j]);
        fprintf(stderr, "\n");
    }

    fprintf(stderr, "rhs\n");
    for (int i = 0; i < ms->nb_colloc_points; i++)
        fprintf(stderr, "%+08.04g ", rhs[i]);
    fprintf(stderr, "\n");
#endif

    return 0;
}

static int construct_matrix(MaximalSlicingContext *ms)
{
    return construct_matrix_polar(ms, ms->mat, NULL, NULL);
}
/* interpolate the cactus gridfunctions onto the pseudospectral grid */
static int interp_geometry(MaximalSlicingContext *ms)
{
    int ret;

    ret = CCTK_InterpGridArrays(ms->gh, 3, ms->interp_operator, ms->interp_params,
                                ms->coord_system, ms->nb_colloc_points, CCTK_VARIABLE_REAL,
                                (const void * const *)ms->interp_coords, ARRAY_ELEMS(ms->interp_vars_indices), ms->interp_vars_indices,
                                ARRAY_ELEMS(ms->interp_values), ms->interp_value_codes, (void * const *)ms->interp_values);
    if (ret < 0)
        CCTK_WARN(0, "Error interpolating");

    //CCTK_TimerStart("MaximalSlicingAxi_filter_input");
    //{
    //    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
    //                ms->nb_colloc_points, ARRAY_ELEMS(ms->interp_values), ms->nb_colloc_points,
    //                1.0, ms->input_filter, ms->nb_colloc_points, ms->interp_buffer_prefilter, ms->nb_colloc_points,
    //                0.0, ms->interp_buffer, ms->nb_colloc_points);
    //}
    //CCTK_TimerStop("MaximalSlicingAxi_filter_input");

    return 0;
}

static int calc_eq_coeffs(MaximalSlicingContext *ms, double *prhs_max)
{
    double rhs_max = 0.0;
#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x) reduction(max : rhs_max)
    for (int i = 0; i < ms->nb_colloc_points; i++) {
        CCTK_REAL Am[3][3], gtu[3][3];
        CCTK_REAL a2;

        CCTK_REAL gtxx = ms->interp_values[I_GTXX][i];
        CCTK_REAL gtyy = ms->interp_values[I_GTYY][i];
        CCTK_REAL gtzz = ms->interp_values[I_GTZZ][i];
        CCTK_REAL gtxy = ms->interp_values[I_GTXY][i];
        CCTK_REAL gtxz = ms->interp_values[I_GTXZ][i];
        CCTK_REAL gtyz = ms->interp_values[I_GTYZ][i];

        CCTK_REAL Atxx = ms->interp_values[I_ATXX][i];
        CCTK_REAL Atyy = ms->interp_values[I_ATYY][i];
        CCTK_REAL Atzz = ms->interp_values[I_ATZZ][i];
        CCTK_REAL Atxy = ms->interp_values[I_ATXY][i];
        CCTK_REAL Atxz = ms->interp_values[I_ATXZ][i];
        CCTK_REAL Atyz = ms->interp_values[I_ATYZ][i];

        CCTK_REAL At[3][3] = {{ Atxx, Atxy, Atxz },
                              { Atxy, Atyy, Atyz },
                              { Atxz, Atyz, Atzz }};

        CCTK_REAL trK  = ms->interp_values[I_K][i];

        CCTK_REAL Xtx  = ms->interp_values[I_XTX][i];
        CCTK_REAL Xtz  = ms->interp_values[I_XTZ][i];

        CCTK_REAL det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz);

        // \tilde{γ}^{ij}
        gtu[0][0] =  (gtyy * gtzz - SQR(gtyz)) / det;
        gtu[1][1] =  (gtxx * gtzz - SQR(gtxz)) / det;
        gtu[2][2] =  (gtxx * gtyy - SQR(gtxy)) / det;
        gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det;
        gtu[0][2] =  (gtxy * gtyz - gtyy * gtxz) / det;
        gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det;
        gtu[1][0] = gtu[0][1];
        gtu[2][0] = gtu[0][2];
        gtu[2][1] = gtu[1][2];

        // \tilde{A}_{i}^j
        for (int j = 0; j < 3; j++)
            for (int k = 0; k < 3; k++) {
                double val = 0.0;
                for (int l = 0; l < 3; l++)
                    val += gtu[j][l] * At[l][k];
                Am[j][k] = val;
            }

        // K_{ij} K^{ij}
        a2 = 0.0;
        for (int j = 0; j < 3; j++)
            for (int k = 0; k < 3; k++)
                a2 += Am[j][k] * Am[k][j];

        {
            double x = ms->interp_coords[0][i];
            double z = ms->interp_coords[2][i];

            const double gtuxx         = gtu[0][0];
            const double gtuyy         = gtu[1][1];
            const double gtuzz         = gtu[2][2];
            const double gtuxz         = gtu[0][2];

            const double phi           = ms->interp_values[I_PHI][i];
            const double phi_dx        = ms->interp_values[I_PHI_DX][i];
            const double phi_dz        = ms->interp_values[I_PHI_DZ][i];

            const double Xtx           = ms->interp_values[I_XTX][i];
            const double Xtz           = ms->interp_values[I_XTZ][i];

            const double k2            = a2 + SQR(trK) / 3.;

            const double trk_dx        = ms->interp_values[I_K_DX][i];
            const double trk_dz        = ms->interp_values[I_K_DZ][i];

            const double betax         = ms->interp_values[I_BETAX][i];
            const double betaz         = ms->interp_values[I_BETAZ][i];

            const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi);
            const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);

            ms->eq_coeff_20[i] = SQR(phi) * (gtuxx + ((x <= EPS) ? gtuyy : 0.0));
            ms->eq_coeff_02[i] = SQR(phi) * gtuzz;
            ms->eq_coeff_11[i] = SQR(phi) * gtuxz * 2;
            ms->eq_coeff_10[i] = -Xx + ((x > EPS) ? SQR(phi) * gtuyy / x : 0.0);
            ms->eq_coeff_01[i] = -Xz;
            ms->eq_coeff_00[i] = -k2;
            ms->rhs[i]         = k2 + trK + betax * trk_dx + betaz * trk_dz;

            rhs_max = MAX(rhs_max, fabs(ms->rhs[i]));
        }

        //ms->metric_u[0][i] = gtu[0][0];
        //ms->metric_u[1][i] = gtu[1][1];
        //ms->metric_u[2][i] = gtu[2][2];
        //ms->metric_u[3][i] = gtu[0][1];
        //ms->metric_u[4][i] = gtu[0][2];
        //ms->metric_u[5][i] = gtu[1][2];

        //ms->kij_kij[i] = a2 + SQR(trK) / 3.;
    }

    *prhs_max = rhs_max;

    return 0;
}

// based on the wikipedia article
// and http://www.netlib.org/templates/matlab/bicgstab.m
static int solve_bicgstab(BiCGSTABContext *ctx, const int N,
                          double *mat, double *rhs, double *x)
{
    const double rhs_norm = cblas_dnrm2(N, rhs, 1);

    double rho, rho_prev = 1.0;
    double omega = 1.0;
    double alpha = 1.0;

    double err;
    int i;

    double *k = ctx->k;
    double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t;
    double *res = ctx->res, *res0 = ctx->res0;

    // initialize the residual
    memcpy(res, rhs, N * sizeof(*res));
    cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
                mat, N, x, 1, 1.0, res, 1);

    memcpy(res0, res, N * sizeof(*res0));
    memcpy(p,    res, N * sizeof(*p));

#define MAXITER 16
#define TOL (1e-15)
    for (i = 0; i < MAXITER; i++) {
        rho = cblas_ddot(N, res, 1, res0, 1);

        if (i) {
            double beta = (rho / rho_prev) * (alpha / omega);

            cblas_daxpy(N, -omega, v, 1, p, 1);
            cblas_dscal(N, beta, p, 1);
            cblas_daxpy(N, 1, res, 1, p, 1);
        }

        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    k, N, p, 1, 0.0, y, 1);

        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    mat, N, y, 1, 0.0, v, 1);

        alpha = rho / cblas_ddot(N, res0, 1, v, 1);

        cblas_daxpy(N, -alpha, v, 1, res, 1);

        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    k, N, res, 1, 0.0, z, 1);
        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    mat, N, z, 1, 0.0, t, 1);

        omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1);

        cblas_daxpy(N, alpha, y, 1, x, 1);
        cblas_daxpy(N, omega, z, 1, x, 1);

        cblas_daxpy(N, -omega, t, 1, res, 1);

        err = cblas_dnrm2(N, res, 1) / rhs_norm;
        if (err < TOL)
            break;

        rho_prev = rho;
    }
    if (i == MAXITER)
        return -1;

    return i;
}

static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q,
                             const int N, double *mat, double *rhs, cl_mem ocl_x)
{
    const double rhs_norm = cblas_dnrm2(N, rhs, 1);

    double rho, rho_prev = 1.0;
    double omega[2] = { 1.0 };
    double alpha = 1.0;

    double err;
    int i;

    cl_event events[8];

    // the matrix, rhs, k and x are assumed to be already uploaded
    clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs,     0, NULL, &events[0]);
    clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]);

    // initialize the residual
    clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
                ctx->cl_mat, 0, N, ocl_x, 0, 1, 1.0, ctx->cl_res, 0, 1,
                1, &cl_q, 2, events, &events[2]);
    clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double),
                        1, &events[2], &events[3]);
    clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double),
                        1, &events[2], &events[4]);

    clWaitForEvents(5, events);
    // BARRIER

#define MAXITER 16
#define TOL (1e-15)
    for (i = 0; i < MAXITER; i++) {
        clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1,
                   ctx->cl_tmp, 1, &cl_q, 0, NULL, &events[0]);
        clEnqueueReadBuffer(cl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho,
                            1, &events[0], NULL);
        // BARRIER

        if (i) {
            double beta = (rho / rho_prev) * (alpha / omega[0]);

            clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1,
                        1, &cl_q, 0, NULL, &events[0]);
            clblasDscal(N, beta, ctx->cl_p, 0, 1,
                        1, &cl_q, 1, &events[0], &events[1]);
            clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1,
                        1, &cl_q, 1, &events[1], &events[0]);
            clWaitForEvents(1, &events[0]);
            // BARRIER
        }

        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1,
                    1, &cl_q, 0, NULL, &events[0]);

        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1,
                    1, &cl_q, 1, &events[0], &events[1]);

        clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1,
                   ctx->cl_tmp, 1, &cl_q, 1, &events[1], &events[0]);
        clEnqueueReadBuffer(cl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha,
                            1, &events[0], NULL);
        // BARRIER

        alpha = rho / alpha;

        clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1,
                    1, &cl_q, 0, NULL, &events[0]);

        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1,
                    1, &cl_q, 1, &events[0], &events[1]);
        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1,
                    1, &cl_q, 1, &events[1], &events[0]);

        clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
                   ctx->cl_tmp, 1, &cl_q, 1, &events[0], &events[1]);
        clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1,
                   ctx->cl_tmp1, 1, &cl_q, 1, &events[0], &events[2]);

        clEnqueueReadBuffer(cl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega,
                            2, &events[1], NULL);
        // BARRIER

        omega[0] /= omega[1];

        clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ocl_x, 0, 1,
                    1, &cl_q, 0, NULL, &events[0]);
        clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ocl_x, 0, 1,
                    1, &cl_q, 1, &events[0], &events[1]);

        clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
                    1, &cl_q, 0, NULL, &events[0]);
        clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1,
                    1, &cl_q, 1, &events[0], &events[2]);
        clEnqueueReadBuffer(cl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err,
                            1, &events[2], NULL);
        clWaitForEvents(1, &events[1]);
        // BARRIER

        if (err < TOL)
            break;

        rho_prev = rho;
    }
    if (i == MAXITER)
        return -1;

    return i;
}

static int lu_invert(const int N, double *mat, double *rhs, int *ipiv)
{
    char  equed = 'N';
    double cond, ferr, berr, rpivot;

    double *mat_f, *x;
    int ret = 0;
#if 1
    LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
                  mat, N, ipiv, rhs, N);
    LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv);
#else
    mat_f = malloc(SQR(N) * sizeof(*mat_f));
    x     = malloc(N * sizeof(*x));

    //{
    //    int i, j;
    //    for (i = 0; i < N; i++) {
    //        for (j = 0; j < N; j++)
    //            fprintf(stderr, "%+#010.8g\t", mat[i + j * N]);
    //        fprintf(stderr, "\n");
    //    }
    //}
    {
        double *mat_copy = malloc(SQR(N) * sizeof(double));
        double *svd = malloc(N * sizeof(double));
        double *rhs_copy = malloc(N * sizeof(double));
        int rank;

        memcpy(mat_copy, mat, SQR(N) * sizeof(double));
        memcpy(rhs_copy, rhs, N * sizeof(double));

        LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N,
                       svd, 1e-13, &rank);

        free(mat_copy);
        for (int i = 0; i < N; i++) {
            if (i > 5 && i < N - 5)
                continue;

            fprintf(stderr, "%g\t", svd[i]);
        }
        fprintf(stderr, "\n rank %d\n", rank);
        free(svd);
        free(rhs_copy);

        if (rank < N)
            ret = 1;
    }

    //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
    //              mat, N, ipiv, rhs, N);
    LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1,
                   mat, N, mat_f, N, ipiv, &equed, NULL, NULL,
                   rhs, N, x, N, &cond, &ferr, &berr, &rpivot);
    LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv);
    memcpy(rhs, x, N * sizeof(double));
    memcpy(mat, mat_f, SQR(N) * sizeof(double));

    fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: "
            "condition number %16.16g; forward error %16.16g backward error %16.16g\n",
            N, N, cond, ferr, berr);

    free(mat_f);
    free(x);
#endif

    return ret;
}

/*
 * Solve the equation
 * D²α - KᵢⱼKⁱʲα = -K
 * for the coefficients of spectral approximation of α:
 * α(ρ, z) = 1 + ΣaᵢⱼTᵢ(ρ)Tⱼ(z)
 * where i =  { 0, ... , ms->nb_coeffs_x };
 *       j =  { 0, ... , ms->nb_coeffs_z };
 * Tᵢ(x) are defined by ms->basis.
 */
int msa_maximal_solve(MaximalSlicingContext *ms)
{
    const int N = ms->nb_coeffs;
    double rhs_max;
    int64_t start;

    int ret = 0;

    /* interpolate the metric values and construct the quantities we'll need */
    CCTK_TimerStart("MaximalSlicingAxi_interp_geometry");
    start = gettime();
    ret = interp_geometry(ms);
    ms->interp_geometry_time += gettime() - start;
    ms->interp_geometry_count++;
    CCTK_TimerStop("MaximalSlicingAxi_interp_geometry");
    if (ret < 0)
        return ret;

    CCTK_TimerStart("MaximalSlicingAxi_calc_eq_coeffs");
    start = gettime();
    ret = calc_eq_coeffs(ms, &rhs_max);
    ms->calc_eq_coeffs_time += gettime() - start;
    ms->calc_eq_coeffs_count++;
    CCTK_TimerStop("MaximalSlicingAxi_calc_eq_coeffs");
    if (ret < 0)
        return ret;

    /* fill the matrix */
    CCTK_TimerStart("MaximalSlicingAxi_construct_matrix");
    start = gettime();
    ret = construct_matrix(ms);
    ms->construct_matrix_time += gettime() - start;
    ms->construct_matrix_count++;
    CCTK_TimerStop("MaximalSlicingAxi_construct_matrix");
    if (ret < 0)
        return ret;

#if 1
    if (rhs_max < EPS) {
        memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
        if (ms->cl_queue) {
            clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double),
                                 ms->coeffs, 0, NULL, NULL);
        }
        return 0;
    }
#endif

    /* solve for the coeffs */
    if (ms->steps_since_inverse < 1024) {
        BiCGSTABContext *b = &ms->bicgstab;
        int64_t start = gettime();

        CCTK_TimerStart("MaximalSlicingAxi_solve_BiCGSTAB");
        if (ms->cl_queue) {
            ret = solve_bicgstab_cl(b, ms->cl_queue, ms->nb_coeffs, ms->mat, ms->rhs, ms->ocl_coeffs);
            clEnqueueReadBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, sizeof(double) * N,
                                ms->coeffs, 0, NULL, NULL);
        } else
            ret = solve_bicgstab(b, ms->nb_coeffs, ms->mat, ms->rhs, ms->coeffs);
        CCTK_TimerStop("MaximalSlicingAxi_solve_BiCGSTAB");

        if (ret >= 0) {
            ms->cg_time_total += gettime() - start;
            ms->cg_solve_count++;
            ms->cg_iter_count += ret + 1;
            ms->steps_since_inverse++;

#if 0
            {
                double min, max;
                gsl_vector_memcpy(b->y, ms->rhs);
                cblas_dgemv(CblasColMajor, CblasNoTrans, ms->mat->size1, ms->mat->size2, -1.0,
                            ms->mat->data, ms->mat->tda, ms->coeffs->data, 1, 1.0, b->y->data, 1);
                gsl_vector_minmax(b->y, &min, &max);
                if (fabs(min) > 1e-11 || fabs(max) > 1e-11)
                    abort();
            }
#endif
        }
    } else
        ret = -1;

    if (ret < 0) {
        double *tmpv;
        double *tmpm;
        int64_t start;

        CCTK_TimerStart("MaximalSlicingAxi_solve_LU");
        start = gettime();

        ret = lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv);
        ms->lu_solves_time += gettime() - start;
        ms->lu_solves_count++;
        CCTK_TimerStop("MaximalSlicingAxi_solve_LU");

        tmpv = ms->coeffs;
        ms->coeffs = ms->rhs;
        ms->rhs = tmpv;

        tmpm = ms->mat;
        ms->mat = ms->bicgstab.k;
        ms->bicgstab.k = tmpm;

        if (ret == 1) {
            memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
            ms->coeffs[0] = 1.0;
        }

        if (ms->cl_queue) {
            cl_event events[2];
            clEnqueueWriteBuffer(ms->cl_queue, ms->bicgstab.cl_k, 0, 0, N * N * sizeof(double),
                                 ms->bicgstab.k, 0, NULL, &events[0]);
            clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 0, 0, N * sizeof(double),
                                 ms->coeffs, 0, NULL, &events[1]);
            clWaitForEvents(2, events);
        }

        ms->steps_since_inverse = 0;
    }

    for (int i = 0; i < N; i++)
        ms->coeffs[i] *= ms->coeff_scale[i];

    //fprintf(stderr, "solve %g %g\n", ms->gh->cctk_time, ms->coeffs[0]);

    return ret;
}