#include <ctype.h>
#include <errno.h>
#include <float.h>
#include <limits.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <gsl/gsl_blas.h>
#include <gsl/gsl_linalg.h>

#include <lapacke.h>

#include "cctk.h"
#include "cctk_Arguments.h"
#include "cctk_Parameters.h"

#define ACC_TEST 1

#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) > (y) ? (y) : (x))
#define SQR(x) ((x) * (x))
#define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0)

/*
 * small number to avoid r=0 singularities
 */
#define EPS 1E-08

/* a set of basis functions */
typedef struct BasisSet {
    /* evaluate the idx-th basis function at the specified point*/
    long double (*eval)      (long double coord, int idx);
    /* evaluate the first derivative of the idx-th basis function at the specified point*/
    long double (*eval_diff1)(long double coord, int idx);
    /* evaluate the second derivative of the idx-th basis function at the specified point*/
    long double (*eval_diff2)(long double coord, int idx);
    /**
     * Get the idx-th collocation point for the specified order.
     * idx runs from 0 to order - 1 (inclusive)
     */
    long double (*colloc_point)(int order, int idx);
} BasisSet;

/*
 * The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9)
 * SB(x, n) = sin((n + 1) arccot(|x| / L))
 * They are symmetric wrt origin and decay as 1/x in infinity.
 */

static CCTK_REAL scale_factor;

#define SCALE_FACTOR scale_factor

static long double sb_even_eval(long double coord, int idx)
{
    long double val = (coord == 0.0) ? M_PI_2 : atanl(SCALE_FACTOR / fabsl(coord));

    idx *= 2;   // even only

    return sinl((idx + 1) * val);
}

static long double sb_even_eval_diff1(long double coord, int idx)
{
    long double val = (coord == 0.0) ? M_PI_2 : atanl(SCALE_FACTOR / fabsl(coord));

    idx *= 2;   // even only

    return - SCALE_FACTOR * (idx + 1) * SGN(coord) * cosl((idx + 1) * val) / (SQR(SCALE_FACTOR) + SQR(coord));
}

static long double sb_even_eval_diff2(long double coord, int idx)
{
    long double val = (coord == 0.0) ? M_PI_2 : atanl(SCALE_FACTOR / fabsl(coord));

    idx *= 2;   // even only

    return SCALE_FACTOR * (idx + 1) * SGN(coord) * (2 * fabsl(coord) * cosl((idx + 1) * val) - SCALE_FACTOR * (idx + 1) * sinl((idx + 1) * val)) / SQR(SQR(SCALE_FACTOR) + SQR(coord));
}

static long double sb_even_colloc_point(int order, int idx)
{
    long double t;

    idx = order - idx - 1;
    order *= 2;

    t = (idx + 2) * M_PI / (order + 4);
    return SCALE_FACTOR / tanl(t);
}

static const BasisSet sb_even_basis = {
    .eval         = sb_even_eval,
    .eval_diff1   = sb_even_eval_diff1,
    .eval_diff2   = sb_even_eval_diff2,
    .colloc_point = sb_even_colloc_point,
};

typedef struct BrillDataContext {
    /* options */
    CCTK_REAL amplitude;

    double (*q_func)(struct BrillDataContext *bd, double rho, double z);
    double (*laplace_q_func)(struct BrillDataContext *bd, double rho, double z);

    /* state */
    const BasisSet *basis;

    int nb_coeffs_x;
    int nb_coeffs_z;
    int nb_coeffs;

    int nb_colloc_points_x;
    int nb_colloc_points_z;
    int nb_colloc_points;

    int colloc_grid_order_x;
    int colloc_grid_order_z;

    gsl_vector *psi_coeffs;
} BrillDataContext;

static int brill_construct_matrix(BrillDataContext *bd, gsl_matrix *mat,
                                  gsl_vector *rhs)
{
    CCTK_REAL *basis_val, *basis_dval, *basis_d2val;
    int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z;

    /* precompute the basis values we will need */
    // FIXME: assumes same number of points in x and z directions
    basis_val   = malloc(sizeof(*basis_val)   * bd->nb_colloc_points_x * bd->nb_coeffs_x);
    basis_dval  = malloc(sizeof(*basis_dval)  * bd->nb_colloc_points_x * bd->nb_coeffs_x);
    basis_d2val = malloc(sizeof(*basis_d2val) * bd->nb_colloc_points_x * bd->nb_coeffs_x);
    for (int i = 0; i < bd->nb_colloc_points_x; i++) {
        CCTK_REAL coord = bd->basis->colloc_point(bd->colloc_grid_order_x, i);
        for (int j = 0; j < bd->nb_coeffs_x; j++) {
            basis_val  [i * bd->nb_coeffs_x + j] = bd->basis->eval(coord, j);
            basis_dval [i * bd->nb_coeffs_x + j] = bd->basis->eval_diff1(coord, j);
            basis_d2val[i * bd->nb_coeffs_x + j] = bd->basis->eval_diff2(coord, j);
        }
    }

    for (idx_grid_z = 0; idx_grid_z < bd->nb_colloc_points_z; idx_grid_z++) {
        CCTK_REAL z_physical = bd->basis->colloc_point(bd->colloc_grid_order_z, idx_grid_z);

        for (idx_grid_x = 0; idx_grid_x < bd->nb_colloc_points_x; idx_grid_x++) {
            CCTK_REAL x_physical = bd->basis->colloc_point(bd->colloc_grid_order_x, idx_grid_x);
            CCTK_REAL d2q        = bd->laplace_q_func(bd, x_physical, z_physical);
            int idx_grid         = idx_grid_z * bd->nb_colloc_points_z + idx_grid_x;

            for (idx_coeff_z = 0; idx_coeff_z < bd->nb_coeffs_z; idx_coeff_z++)
                for (idx_coeff_x = 0; idx_coeff_x < bd->nb_coeffs_x; idx_coeff_x++) {
                    int idx_coeff = idx_coeff_z * bd->nb_coeffs_z + idx_coeff_x;

                    mat->data[idx_grid + mat->tda * idx_coeff] = basis_d2val[idx_grid_x * bd->nb_coeffs_x + idx_coeff_x] * basis_val[idx_grid_z * bd->nb_coeffs_z + idx_coeff_z] * x_physical +
                                                                 basis_dval [idx_grid_x * bd->nb_coeffs_x + idx_coeff_x] * basis_val[idx_grid_z * bd->nb_coeffs_z + idx_coeff_z] +
                                                                 basis_d2val[idx_grid_z * bd->nb_coeffs_z + idx_coeff_z] * basis_val[idx_grid_x * bd->nb_coeffs_x + idx_coeff_x] * x_physical +
                                                                 basis_val  [idx_grid_x * bd->nb_coeffs_x + idx_coeff_x] * basis_val[idx_grid_z * bd->nb_coeffs_z + idx_coeff_z] * 0.25 * d2q * x_physical;
                }
            rhs->data[idx_grid] = -0.25 * d2q * x_physical;
        }
    }

    free(basis_val);
    free(basis_dval);
    free(basis_d2val);

    return 0;
}

static int solve_linear(gsl_matrix *mat, gsl_vector **px, gsl_vector **prhs)
{
    int *ipiv;
    gsl_matrix *mat_f;
    double cond, ferr, berr, rpivot;

    gsl_vector   *x = *px;
    gsl_vector *rhs = *prhs;
    char      equed = 'N';
    int         ret = 0;

    ipiv  = malloc(mat->size1 * sizeof(*ipiv));
    mat_f = gsl_matrix_alloc(mat->size1, mat->size2);
    if (!ipiv || !mat_f) {
        ret = -ENOMEM;
        goto fail;
    }

    LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', mat->size1, 1,
                   mat->data, mat->tda, mat_f->data, mat_f->tda, ipiv, &equed,
                   NULL, NULL, rhs->data, rhs->size, x->data, x->size,
                   &cond, &ferr, &berr, &rpivot);

    CCTK_VInfo(CCTK_THORNSTRING, "LU factorization solution to a %zdx%zd matrix: "
               "condition number %16.16g; forward error %16.16g backward error %16.16g",
               mat->size1, mat->size2, cond, ferr, berr);
fail:
    gsl_matrix_free(mat_f);
    free(ipiv);

    return ret;
}

static int solve_svd(gsl_matrix *mat, gsl_vector **px, gsl_vector **prhs)
{
    double *sv;
    int rank;

    gsl_vector   *x = *px;
    gsl_vector *rhs = *prhs;

    sv = malloc(sizeof(*sv) * x->size);
    if (!sv)
        return -ENOMEM;

    LAPACKE_dgelsd(LAPACK_COL_MAJOR, rhs->size, x->size, 1, mat->data, mat->tda,
                   rhs->data, rhs->size, sv, -1, &rank);

    CCTK_VInfo(CCTK_THORNSTRING, "Least squares SVD solution to a %zdx%zd matrix: "
               "rank %d, condition number %16.16g", mat->size1, mat->size2,
               rank, sv[x->size - 1] / sv[0]);

    gsl_vector_free(x);

    *px         = rhs;
    (*px)->size = mat->size1;
    *prhs       = NULL;

    free(sv);

    return 0;
}

/*
 * Solve the equation
 * Δψ + ¼ ψ (∂²q/∂r² + ∂²q/∂z²) = 0
 * for the coefficients of spectral approximation of ψ:
 * ψ(r, z) = 1 + ΣaᵢⱼTᵢ(r)Tⱼ(z)
 * where i =  { 0, ... , bd->nb_coeffs_x };
 *       j =  { 0, ... , bd->nb_coeffs_z };
 * q(r, z) and Tᵢ(x) are defined by bd->q_func, bd->laplace_q_func and
 * bd->basis.
 *
 * The cofficients are exported in bd->psi_coeffs
 */
static int brill_solve(BrillDataContext *bd)
{
    gsl_matrix *mat = NULL;
    gsl_vector *coeffs = NULL, *rhs = NULL;

#if ACC_TEST
    gsl_vector *rhs_bkp = NULL;
    gsl_matrix *mat_bkp = NULL;
#endif

    int ret = 0;

    /* allocate and fill the pseudospectral matrix */
    mat    = gsl_matrix_alloc (bd->nb_coeffs, bd->nb_colloc_points); // inverted order for lapack
    coeffs = gsl_vector_alloc (bd->nb_coeffs);
    rhs    = gsl_vector_calloc(bd->nb_colloc_points);
    if (!mat || !coeffs || !rhs) {
        ret = -ENOMEM;
        goto fail;
    }

    /* fill the matrix */
    ret = brill_construct_matrix(bd, mat, rhs);
    if (ret < 0)
        goto fail;

#if ACC_TEST
    /* make backups of the matrix and RHS, since they might be destroyed later */
    mat_bkp = gsl_matrix_alloc(mat->size2, mat->size1);
    rhs_bkp = gsl_vector_alloc(rhs->size);
    if (!mat_bkp || !rhs_bkp) {
        ret = -ENOMEM;
        goto fail;
    }

    gsl_vector_memcpy(rhs_bkp, rhs);
    gsl_matrix_transpose_memcpy(mat_bkp, mat);
#endif

    /* solve for the coeffs */
    if (bd->nb_colloc_points > bd->nb_coeffs)
        ret = solve_svd(mat, &coeffs, &rhs);
    else
        ret = solve_linear(mat, &coeffs, &rhs);

    /* export the result to the caller */
    bd->psi_coeffs = coeffs;

#if ACC_TEST
    {
        double min, max, rmin, rmax;

        gsl_vector_minmax(rhs_bkp, &rmin, &rmax);
        gsl_blas_dgemv(CblasNoTrans, 1.0, mat_bkp, coeffs, -1.0, rhs_bkp);
        gsl_vector_minmax(rhs_bkp, &min, &max);

        CCTK_VInfo(CCTK_THORNSTRING, "max(|A·x - rhs|) / max(|rhs|): %16.16g",
                   MAX(fabs(min), fabs(max)) / MAX(fabs(rmin), fabs(rmax)));
    }
#endif

fail:

#if ACC_TEST
    gsl_matrix_free(mat_bkp);
    gsl_vector_free(rhs_bkp);
#endif

    if (ret < 0)
        gsl_vector_free(coeffs);

    gsl_vector_free(rhs);
    gsl_matrix_free(mat);

    return ret;
}

// q function form from PHYSICAL REVIEW D 88, 103009 (2013)
// with σ and ρ_0 hardcoded to 1 for now
static double q_gundlach(BrillDataContext *bd, double rho, double z)
{
    return bd->amplitude * SQR(rho) * exp(- (SQR(rho) + SQR(z)));
}

static double laplace_q_gundlach(BrillDataContext *bd, double rho, double z)
{
    double r2 = SQR(rho);
    double z2 = SQR(z);
    double e  = exp(-r2 - z2);

    return 2 * bd->amplitude * e * (1.0 + 2 * r2 * (r2 + z2 - 3));
}

static BrillDataContext *init_bd(cGH *cctkGH, CCTK_REAL amplitude,
                                 int basis_order_r, int basis_order_z,
                                 int colloc_offset_r, int colloc_offset_z,
                                 double sf, int overdet)
{
    BrillDataContext *bd;

    if (basis_order_r != basis_order_z)
        CCTK_WARN(0, "Different r and z basis orders are not supported.");

    bd = malloc(sizeof(*bd));

    bd->q_func         = q_gundlach;
    bd->laplace_q_func = laplace_q_gundlach;
    bd->amplitude      = amplitude;

    bd->basis      = &sb_even_basis;

    bd->nb_coeffs_x = basis_order_r;
    bd->nb_coeffs_z = basis_order_z;

    bd->nb_coeffs = bd->nb_coeffs_x * bd->nb_coeffs_z;

    bd->nb_colloc_points_x = basis_order_r + overdet;
    bd->nb_colloc_points_z = basis_order_z + overdet;

    bd->nb_colloc_points = bd->nb_colloc_points_x * bd->nb_colloc_points_z;

    bd->colloc_grid_order_x = basis_order_r + colloc_offset_r;
    bd->colloc_grid_order_z = basis_order_z + colloc_offset_z;

    scale_factor = sf;

    return bd;
}

void brill_data(CCTK_ARGUMENTS)
{
    static BrillDataContext *prev_bd;

    DECLARE_CCTK_ARGUMENTS;
    DECLARE_CCTK_PARAMETERS;

    BrillDataContext *bd;

    long double *basis_val_r, *basis_val_z;

    int64_t grid_size = CCTK_GFINDEX3D(cctkGH,
                                       cctk_lsh[0] - 1,
                                       cctk_lsh[1] - 1,
                                       cctk_lsh[2] - 1) + 1;

    /* on the first run, solve the equation for ψ */
    if (!prev_bd) {
        bd = init_bd(cctkGH, amplitude, basis_order_r, basis_order_z,
                     colloc_grid_offset_r, colloc_grid_offset_z,
                     scale_factor, overdet);

        brill_solve(bd);

        if (export_coeffs)
            memcpy(brill_coeffs, bd->psi_coeffs->data, sizeof(*brill_coeffs) * bd->nb_coeffs);

        prev_bd = bd;
    } else
        bd = prev_bd;

    memset(kxx, 0, sizeof(*kxx) * grid_size);
    memset(kyy, 0, sizeof(*kyy) * grid_size);
    memset(kzz, 0, sizeof(*kzz) * grid_size);
    memset(kxy, 0, sizeof(*kxy) * grid_size);
    memset(kxz, 0, sizeof(*kxz) * grid_size);
    memset(kyz, 0, sizeof(*kyz) * grid_size);
    memset(gxz, 0, sizeof(*kxz) * grid_size);
    memset(gyz, 0, sizeof(*kyz) * grid_size);

    /* precompute the basis values on the grid points */
    basis_val_r = malloc(sizeof(*basis_val_r) * bd->nb_coeffs_x * cctk_lsh[1] * cctk_lsh[0]);
    basis_val_z = malloc(sizeof(*basis_val_z) * bd->nb_coeffs_z * cctk_lsh[2]);

    for (int i = 0; i < cctk_lsh[2]; i++) {
        CCTK_REAL zz = z[CCTK_GFINDEX3D(cctkGH, 0, 0, i)];
        for (int j = 0; j < bd->nb_coeffs_z; j++)
            basis_val_z[i * bd->nb_coeffs_z + j] = bd->basis->eval(zz, j);
    }

    for (int j = 0; j < cctk_lsh[1]; j++)
        for (int i = 0; i < cctk_lsh[0]; i++) {
            CCTK_REAL xx = x[CCTK_GFINDEX3D(cctkGH, i, j, 0)];
            CCTK_REAL yy = y[CCTK_GFINDEX3D(cctkGH, i, j, 0)];
            CCTK_REAL r = sqrt(SQR(xx) + SQR(yy));

            for (int k = 0; k < bd->nb_coeffs_x; k++)
                basis_val_r[(j * cctk_lsh[0] + i) * bd->nb_coeffs_x + k] = bd->basis->eval(r, k);
        }

#pragma omp parallel for
    for (int k = 0; k < cctk_lsh[2]; k++)
        for (int j = 0; j < cctk_lsh[1]; j++)
            for (int i = 0; i < cctk_lsh[0]; i++) {
                int    index = CCTK_GFINDEX3D(cctkGH, i, j, k);
                long double xx = x[index], yy = y[index], zz = z[index];

                long double r2 = SQR(xx) + SQR(yy);
                long double r  = sqrtl(r2);

                long double q   = bd->q_func(bd, r, zz);
                long double e2q = expl(2 * q);

                long double psi = 1.0, psi4;

                for (int n = 0; n < bd->nb_coeffs_z; n++)
                    for (int m = 0; m < bd->nb_coeffs_x; m++)
                        psi += bd->psi_coeffs->data[n * bd->nb_coeffs_z + m] * basis_val_r[(j * cctk_lsh[0] + i) * bd->nb_coeffs_x + m] * basis_val_z[k * bd->nb_coeffs_z + n];

                psi4 = SQR(SQR(psi));

                if (r2 < EPS)
                    r2 = EPS;

                gxx[index] = psi4 * (e2q + (1 - e2q) * SQR(yy) / r2);
                gyy[index] = psi4 * (e2q + (1 - e2q) * SQR(xx) / r2);
                gzz[index] = psi4 * e2q;
                gxy[index] = psi4 * (-(1.0 - e2q) * xx * yy / r2);
            }
}