/*
 * Copyright 2014-2015 Anton Khirnov <anton@khirnov.net>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/**
 * @file
 * the actual pseudo-spectral solver code
 */

#include <cblas.h>
#include <lapacke.h>

#include <errno.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>

#include <stdio.h>

#include "brill_data.h"
#include "internal.h"
#include "qfunc.h"

static int brill_construct_matrix_cylindrical(const BDContext *bd, double *mat,
                                  double *rhs)
{
    BDPriv *s = bd->priv;

    double *basis_val, *basis_dval, *basis_d2val;
    int idx_coeff_rho, idx_coeff_z, idx_grid_rho, idx_grid_z;

    double *basis[2][3] = { { NULL } };
    int ret = 0;

    /* precompute the basis values we will need */
    for (int i = 0; i < ARRAY_ELEMS(basis); i++) {
        for (int j = 0; j < ARRAY_ELEMS(basis[i]); j++) {
            basis[i][j] = malloc(sizeof(*basis[i][j]) * s->nb_colloc_points[i] * s->nb_coeffs[i]);
            if (!basis[i][j]) {
                ret = -ENOMEM;
                goto fail;
            }
        }
        for (int j = 0; j < s->nb_colloc_points[i]; j++) {
            double coord = bdi_basis_colloc_point(s->basis[i], j, s->nb_coeffs[i]);
            for (int k = 0; k < s->nb_coeffs[i]; k++) {
                basis[i][0][j * s->nb_coeffs[i] + k] = bdi_basis_eval(s->basis[i], BS_EVAL_TYPE_VALUE, coord, k);
                basis[i][1][j * s->nb_coeffs[i] + k] = bdi_basis_eval(s->basis[i], BS_EVAL_TYPE_DIFF1, coord, k);
                basis[i][2][j * s->nb_coeffs[i] + k] = bdi_basis_eval(s->basis[i], BS_EVAL_TYPE_DIFF2, coord, k);
            }
        }
    }

#define   BASIS_RHO (basis[0][0][idx_grid_rho * s->nb_coeffs[0] + idx_coeff_rho])
#define  DBASIS_RHO (basis[0][1][idx_grid_rho * s->nb_coeffs[0] + idx_coeff_rho])
#define D2BASIS_RHO (basis[0][2][idx_grid_rho * s->nb_coeffs[0] + idx_coeff_rho])
#define   BASIS_Z   (basis[1][0][idx_grid_z   * s->nb_coeffs[1] + idx_coeff_z])
#define  DBASIS_Z   (basis[1][1][idx_grid_z   * s->nb_coeffs[1] + idx_coeff_z])
#define D2BASIS_Z   (basis[1][2][idx_grid_z   * s->nb_coeffs[1] + idx_coeff_z])

    for (idx_grid_z = 0; idx_grid_z < s->nb_colloc_points[1]; idx_grid_z++) {
        double z_val = bdi_basis_colloc_point(s->basis[1], idx_grid_z, s->nb_coeffs[1]);

        for (idx_grid_rho = 0; idx_grid_rho < s->nb_colloc_points[0]; idx_grid_rho++) {
            double x_val = bdi_basis_colloc_point(s->basis[0], idx_grid_rho, s->nb_coeffs[0]);
            double d2q = bdi_qfunc_eval(s->qfunc, x_val, z_val, 2) +
                         bdi_qfunc_eval(s->qfunc, x_val, z_val, 6);
            int idx_grid = idx_grid_z * s->nb_colloc_points[0] + idx_grid_rho;

            for (idx_coeff_z = 0; idx_coeff_z < s->nb_coeffs[1]; idx_coeff_z++)
                for (idx_coeff_rho = 0; idx_coeff_rho < s->nb_coeffs[0]; idx_coeff_rho++) {
                    int idx_coeff = idx_coeff_z * s->nb_coeffs[0] + idx_coeff_rho;

                    double val  = D2BASIS_RHO * BASIS_Z + D2BASIS_Z * BASIS_RHO + BASIS_RHO * BASIS_Z * 0.25 * d2q;
                    if (fabs(x_val) > EPS)
                        val += DBASIS_RHO * BASIS_Z / fabs(x_val);
                    else
                        val += D2BASIS_RHO * BASIS_Z;

                    mat[idx_grid + NB_COLLOC_POINTS(s) * idx_coeff] = val;
                }
            rhs[idx_grid] = -0.25 * d2q;
        }
    }

fail:
    for (int i = 0; i < ARRAY_ELEMS(basis); i++)
        for (int j = 0; j < ARRAY_ELEMS(basis[i]); j++)
            free(basis[i][j]);

    return ret;
}

static int brill_construct_matrix_polar(const BDContext *bd, double *mat,
                                        double *rhs)
{
    BDPriv *s = bd->priv;

    double *basis_val, *basis_dval, *basis_d2val;
    int idx_coeff_0, idx_coeff_1, idx_grid_0, idx_grid_1;

    double *basis[2][3] = { { NULL } };
    int ret = 0;

    /* precompute the basis values we will need */
    for (int i = 0; i < ARRAY_ELEMS(basis); i++) {
        for (int j = 0; j < ARRAY_ELEMS(basis[i]); j++) {
            basis[i][j] = malloc(sizeof(*basis[i][j]) * s->nb_colloc_points[i] * s->nb_coeffs[i]);
            if (!basis[i][j]) {
                ret = -ENOMEM;
                goto fail;
            }
        }
        for (int j = 0; j < s->nb_colloc_points[i]; j++) {
            double coord = bdi_basis_colloc_point(s->basis[i], j, s->nb_coeffs[i]);
            for (int k = 0; k < s->nb_coeffs[i]; k++) {
                basis[i][0][j * s->nb_coeffs[i] + k] = bdi_basis_eval(s->basis[i], BS_EVAL_TYPE_VALUE, coord, k);
                basis[i][1][j * s->nb_coeffs[i] + k] = bdi_basis_eval(s->basis[i], BS_EVAL_TYPE_DIFF1, coord, k);
                basis[i][2][j * s->nb_coeffs[i] + k] = bdi_basis_eval(s->basis[i], BS_EVAL_TYPE_DIFF2, coord, k);
            }
        }
    }

#define   BASIS0 (basis[0][0][idx_grid_0 * s->nb_coeffs[0] + idx_coeff_0])
#define  DBASIS0 (basis[0][1][idx_grid_0 * s->nb_coeffs[0] + idx_coeff_0])
#define D2BASIS0 (basis[0][2][idx_grid_0 * s->nb_coeffs[0] + idx_coeff_0])
#define   BASIS1 (basis[1][0][idx_grid_1 * s->nb_coeffs[1] + idx_coeff_1])
#define  DBASIS1 (basis[1][1][idx_grid_1 * s->nb_coeffs[1] + idx_coeff_1])
#define D2BASIS1 (basis[1][2][idx_grid_1 * s->nb_coeffs[1] + idx_coeff_1])

    for (idx_grid_1 = 0; idx_grid_1 < s->nb_colloc_points[1]; idx_grid_1++) {
        double phi = bdi_basis_colloc_point(s->basis[1], idx_grid_1, s->nb_coeffs[1]);

        for (idx_grid_0 = 0; idx_grid_0 < s->nb_colloc_points[0]; idx_grid_0++) {
            double r = bdi_basis_colloc_point(s->basis[0], idx_grid_0, s->nb_coeffs[0]);

            double x = r * cos(phi);
            double z = r * sin(phi);

            double d2q = bdi_qfunc_eval(s->qfunc, x, z, 2) +
                         bdi_qfunc_eval(s->qfunc, x, z, 6);
            int idx_grid  = idx_grid_1 * s->nb_colloc_points[0] + idx_grid_0;

            for (idx_coeff_1 = 0; idx_coeff_1 < s->nb_coeffs[1]; idx_coeff_1++)
                for (idx_coeff_0 = 0; idx_coeff_0 < s->nb_coeffs[0]; idx_coeff_0++) {
                    int idx_coeff = idx_coeff_1 * s->nb_coeffs[0] + idx_coeff_0;

                    double basis_val_20 =  ((r > EPS) ? (SQR(x / r) * D2BASIS0 * BASIS1 + SQR(z / SQR(r)) * BASIS0 * D2BASIS1
                                                         + (1.0 - SQR(x / r)) / r * DBASIS0 * BASIS1
                                                         + 2 * x * z / SQR(SQR(r)) * BASIS0 * DBASIS1
                                                         - 2 * z * x / (r * SQR(r)) * DBASIS0 * DBASIS1) : 0.0);
                    double basis_val_02 =  ((r > EPS) ? (SQR(z / r) * D2BASIS0 * BASIS1 + SQR(x / SQR(r)) * BASIS0 * D2BASIS1
                                                           + (1.0 - SQR(z / r)) / r * DBASIS0 * BASIS1
                                                           - 2 * x * z / SQR(SQR(r)) * BASIS0 * DBASIS1
                                                           + 2 * z * x / (r * SQR(r)) * DBASIS0 * DBASIS1) : 0.0);
                    double basis_val_10 = ((r > EPS) ? (DBASIS0 * BASIS1 * x / r - BASIS0 * DBASIS1 * z / SQR(r)) : 0.0);

                    double val  = basis_val_20 + basis_val_02 + BASIS0 * BASIS1 * 0.25 * d2q;
                    if (fabs(x) > EPS)
                        val += basis_val_10 / fabs(x);
                    else
                        val += basis_val_20;

                    mat[idx_grid + NB_COLLOC_POINTS(s) * idx_coeff] = val;
                }
            rhs[idx_grid] = -0.25 * d2q;
        }
    }

fail:
    for (int i = 0; i < ARRAY_ELEMS(basis); i++)
        for (int j = 0; j < ARRAY_ELEMS(basis[i]); j++)
            free(basis[i][j]);

    return ret;
}

static int brill_solve_linear(const BDContext *bd, double *mat, double **px, double **prhs)
{
    const BDPriv *s = bd->priv;
    const int stride = NB_COEFFS(s);
    int *ipiv;
    double *mat_f;
    double cond, ferr, berr, rpivot;

    double   *x = *px;
    double *rhs = *prhs;
    char  equed = 'N';
    int     ret = 0;

    ipiv  = malloc(stride * sizeof(*ipiv));
    mat_f = malloc(SQR(stride) * sizeof(*mat_f));
    if (!ipiv || !mat_f) {
        ret = -ENOMEM;
        goto fail;
    }

    LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', stride, 1,
                   mat, stride, mat_f, stride, ipiv, &equed,
                   NULL, NULL, rhs, stride, x, stride,
                   &cond, &ferr, &berr, &rpivot);

    bdi_log(bd, 0, "LU factorization solution to a %zdx%zd matrix: "
            "condition number %16.16g; forward error %16.16g backward error %16.16g\n",
            stride, stride, cond, ferr, berr);
fail:
    free(mat_f);
    free(ipiv);

    return ret;
}

static int brill_solve_svd(const BDContext *bd, double *mat,
                           double **px, double **prhs)
{
    const BDPriv *s = bd->priv;
    double *sv;
    int rank;

    double   *x = *px;
    double *rhs = *prhs;

    sv = malloc(sizeof(*sv) * NB_COEFFS(s));
    if (!sv)
        return -ENOMEM;

    LAPACKE_dgelsd(LAPACK_COL_MAJOR, NB_COLLOC_POINTS(s), NB_COEFFS(s), 1,
                   mat, NB_COLLOC_POINTS(s), rhs,
                   MAX(NB_COEFFS(s), NB_COLLOC_POINTS(s)),
                   sv, -1, &rank);

    bdi_log(bd, 0, "Least squares SVD solution to a %zdx%zd matrix: "
            "rank %d, condition number %16.16g\n", NB_COLLOC_POINTS(s), NB_COEFFS(s),
            rank, sv[NB_COEFFS(s) - 1] / sv[0]);

    free(sv);

    *px         = rhs;
    *prhs       = x;

    return 0;
}

static int brill_export_coeffs(BDPriv *s, const double *coeffs)
{
    int alignment = REQ_ALIGNMENT(*coeffs);
    int nb_coeffs_aligned[2] = { ALIGN(s->nb_coeffs[0], alignment), ALIGN(s->nb_coeffs[1], alignment) };

    int ret, i;

    ret = posix_memalign((void**)&s->coeffs, REQ_ALIGNMENT(char), nb_coeffs_aligned[0] * nb_coeffs_aligned[1] * sizeof(*s->coeffs));
    if (ret)
        return -ret;

    memset(s->coeffs, 0, nb_coeffs_aligned[0] * nb_coeffs_aligned[1] * sizeof(*s->coeffs));
    for (i = 0; i < s->nb_coeffs[1]; i++)
        memcpy(s->coeffs + i * nb_coeffs_aligned[0], coeffs + i * s->nb_coeffs[0], sizeof(*coeffs) * s->nb_coeffs[0]);

    s->coeffs_stride = nb_coeffs_aligned[0];

    return 0;
}

/*
 * Solve the equation
 * Δψ + ¼ ψ (∂²q/∂r² + ∂²q/∂z²) = 0
 * for the coefficients of spectral approximation of ψ:
 * ψ(r, z) = 1 + ΣaᵢⱼTᵢ(r)Tⱼ(z)
 * where i =  { 0, ... , bd->nb_coeffs_rho };
 *       j =  { 0, ... , bd->nb_coeffs_z };
 * q(r, z) and Tᵢ(x) are defined by bd->q_func, bd->laplace_q_func and
 * bd->basis.
 *
 * The cofficients are exported in bd->priv->coeffs
 */
int bdi_solve(BDContext *bd)
{
    BDPriv *s = bd->priv;

    const int vecsize = MAX(NB_COEFFS(s), NB_COLLOC_POINTS(s));

    double *mat = NULL;
    double *rhs = NULL, *coeffs = NULL;

    int ret = 0;

    /* allocate and fill the pseudospectral matrix */
    mat    = malloc(sizeof(*mat) * NB_COEFFS(s) * NB_COLLOC_POINTS(s));
    coeffs = malloc(sizeof(*coeffs) * vecsize);
    rhs    = malloc(sizeof(*rhs)    * vecsize);
    if (!mat || !coeffs || !rhs) {
        ret = -ENOMEM;
        goto fail;
    }

    /* fill the matrix */
    switch (s->coord_system) {
    case COORD_SYSTEM_CYLINDRICAL: ret = brill_construct_matrix_cylindrical(bd, mat, rhs);
                                   break;
    case COORD_SYSTEM_RADIAL:      ret = brill_construct_matrix_polar(bd, mat, rhs);
                                   break;
    }
    if (ret < 0)
        goto fail;

    /* solve for the coeffs */
    if (s->nb_colloc_points > s->nb_coeffs)
        ret = brill_solve_svd(bd, mat, &coeffs, &rhs);
    else
        ret = brill_solve_linear(bd, mat, &coeffs, &rhs);

    /* export the result */
    ret = brill_export_coeffs(s, coeffs);
    if (ret < 0)
        goto fail;

fail:
    free(coeffs);
    free(rhs);
    free(mat);

    return ret;
}