/*
 * Copyright 2014-2015 Anton Khirnov <anton@khirnov.net>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/**
 * @file
 * the actual pseudo-spectral solver code
 */

#include <cblas.h>
#include <lapacke.h>

#include <errno.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>

#include <stdio.h>

#include "brill_data.h"
#include "internal.h"

static int brill_construct_matrix(const BDContext *bd, double *mat,
                                  double *rhs)
{
    BDPriv *s = bd->priv;
    const double sfr = bd->basis_scale_factor_rho, sfz = bd->basis_scale_factor_z;

    double *basis_val, *basis_dval, *basis_d2val;
    int idx_coeff_rho, idx_coeff_z, idx_grid_rho, idx_grid_z;

    double *basis_r[3];
    double *basis_z[3];

    /* precompute the basis values we will need */
    for (int i = 0; i < ARRAY_ELEMS(basis_r); i++)
        basis_r[i] = malloc(sizeof(*basis_r[i]) * s->nb_colloc_points_rho * bd->nb_coeffs_rho);
    for (int i = 0; i < s->nb_colloc_points_rho; i++) {
        double coord = s->basis->colloc_point(s->colloc_grid_order_rho, i, sfr);
        for (int j = 0; j < bd->nb_coeffs_rho; j++) {
            basis_r[0][i * bd->nb_coeffs_rho + j] = s->basis->eval(coord, j, sfr);
            basis_r[1][i * bd->nb_coeffs_rho + j] = s->basis->eval_diff1(coord, j, sfr);
            basis_r[2][i * bd->nb_coeffs_rho + j] = s->basis->eval_diff2(coord, j, sfr);
        }
    }

    for (int i = 0; i < ARRAY_ELEMS(basis_z); i++)
        basis_z[i] = malloc(sizeof(*basis_z[i]) * s->nb_colloc_points_z * bd->nb_coeffs_z);
    for (int i = 0; i < s->nb_colloc_points_z; i++) {
        double coord = s->basis->colloc_point(s->colloc_grid_order_z, i, sfr);
        for (int j = 0; j < bd->nb_coeffs_z; j++) {
            basis_z[0][i * bd->nb_coeffs_z + j] = s->basis->eval(coord, j, sfr);
            basis_z[1][i * bd->nb_coeffs_z + j] = s->basis->eval_diff1(coord, j, sfr);
            basis_z[2][i * bd->nb_coeffs_z + j] = s->basis->eval_diff2(coord, j, sfr);
        }
    }

#define   BASIS_RHO (basis_r[0][idx_grid_rho * bd->nb_coeffs_rho + idx_coeff_rho])
#define  DBASIS_RHO (basis_r[1][idx_grid_rho * bd->nb_coeffs_rho + idx_coeff_rho])
#define D2BASIS_RHO (basis_r[2][idx_grid_rho * bd->nb_coeffs_rho + idx_coeff_rho])
#define   BASIS_Z   (basis_z[0][idx_grid_z * bd->nb_coeffs_z + idx_coeff_z])
#define  DBASIS_Z   (basis_z[1][idx_grid_z * bd->nb_coeffs_z + idx_coeff_z])
#define D2BASIS_Z   (basis_z[2][idx_grid_z * bd->nb_coeffs_z + idx_coeff_z])

    for (idx_grid_z = 0; idx_grid_z < s->nb_colloc_points_z; idx_grid_z++) {
        double z_physical = s->basis->colloc_point(s->colloc_grid_order_z, idx_grid_z, sfz);

        for (idx_grid_rho = 0; idx_grid_rho < s->nb_colloc_points_rho; idx_grid_rho++) {
            double x_physical = s->basis->colloc_point(s->colloc_grid_order_rho, idx_grid_rho, sfr);
            double d2q        = s->q_func->d2q_rho(bd, x_physical, z_physical) + s->q_func->d2q_z(bd, x_physical, z_physical);
            int idx_grid         = idx_grid_z * s->nb_colloc_points_rho + idx_grid_rho;

            for (idx_coeff_z = 0; idx_coeff_z < bd->nb_coeffs_z; idx_coeff_z++)
                for (idx_coeff_rho = 0; idx_coeff_rho < bd->nb_coeffs_rho; idx_coeff_rho++) {
                    int idx_coeff = idx_coeff_z * bd->nb_coeffs_rho + idx_coeff_rho;

                    double val  = D2BASIS_RHO * BASIS_Z + D2BASIS_Z * BASIS_RHO + BASIS_RHO * BASIS_Z * 0.25 * d2q;
                    if (fabs(x_physical) > EPS)
                        val += DBASIS_RHO * BASIS_Z / fabs(x_physical);
                    else
                        val += D2BASIS_RHO * BASIS_Z;

                    mat[idx_grid + s->nb_colloc_points * idx_coeff] = val;
                }
            rhs[idx_grid] = -0.25 * d2q;
        }
    }

    for (int i = 0; i < ARRAY_ELEMS(basis_r); i++)
        free(basis_r[i]);
    for (int i = 0; i < ARRAY_ELEMS(basis_z); i++)
        free(basis_z[i]);

    return 0;
}

static int brill_solve_linear(const BDContext *bd, double *mat, double **px, double **prhs)
{
    const BDPriv *s = bd->priv;
    const int stride = s->nb_coeffs;
    int *ipiv;
    double *mat_f;
    double cond, ferr, berr, rpivot;

    double   *x = *px;
    double *rhs = *prhs;
    char  equed = 'N';
    int     ret = 0;

    ipiv  = malloc(stride * sizeof(*ipiv));
    mat_f = malloc(SQR(stride) * sizeof(*mat_f));
    if (!ipiv || !mat_f) {
        ret = -ENOMEM;
        goto fail;
    }

    LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', stride, 1,
                   mat, stride, mat_f, stride, ipiv, &equed,
                   NULL, NULL, rhs, stride, x, stride,
                   &cond, &ferr, &berr, &rpivot);

    bdi_log(bd, 0, "LU factorization solution to a %zdx%zd matrix: "
            "condition number %16.16g; forward error %16.16g backward error %16.16g\n",
            stride, stride, cond, ferr, berr);
fail:
    free(mat_f);
    free(ipiv);

    return ret;
}

static int brill_solve_svd(const BDContext *bd, double *mat,
                           double **px, double **prhs)
{
    const BDPriv *s = bd->priv;
    double *sv;
    int rank;

    double   *x = *px;
    double *rhs = *prhs;

    sv = malloc(sizeof(*sv) * s->nb_coeffs);
    if (!sv)
        return -ENOMEM;

    LAPACKE_dgelsd(LAPACK_COL_MAJOR, s->nb_colloc_points, s->nb_coeffs, 1,
                   mat, s->nb_colloc_points, rhs,
                   MAX(s->nb_coeffs, s->nb_colloc_points),
                   sv, -1, &rank);

    bdi_log(bd, 0, "Least squares SVD solution to a %zdx%zd matrix: "
            "rank %d, condition number %16.16g\n", s->nb_colloc_points, s->nb_coeffs,
            rank, sv[s->nb_coeffs - 1] / sv[0]);

    free(sv);

    *px         = rhs;
    *prhs       = x;

    return 0;
}

/*
 * Solve the equation
 * Δψ + ¼ ψ (∂²q/∂r² + ∂²q/∂z²) = 0
 * for the coefficients of spectral approximation of ψ:
 * ψ(r, z) = 1 + ΣaᵢⱼTᵢ(r)Tⱼ(z)
 * where i =  { 0, ... , bd->nb_coeffs_rho };
 *       j =  { 0, ... , bd->nb_coeffs_z };
 * q(r, z) and Tᵢ(x) are defined by bd->q_func, bd->laplace_q_func and
 * bd->basis.
 *
 * The cofficients are exported in bd->priv->coeffs
 */
int bdi_solve(BDContext *bd)
{
    BDPriv *s = bd->priv;

    const int vecsize = MAX(s->nb_coeffs, s->nb_colloc_points);

    double *mat = NULL;
    double *rhs = NULL, *coeffs = NULL;

    int ret = 0;

    /* allocate and fill the pseudospectral matrix */
    mat    = malloc(sizeof(*mat) * s->nb_coeffs * s->nb_colloc_points);
    coeffs = malloc(sizeof(*coeffs) * vecsize);
    rhs    = malloc(sizeof(*rhs)    * vecsize);
    if (!mat || !coeffs || !rhs) {
        ret = -ENOMEM;
        goto fail;
    }

    /* fill the matrix */
    ret = brill_construct_matrix(bd, mat, rhs);
    if (ret < 0)
        goto fail;

    /* solve for the coeffs */
    if (s->nb_colloc_points > s->nb_coeffs)
        ret = brill_solve_svd(bd, mat, &coeffs, &rhs);
    else
        ret = brill_solve_linear(bd, mat, &coeffs, &rhs);

    /* export the result */
    s->coeffs = coeffs;

fail:
    if (ret < 0)
        free(coeffs);

    free(rhs);
    free(mat);

    return ret;
}