/*
 * Pseudospectral 2nd order 2D linear PDE solver
 * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <errno.h>
#include <inttypes.h>
#include <limits.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <cblas.h>
#include <lapacke.h>

#include "bicgstab.h"
#include "pssolve.h"

#define NB_COEFFS(priv)        (priv->nb_coeffs[0]        * priv->nb_coeffs[1])
#define NB_COLLOC_POINTS(priv) (priv->nb_colloc_points[0] * priv->nb_colloc_points[1])

struct PSSolvePriv {
    BiCGStabContext *bicgstab;
    int steps_since_inverse;

    int nb_coeffs[2];
    int nb_colloc_points[2];
    int colloc_grid_order[2];

    double *basis_val[PSSOLVE_DIFF_ORDER_NB];

    int *ipiv;
    double *mat;
};

static int construct_matrix(PSSolveContext *ctx,
                            const double *eq_coeffs[PSSOLVE_DIFF_ORDER_NB])
{
    double *mat = ctx->priv->mat;
    int idx_coeff, idx_grid;

//#pragma omp parallel for
    for (idx_coeff = 0; idx_coeff < NB_COEFFS(ctx->priv); idx_coeff++)
        for (idx_grid = 0; idx_grid < NB_COLLOC_POINTS(ctx->priv); idx_grid++) {
            const int idx      = idx_grid + NB_COLLOC_POINTS(ctx->priv) * idx_coeff;
            double val = 0.0;

            for (int i = 0; i < ARRAY_ELEMS(ctx->priv->basis_val); i++)
                val += eq_coeffs[i][idx_grid] * ctx->priv->basis_val[i][idx];

            mat[idx] = val;
        }

    return 0;
}

static int lu_invert(const int N, double *mat, double *rhs, int *ipiv)
{
    char  equed = 'N';
    double cond, ferr, berr, rpivot;

    double *mat_f, *x;
    int ret = 0;
#if 0
    LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
                  mat, N, ipiv, rhs, N);
    LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv);
#else
    mat_f = malloc(SQR(N) * sizeof(*mat_f));
    x     = malloc(N * sizeof(*x));

    //{
    //    int i, j;
    //    for (i = 0; i < N; i++) {
    //        for (j = 0; j < N; j++)
    //            fprintf(stderr, "%+#010.8g\t", mat[i + j * N]);
    //        fprintf(stderr, "\n");
    //    }
    //}
    //{
    //    double *mat_copy = malloc(SQR(N) * sizeof(double));
    //    double *svd = malloc(N * sizeof(double));
    //    double *rhs_copy = malloc(N * sizeof(double));
    //    int rank;

    //    memcpy(mat_copy, mat, SQR(N) * sizeof(double));
    //    memcpy(rhs_copy, rhs, N * sizeof(double));

    //    LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N,
    //                   svd, 1e-13, &rank);

    //    free(mat_copy);
    //    for (int i = 0; i < N; i++) {
    //        if (i > 5 && i < N - 5)
    //            continue;

    //        fprintf(stderr, "%g\t", svd[i]);
    //    }
    //    fprintf(stderr, "\n rank %d\n", rank);
    //    free(svd);
    //    free(rhs_copy);

    //    if (rank < N)
    //        ret = 1;
    //}

    //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
    //              mat, N, ipiv, rhs, N);
    LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1,
                   mat, N, mat_f, N, ipiv, &equed, NULL, NULL,
                   rhs, N, x, N, &cond, &ferr, &berr, &rpivot);
    LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv);
    memcpy(rhs, x, N * sizeof(double));
    memcpy(mat, mat_f, SQR(N) * sizeof(double));

    fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: "
            "condition number %16.16g; forward error %16.16g backward error %16.16g\n",
            N, N, cond, ferr, berr);

    free(mat_f);
    free(x);
#endif

    return ret;
}

int qms_pssolve_solve(PSSolveContext *ctx,
                      const double * const eq_coeffs[PSSOLVE_DIFF_ORDER_NB],
                      const double *rhs, double *coeffs)
{
    PSSolvePriv *s = ctx->priv;
    const int N = NB_COEFFS(s);
    double rhs_max;
    int64_t start;

    int ret = 0;

    /* fill the matrix */
    CCTK_TimerStart("QuasiMaximalSlicing_construct_matrix");
    start = gettime();
    ret = construct_matrix(ctx, eq_coeffs);
    ctx->construct_matrix_time += gettime() - start;
    ctx->construct_matrix_count++;
    CCTK_TimerStop("QuasiMaximalSlicing_construct_matrix");
    if (ret < 0)
        return ret;

#if 0
    if (rhs_max < EPS) {
        fprintf(stderr, "zero rhs\n");
        memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
        if (ms->cl_queue) {
            clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double),
                                 ms->coeffs, 0, NULL, NULL);
        }
        return 0;
    }
#endif

    /* solve for the coeffs */
    if (s->steps_since_inverse < 1024) {
        int64_t start;

        start = gettime();

        CCTK_TimerStart("QuasiMaximalSlicing_solve_BiCGSTAB");
        ret = qms_bicgstab_solve(s->bicgstab, s->mat, rhs, coeffs);
        CCTK_TimerStop("QuasiMaximalSlicing_solve_BiCGSTAB");

        if (ret >= 0) {
            ctx->cg_time_total += gettime() - start;
            ctx->cg_solve_count++;
            ctx->cg_iter_count += ret + 1;
            s->steps_since_inverse++;

        }
    } else
        ret = -1;

    if (ret < 0) {
        int64_t start;

        CCTK_TimerStart("QuasiMaximalSlicing_solve_LU");
        start = gettime();

        memcpy(coeffs, rhs, N * sizeof(*rhs));

        ret = lu_invert(N, s->mat, coeffs, s->ipiv);
        ctx->lu_solves_time += gettime() - start;
        ctx->lu_solves_count++;
        CCTK_TimerStop("QuasiMaximalSlicing_solve_LU");

        ret = qms_bicgstab_init(s->bicgstab, s->mat, coeffs);

        s->steps_since_inverse = 0;
    }

    return ret;
}

int qms_pssolve_context_init(PSSolveContext *ctx)
{
    PSSolvePriv *s = ctx->priv;
    double *basis_val[2][3] = { { NULL } };

    int ret = 0;

    if (ctx->solve_order[0] <= 0 || ctx->solve_order[1] <= 0)
        return -EINVAL;
    s->nb_coeffs[0]         = ctx->solve_order[0];
    s->nb_coeffs[1]         = ctx->solve_order[1];
    s->nb_colloc_points[0]  = ctx->solve_order[0];
    s->nb_colloc_points[1]  = ctx->solve_order[1];
    s->colloc_grid_order[0] = ctx->solve_order[0];
    s->colloc_grid_order[1] = ctx->solve_order[1];

    s->steps_since_inverse = INT_MAX;

    /* init the BiCGStab solver */
    ret = qms_bicgstab_context_alloc(&s->bicgstab, NB_COEFFS(s), ctx->ocl_ctx,
                                     ctx->ocl_queue);
    if (ret < 0)
        return ret;

    /* compute the collocation grid */
    posix_memalign((void**)&ctx->colloc_grid[0], 32, s->nb_colloc_points[0] * sizeof(*ctx->colloc_grid[0]));
    posix_memalign((void**)&ctx->colloc_grid[1], 32, s->nb_colloc_points[1] * sizeof(*ctx->colloc_grid[1]));
    if (!ctx->colloc_grid[0] || !ctx->colloc_grid[1])
        return -ENOMEM;

    for (int i = 0; i < s->nb_colloc_points[0]; i++)
        ctx->colloc_grid[0][i] = ctx->basis[0]->colloc_point(s->colloc_grid_order[0], i);
    for (int i = 0; i < s->nb_colloc_points[1]; i++)
        ctx->colloc_grid[1][i] = ctx->basis[1]->colloc_point(s->colloc_grid_order[1], i);

    /* precompute the basis values we will need */
    for (int i = 0; i < ARRAY_ELEMS(basis_val); i++) {
        for (int j = 0; j < ARRAY_ELEMS(basis_val[i]); j++) {
            int ret = posix_memalign((void**)&basis_val[i][j], 32,
                                     sizeof(*basis_val[i][j]) * s->nb_coeffs[i] * s->nb_colloc_points[i]);
            if (ret) {
                ret = -ENOMEM;
                goto fail;
            }
        }

        for (int j = 0; j < s->nb_colloc_points[i]; j++) {
            double coord = ctx->colloc_grid[i][j];
            for (int k = 0; k < s->nb_coeffs[i]; k++) {
                basis_val[i][0][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval      (coord, k);
                basis_val[i][1][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval_diff1(coord, k);
                basis_val[i][2][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval_diff2(coord, k);
            }
        }
    }

    for (int i = 0; i < ARRAY_ELEMS(s->basis_val); i++) {
        ret = posix_memalign((void**)&s->basis_val[i], 32, NB_COLLOC_POINTS(s) * NB_COEFFS(s) * sizeof(*s->basis_val[i]));
        if (ret) {
            ret = -ENOMEM;
            goto fail;
        }
    }

    for (int i = 0; i < s->nb_colloc_points[1]; i++) {
        const double *basis_z   = basis_val[1][0] + i * s->nb_coeffs[1];
        const double *dbasis_z  = basis_val[1][1] + i * s->nb_coeffs[1];
        const double *d2basis_z = basis_val[1][2] + i * s->nb_coeffs[1];

        for (int j = 0; j < s->nb_colloc_points[0]; j++) {
            const double *basis_x   = basis_val[0][0] + j * s->nb_coeffs[0];
            const double *dbasis_x  = basis_val[0][1] + j * s->nb_coeffs[0];
            const double *d2basis_x = basis_val[0][2] + j * s->nb_coeffs[0];
            const int idx_grid = i * s->nb_colloc_points[0] + j;

#if QMS_POLAR
            double r     = ctx->colloc_grid[0][j];
            double theta = ctx->colloc_grid[1][i];

            double x = r * cos(theta);
            double z = r * sin(theta);
#else
            double x = ctx->colloc_grid[0][j];
            double z = ctx->colloc_grid[1][i];
#endif

            for (int k = 0; k < s->nb_coeffs[1]; k++)
                for (int l = 0; l < s->nb_coeffs[0]; l++) {
                    const int idx_coeff = k * s->nb_coeffs[0] + l;
                    const int idx = idx_grid + NB_COLLOC_POINTS(s) * idx_coeff;
                    s->basis_val[PSSOLVE_DIFF_ORDER_00][idx] =   basis_x[l] *   basis_z[k];
#if QMS_POLAR
                    s->basis_val[PSSOLVE_DIFF_ORDER_10][idx] =  ((r > EPS) ? (dbasis_x[l] * basis_z[k] * x / r - basis_x[l] * dbasis_z[k] * z / SQR(r)) : 0.0);
                    s->basis_val[PSSOLVE_DIFF_ORDER_01][idx] =  ((r > EPS) ? (dbasis_x[l] * basis_z[k] * z / r + basis_x[l] * dbasis_z[k] * x / SQR(r)) : 0.0);
                    s->basis_val[PSSOLVE_DIFF_ORDER_20][idx] =  ((r > EPS) ? (SQR(x / r) * d2basis_x[l] * basis_z[k] + SQR(z / SQR(r)) * basis_x[l] * d2basis_z[k]
                                                           + (1.0 - SQR(x / r)) / r * dbasis_x[l] * basis_z[k]
                                                           + 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k]
                                                           - 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
                    s->basis_val[PSSOLVE_DIFF_ORDER_02][idx] =  ((r > EPS) ? (SQR(z / r) * d2basis_x[l] * basis_z[k] + SQR(x / SQR(r)) * basis_x[l] * d2basis_z[k]
                                                           + (1.0 - SQR(z / r)) / r * dbasis_x[l] * basis_z[k]
                                                           - 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k]
                                                           + 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
                    s->basis_val[PSSOLVE_DIFF_ORDER_11][idx] =  ((r > EPS) ? (x * z / SQR(r) * d2basis_x[l] * basis_z[k] - x * z / SQR(SQR(r)) * basis_x[l] * d2basis_z[k]
                                                           - x * z / (r * SQR(r)) * dbasis_x[l] * basis_z[k]
                                                           - (1.0 - SQR(z / r)) / SQR(r) * basis_x[l] * dbasis_z[k]
                                                           + (SQR(x) - SQR(z)) / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
#else
                    s->basis_val[PSSOLVE_DIFF_ORDER_10][idx] =  dbasis_x[l] *   basis_z[k];
                    s->basis_val[PSSOLVE_DIFF_ORDER_01][idx] =   basis_x[l] *  dbasis_z[k];
                    s->basis_val[PSSOLVE_DIFF_ORDER_20][idx] = d2basis_x[l] *   basis_z[k];
                    s->basis_val[PSSOLVE_DIFF_ORDER_02][idx] =   basis_x[l] * d2basis_z[k];
                    s->basis_val[PSSOLVE_DIFF_ORDER_11][idx] =  dbasis_x[l] *  dbasis_z[k];
#endif
                }
        }
    }

    ret  = posix_memalign((void**)&s->ipiv, 32, sizeof(*s->ipiv) * NB_COEFFS(s));
    ret |= posix_memalign((void**)&s->mat,  32, sizeof(*s->mat)  * NB_COEFFS(s) * NB_COLLOC_POINTS(s));
    if (ret) {
        ret = -ENOMEM;
        goto fail;
    }

fail:
    for (int i = 0; i < ARRAY_ELEMS(basis_val); i++)
        for (int j = 0; j < ARRAY_ELEMS(basis_val[i]); j++)
            free(basis_val[i][j]);

    return ret;
}

int qms_pssolve_context_alloc(PSSolveContext **pctx)
{
    PSSolveContext *ctx = calloc(1, sizeof(*ctx));

    if (!ctx)
        return -ENOMEM;

    ctx->priv = calloc(1, sizeof(*ctx->priv));
    if (!ctx->priv)
        goto fail;

    *pctx = ctx;
    return 0;
fail:
    qms_pssolve_context_free(&ctx);
    return -ENOMEM;
}

void qms_pssolve_context_free(PSSolveContext **pctx)
{
    PSSolveContext *ctx = *pctx;

    if (!ctx)
        return;

    if (ctx->priv) {
        for (int i = 0; i < ARRAY_ELEMS(ctx->priv->basis_val); i++)
            free(ctx->priv->basis_val[i]);

        free(ctx->priv->ipiv);
        free(ctx->priv->mat);

        qms_bicgstab_context_free(&ctx->priv->bicgstab);
    }

    free(ctx->priv);

    free(ctx->colloc_grid[0]);
    free(ctx->colloc_grid[1]);

    free(ctx);
    *pctx = NULL;
}