/*
 * BiCGStab iterative linear system solver
 * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "config.h"

#if HAVE_OPENCL
#include <cl.h>
#include <clBLAS.h>
#endif

#include <cblas.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>

#include "bicgstab.h"
#include "common.h"

#define BICGSTAB_MAXITER 16
#define BICGSTAB_TOL (1e-15)

struct BiCGStabContext {
    int N;

    double *x;
    double *p, *v, *y, *z, *t;
    double *res, *res0;
    double *k;

#if HAVE_OPENCL
    cl_context       ocl_ctx;
    cl_command_queue ocl_queue;

    cl_mem cl_x;
    cl_mem cl_p, cl_v, cl_y, cl_z, cl_t;
    cl_mem cl_res, cl_res0;
    cl_mem cl_k, cl_mat;
    cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1;
    cl_mem cl_tmp, cl_tmp1;
#endif
};

#if HAVE_OPENCL
static int solve_cl(BiCGStabContext *ctx,
                    const double *mat, const double *rhs, double *x)
{
    cl_command_queue ocl_q = ctx->ocl_queue;
    const int            N = ctx->N;
    const double rhs_norm = cblas_dnrm2(N, rhs, 1);

    double rho, rho_prev = 1.0;
    double omega[2] = { 1.0 };
    double alpha = 1.0;

    double err;
    int i;

    cl_event events[8];

    // upload the matrix and RHS
    clEnqueueWriteBuffer(ocl_q, ctx->cl_res, 0, 0, N * sizeof(double),     rhs, 0, NULL, &events[0]);
    clEnqueueWriteBuffer(ocl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]);

    // initialize the residual
    clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
                ctx->cl_mat, 0, N, ctx->cl_x, 0, 1, 1.0, ctx->cl_res, 0, 1,
                1, &ocl_q, 2, events, &events[2]);
    clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double),
                        1, &events[2], &events[3]);
    clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double),
                        1, &events[2], &events[4]);

    clWaitForEvents(5, events);
    // BARRIER

    for (i = 0; i < MAXITER; i++) {
        clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1,
                   ctx->cl_tmp, 1, &ocl_q, 0, NULL, &events[0]);
        clEnqueueReadBuffer(ocl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho,
                            1, &events[0], NULL);
        // BARRIER

        if (i) {
            double beta = (rho / rho_prev) * (alpha / omega[0]);

            clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1,
                        1, &ocl_q, 0, NULL, &events[0]);
            clblasDscal(N, beta, ctx->cl_p, 0, 1,
                        1, &ocl_q, 1, &events[0], &events[1]);
            clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1,
                        1, &ocl_q, 1, &events[1], &events[0]);
            clWaitForEvents(1, &events[0]);
            // BARRIER
        }

        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1,
                    1, &ocl_q, 0, NULL, &events[0]);

        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1,
                    1, &ocl_q, 1, &events[0], &events[1]);

        clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1,
                   ctx->cl_tmp, 1, &ocl_q, 1, &events[1], &events[0]);
        clEnqueueReadBuffer(ocl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha,
                            1, &events[0], NULL);
        // BARRIER

        alpha = rho / alpha;

        clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1,
                    1, &ocl_q, 0, NULL, &events[0]);

        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1,
                    1, &ocl_q, 1, &events[0], &events[1]);
        clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1,
                    1, &ocl_q, 1, &events[1], &events[0]);

        clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
                   ctx->cl_tmp, 1, &ocl_q, 1, &events[0], &events[1]);
        clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1,
                   ctx->cl_tmp1, 1, &ocl_q, 1, &events[0], &events[2]);

        clEnqueueReadBuffer(ocl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega,
                            2, &events[1], NULL);
        // BARRIER

        omega[0] /= omega[1];

        clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ctx->cl_x, 0, 1,
                    1, &ocl_q, 0, NULL, &events[0]);
        clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ctx->cl_x, 0, 1,
                    1, &ocl_q, 1, &events[0], &events[1]);

        clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
                    1, &ocl_q, 0, NULL, &events[0]);
        clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1,
                    1, &ocl_q, 1, &events[0], &events[2]);
        clEnqueueReadBuffer(ocl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err,
                            1, &events[2], NULL);
        clWaitForEvents(1, &events[1]);
        // BARRIER

        if (err < BICGSTAB_TOL)
            break;

        rho_prev = rho;
    }
    if (i == BICGSTAB_MAXITER)
        return -1;

    clEnqueueReadBuffer(ocl_q, ctx->cl_x, 1, 0, sizeof(double) * N,
                        x, 0, NULL, NULL);
    return i;
}
#endif

// based on the wikipedia article
// and http://www.netlib.org/templates/matlab/bicgstab.m
static int solve_sw(BiCGStabContext *ctx,
                    const double *mat, const double *rhs, double *x)
{
    const int N = ctx->N;
    const double rhs_norm = cblas_dnrm2(N, rhs, 1);

    double rho, rho_prev = 1.0;
    double omega = 1.0;
    double alpha = 1.0;

    double err;
    int i;

    double *k = ctx->k;
    double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t;
    double *res = ctx->res, *res0 = ctx->res0;

    // initialize the residual
    memcpy(res, rhs, N * sizeof(*res));
    cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
                mat, N, ctx->x, 1, 1.0, res, 1);

    memcpy(res0, res, N * sizeof(*res0));
    memcpy(p,    res, N * sizeof(*p));

    for (i = 0; i < BICGSTAB_MAXITER; i++) {
        rho = cblas_ddot(N, res, 1, res0, 1);

        if (i) {
            double beta = (rho / rho_prev) * (alpha / omega);

            cblas_daxpy(N, -omega, v, 1, p, 1);
            cblas_dscal(N, beta, p, 1);
            cblas_daxpy(N, 1, res, 1, p, 1);
        }

        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    k, N, p, 1, 0.0, y, 1);

        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    mat, N, y, 1, 0.0, v, 1);

        alpha = rho / cblas_ddot(N, res0, 1, v, 1);

        cblas_daxpy(N, -alpha, v, 1, res, 1);

        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    k, N, res, 1, 0.0, z, 1);
        cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
                    mat, N, z, 1, 0.0, t, 1);

        omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1);

        cblas_daxpy(N, alpha, y, 1, ctx->x, 1);
        cblas_daxpy(N, omega, z, 1, ctx->x, 1);

        cblas_daxpy(N, -omega, t, 1, res, 1);

        err = cblas_dnrm2(N, res, 1) / rhs_norm;
        if (err < BICGSTAB_TOL)
            break;

        rho_prev = rho;
    }
    if (i == BICGSTAB_MAXITER)
        return -1;

    memcpy(x, ctx->x, sizeof(*x) * ctx->N);

    return i;
}

int tdi_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x)
{
    int ret;

#if HAVE_OPENCL
    if (ctx->ocl_ctx)
        ret = solve_cl(ctx, mat, rhs, x);
    else
#endif
        ret = solve_sw(ctx, mat, rhs, x);
    if (ret < 0)
        return ret;

#if MD_VERIFY
    {
        int i;
        double *y;

        y = malloc(sizeof(*y) * ctx->N);
        memcpy(y, rhs, sizeof(*y) * ctx->N);
        cblas_dgemv(CblasColMajor, CblasNoTrans, ctx->N, ctx->N, -1.0,
                    mat, ctx->N, x, 1, 1.0, y, 1);
        i = cblas_idamax(ctx->N, y, 1);
        if (fabs(y[i]) > 1e-11)
            abort();
    }
#endif

    return ret;
}

int tdi_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0)
{
#if HAVE_OPENCL
    if (ctx->ocl_ctx) {
        cl_event events[2];
        clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_k, 0, 0, ctx->N * ctx->N * sizeof(double),
                             k, 0, NULL, &events[0]);
        clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_x, 0, 0, ctx->N * sizeof(double),
                             x0, 0, NULL, &events[1]);
        clWaitForEvents(2, events);
    } else
#endif
    {
        memcpy(ctx->x, x0, ctx->N * sizeof(*x0));
        memcpy(ctx->k, k,  ctx->N * ctx->N * sizeof(*k));
    }

    return 0;
}

int tdi_bicgstab_context_alloc(BiCGStabContext **pctx, int N,
                               cl_context ocl_ctx, cl_command_queue ocl_q)
{
    BiCGStabContext *ctx;
    int ret = 0;

    ctx = calloc(1, sizeof(*ctx));
    if (!ctx)
        return -ENOMEM;

    ctx->N = N;

#if HAVE_OPENCL
    if (ocl_ctx) {
        ctx->ocl_ctx   = ocl_ctx;
        ctx->ocl_queue = ocl_q;

#define ALLOC(dst, size)                                        \
do {                                                            \
    ctx->dst = clCreateBuffer(ocl_ctx, 0, size, NULL, &ret);    \
    if (ret != CL_SUCCESS)                                      \
        goto fail;                                              \
} while (0)

        ALLOC(cl_x,    N * sizeof(double));
        ALLOC(cl_p,    N * sizeof(double));
        ALLOC(cl_v,    N * sizeof(double));
        ALLOC(cl_y,    N * sizeof(double));
        ALLOC(cl_z,    N * sizeof(double));
        ALLOC(cl_t,    N * sizeof(double));
        ALLOC(cl_res,  N * sizeof(double));
        ALLOC(cl_res0, N * sizeof(double));
        ALLOC(cl_tmp,  N * sizeof(double));
        ALLOC(cl_tmp1, N * 2 * sizeof(double));

        ALLOC(cl_k,   N * N * sizeof(double));
        ALLOC(cl_mat, N * N * sizeof(double));

        ALLOC(cl_rho,    sizeof(double));
        ALLOC(cl_alpha,  sizeof(double));
        ALLOC(cl_beta,   sizeof(double));
        ALLOC(cl_omega,  2 * sizeof(double));
        ALLOC(cl_omega1, sizeof(double));
    } else
#endif
    {
        ret |= posix_memalign((void**)&ctx->x,    32, sizeof(double) * N);
        ret |= posix_memalign((void**)&ctx->p,    32, sizeof(double) * N);
        ret |= posix_memalign((void**)&ctx->v,    32, sizeof(double) * N);
        ret |= posix_memalign((void**)&ctx->y,    32, sizeof(double) * N);
        ret |= posix_memalign((void**)&ctx->z,    32, sizeof(double) * N);
        ret |= posix_memalign((void**)&ctx->t,    32, sizeof(double) * N);
        ret |= posix_memalign((void**)&ctx->res,  32, sizeof(double) * N);
        ret |= posix_memalign((void**)&ctx->res0, 32, sizeof(double) * N);
        ret |= posix_memalign((void**)&ctx->k,    32, sizeof(double) * N * N);
    }

fail:
    if (ret) {
        tdi_bicgstab_context_free(&ctx);
        return -ENOMEM;
    }

    *pctx = ctx;
    return 0;
}

void tdi_bicgstab_context_free(BiCGStabContext **pctx)
{
    BiCGStabContext *ctx = *pctx;

    if (!ctx)
        return;

    free(ctx->x);
    free(ctx->p);
    free(ctx->v);
    free(ctx->y);
    free(ctx->z);
    free(ctx->t);
    free(ctx->res);
    free(ctx->res0);
    free(ctx->k);

#if HAVE_OPENCL
    if (ctx->ocl_ctx) {
        clReleaseMemObject(ctx->cl_x);
        clReleaseMemObject(ctx->cl_p);
        clReleaseMemObject(ctx->cl_v);
        clReleaseMemObject(ctx->cl_y);
        clReleaseMemObject(ctx->cl_z);
        clReleaseMemObject(ctx->cl_t);
        clReleaseMemObject(ctx->cl_res);
        clReleaseMemObject(ctx->cl_res0);
        clReleaseMemObject(ctx->cl_tmp);
        clReleaseMemObject(ctx->cl_tmp1);

        clReleaseMemObject(ctx->cl_k);
        clReleaseMemObject(ctx->cl_mat);

        clReleaseMemObject(ctx->cl_rho);
        clReleaseMemObject(ctx->cl_alpha);
        clReleaseMemObject(ctx->cl_beta);
        clReleaseMemObject(ctx->cl_omega);
        clReleaseMemObject(ctx->cl_omega1);
    }
#endif

    free(ctx);
    *pctx = NULL;
}