/*
 * Copyright 2017 Anton Khirnov <anton@khirnov.net>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "config.h"

#include <errno.h>
#include <float.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>

#include <cblas.h>

#if HAVE_OPENCL
#include <cl.h>
#include <clBLAS.h>
#endif

#include "basis.h"
#include "common.h"
#include "cpu.h"
#include "log.h"
#include "nlsolve.h"
#include "td_constraints.h"
#include "teukolsky_data.h"
#include "threadpool.h"

#define NB_EQUATIONS 3

typedef struct TDPriv {
    unsigned int basis_order[NB_EQUATIONS][2];
    BasisSetContext *basis[NB_EQUATIONS][2];

    ThreadPoolContext *tp;
    TDLogger logger;

    double *coeffs;
    double *coeffs_tmp;
    ptrdiff_t coeffs_stride;

    double *coeffs_lapse;

    int cpu_flags;

    double (*scalarproduct_metric)(size_t len1, size_t len2, double *mat,
                                   double *vec1, double *vec2);
} TDPriv;

double tdi_scalarproduct_metric_fma3(size_t len1, size_t len2, double *mat,
                                     double *vec1, double *vec2);
double tdi_scalarproduct_metric_avx(size_t len1, size_t len2, double *mat,
                                    double *vec1, double *vec2);
double tdi_scalarproduct_metric_sse3(size_t len1, size_t len2, double *mat,
                                     double *vec1, double *vec2);
double tdi_scalarproduct_metric_c(size_t len1, size_t len2, double *mat,
                                  double *vec1, double *vec2);

static void init_opencl(TDPriv *s)
#if HAVE_OPENCL
{
    int err, count;
    cl_platform_id platform;
    cl_context_properties props[3];
    cl_device_id ocl_device;

    err = clGetPlatformIDs(1, &platform, &count);
    if (err != CL_SUCCESS || count < 1) {
        tdi_log(&s->logger, 0, "Could not get an OpenCL platform ID\n");
        return;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ocl_device, &count);
    if (err != CL_SUCCESS || count < 1) {
        tdi_log(&s->logger, 0, "Could not get an OpenCL device ID\n");
        return;
    }

    props[0] = CL_CONTEXT_PLATFORM;
    props[1] = (cl_context_properties)platform;
    props[2] = 0;

    s->ocl_ctx = clCreateContext(props, 1, &ocl_device, NULL, NULL, &err);
    if (err != CL_SUCCESS || !s->ocl_ctx) {
        tdi_log(&s->logger, 0, "Could not create an OpenCL context\n");
        return;
    }

    s->ocl_queue = clCreateCommandQueue(s->ocl_ctx, ocl_device, 0, &err);
    if (err != CL_SUCCESS || !s->ocl_queue) {
        tdi_log(&s->logger, 0, "Could not create an OpenCL command queue: %d\n", err);
        goto fail;
    }

    err = clblasSetup();
    if (err != CL_SUCCESS) {
        tdi_log(&s->logger, 0, "Error setting up clBLAS\n");
        goto fail;
    }

    return;
fail:
    if (s->ocl_queue)
        clReleaseCommandQueue(s->ocl_queue);
    s->ocl_queue = 0;

    if (s->ocl_ctx)
        clReleaseContext(s->ocl_ctx);
    s->ocl_ctx = 0;
}
#else
{
}
#endif

static const enum BasisFamily basis_sets[NB_EQUATIONS][2] = {
    { BASIS_FAMILY_SB_EVEN,  BASIS_FAMILY_COS_EVEN },
    { BASIS_FAMILY_SB_EVEN,  BASIS_FAMILY_COS_EVEN },
    { BASIS_FAMILY_SB_EVEN,  BASIS_FAMILY_COS_EVEN },
};

static void log_callback(TDLogger *log, int level, const char *fmt, va_list vl)
{
    TDContext *td = log->opaque;
    td->log_callback(td, level, fmt, vl);
}

static int teukolsky_init_check_options(TDContext *td)
{
    TDPriv *s = td->priv;
    int ret;

    s->cpu_flags = tdi_init_cpu_flags();

    if (!td->nb_threads) {
        td->nb_threads = tdi_cpu_count();
        if (!td->nb_threads)
            td->nb_threads = 1;
    }

    ret = tdi_threadpool_init(&s->tp, td->nb_threads);
    if (ret < 0)
        return ret;

    init_opencl(s);

    s->logger.log    = log_callback;
    s->logger.opaque = td;

    //s->scalarproduct_metric = tdi_scalarproduct_metric_c;
    //if (EXTERNAL_SSE3(s->cpu_flags))
    //    s->scalarproduct_metric = tdi_scalarproduct_metric_sse3;
    //if (EXTERNAL_AVX(s->cpu_flags))
    //    s->scalarproduct_metric = tdi_scalarproduct_metric_avx;
    //if (EXTERNAL_FMA3(s->cpu_flags))
    //    s->scalarproduct_metric = tdi_scalarproduct_metric_fma3;
//    for (int i = 0; i < ctx->nb_equations; i++)
//        for (int j = 0; j < 2; j++) {
//            s->ps_ctx->basis[i][j]       = ctx->basis[i][j];
//            s->ps_ctx->solve_order[i][j] = basis_order[i][j];
//            max_order = MAX(max_order, basis_order[i][j]);
//        }
//
    //for (int i = 0; i < max_order; i++) {
    //    tdi_log(&s->logger, 2, "%d ", i);
    //    for (int j = 0; j < ctx->nb_equations; j++)
    //        for (int k = 0; k < 2; k++) {
    //            if (i < s->ps_ctx->solve_order[j][k])
    //                tdi_log(&s->logger, 2, "%8.8g\t", s->ps_ctx->colloc_grid[j][k][i]);
    //            else
    //                tdi_log(&s->logger, 2, "        ");
    //        }
    //    tdi_log(&s->logger, 2, "\n");
    //}

    s->basis_order[0][0] = td->nb_coeffs[0];
    s->basis_order[0][1] = td->nb_coeffs[1];
    s->basis_order[1][0] = td->nb_coeffs[0];
    s->basis_order[1][1] = td->nb_coeffs[1];
    s->basis_order[2][0] = td->nb_coeffs[0];
    s->basis_order[2][1] = td->nb_coeffs[1];

    ret = posix_memalign((void**)&s->coeffs, 32,
                         sizeof(*s->coeffs) * NB_EQUATIONS * td->nb_coeffs[0] * td->nb_coeffs[1]);
    if (ret)
        return -ENOMEM;
    memset(s->coeffs, 0, sizeof(*s->coeffs) * NB_EQUATIONS * td->nb_coeffs[0] * td->nb_coeffs[1]);

    if (td->solution_branch > 0) {
        ret = posix_memalign((void**)&s->coeffs_tmp, 32,
                             sizeof(*s->coeffs_tmp) * NB_EQUATIONS * td->nb_coeffs[0] * td->nb_coeffs[1]);
        if (ret)
            return -ENOMEM;
        memset(s->coeffs_tmp, 0, sizeof(*s->coeffs_tmp) * NB_EQUATIONS * td->nb_coeffs[0] * td->nb_coeffs[1]);
    }

    for (int i = 0; i < NB_EQUATIONS; i++)
        td->coeffs[i] = s->coeffs + i * td->nb_coeffs[0] * td->nb_coeffs[1];

    for (int i = 0; i < ARRAY_ELEMS(basis_sets); i++)
        for (int j = 0; j < ARRAY_ELEMS(basis_sets[i]); j++) {
            double sf;

            ret = tdi_basis_init(&s->basis[i][j], basis_sets[i][j], td->basis_scale_factor[j]);
            if (ret < 0)
                return ret;
        }

    return 0;
}

static int constraint_eval_alloc(const TDContext *td, const unsigned int *nb_coords,
                                 const double * const *coords,
                                 double amplitude, TDConstraintEvalContext **pce)
{
    TDPriv *priv = td->priv;
    TDConstraintEvalContext *ce;
    int ret;

    ret = tdi_constraint_eval_alloc(&ce, td->family);
    if (ret < 0) {
        tdi_log(&priv->logger, 0, "Error allocating the constraints evaluator\n");
        return ret;
    }

    ce->logger       = priv->logger;
    ce->amplitude    = amplitude;
    ce->nb_coords[0] = nb_coords[0];
    ce->nb_coords[1] = nb_coords[1];
    ce->coords[0]    = coords[0];
    ce->coords[1]    = coords[1];

    ret = tdi_constraint_eval_init(ce);
    if (ret < 0) {
        tdi_constraint_eval_free(&ce);
        tdi_log(&priv->logger, 0, "Error initializing the constraints evaluator\n");
        return ret;
    }

    *pce = ce;
    return 0;
}

static int nlsolve_alloc(const TDContext *td, NLSolveContext **pnl)
{
    TDPriv *s = td->priv;
    NLSolveContext *nl;
    int ret;

    ret = tdi_nlsolve_context_alloc(&nl, ARRAY_ELEMS(basis_sets));
    if (ret < 0) {
        tdi_log(&s->logger, 0, "Error allocating the non-linear solver\n");
        return ret;
    }

    nl->logger  = s->logger;
    nl->tp      = s->tp;
    nl->maxiter = td->max_iter;
    nl->atol    = td->atol;

    memcpy(nl->basis,       s->basis,       sizeof(s->basis));
    memcpy(nl->solve_order, s->basis_order, sizeof(s->basis_order));

#if HAVE_OPENCL
    nl->ocl_ctx   = s->ocl_ctx;
    nl->ocl_queue = s->ocl_queue;
#endif

    ret = tdi_nlsolve_context_init(nl);
    if (ret < 0) {
        tdi_log(&s->logger, 0, "Error initializing the non-linear solver\n");
        goto fail;
    }

    *pnl = nl;
    return 0;
fail:
    tdi_nlsolve_context_free(&nl);
    return ret;
}

static int teukolsky_constraint_eval(void *opaque, unsigned int eq_idx,
                                     const unsigned int *colloc_grid_order,
                                     const double * const *colloc_grid,
                                     const double * const (*vars)[PSSOLVE_DIFF_ORDER_NB],
                                     double *dst)
{
    TDConstraintEvalContext *ce = opaque;
    const double *amplitude = opaque;

    return tdi_constraint_eval(ce, eq_idx, vars, dst);
}

int td_solve(TDContext *td, double *coeffs_init[3])
{
    TDPriv *s = td->priv;
    NLSolveContext *nl;
    TDConstraintEvalContext *ce;
    double a0;
    int ret;

    ret = teukolsky_init_check_options(td);
    if (ret < 0)
        return ret;

    ret = nlsolve_alloc(td, &nl);
    if (ret < 0)
        goto fail;

    ret = constraint_eval_alloc(td, td->nb_coeffs, nl->colloc_grid[0], 0.0, &ce);
    if (ret < 0)
        goto fail;
    if (fabs(td->amplitude) >= ce->a_diverge) {
        tdi_log(&s->logger, 0,
                "Amplitude A=%16.16g is above the point A_{max}=%g, no solutions "
                "are known to exist there. Set solution_branch=1 to get to the "
                "second branch where mass increases with decreasing amplitude\n",
                td->amplitude, ce->a_converge);
    }
    a0 = ce->a_converge;
    tdi_constraint_eval_free(&ce);

    if (coeffs_init) {
        for (int i = 0; i < 3; i++) {
            memcpy(td->coeffs[i], coeffs_init[i], sizeof(*td->coeffs[i]) *
                   td->nb_coeffs[0] * td->nb_coeffs[1]);
        }
    }

    if (td->solution_branch == 0 || coeffs_init) {
        // direct solve with default (flat space) or user-provided initial guess
        ret = constraint_eval_alloc(td, td->nb_coeffs, nl->colloc_grid[0], td->amplitude, &ce);
        if (ret < 0)
            goto fail;

        ret = tdi_nlsolve_solve(nl, teukolsky_constraint_eval, NULL,
                                ce, s->coeffs, 0);
        tdi_constraint_eval_free(&ce);
        if (ret < 0) {
            tdi_log(&s->logger, 0, "tdi_nlsolve_solve() failed: %d", ret);
            goto fail;
        }
    } else {
        // second branch requested and no user-provided initial guess
        // execute two lower-branch solutions and extrapolate to get to the upper branch
        const int N = NB_EQUATIONS * td->nb_coeffs[0] * td->nb_coeffs[1];
        const double delta = 1e-5;
        const double a1 = a0 - delta;

        double cur_amplitude, new_amplitude, step;

        ret = constraint_eval_alloc(td, td->nb_coeffs, nl->colloc_grid[0], a0, &ce);
        if (ret < 0)
            goto fail;

        ret = tdi_nlsolve_solve(nl, teukolsky_constraint_eval, NULL,
                                ce, s->coeffs, 0);
        tdi_constraint_eval_free(&ce);
        if (ret < 0)
            goto fail;

        ret = constraint_eval_alloc(td, td->nb_coeffs, nl->colloc_grid[0], a1, &ce);
        if (ret < 0)
            goto fail;
        ret = tdi_nlsolve_solve(nl, teukolsky_constraint_eval, NULL,
                                ce, s->coeffs_tmp, 0);
        tdi_constraint_eval_free(&ce);
        if (ret < 0)
            goto fail;

        cblas_daxpy(N, -1.0, s->coeffs,     1, s->coeffs_tmp, 1);
        cblas_daxpy(N, -1.0, s->coeffs_tmp, 1, s->coeffs,     1);

        // obtain solution for a1 in the upper branch
        ret = constraint_eval_alloc(td, td->nb_coeffs, nl->colloc_grid[0], a1, &ce);
        if (ret < 0)
            goto fail;

        ret = tdi_nlsolve_solve(nl, teukolsky_constraint_eval, NULL,
                                ce, s->coeffs, 0);
        tdi_constraint_eval_free(&ce);
        if (ret < 0) {
            tdi_log(&s->logger, 0, "Failed to get into the upper branch\n");
            goto fail;
        }

        cur_amplitude = a1;
        step = (td->amplitude - cur_amplitude) / 128.0;

        while (fabs(cur_amplitude - td->amplitude) > DBL_EPSILON) {
            new_amplitude = cur_amplitude + step;
            if (SGN(step) != SGN(td->amplitude - new_amplitude))
                new_amplitude = td->amplitude;
            tdi_log(&s->logger, 2, "Trying amplitude %g\n", new_amplitude);

            ret = constraint_eval_alloc(td, td->nb_coeffs, nl->colloc_grid[0], new_amplitude, &ce);
            if (ret < 0)
                goto fail;

            memcpy(s->coeffs_tmp, s->coeffs, sizeof(*s->coeffs) * N);
            ret = tdi_nlsolve_solve(nl, teukolsky_constraint_eval, NULL,
                                    ce, s->coeffs_tmp, 1);
            tdi_constraint_eval_free(&ce);
            if (ret == -EDOM) {
                step *= 0.5;
                if (fabs(step) < 1e-5)
                    goto fail;
                continue;
            } else if (ret < 0)
                goto fail;

            if (ret <= nl->maxiter / 2)
                step *= 1.75;

            cur_amplitude = new_amplitude;
            memcpy(s->coeffs, s->coeffs_tmp, sizeof(*s->coeffs) * N);
        }
    }
finish:
    ret = 0;
    tdi_nlsolve_print_stats(nl);
fail:
    tdi_nlsolve_context_free(&nl);

    return ret;
}

static void log_default_callback(const TDContext *td, int level, const char *fmt, va_list vl)
{
    vfprintf(stderr, fmt, vl);
}

TDContext *td_context_alloc(void)
{
    TDContext *td = calloc(1, sizeof(*td));

    if (!td)
        return NULL;

    td->nb_threads = 1;

    td->family          = TD_FAMILY_TIME_ANTISYM_CUBIC;
    td->solution_branch = 0;
    td->amplitude       = 1.0;

    td->nb_coeffs[0] = 32;
    td->nb_coeffs[1] = 16;

    td->basis_scale_factor[0] = 3.0;
    td->basis_scale_factor[1] = 3.0;

    td->max_iter = 16;
    td->atol     = 1e-12;

    td->log_callback = log_default_callback;

    td->priv = calloc(1, sizeof(TDPriv));
    if (!td->priv) {
        free(td);
        return NULL;
    }

    return td;
}

void td_context_free(TDContext **ptd)
{
    TDContext *td = *ptd;
    TDPriv *s;

    if (!td)
        return;

    s = td->priv;

    tdi_threadpool_free(&s->tp);

#if HAVE_OPENCL
    if (s->ocl_queue)
        clReleaseCommandQueue(s->ocl_queue);
    if (s->ocl_ctx)
        clReleaseContext(s->ocl_ctx);
#endif

    for (int i = 0; i < ARRAY_ELEMS(s->basis); i++)
        for (int j = 0; j < ARRAY_ELEMS(s->basis[i]); j++)
            tdi_basis_free(&s->basis[i][j]);

    free(s->coeffs);
    free(s->coeffs_tmp);
    free(s->coeffs_lapse);

    free(td->priv);
    free(td);
    *ptd = NULL;
}

typedef struct MaximalSlicingEval {
    double *psi;
    double *dpsi_r;
    double *dpsi_t;
    double *krr;
    double *kpp;
    double *krt;
} MaximalSlicingEval;

static int maximal_slicing_eval(void *opaque, unsigned int eq_idx,
                                const unsigned int *colloc_grid_order,
                                const double * const *colloc_grid,
                                const double * const (*vars)[PSSOLVE_DIFF_ORDER_NB],
                                double *dst)
{
    MaximalSlicingEval *max = opaque;

    for (int idx1 = 0; idx1 < colloc_grid_order[1]; idx1++) {
        const double theta = colloc_grid[1][idx1];
        const double st  = sin(theta);
        const double st2 = SQR(st);
        const double ct  = cos(theta);

        for (int idx0 = 0; idx0 < colloc_grid_order[0]; idx0++) {
            const double r  = colloc_grid[0][idx0];
            const double r2 = SQR(r);

            const int idx = idx1 * colloc_grid_order[0] + idx0;

            const double alpha      = vars[0][PSSOLVE_DIFF_ORDER_00][idx] + 1.0;
            const double dalpha_r   = vars[0][PSSOLVE_DIFF_ORDER_10][idx];
            const double dalpha_t   = vars[0][PSSOLVE_DIFF_ORDER_01][idx];
            const double d2alpha_rr = vars[0][PSSOLVE_DIFF_ORDER_20][idx];
            const double d2alpha_tt = vars[0][PSSOLVE_DIFF_ORDER_02][idx];
            const double d2alpha_rt = vars[0][PSSOLVE_DIFF_ORDER_11][idx];

            const double d2alpha[3][3] = {
                { d2alpha_rr, d2alpha_rt, 0.0 },
                { d2alpha_rt, d2alpha_tt, 0.0 },
                { 0.0, 0.0, 0.0 },
            };

            const double psi  = max->psi[idx];
            const double psi2 = SQR(psi);
            const double psi3 = psi * psi2;
            const double psi4 = SQR(psi2);

            const double dpsi_r = max->dpsi_r[idx];
            const double dpsi_t = max->dpsi_t[idx];

            const double g[3][3] = {
                { psi4, 0.0, 0.0 },
                { 0.0, psi4 * r2, 0.0 },
                { 0.0, 0.0, psi4 * r2 * st2},
            };
            const double gu[3][3] = {
                { 1.0 / psi4, 0.0, 0.0 },
                { 0.0, 1.0 / (psi4 * r2), 0.0 },
                { 0.0, 0.0, 1.0 / (psi4 * r2 * st2) },
            };

            const double dg[3][3][3] = {
                {
                    { 4.0 * dpsi_r * psi3, 0.0, 0.0 },
                    { 0.0, 4.0 * dpsi_r * psi3 * r2 + 2.0 * r * psi4, 0.0 },
                    { 0.0, 0.0, 4.0 * dpsi_r * psi3 * r2 * st2 + 2.0 * r * psi4 * st2 },
                },
                {
                    { 4.0 * dpsi_t * psi3, 0.0, 0.0 },
                    { 0.0, 4.0 * dpsi_t * psi3 * r2, 0.0 },
                    { 0.0, 0.0, 4.0 * dpsi_t * psi3 * r2 * st2 + 2.0 * psi4 * r2 * st * ct },

                },
                {
                    { 0.0, 0.0, 0.0 },
                    { 0.0, 0.0, 0.0 },
                    { 0.0, 0.0, 0.0 },
                },
            };

            const double krr  = max->krr[idx];
            const double kpp  = max->kpp[idx];
            const double krt  = max->krt[idx];

            const double Km[3][3] = {
                { krr, krt, 0.0 },
                { krt / r2, -(krr + kpp), 0.0 },
                { 0.0, 0.0, kpp },
            };

            double G[3][3][3], X[3];
            double laplace_alpha, k2;

            for (int i = 0; i < 3; i++)
                for (int j = 0; j < 3; j++)
                    for (int k = 0; k < 3; k++) {
                        double val = 0.0;

                        for (int l = 0; l < 3; l++)
                            val += gu[i][l] * (dg[k][j][l] + dg[j][k][l] - dg[l][j][k]);

                        G[i][j][k] = 0.5 * val;
                    }

            for (int i = 0; i < 3; i++) {
                double val = 0.0;
                for (int j = 0; j < 3; j++)
                    for (int k = 0; k < 3; k++)
                        val += gu[j][k] * G[i][j][k];
                X[i] = val;
            }

            laplace_alpha = 0.0;
            for (int i = 0; i < 3; i++)
                for (int j = 0; j < 3; j++)
                    laplace_alpha += gu[i][j] * d2alpha[i][j];

            laplace_alpha -= X[0] * dalpha_r + X[1] * dalpha_t;

            k2 = 0.0;
            for (int i = 0; i < 3; i++)
                for (int j = 0; j < 3; j++)
                    k2 += Km[i][j] * Km[j][i];

            dst[idx] = laplace_alpha - alpha * k2; 
        }
    }

    return 0;
}

static int lapse_solve_max(const TDContext *td)
{
    MaximalSlicingEval max = { NULL };
    TDPriv *priv = td->priv;
    NLSolveContext *nl;
    double *coords_r = NULL;
    double *coords_t = NULL;
    double *coeffs;
    int ret = 0;

    ret = tdi_nlsolve_context_alloc(&nl, 1);

    if (ret < 0) {
        tdi_log(&priv->logger, 0, "Error allocating the non-linear solver\n");
        return ret;
    }

    nl->logger  = priv->logger;
    nl->tp      = priv->tp;
    nl->maxiter = td->max_iter;
    nl->atol    = td->atol;

    nl->basis[0][0]       = priv->basis[0][0];
    nl->basis[0][1]       = priv->basis[0][1];
    nl->solve_order[0][0] = priv->basis_order[0][0];
    nl->solve_order[0][1] = priv->basis_order[0][1];

#if HAVE_OPENCL
    nl->ocl_ctx   = priv->ocl_ctx;
    nl->ocl_queue = priv->ocl_queue;
#endif

    ret = tdi_nlsolve_context_init(nl);
    if (ret < 0) {
        tdi_log(&priv->logger, 0, "Error initializing the non-linear solver\n");
        goto fail;
    }

    ret  = posix_memalign((void**)&max.psi,  32, sizeof(*max.psi) * td->nb_coeffs[0] * td->nb_coeffs[1]);
    ret |= posix_memalign((void**)&max.dpsi_r,  32, sizeof(*max.psi) * td->nb_coeffs[0] * td->nb_coeffs[1]);
    ret |= posix_memalign((void**)&max.dpsi_t,  32, sizeof(*max.psi) * td->nb_coeffs[0] * td->nb_coeffs[1]);
    ret |= posix_memalign((void**)&max.krr,  32, sizeof(*max.krr) * td->nb_coeffs[0] * td->nb_coeffs[1]);
    ret |= posix_memalign((void**)&max.kpp,  32, sizeof(*max.kpp) * td->nb_coeffs[0] * td->nb_coeffs[1]);
    ret |= posix_memalign((void**)&max.krt,  32, sizeof(*max.krt) * td->nb_coeffs[0] * td->nb_coeffs[1]);
    ret |= posix_memalign((void**)&coords_r, 32, sizeof(*coords_r) * td->nb_coeffs[0] * td->nb_coeffs[1]);
    ret |= posix_memalign((void**)&coords_t, 32, sizeof(*coords_t) * td->nb_coeffs[0] * td->nb_coeffs[1]);
    ret |= posix_memalign((void**)&coeffs,   32, sizeof(*coeffs) * td->nb_coeffs[0] * td->nb_coeffs[1]);
    if (ret) {
        ret = -ENOMEM;
        goto fail;
    }

    for (int j = 0; j < td->nb_coeffs[1]; j++) {
        for (int i = 0; i < td->nb_coeffs[0]; i++) {
            coords_r[j * td->nb_coeffs[0] + i] = nl->colloc_grid[0][0][i];
            coords_t[j * td->nb_coeffs[0] + i] = nl->colloc_grid[0][1][j];
        }
    }

    td_eval_psi(td, td->nb_coeffs[0] * td->nb_coeffs[1], coords_r, coords_t,
                (const unsigned int [2]){ 0, 0 }, max.psi);
    td_eval_psi(td, td->nb_coeffs[0] * td->nb_coeffs[1], coords_r, coords_t,
                (const unsigned int [2]){ 1, 0 }, max.dpsi_r);
    td_eval_psi(td, td->nb_coeffs[0] * td->nb_coeffs[1], coords_r, coords_t,
                (const unsigned int [2]){ 0, 1 }, max.dpsi_t);
    td_eval_krr(td, td->nb_coeffs[0] * td->nb_coeffs[1], coords_r, coords_t,
                (const unsigned int [2]){ 0, 0 }, max.krr);
    td_eval_kpp(td, td->nb_coeffs[0] * td->nb_coeffs[1], coords_r, coords_t,
                (const unsigned int [2]){ 0, 0 }, max.kpp);
    td_eval_krt(td, td->nb_coeffs[0] * td->nb_coeffs[1], coords_r, coords_t,
                (const unsigned int [2]){ 0, 0 }, max.krt);

    ret = tdi_nlsolve_solve(nl, maximal_slicing_eval, NULL, &max, coeffs, 0);
    if (ret < 0) {
        tdi_log(&priv->logger, 0, "Error solving the maximal slicing equation\n");
        goto fail;
    }

    priv->coeffs_lapse = coeffs;
    coeffs             = NULL;

fail:
    free(coords_r);
    free(coords_t);
    free(coeffs);
    free(max.psi);
    free(max.dpsi_r);
    free(max.dpsi_t);
    free(max.krr);
    free(max.kpp);
    free(max.krt);

    tdi_nlsolve_context_free(&nl);
    return ret;
}

static double scalarproduct_metric_c(size_t len1, size_t len2, double *mat,
                                     double *vec1, double *vec2)
{
    double val = 0.0;
    for (int m = 0; m < len2; m++) {
        double tmp = 0.0;
        for (int n = 0; n < len1; n++)
            tmp += mat[m * len1 + n] * vec1[n];
        val += tmp * vec2[m];
    }
    return val;
}

static int eval_var(const TDContext *td, unsigned int var_idx,
                    size_t nb_coords, const double *r, const double *theta,
                    const unsigned int diff_order[2],
                    double *out)
{
    TDPriv *s = td->priv;
    double *basis_val[2] = { NULL };
    int ret = 0;

    if (diff_order[0] > 2 || diff_order[1] > 2) {
        tdi_log(&s->logger, 0, "Derivatives of higher order than 2 are not supported.\n");
        return -ENOSYS;
    }

    posix_memalign((void**)&basis_val[0], 32, sizeof(*basis_val[0]) * td->nb_coeffs[0]);
    posix_memalign((void**)&basis_val[1], 32, sizeof(*basis_val[1]) * td->nb_coeffs[1]);
    if (!basis_val[0] || !basis_val[1]) {
        ret = -ENOMEM;
        goto fail;
    }
    memset(basis_val[0], 0, sizeof(*basis_val[0]) * td->nb_coeffs[0]);
    memset(basis_val[1], 0, sizeof(*basis_val[1]) * td->nb_coeffs[1]);

    for (int i = 0; i < nb_coords; i++) {
        double theta_val = theta[i];
        double r_val     = r[i];

        double val = (var_idx == 0 && diff_order[0] == 0 && diff_order[1] == 0) ? 1.0 : 0.0;

        for (int k = 0; k < td->nb_coeffs[0]; k++)
            basis_val[0][k] = tdi_basis_eval(s->basis[var_idx][0], diff_order[0], r_val, k);
        for (int k = 0; k < td->nb_coeffs[1]; k++)
            basis_val[1][k] = tdi_basis_eval(s->basis[var_idx][1], diff_order[1], theta_val, k);

        val += scalarproduct_metric_c(td->nb_coeffs[0], td->nb_coeffs[1], td->coeffs[var_idx],
                                      basis_val[0], basis_val[1]);

        out[i] = val;
    }


fail:
    free(basis_val[0]);
    free(basis_val[1]);

    return ret;
}

int td_eval_psi(const TDContext *td,
                size_t nb_coords, const double *r, const double *theta,
                const unsigned int diff_order[2],
                double *out)
{
    return eval_var(td, 0, nb_coords, r, theta, diff_order, out);
}

int td_eval_krr(const TDContext *td,
                size_t nb_coords, const double *r, const double *theta,
                const unsigned int diff_order[2],
                double *out)
{
    return eval_var(td, 1, nb_coords, r, theta, diff_order, out);
}
int td_eval_kpp(const TDContext *td,
                size_t nb_coords, const double *r, const double *theta,
                const unsigned int diff_order[2],
                double *out)
{
    return eval_var(td, 2, nb_coords, r, theta, diff_order, out);
}

int td_eval_krt(const TDContext *td,
                size_t nb_coords, const double *r, const double *theta,
                const unsigned int diff_order[2],
                double *out)
{
    static const double dummy_coord = 0.0;
    static const double *dummy_coords[2] = { &dummy_coord, &dummy_coord };
    static const unsigned int nb_dummy_coords[2] = { 1, 1 };

    TDConstraintEvalContext *ce;
    double (*eval)(TDConstraintEvalContext*, double, double);
    int ret;

    if (diff_order[0] == 0 && diff_order[1] == 0)
        eval = tdi_constraint_eval_k_rtheta;
    else if (diff_order[0] == 1 && diff_order[1] == 0)
        eval = tdi_constraint_eval_dk_rtheta_r;
    else if (diff_order[0] == 0 && diff_order[1] == 1)
        eval = tdi_constraint_eval_dk_rtheta_t;
    else
        return -ENOSYS;

    ret = constraint_eval_alloc(td, nb_dummy_coords, dummy_coords,
                                td->amplitude, &ce);
    if (ret < 0)
        return ret;

    for (int i = 0; i < nb_coords; i++) {
        double theta_val = theta[i];
        double r_val     = r[i];

        out[i] = eval(ce, r_val, theta_val);
    }

    tdi_constraint_eval_free(&ce);

    return 0;
}

int td_eval_lapse(const TDContext *td,
                  size_t nb_coords, const double *r, const double *theta,
                  const unsigned int diff_order[2],
                  double *out)
{
    TDPriv *priv = td->priv;
    double *basis_val[2] = { NULL };
    int ret;

    if (!priv->coeffs_lapse) {
        ret = lapse_solve_max(td);
        if (ret < 0)
            return ret;
    }

    if (diff_order[0] > 2 || diff_order[1] > 2) {
        tdi_log(&priv->logger, 0, "Derivatives of higher order than 2 are not supported.\n");
        return -ENOSYS;
    }

    posix_memalign((void**)&basis_val[0], 32, sizeof(*basis_val[0]) * td->nb_coeffs[0]);
    posix_memalign((void**)&basis_val[1], 32, sizeof(*basis_val[1]) * td->nb_coeffs[1]);
    if (!basis_val[0] || !basis_val[1]) {
        ret = -ENOMEM;
        goto fail;
    }
    memset(basis_val[0], 0, sizeof(*basis_val[0]) * td->nb_coeffs[0]);
    memset(basis_val[1], 0, sizeof(*basis_val[1]) * td->nb_coeffs[1]);

    for (int i = 0; i < nb_coords; i++) {
        double theta_val = theta[i];
        double r_val     = r[i];

        double val = (diff_order[0] == 0 && diff_order[1] == 0) ? 1.0 : 0.0;

        for (int k = 0; k < td->nb_coeffs[0]; k++)
            basis_val[0][k] = tdi_basis_eval(priv->basis[0][0], diff_order[0], r_val, k);
        for (int k = 0; k < td->nb_coeffs[1]; k++)
            basis_val[1][k] = tdi_basis_eval(priv->basis[0][1], diff_order[1], theta_val, k);

        val += scalarproduct_metric_c(td->nb_coeffs[0], td->nb_coeffs[1], priv->coeffs_lapse,
                                      basis_val[0], basis_val[1]);

        out[i] = val;
    }

fail:
    free(basis_val[0]);
    free(basis_val[1]);

    return 0;
}