aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2017-11-16 13:11:07 +0100
committerAnton Khirnov <anton@khirnov.net>2017-12-29 15:29:52 +0100
commit56b797850884656ae28eef80560563ccf42db007 (patch)
tree341d1142c2387e0c7af9af675aebfe0682da6717
parent0007a0b0c11fa7c12b228883453368f105a4324b (diff)
Add the generic non-linear solver layer.
-rw-r--r--Makefile4
-rw-r--r--nlsolve.c529
-rw-r--r--nlsolve.h108
-rw-r--r--tests/nlsolve.c300
4 files changed, 940 insertions, 1 deletions
diff --git a/Makefile b/Makefile
index 6affc77..14ad028 100644
--- a/Makefile
+++ b/Makefile
@@ -9,10 +9,12 @@ OBJS = basis.o \
cpu.o \
cpuid.o \
log.o \
+ nlsolve.o \
pssolve.o \
threadpool.o \
-TESTPROGS = pssolve
+TESTPROGS = nlsolve \
+ pssolve
TESTPROGS := $(addprefix tests/,$(TESTPROGS))
diff --git a/nlsolve.c b/nlsolve.c
new file mode 100644
index 0000000..5d691e0
--- /dev/null
+++ b/nlsolve.c
@@ -0,0 +1,529 @@
+/*
+ * Newton-Kantorovich iterative solver for nonlinear PDE systems
+ * Copyright (C) 2017 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <limits.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cblas.h>
+
+#if HAVE_OPENCL
+#include <cl.h>
+#include <clBLAS.h>
+#endif
+
+#include "basis.h"
+#include "common.h"
+#include "log.h"
+#include "pssolve.h"
+#include "nlsolve.h"
+#include "threadpool.h"
+
+#define NB_COEFFS(td) (td->nb_coeffs[0] * td->nb_coeffs[1])
+#define NB_COLLOC_POINTS(td) (td->nb_colloc_points[0] * td->nb_colloc_points[1])
+#define SOLVE_ORDER(ctx, idx) (ctx->solve_order[idx][0] * ctx->solve_order[idx][1])
+#define VAR_EPS 1e-4
+
+/* per-equation state */
+typedef struct NLEquationContext {
+ /* eq_coeffs[i][j] is an array of coefficients at the collocation points
+ * for j-th derivative of i-th unknown function */
+ double *(*eq_coeffs)[PSSOLVE_DIFF_ORDER_NB];
+ const double *(*var_ptrs)[PSSOLVE_DIFF_ORDER_NB];
+ const double *(*var_mod_ptrs)[PSSOLVE_DIFF_ORDER_NB];
+
+ double *rhs;
+
+} NLEquationContext;
+
+/* per-variable state */
+typedef struct NLVarContext {
+ ptrdiff_t coeffs_offset;
+ double *vars[PSSOLVE_DIFF_ORDER_NB];
+ double *val_tmp[3];
+ double *eps;
+
+ double *transform_matrix[2][3];
+ double *transform_tmp;
+} NLVarContext;
+
+struct NLSolvePriv {
+ PSSolveContext *ps_ctx;
+
+ TDLogger logger;
+
+ double amplitude;
+
+ unsigned int nb_equations;
+ unsigned int nb_vars;
+ unsigned int solve_order;
+
+ NLEquationContext *eqs;
+ NLVarContext *vars;
+
+ const double *(**eq_coeffs)[PSSOLVE_DIFF_ORDER_NB];
+ double *delta;
+ double *rhs;
+
+ ThreadPoolContext *tp;
+ ThreadPoolContext *tp_internal;
+
+ uint64_t solve_count;
+ uint64_t solve_time;
+
+ uint64_t calc_eq_coeffs_count;
+ uint64_t calc_eq_coeffs_time;
+};
+
+int tdi_nlsolve_solve(NLSolveContext *ctx, NLEqCallback eq_eval,
+ NLEqJacobianCallback eq_jac_eval, void *opaque, double *coeffs)
+{
+ NLSolvePriv *s = ctx->priv;
+ int64_t start, totaltime_start;
+ int ret = 0;
+
+ totaltime_start = gettime();
+
+ for (unsigned int it = 0; it < ctx->maxiter; it++) {
+ size_t max_idx;
+
+ start = gettime();
+
+ // evaluate the unknowns and their derivatives on the pseudospectral grid
+ for (int i = 0; i < s->nb_vars; i++) {
+ NLVarContext *var_ctx = &s->vars[i];
+
+ for (int j = 0; j < PSSOLVE_DIFF_ORDER_NB; j++) {
+ int diff0 = tdi_pssolve_diff_order(j, 0);
+ int diff1 = tdi_pssolve_diff_order(j, 1);
+
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+ ctx->solve_order[i][0], ctx->solve_order[i][1], ctx->solve_order[i][1], 1.0,
+ coeffs + var_ctx->coeffs_offset, ctx->solve_order[i][0],
+ var_ctx->transform_matrix[1][diff1], ctx->solve_order[i][1],
+ 0.0, var_ctx->transform_tmp, ctx->solve_order[i][0]);
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+ ctx->solve_order[i][0], ctx->solve_order[i][1], ctx->solve_order[i][0], 1.0,
+ var_ctx->transform_matrix[0][diff0], ctx->solve_order[i][0],
+ var_ctx->transform_tmp, ctx->solve_order[i][0], 0.0, var_ctx->vars[j], ctx->solve_order[i][0]);
+ }
+ }
+
+ // build equation coeffs
+ for (int i = 0; i < s->nb_equations; i++) {
+ NLEquationContext *eq_ctx = &s->eqs[i];
+ const int N = SOLVE_ORDER(ctx, i);
+
+ for (int j = 0; j < s->nb_vars; j++)
+ for (int k = 0; k < ARRAY_ELEMS(eq_ctx->var_ptrs[j]); k++)
+ eq_ctx->var_ptrs[j][k] = s->vars[j].vars[k];
+
+ ret = eq_eval(opaque, i, s->ps_ctx->solve_order[i],
+ s->ps_ctx->colloc_grid[i], eq_ctx->var_ptrs,
+ eq_ctx->rhs);
+ if (ret < 0)
+ return ret;
+ cblas_dscal(N, -1.0, eq_ctx->rhs, 1);
+
+ for (int j = 0; j < s->nb_vars; j++) {
+ NLVarContext *var_ctx = &s->vars[j];
+
+ for (int k = 0; k < PSSOLVE_DIFF_ORDER_NB; k++) {
+ double *dst = eq_ctx->eq_coeffs[j][k];
+
+ if (eq_jac_eval) {
+ ret = eq_jac_eval(opaque, i, j, k, s->ps_ctx->solve_order[i],
+ s->ps_ctx->colloc_grid[i], eq_ctx->var_ptrs, dst);
+ if (ret < 0)
+ return ret;
+
+ continue;
+ }
+
+ // jacobian not available -- try to approximate with
+ // finite differences
+ memcpy(eq_ctx->var_mod_ptrs, eq_ctx->var_ptrs, s->nb_vars * sizeof(*eq_ctx->var_ptrs));
+ memcpy(var_ctx->val_tmp[0], eq_ctx->var_ptrs[j][k], N * sizeof(*var_ctx->eps));
+
+ cblas_daxpy(N, 1.0, var_ctx->eps, 1, var_ctx->val_tmp[0], 1);
+ eq_ctx->var_mod_ptrs[j][k] = var_ctx->val_tmp[0];
+
+ ret = eq_eval(opaque, i,
+ s->ps_ctx->solve_order[i], s->ps_ctx->colloc_grid[i],
+ eq_ctx->var_mod_ptrs, dst);
+ if (ret < 0)
+ return ret;
+
+ cblas_daxpy(N, -2.0, var_ctx->eps, 1, var_ctx->val_tmp[0], 1);
+ ret = eq_eval(opaque, i,
+ s->ps_ctx->solve_order[i], s->ps_ctx->colloc_grid[i],
+ eq_ctx->var_mod_ptrs, var_ctx->val_tmp[1]);
+ if (ret < 0)
+ return ret;
+
+ cblas_daxpy(N, -1.0, var_ctx->val_tmp[1], 1, dst, 1);
+ cblas_dscal(N, 1.0 / (2.0 * VAR_EPS), dst, 1);
+ }
+ }
+ }
+
+ // solve for delta
+ ret = tdi_pssolve_solve(s->ps_ctx, s->eq_coeffs, s->rhs, s->delta);
+ if (ret < 0)
+ return ret;
+
+ cblas_daxpy(s->solve_order, 1.0, s->delta, 1, coeffs, 1);
+ max_idx = cblas_idamax(s->solve_order, s->delta, 1);
+ if (fabs(s->delta[max_idx]) < ctx->atol) {
+ tdi_log(&ctx->logger, 2, "Converged on iteration %d: max(delta) %g, tolerance %g\n",
+ it, s->delta[max_idx], ctx->atol);
+ ret = 0;
+ goto finish;
+ }
+
+ tdi_log(&ctx->logger, 3, "Iteration %d, max(delta) %g coeffs[0] %g\n",
+ it, s->delta[max_idx], coeffs[0]);
+ }
+
+ tdi_log(&ctx->logger, 0, "The solver did not converge\n");
+ ret = -EDOM;
+
+finish:
+ s->solve_count++;
+ s->solve_time += gettime() - totaltime_start;
+
+ return ret;
+}
+
+void tdi_solver_print_stats(NLSolveContext *ctx)
+{
+ NLSolvePriv *s = ctx->priv;
+
+ tdi_log(&ctx->logger, 2,
+ "%g%% calc equation coefficients: %lu, "
+ "total time %g s, avg time per call %g ms\n",
+ (double)s->calc_eq_coeffs_time * 100 / s->solve_time,
+ s->calc_eq_coeffs_count, (double)s->calc_eq_coeffs_time / 1e6,
+ (double)s->calc_eq_coeffs_time / s->calc_eq_coeffs_count / 1e3);
+ tdi_log(&ctx->logger, 2,
+ "%g%% pseudospectral matrix construction: %lu, "
+ "total time %g s, avg time per call %g ms\n",
+ (double)s->ps_ctx->construct_matrix_time * 100 / s->solve_time,
+ s->ps_ctx->construct_matrix_count, (double)s->ps_ctx->construct_matrix_time / 1e6,
+ (double)s->ps_ctx->construct_matrix_time / s->ps_ctx->construct_matrix_count / 1e3);
+ tdi_log(&ctx->logger, 2,
+ "%g%% BiCGSTAB %lu solves, "
+ "%lu iterations, total time %g s, "
+ "avg iterations per solve %g, avg time per solve %g ms, "
+ "avg time per iteration %g ms\n",
+ (double)s->ps_ctx->cg_time_total * 100 / s->solve_time,
+ s->ps_ctx->cg_solve_count, s->ps_ctx->cg_iter_count, (double)s->ps_ctx->cg_time_total / 1e6,
+ (double)s->ps_ctx->cg_iter_count / s->ps_ctx->cg_solve_count,
+ (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_solve_count / 1e3,
+ (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_iter_count / 1e3);
+ tdi_log(&ctx->logger, 2,
+ "%g%% LU %lu solves, total time %g s, avg time per solve %g ms\n",
+ (double)s->ps_ctx->lu_solves_time * 100 / s->solve_time,
+ s->ps_ctx->lu_solves_count, (double)s->ps_ctx->lu_solves_time / 1e6,
+ (double)s->ps_ctx->lu_solves_time / s->ps_ctx->lu_solves_count / 1e3);
+}
+
+static void eq_ctx_free(NLSolveContext *ctx, unsigned int eq_idx)
+{
+ NLSolvePriv *s = ctx->priv;
+ NLEquationContext *eq_ctx = &s->eqs[eq_idx];
+ int i, j;
+
+ if (eq_ctx->eq_coeffs) {
+ for (int i = 0; i < s->nb_vars; i++)
+ for (int j = 0; j < ARRAY_ELEMS(eq_ctx->eq_coeffs[i]); j++)
+ free(eq_ctx->eq_coeffs[i][j]);
+ }
+ free(eq_ctx->eq_coeffs);
+ free(eq_ctx->var_ptrs);
+ free(eq_ctx->var_mod_ptrs);
+}
+
+static int eq_ctx_init(NLSolveContext *ctx, unsigned int eq_idx)
+{
+ NLSolvePriv *s = ctx->priv;
+ NLEquationContext *eq_ctx = &s->eqs[eq_idx];
+ int ret;
+
+ /* allocate the equation coefficients */
+ eq_ctx->eq_coeffs = calloc(s->nb_vars, sizeof(*eq_ctx->eq_coeffs));
+ if (!eq_ctx->eq_coeffs)
+ return -ENOMEM;
+ for (int i = 0; i < s->nb_vars; i++)
+ for (int j = 0; j < ARRAY_ELEMS(eq_ctx->eq_coeffs[i]); j++) {
+ ret = posix_memalign((void**)&eq_ctx->eq_coeffs[i][j], 32,
+ SOLVE_ORDER(ctx, i) * sizeof(*eq_ctx->eq_coeffs[i][j]));
+ if (ret)
+ return -ENOMEM;
+ }
+
+ eq_ctx->var_ptrs = calloc(s->nb_vars, sizeof(*eq_ctx->var_ptrs));
+ eq_ctx->var_mod_ptrs = calloc(s->nb_vars, sizeof(*eq_ctx->var_mod_ptrs));
+ if (!eq_ctx->var_ptrs || !eq_ctx->var_mod_ptrs)
+ return -ENOMEM;
+
+ /* setup the RHS pointer */
+ if (eq_idx == 0)
+ eq_ctx->rhs = s->rhs;
+ else
+ eq_ctx->rhs = s->eqs[eq_idx - 1].rhs + SOLVE_ORDER(ctx, eq_idx - 1);
+
+ return 0;
+}
+
+static void var_ctx_free(NLSolveContext *ctx, unsigned int var_idx)
+{
+ NLSolvePriv *s = ctx->priv;
+ NLVarContext *var_ctx = &s->vars[var_idx];
+
+ for (int i = 0; i < ARRAY_ELEMS(var_ctx->vars); i++)
+ free(var_ctx->vars[i]);
+
+ for (int i = 0; i < ARRAY_ELEMS(var_ctx->transform_matrix); i++)
+ for (int j = 0; j < ARRAY_ELEMS(var_ctx->transform_matrix[i]); j++)
+ free(var_ctx->transform_matrix[i][j]);
+
+ for (int i = 0; i < ARRAY_ELEMS(var_ctx->val_tmp); i++)
+ free(var_ctx->val_tmp[i]);
+
+ free(var_ctx->transform_tmp);
+ free(var_ctx->eps);
+}
+
+static int var_ctx_init(NLSolveContext *ctx, unsigned int var_idx)
+{
+ NLSolvePriv *s = ctx->priv;
+ NLVarContext *var_ctx = &s->vars[var_idx];
+ int ret;
+
+ for (int i = 0; i < ARRAY_ELEMS(var_ctx->vars); i++) {
+ ret = posix_memalign((void**)&var_ctx->vars[i], 32,
+ SOLVE_ORDER(ctx, var_idx) * sizeof(*var_ctx->vars[i]));
+ if (ret)
+ return -ENOMEM;
+ }
+
+ for (int i = 0; i < ARRAY_ELEMS(var_ctx->transform_matrix); i++)
+ for (int j = 0; j < ARRAY_ELEMS(var_ctx->transform_matrix[i]); j++) {
+ ret = posix_memalign((void**)&var_ctx->transform_matrix[i][j], 32,
+ SQR(ctx->solve_order[var_idx][i]) * sizeof(*var_ctx->transform_matrix[i][j]));
+ if (ret)
+ return -ENOMEM;
+
+ for (int k = 0; k < ctx->solve_order[var_idx][i]; k++)
+ for (int l = 0; l < ctx->solve_order[var_idx][i]; l++) {
+ int idx = (i == 0) ? ctx->solve_order[var_idx][i] * l + k :
+ ctx->solve_order[var_idx][i] * k + l;
+ var_ctx->transform_matrix[i][j][idx] = tdi_basis_eval(ctx->basis[var_idx][i], j,
+ s->ps_ctx->colloc_grid[var_idx][i][k], l);
+ }
+ }
+
+ for (int i = 0; i < ARRAY_ELEMS(var_ctx->val_tmp); i++) {
+ ret = posix_memalign((void**)&var_ctx->val_tmp[i], 32,
+ SOLVE_ORDER(ctx, var_idx) * sizeof(*var_ctx->val_tmp[i]));
+ if (ret)
+ return -ENOMEM;
+ }
+
+ ret = posix_memalign((void**)&var_ctx->transform_tmp, 32,
+ SOLVE_ORDER(ctx, var_idx) * sizeof(*var_ctx->transform_tmp));
+ if (ret)
+ return -ENOMEM;
+
+ ret = posix_memalign((void**)&var_ctx->eps, 32,
+ SOLVE_ORDER(ctx, var_idx) * sizeof(*var_ctx->eps));
+ if (ret)
+ return -ENOMEM;
+ for (int i = 0; i < SOLVE_ORDER(ctx, var_idx); i++)
+ var_ctx->eps[i] = VAR_EPS;
+
+ /* setup the coeffs offset */
+ if (var_idx > 0)
+ var_ctx->coeffs_offset = s->vars[var_idx - 1].coeffs_offset + SOLVE_ORDER(ctx, var_idx - 1);
+
+ return 0;
+}
+
+int tdi_nlsolve_context_alloc(NLSolveContext **pctx, unsigned int nb_equations)
+{
+ NLSolveContext *ctx;
+ NLSolvePriv *s;
+ int ret;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->priv = calloc(1, sizeof(*ctx->priv));
+ if (!ctx->priv) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ s = ctx->priv;
+
+ ctx->nb_equations = nb_equations;
+ s->nb_equations = nb_equations;
+ s->nb_vars = nb_equations;
+
+ s->eq_coeffs = calloc(s->nb_equations, sizeof(*s->eq_coeffs));
+ s->eqs = calloc(s->nb_equations, sizeof(*s->eqs));
+ s->vars = calloc(s->nb_vars, sizeof(*s->vars));
+ if (!s->eq_coeffs || !s->eqs || !s->vars) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ ctx->basis = calloc(s->nb_vars, sizeof(*ctx->basis));
+ if (!ctx->basis) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ ctx->solve_order = calloc(s->nb_vars, sizeof(*ctx->solve_order));
+ if (!ctx->solve_order) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ *pctx = ctx;
+ return 0;
+fail:
+ tdi_nlsolve_context_free(&ctx);
+ return ret;
+}
+
+int tdi_nlsolve_context_init(NLSolveContext *ctx)
+{
+ NLSolvePriv *s = ctx->priv;
+ unsigned int solve_order = 0;
+ int ret;
+
+ if (ctx->tp) {
+ s->tp = ctx->tp;
+ } else {
+ ret = tdi_threadpool_init(&s->tp_internal, 1);
+ if (ret < 0)
+ return ret;
+ s->tp = s->tp_internal;
+ }
+
+ for (int i = 0; i < s->nb_vars; i++) {
+ if (ctx->solve_order[i][0] >= UINT_MAX / ctx->solve_order[i][1] ||
+ solve_order >= UINT_MAX - ctx->solve_order[i][0] * ctx->solve_order[i][1]) {
+ return -EDOM;
+ }
+ solve_order += ctx->solve_order[i][0] * ctx->solve_order[i][1];
+ }
+ s->solve_order = solve_order;
+
+ ret = posix_memalign((void**)&s->rhs, 32, sizeof(*s->rhs) * solve_order);
+ ret |= posix_memalign((void**)&s->delta, 32, sizeof(*s->delta) * solve_order);
+ if (ret) {
+ return -ENOMEM;
+ }
+
+ ret = tdi_pssolve_context_alloc(&s->ps_ctx, s->nb_equations);
+ if (ret < 0) {
+ tdi_log(&ctx->logger, 0, "Error allocating the pseudospectral solver");
+ return ret;
+ }
+
+ s->ps_ctx->logger = ctx->logger;
+ s->ps_ctx->tp = s->tp;
+
+#if HAVE_OPENCL
+ s->ps_ctx->ocl_ctx = ctx->ocl_ctx;
+ s->ps_ctx->ocl_queue = ctx->ocl_queue;
+#endif
+
+ memcpy(s->ps_ctx->basis, ctx->basis, s->nb_vars * sizeof(*ctx->basis));
+ memcpy(s->ps_ctx->solve_order, ctx->solve_order, s->nb_vars * sizeof(*ctx->solve_order));
+
+ ret = tdi_pssolve_context_init(s->ps_ctx);
+ if (ret < 0) {
+ tdi_log(&ctx->logger, 0, "Error initializing the pseudospectral solver\n");
+ return ret;
+ }
+
+ /* init the per-equation state */
+ for (int i = 0; i < s->nb_equations; i++) {
+ ret = eq_ctx_init(ctx, i);
+ if (ret < 0)
+ return ret;
+
+ s->eq_coeffs[i] = s->eqs[i].eq_coeffs;
+ }
+
+ /* init the per-variable state */
+ for (int i = 0; i < s->nb_vars; i++) {
+ ret = var_ctx_init(ctx, i);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+void tdi_nlsolve_context_free(NLSolveContext **pctx)
+{
+ NLSolveContext *ctx = *pctx;
+
+ if (!ctx)
+ return;
+
+ if (ctx->priv) {
+ free(ctx->priv->rhs);
+ free(ctx->priv->delta);
+
+ if (ctx->priv->eqs)
+ for (int i = 0; i < ctx->nb_equations; i++)
+ eq_ctx_free(ctx, i);
+ free(ctx->priv->eqs);
+
+ if (ctx->priv->vars)
+ for (int i = 0; i < ctx->nb_equations; i++)
+ var_ctx_free(ctx, i);
+ free(ctx->priv->vars);
+
+ free(ctx->priv->eq_coeffs);
+
+ tdi_pssolve_context_free(&ctx->priv->ps_ctx);
+
+ tdi_threadpool_free(&ctx->priv->tp_internal);
+ }
+
+ free(ctx->priv);
+
+ free(ctx->basis);
+ free(ctx->solve_order);
+
+ free(ctx);
+ *pctx = NULL;
+}
diff --git a/nlsolve.h b/nlsolve.h
new file mode 100644
index 0000000..1d3fae5
--- /dev/null
+++ b/nlsolve.h
@@ -0,0 +1,108 @@
+/*
+ * Newton-Kantorovich iterative solver for nonlinear PDE systems
+ * Copyright (C) 2017 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TEUKOLSKY_DATA_NLSOLVE_H
+#define TEUKOLSKY_DATA_NLSOLVE_H
+
+#include "basis.h"
+#include "log.h"
+#include "pssolve.h"
+#include "threadpool.h"
+
+typedef struct NLSolvePriv NLSolvePriv;
+
+typedef struct NLSolveContext {
+ /**
+ * Solver private data, not to be touched by the caller.
+ */
+ NLSolvePriv *priv;
+
+ /**
+ * The logging context.
+ * Set by the caller before tdi_nlsolve_context_init().
+ */
+ TDLogger logger;
+
+ /**
+ * The thread pool used for multithreaded execution. May be set by the
+ * caller before tdi_nlsolve_context_init(), otherwise a single thread will
+ * be used.
+ */
+ ThreadPoolContext *tp;
+
+ /**
+ * Number of equations/unknown functions in the set.
+ * Set by tdi_nlsolve_context_alloc().
+ */
+ unsigned int nb_equations;
+
+ /**
+ * The basis sets.
+ *
+ * basis[i][j] is the basis set used for i-th variable in j-th direction.
+ *
+ * The array is allocated by tdi_nlsolve_context_alloc(), must be filled
+ * by the caller before tdi_nlsolve_context_init().
+ */
+ const BasisSetContext *(*basis)[2];
+
+ /**
+ * Order of the solver.
+ *
+ * solve_order[i][j] is the order of the solver (i.e. the number of the
+ * basis functions used) for i-th variable in j-th direction.
+ *
+ * Allocated by tdi_nlsolve_context_alloc(), must be filled by the caller
+ * before tdi_nlsolve_context_init().
+ */
+ unsigned int (*solve_order)[2];
+
+#if HAVE_OPENCL
+ cl_context ocl_ctx;
+ cl_command_queue ocl_queue;
+#endif
+
+ // solver parameters
+ unsigned int maxiter;
+ double atol;
+} NLSolveContext;
+
+typedef int (*NLEqCallback)(
+ void *opaque, unsigned int eq_idx,
+ const unsigned int *colloc_grid_order, const double * const *colloc_grid,
+ const double * const (*vars)[PSSOLVE_DIFF_ORDER_NB], double *dst);
+
+typedef int (*NLEqJacobianCallback)(
+ void *opaque, unsigned int eq_idx,
+ unsigned int var_idx, enum PSSolveDiffOrder diff_order,
+ const unsigned int *colloc_grid_order, const double * const *colloc_grid,
+ const double * const (*vars)[PSSOLVE_DIFF_ORDER_NB], double *dst);
+
+int tdi_nlsolve_context_alloc(NLSolveContext **ctx, unsigned int nb_equations);
+
+int tdi_nlsolve_context_init(NLSolveContext *ctx);
+
+void tdi_nlsolve_context_free(NLSolveContext **ctx);
+
+int tdi_nlsolve_solve(NLSolveContext *ctx, NLEqCallback eq_eval,
+ NLEqJacobianCallback eq_jac_eval,
+ void *opaque, double *coeffs);
+
+void tdi_nlsolve_print_stats(NLSolveContext *ctx);
+
+#endif // TEUKOLSKY_DATA_NLSOLVE_H
diff --git a/tests/nlsolve.c b/tests/nlsolve.c
new file mode 100644
index 0000000..6d46c9e
--- /dev/null
+++ b/tests/nlsolve.c
@@ -0,0 +1,300 @@
+/*
+ * Copyright 2017 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "basis.h"
+#include "common.h"
+#include "log.h"
+#include "nlsolve.h"
+#include "pssolve.h"
+
+#define N_X 8
+#define N_Z 4
+#define FUNC_X 4
+#define FUNC_Z 2
+#define TOL 5e-15
+#define ITER 10
+static int
+scalar_linear_func(void *opaque, unsigned int eq_idx,
+ const unsigned int *colloc_grid_order,
+ const double * const *colloc_grid,
+ const double * const (*vars)[PSSOLVE_DIFF_ORDER_NB],
+ double *dst)
+{
+ BasisSetContext *basis = opaque;
+
+ for (int j = 0; j < colloc_grid_order[1]; j++) {
+ const double z = colloc_grid[1][j];
+
+ for (int i = 0; i < colloc_grid_order[0]; i++) {
+ const double x = colloc_grid[0][i];
+
+ const int idx = j * colloc_grid_order[0] + i;
+
+ const double d2x = vars[0][PSSOLVE_DIFF_ORDER_20][idx];
+ const double d2z = vars[0][PSSOLVE_DIFF_ORDER_02][idx];
+ const double val = vars[0][PSSOLVE_DIFF_ORDER_00][idx];
+
+ const double d2x_ex = tdi_basis_eval(basis, BS_EVAL_TYPE_DIFF2, x, FUNC_X) *
+ tdi_basis_eval(basis, BS_EVAL_TYPE_VALUE, z, FUNC_Z);
+ const double d2z_ex = tdi_basis_eval(basis, BS_EVAL_TYPE_VALUE, x, FUNC_X) *
+ tdi_basis_eval(basis, BS_EVAL_TYPE_DIFF2, z, FUNC_Z);
+ const double val_ex = tdi_basis_eval(basis, BS_EVAL_TYPE_VALUE, x, FUNC_X) *
+ tdi_basis_eval(basis, BS_EVAL_TYPE_VALUE, z, FUNC_Z);
+
+ dst[idx] = (d2x + d2z + val) - (d2x_ex + d2z_ex + val_ex);
+ }
+ }
+
+ return 0;
+}
+
+static int scalar_linear(void)
+{
+ BasisSetContext *basis = NULL;
+ NLSolveContext *ctx = NULL;
+
+ double coeffs[N_X * N_Z];
+ int ret;
+
+ ret = tdi_basis_init(&basis, BASIS_FAMILY_SB_EVEN, 1.0);
+ if (ret < 0)
+ goto finish;
+
+ ret = tdi_nlsolve_context_alloc(&ctx, 1);
+ if (ret < 0)
+ return ret;
+
+ ctx->basis[0][0] = basis;
+ ctx->basis[0][1] = basis;
+
+ ctx->solve_order[0][0] = N_X;
+ ctx->solve_order[0][1] = N_Z;
+
+ ctx->maxiter = ITER;
+ ctx->atol = TOL;
+
+ ctx->logger.log = tdi_log_default_callback;
+
+ ret = tdi_nlsolve_context_init(ctx);
+ if (ret < 0)
+ goto finish;
+
+ // seed the initial guess with garbage, but not too large, numbers
+ for (int i = 0; i < N_X; i++)
+ for (int j = 0; j < N_Z; j++)
+ coeffs[j * N_X + i] = 0.2 * j + 0.1 * sin(138.0 * i);
+
+ ret = tdi_nlsolve_solve(ctx, scalar_linear_func, NULL, basis, coeffs);
+ if (ret < 0)
+ goto finish;
+
+ for (int j = 0; j < N_Z; j++)
+ for (int i = 0; i < N_X; i++) {
+ const int idx = j * N_X + i;
+ const double val = (i == FUNC_X && j == FUNC_Z) ? 1.0 : 0.0;
+
+ if (fabs(coeffs[idx] - val) > TOL) {
+ fprintf(stderr, "unexpected value %g at %d/%d\n",
+ coeffs[idx], i, j);
+ ret = -1;
+ goto finish;
+ }
+ }
+
+finish:
+ tdi_nlsolve_context_free(&ctx);
+ tdi_basis_free(&basis);
+
+ return ret;
+}
+#undef N_X
+#undef N_Z
+#undef FUNC_X
+#undef FUNC_Z
+#undef TOL
+#undef ITER
+
+#define N_X 16
+#define N_Z 16
+#define L 0.1
+#define FUNC_X 4
+#define FUNC_Z 2
+#define ATOL 1e-14
+#define TOL 5e-15
+#define ITER 10
+static int
+scalar_quadratic_func(void *opaque, unsigned int eq_idx,
+ const unsigned int *colloc_grid_order,
+ const double * const *colloc_grid,
+ const double * const (*vars)[PSSOLVE_DIFF_ORDER_NB],
+ double *dst)
+{
+ BasisSetContext *basis = opaque;
+
+ for (int j = 0; j < colloc_grid_order[1]; j++) {
+ const double z = colloc_grid[1][j];
+
+ for (int i = 0; i < colloc_grid_order[0]; i++) {
+ const double x = colloc_grid[0][i];
+
+ const int idx = j * colloc_grid_order[0] + i;
+
+ const double d2x = vars[0][PSSOLVE_DIFF_ORDER_20][idx];
+ const double d2z = vars[0][PSSOLVE_DIFF_ORDER_02][idx];
+ const double val = vars[0][PSSOLVE_DIFF_ORDER_00][idx];
+
+ const double d2x_ex = tdi_basis_eval(basis, BS_EVAL_TYPE_DIFF2, x, FUNC_X) *
+ tdi_basis_eval(basis, BS_EVAL_TYPE_VALUE, z, FUNC_Z);
+ const double d2z_ex = tdi_basis_eval(basis, BS_EVAL_TYPE_VALUE, x, FUNC_X) *
+ tdi_basis_eval(basis, BS_EVAL_TYPE_DIFF2, z, FUNC_Z);
+ const double val_ex = tdi_basis_eval(basis, BS_EVAL_TYPE_VALUE, x, FUNC_X) *
+ tdi_basis_eval(basis, BS_EVAL_TYPE_VALUE, z, FUNC_Z);
+
+ dst[idx] = (d2x - d2x_ex) + (d2z - d2z_ex) + (SQR(val) - SQR(val_ex));
+ }
+ }
+
+ return 0;
+}
+
+static int
+scalar_quadratic_jac(void *opaque, unsigned int eq_idx,
+ unsigned int var_idx, enum PSSolveDiffOrder diff_order,
+ const unsigned int *colloc_grid_order,
+ const double * const *colloc_grid,
+ const double * const (*vars)[PSSOLVE_DIFF_ORDER_NB],
+ double *dst)
+{
+ BasisSetContext *basis = opaque;
+
+ for (int j = 0; j < colloc_grid_order[1]; j++) {
+ const double z = colloc_grid[1][j];
+
+ for (int i = 0; i < colloc_grid_order[0]; i++) {
+ const double x = colloc_grid[0][i];
+
+ const int idx = j * colloc_grid_order[0] + i;
+
+ const double val = vars[0][PSSOLVE_DIFF_ORDER_00][idx];
+
+ if (diff_order == PSSOLVE_DIFF_ORDER_20 ||
+ diff_order == PSSOLVE_DIFF_ORDER_02) {
+ dst[idx] = 1.0;
+ } else if (diff_order == PSSOLVE_DIFF_ORDER_00) {
+ dst[idx] = 2.0 * val;
+ } else {
+ dst[idx] = 0.0;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int scalar_quadratic(void)
+{
+ BasisSetContext *basis = NULL;
+ NLSolveContext *ctx = NULL;
+
+ double coeffs[N_X * N_Z];
+ int ret;
+
+ ret = tdi_basis_init(&basis, BASIS_FAMILY_SB_EVEN, L);
+ if (ret < 0)
+ goto finish;
+
+ ret = tdi_nlsolve_context_alloc(&ctx, 1);
+ if (ret < 0)
+ return ret;
+
+ ctx->basis[0][0] = basis;
+ ctx->basis[0][1] = basis;
+
+ ctx->solve_order[0][0] = N_X;
+ ctx->solve_order[0][1] = N_Z;
+
+ ctx->maxiter = ITER;
+ ctx->atol = ATOL;
+
+ ctx->logger.log = tdi_log_default_callback;
+
+ ret = tdi_nlsolve_context_init(ctx);
+ if (ret < 0)
+ goto finish;
+
+ memset(coeffs, 0, sizeof(coeffs));
+
+ ret = tdi_nlsolve_solve(ctx, scalar_quadratic_func, scalar_quadratic_jac, basis, coeffs);
+ if (ret < 0)
+ goto finish;
+
+ for (int j = 0; j < N_Z; j++)
+ for (int i = 0; i < N_X; i++) {
+ const int idx = j * N_X + i;
+ const double val = (i == FUNC_X && j == FUNC_Z) ? 1.0 : 0.0;
+
+ if (fabs(coeffs[idx] - val) > TOL) {
+ fprintf(stderr, "unexpected value %g at %d/%d\n",
+ coeffs[idx], i, j);
+ ret = -1;
+ goto finish;
+ }
+ }
+
+finish:
+ tdi_nlsolve_context_free(&ctx);
+ tdi_basis_free(&basis);
+
+ return ret;
+}
+#undef N_X
+#undef N_Z
+#undef FUNC_X
+#undef FUNC_Z
+#undef TOL
+#undef ITER
+
+static const struct {
+ const char *name;
+ int (*test)(void);
+} tests[] = {
+ { "scalar_linear", scalar_linear },
+ { "scalar_quadratic", scalar_quadratic },
+};
+
+int main(void)
+{
+ int ret = 0;
+
+ for (int i = 0; i < ARRAY_ELEMS(tests); i++) {
+ fprintf(stderr, "executing test '%s'\n", tests[i].name);
+ ret = tests[i].test();
+ if (ret < 0) {
+ fprintf(stderr, "test '%s' failed\n", tests[i].name);
+ return -1;
+ }
+ fprintf(stderr, "test '%s' succeeded\n", tests[i].name);
+ }
+
+ return 0;
+}