aboutsummaryrefslogtreecommitdiff
path: root/pssolve.c
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2017-11-16 13:11:07 +0100
committerAnton Khirnov <anton@khirnov.net>2017-11-19 16:30:14 +0100
commit0007a0b0c11fa7c12b228883453368f105a4324b (patch)
treea5ac8016c58c13668bf87931dd921bea933cf07f /pssolve.c
Initial commit.
The following code is present: * the basis API * the BiCGSTAB solver * the pseudospectral linear system solver * helper APIs: - threadpool - logging - cpuid
Diffstat (limited to 'pssolve.c')
-rw-r--r--pssolve.c521
1 files changed, 521 insertions, 0 deletions
diff --git a/pssolve.c b/pssolve.c
new file mode 100644
index 0000000..f2288c2
--- /dev/null
+++ b/pssolve.c
@@ -0,0 +1,521 @@
+/*
+ * Pseudospectral 2nd order 2D linear PDE solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cblas.h>
+#include <lapacke.h>
+
+#include "bicgstab.h"
+#include "common.h"
+#include "log.h"
+#include "pssolve.h"
+#include "threadpool.h"
+
+#define NB_COEFFS(eq_ctx) ((eq_ctx)->nb_coeffs[0] * (eq_ctx)->nb_coeffs[1])
+#define NB_COLLOC_POINTS(eq_ctx) ((eq_ctx)->nb_colloc_points[0] * (eq_ctx)->nb_colloc_points[1])
+
+typedef struct PSEquationContext {
+ size_t nb_coeffs[2];
+ size_t nb_colloc_points[2];
+ unsigned int colloc_grid_order[2];
+
+ double *(*basis_val)[PSSOLVE_DIFF_ORDER_NB];
+ double *mat;
+} PSEquationContext;
+
+struct PSSolvePriv {
+ BiCGStabContext *bicgstab;
+ int steps_since_inverse;
+
+ size_t nb_coeffs;
+
+ PSEquationContext *eqs;
+
+ int *ipiv;
+ double *mat;
+
+ ThreadPoolContext *tp;
+ ThreadPoolContext *tp_internal;
+};
+
+typedef struct ConstructMatrixThread {
+ const PSEquationContext *eq_ctx;
+ const double **eq_coeffs;
+ double *mat;
+ ptrdiff_t mat_stride;
+ unsigned int var_idx;
+} ConstructMatrixThread;
+
+static void construct_matrix(void *arg,
+ unsigned int job_idx, unsigned int nb_jobs,
+ unsigned int thread_idx, unsigned int nb_threads)
+{
+ ConstructMatrixThread *cmt = arg;
+ const PSEquationContext *eq_ctx = cmt->eq_ctx;
+ const double **eq_coeffs = cmt->eq_coeffs;
+ double *mat = cmt->mat;
+ ptrdiff_t mat_stride = cmt->mat_stride;
+ unsigned int var_idx = cmt->var_idx;
+ unsigned int idx_coeff = job_idx;
+
+ for (int idx_grid = 0; idx_grid < NB_COLLOC_POINTS(eq_ctx); idx_grid++) {
+ const int idx = idx_grid + NB_COLLOC_POINTS(eq_ctx) * idx_coeff;
+ double val = 0.0;
+
+ for (int i = 0; i < PSSOLVE_DIFF_ORDER_NB; i++)
+ val += eq_coeffs[i][idx_grid] * eq_ctx->basis_val[var_idx][i][idx];
+
+ mat[idx_grid + mat_stride * idx_coeff] = val;
+ }
+}
+
+static int lu_invert(TDLogger *logger, const int N, double *mat, double *rhs, int *ipiv)
+{
+ char equed = 'N';
+ double cond, ferr, berr, rpivot;
+
+ double *mat_f, *x;
+ int ret = 0;
+
+#if 0
+ LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+ mat, N, ipiv, rhs, N);
+ LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv);
+#else
+ mat_f = malloc(SQR(N) * sizeof(*mat_f));
+ x = malloc(N * sizeof(*x));
+
+ //{
+ // int i, j;
+ // for (i = 0; i < N; i++) {
+ // for (j = 0; j < N; j++)
+ // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]);
+ // fprintf(stderr, "\n");
+ // }
+ //}
+ //{
+ // double *mat_copy = malloc(SQR(N) * sizeof(double));
+ // double *svd = malloc(N * sizeof(double));
+ // double *rhs_copy = malloc(N * sizeof(double));
+ // int rank;
+
+ // memcpy(mat_copy, mat, SQR(N) * sizeof(double));
+ // memcpy(rhs_copy, rhs, N * sizeof(double));
+
+ // LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N,
+ // svd, 1e-13, &rank);
+
+ // free(mat_copy);
+ // for (int i = 0; i < N; i++) {
+ // if (i > 5 && i < N - 5)
+ // continue;
+
+ // fprintf(stderr, "%g\t", svd[i]);
+ // }
+ // fprintf(stderr, "\n rank %d\n", rank);
+ // free(svd);
+ // free(rhs_copy);
+
+ // if (rank < N)
+ // ret = 1;
+ //}
+
+ //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+ // mat, N, ipiv, rhs, N);
+ LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1,
+ mat, N, mat_f, N, ipiv, &equed, NULL, NULL,
+ rhs, N, x, N, &cond, &ferr, &berr, &rpivot);
+ LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv);
+ memcpy(rhs, x, N * sizeof(double));
+ memcpy(mat, mat_f, SQR(N) * sizeof(double));
+
+ tdi_log(logger, 1, "LU factorization solution to a %zdx%zd matrix: "
+ "condition number %16.16g; forward error %16.16g backward error %16.16g\n",
+ N, N, cond, ferr, berr);
+
+ free(mat_f);
+ free(x);
+#endif
+
+ return ret;
+}
+
+int tdi_pssolve_solve(PSSolveContext *ctx,
+ const double *(**eq_coeffs)[PSSOLVE_DIFF_ORDER_NB],
+ const double *rhs, double *coeffs)
+{
+ PSSolvePriv *s = ctx->priv;
+ double rhs_max;
+ int64_t start;
+
+ int ret = 0;
+
+ /* fill the matrix */
+ start = gettime();
+
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ PSEquationContext *eq_ctx = &s->eqs[i];
+ double *mat = s->eqs[i].mat;
+
+ for (int j = 0; j < ctx->nb_equations; j++) {
+ ConstructMatrixThread thread = {
+ .eq_ctx = eq_ctx,
+ .eq_coeffs = eq_coeffs[i][j],
+ .mat = mat,
+ .mat_stride = s->nb_coeffs,
+ .var_idx = j,
+ };
+ tdi_threadpool_execute(s->tp, NB_COEFFS(&s->eqs[j]), construct_matrix,
+ &thread);
+ mat += NB_COEFFS(&s->eqs[j]) * s->nb_coeffs;
+ }
+ }
+
+ ctx->construct_matrix_time += gettime() - start;
+ ctx->construct_matrix_count++;
+
+#if 0
+ if (rhs_max < EPS) {
+ fprintf(stderr, "zero rhs\n");
+ memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
+ if (ms->cl_queue) {
+ clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double),
+ ms->coeffs, 0, NULL, NULL);
+ }
+ return 0;
+ }
+#endif
+
+ /* solve for the coeffs */
+ if (s->steps_since_inverse < 1024) {
+ int64_t start;
+
+ start = gettime();
+
+ ret = tdi_bicgstab_solve(s->bicgstab, s->mat, rhs, coeffs);
+
+ if (ret >= 0) {
+ ctx->cg_time_total += gettime() - start;
+ ctx->cg_solve_count++;
+ ctx->cg_iter_count += ret + 1;
+ s->steps_since_inverse++;
+
+ }
+ } else
+ ret = -1;
+
+ if (ret < 0) {
+ int64_t start;
+
+ start = gettime();
+
+ memcpy(coeffs, rhs, s->nb_coeffs * sizeof(*rhs));
+
+ ret = lu_invert(&ctx->logger, s->nb_coeffs, s->mat, coeffs, s->ipiv);
+ ctx->lu_solves_time += gettime() - start;
+ ctx->lu_solves_count++;
+
+ ret = tdi_bicgstab_init(s->bicgstab, s->mat, coeffs);
+
+ s->steps_since_inverse = 0;
+ }
+
+ return ret;
+}
+
+static int basis_val_init(PSSolveContext *ctx, unsigned int eq_idx)
+{
+ PSSolvePriv *s = ctx->priv;
+ PSEquationContext *eq_ctx = &s->eqs[eq_idx];
+ int ret;
+
+ eq_ctx->basis_val = calloc(ctx->nb_equations, sizeof(*eq_ctx->basis_val));
+ if (!eq_ctx->basis_val)
+ return -ENOMEM;
+
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ double *basis_val[2][3] = { { NULL } };
+
+ /* for each direction, compute the corresponding basis values/derivatives */
+ for (int dir = 0; dir < ARRAY_ELEMS(basis_val); dir++) {
+ for (int diff_order = 0; diff_order < ARRAY_ELEMS(basis_val[dir]); diff_order++) {
+ ret = posix_memalign((void**)&basis_val[dir][diff_order], 32,
+ sizeof(*basis_val[dir][diff_order]) * s->eqs[i].nb_coeffs[dir] * eq_ctx->nb_colloc_points[dir]);
+ if (ret) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ for (int k = 0; k < eq_ctx->nb_colloc_points[dir]; k++) {
+ double coord = ctx->colloc_grid[eq_idx][dir][k];
+ for (int l = 0; l < s->eqs[i].nb_coeffs[dir]; l++) {
+ basis_val[dir][0][k * s->eqs[i].nb_coeffs[dir] + l] = tdi_basis_eval(ctx->basis[i][dir], BS_EVAL_TYPE_VALUE, coord, l);
+ basis_val[dir][1][k * s->eqs[i].nb_coeffs[dir] + l] = tdi_basis_eval(ctx->basis[i][dir], BS_EVAL_TYPE_DIFF1, coord, l);
+ basis_val[dir][2][k * s->eqs[i].nb_coeffs[dir] + l] = tdi_basis_eval(ctx->basis[i][dir], BS_EVAL_TYPE_DIFF2, coord, l);
+ }
+ }
+ }
+
+ for (int diff = 0; diff < ARRAY_ELEMS(eq_ctx->basis_val[i]); diff++) {
+ ret = posix_memalign((void**)&eq_ctx->basis_val[i][diff], 32,
+ NB_COLLOC_POINTS(eq_ctx) * NB_COEFFS(eq_ctx) * sizeof(*eq_ctx->basis_val[i][diff]));
+ if (ret) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ for (int j = 0; j < eq_ctx->nb_colloc_points[1]; j++) {
+ const double *basis1 = basis_val[1][0] + j * s->eqs[i].nb_coeffs[1];
+ const double *dbasis1 = basis_val[1][1] + j * s->eqs[i].nb_coeffs[1];
+ const double *d2basis1 = basis_val[1][2] + j * s->eqs[i].nb_coeffs[1];
+
+ for (int k = 0; k < eq_ctx->nb_colloc_points[0]; k++) {
+ const double *basis0 = basis_val[0][0] + k * s->eqs[i].nb_coeffs[0];
+ const double *dbasis0 = basis_val[0][1] + k * s->eqs[i].nb_coeffs[0];
+ const double *d2basis0 = basis_val[0][2] + k * s->eqs[i].nb_coeffs[0];
+
+ const int idx_grid = j * eq_ctx->nb_colloc_points[0] + k;
+
+ for (int l = 0; l < s->eqs[i].nb_coeffs[1]; l++)
+ for (int m = 0; m < s->eqs[i].nb_coeffs[0]; m++) {
+ const int idx_coeff = l * s->eqs[i].nb_coeffs[0] + m;
+ const int idx = idx_grid + NB_COLLOC_POINTS(eq_ctx) * idx_coeff;
+
+ eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_00][idx] = basis0[m] * basis1[l];
+ eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_10][idx] = dbasis0[m] * basis1[l];
+ eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_01][idx] = basis0[m] * dbasis1[l];
+ eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_20][idx] = d2basis0[m] * basis1[l];
+ eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_02][idx] = basis0[m] * d2basis1[l];
+ eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_11][idx] = dbasis0[m] * dbasis1[l];
+ }
+ }
+ }
+
+fail:
+ for (int dir = 0; dir < ARRAY_ELEMS(basis_val); dir++)
+ for (int diff = 0; diff < ARRAY_ELEMS(basis_val[dir]); diff++)
+ free(basis_val[dir][diff]);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+int tdi_pssolve_context_init(PSSolveContext *ctx)
+{
+ PSSolvePriv *s = ctx->priv;
+ size_t N = 0;
+
+ int ret = 0;
+
+ if (ctx->tp) {
+ s->tp = ctx->tp;
+ } else {
+ ret = tdi_threadpool_init(&s->tp_internal, 1);
+ if (ret < 0)
+ return ret;
+ s->tp = s->tp_internal;
+ }
+
+ /* sanity check the parameters */
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ if (!ctx->basis[i][0] || !ctx->basis[i][1]) {
+ tdi_log(&ctx->logger, 0, "Basis set for variable %d not set\n", i);
+ return -EINVAL;
+ }
+ if (!ctx->solve_order[i][0] || !ctx->solve_order[i][1]) {
+ tdi_log(&ctx->logger, 0, "Solver order for variable %d not set\n", i);
+ return -EINVAL;
+ }
+
+ N += ctx->solve_order[i][0] * ctx->solve_order[i][1];
+ }
+
+ ret = posix_memalign((void**)&s->ipiv, 32, sizeof(*s->ipiv) * N);
+ ret |= posix_memalign((void**)&s->mat, 32, sizeof(*s->mat) * N * N);
+ if (ret)
+ return -ENOMEM;
+
+ s->nb_coeffs = N;
+
+ ctx->colloc_grid = calloc(ctx->nb_equations, sizeof(*ctx->colloc_grid));
+ if (!ctx->colloc_grid)
+ return -ENOMEM;
+
+ /* initialize the per-equation state */
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ PSEquationContext *eq_ctx = &s->eqs[i];
+
+ eq_ctx->nb_coeffs[0] = ctx->solve_order[i][0];
+ eq_ctx->nb_coeffs[1] = ctx->solve_order[i][1];
+ eq_ctx->nb_colloc_points[0] = ctx->solve_order[i][0];
+ eq_ctx->nb_colloc_points[1] = ctx->solve_order[i][1];
+ eq_ctx->colloc_grid_order[0] = ctx->solve_order[i][0];
+ eq_ctx->colloc_grid_order[1] = ctx->solve_order[i][1];
+
+ if (i == 0)
+ eq_ctx->mat = s->mat;
+ else
+ eq_ctx->mat = s->eqs[i - 1].mat + NB_COLLOC_POINTS(&s->eqs[i - 1]);
+
+ /* compute the collocation grid */
+ posix_memalign((void**)&ctx->colloc_grid[i][0], 32, eq_ctx->nb_colloc_points[0] * sizeof(*ctx->colloc_grid[i][0]));
+ posix_memalign((void**)&ctx->colloc_grid[i][1], 32, eq_ctx->nb_colloc_points[1] * sizeof(*ctx->colloc_grid[i][1]));
+ if (!ctx->colloc_grid[i][0] || !ctx->colloc_grid[i][1])
+ return -ENOMEM;
+
+ for (int j = 0; j < eq_ctx->nb_colloc_points[0]; j++)
+ ctx->colloc_grid[i][0][j] = tdi_basis_colloc_point(ctx->basis[i][0], eq_ctx->colloc_grid_order[0], j);
+ for (int j = 0; j < eq_ctx->nb_colloc_points[1]; j++)
+ ctx->colloc_grid[i][1][j] = tdi_basis_colloc_point(ctx->basis[i][1], eq_ctx->colloc_grid_order[1], j);
+
+ }
+
+ /* precompute the basis values we will need */
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ ret = basis_val_init(ctx, i);
+ if (ret < 0)
+ return ret;
+ }
+
+ s->steps_since_inverse = INT_MAX;
+
+ /* init the BiCGStab solver */
+ ret = tdi_bicgstab_context_alloc(&s->bicgstab, N, ctx->ocl_ctx, ctx->ocl_queue);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+int tdi_pssolve_context_alloc(PSSolveContext **pctx, unsigned int nb_equations)
+{
+ PSSolveContext *ctx;
+
+ if (!nb_equations)
+ return -EINVAL;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->nb_equations = nb_equations;
+
+ ctx->priv = calloc(1, sizeof(*ctx->priv));
+ if (!ctx->priv)
+ goto fail;
+
+ ctx->priv->eqs = calloc(nb_equations, sizeof(*ctx->priv->eqs));
+ if (!ctx->priv->eqs)
+ goto fail;
+
+ ctx->basis = calloc(nb_equations, sizeof(*ctx->basis));
+ if (!ctx->basis)
+ goto fail;
+
+ ctx->solve_order = calloc(nb_equations, sizeof(*ctx->solve_order));
+ if (!ctx->solve_order)
+ goto fail;
+
+ *pctx = ctx;
+ return 0;
+fail:
+ tdi_pssolve_context_free(&ctx);
+ return -ENOMEM;
+}
+
+void tdi_pssolve_context_free(PSSolveContext **pctx)
+{
+ PSSolveContext *ctx = *pctx;
+
+ if (!ctx)
+ return;
+
+ if (ctx->priv) {
+ if (ctx->priv->eqs) {
+ for (int i = 0; i < ctx->nb_equations; i++) {
+ PSEquationContext *eq_ctx = &ctx->priv->eqs[i];
+
+ if (eq_ctx->basis_val) {
+ for (int j = 0; j < ctx->nb_equations; j++)
+ for (int k = 0; k < ARRAY_ELEMS(eq_ctx->basis_val[j]); k++)
+ free(eq_ctx->basis_val[j][k]);
+ }
+ free(eq_ctx->basis_val);
+ }
+ }
+
+ free(ctx->priv->eqs);
+
+ free(ctx->priv->ipiv);
+ free(ctx->priv->mat);
+
+ tdi_bicgstab_context_free(&ctx->priv->bicgstab);
+ tdi_threadpool_free(&ctx->priv->tp_internal);
+ }
+
+ free(ctx->priv);
+
+ if (ctx->colloc_grid) {
+ for (int i = 0; i < ctx->nb_equations; i++)
+ for (int j = 0; j < ARRAY_ELEMS(ctx->colloc_grid[i]); j++)
+ free(ctx->colloc_grid[i][j]);
+ }
+
+ free(ctx->colloc_grid);
+
+ free(ctx->basis);
+ free(ctx->solve_order);
+
+ free(ctx);
+ *pctx = NULL;
+}
+
+int tdi_pssolve_diff_order(enum PSSolveDiffOrder order, unsigned int dir)
+{
+ if (dir == 0) {
+ switch (order) {
+ case PSSOLVE_DIFF_ORDER_00:
+ case PSSOLVE_DIFF_ORDER_01:
+ case PSSOLVE_DIFF_ORDER_02: return 0;
+ case PSSOLVE_DIFF_ORDER_10:
+ case PSSOLVE_DIFF_ORDER_11: return 1;
+ case PSSOLVE_DIFF_ORDER_20: return 2;
+ }
+ } else if (dir == 1) {
+ switch (order) {
+ case PSSOLVE_DIFF_ORDER_00:
+ case PSSOLVE_DIFF_ORDER_10:
+ case PSSOLVE_DIFF_ORDER_20: return 0;
+ case PSSOLVE_DIFF_ORDER_01:
+ case PSSOLVE_DIFF_ORDER_11: return 1;
+ case PSSOLVE_DIFF_ORDER_02: return 2;
+ }
+ }
+ return -1;
+}