From 0007a0b0c11fa7c12b228883453368f105a4324b Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Thu, 16 Nov 2017 13:11:07 +0100 Subject: Initial commit. The following code is present: * the basis API * the BiCGSTAB solver * the pseudospectral linear system solver * helper APIs: - threadpool - logging - cpuid --- pssolve.c | 521 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 521 insertions(+) create mode 100644 pssolve.c (limited to 'pssolve.c') diff --git a/pssolve.c b/pssolve.c new file mode 100644 index 0000000..f2288c2 --- /dev/null +++ b/pssolve.c @@ -0,0 +1,521 @@ +/* + * Pseudospectral 2nd order 2D linear PDE solver + * Copyright (C) 2016 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "bicgstab.h" +#include "common.h" +#include "log.h" +#include "pssolve.h" +#include "threadpool.h" + +#define NB_COEFFS(eq_ctx) ((eq_ctx)->nb_coeffs[0] * (eq_ctx)->nb_coeffs[1]) +#define NB_COLLOC_POINTS(eq_ctx) ((eq_ctx)->nb_colloc_points[0] * (eq_ctx)->nb_colloc_points[1]) + +typedef struct PSEquationContext { + size_t nb_coeffs[2]; + size_t nb_colloc_points[2]; + unsigned int colloc_grid_order[2]; + + double *(*basis_val)[PSSOLVE_DIFF_ORDER_NB]; + double *mat; +} PSEquationContext; + +struct PSSolvePriv { + BiCGStabContext *bicgstab; + int steps_since_inverse; + + size_t nb_coeffs; + + PSEquationContext *eqs; + + int *ipiv; + double *mat; + + ThreadPoolContext *tp; + ThreadPoolContext *tp_internal; +}; + +typedef struct ConstructMatrixThread { + const PSEquationContext *eq_ctx; + const double **eq_coeffs; + double *mat; + ptrdiff_t mat_stride; + unsigned int var_idx; +} ConstructMatrixThread; + +static void construct_matrix(void *arg, + unsigned int job_idx, unsigned int nb_jobs, + unsigned int thread_idx, unsigned int nb_threads) +{ + ConstructMatrixThread *cmt = arg; + const PSEquationContext *eq_ctx = cmt->eq_ctx; + const double **eq_coeffs = cmt->eq_coeffs; + double *mat = cmt->mat; + ptrdiff_t mat_stride = cmt->mat_stride; + unsigned int var_idx = cmt->var_idx; + unsigned int idx_coeff = job_idx; + + for (int idx_grid = 0; idx_grid < NB_COLLOC_POINTS(eq_ctx); idx_grid++) { + const int idx = idx_grid + NB_COLLOC_POINTS(eq_ctx) * idx_coeff; + double val = 0.0; + + for (int i = 0; i < PSSOLVE_DIFF_ORDER_NB; i++) + val += eq_coeffs[i][idx_grid] * eq_ctx->basis_val[var_idx][i][idx]; + + mat[idx_grid + mat_stride * idx_coeff] = val; + } +} + +static int lu_invert(TDLogger *logger, const int N, double *mat, double *rhs, int *ipiv) +{ + char equed = 'N'; + double cond, ferr, berr, rpivot; + + double *mat_f, *x; + int ret = 0; + +#if 0 + LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + mat, N, ipiv, rhs, N); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv); +#else + mat_f = malloc(SQR(N) * sizeof(*mat_f)); + x = malloc(N * sizeof(*x)); + + //{ + // int i, j; + // for (i = 0; i < N; i++) { + // for (j = 0; j < N; j++) + // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]); + // fprintf(stderr, "\n"); + // } + //} + //{ + // double *mat_copy = malloc(SQR(N) * sizeof(double)); + // double *svd = malloc(N * sizeof(double)); + // double *rhs_copy = malloc(N * sizeof(double)); + // int rank; + + // memcpy(mat_copy, mat, SQR(N) * sizeof(double)); + // memcpy(rhs_copy, rhs, N * sizeof(double)); + + // LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N, + // svd, 1e-13, &rank); + + // free(mat_copy); + // for (int i = 0; i < N; i++) { + // if (i > 5 && i < N - 5) + // continue; + + // fprintf(stderr, "%g\t", svd[i]); + // } + // fprintf(stderr, "\n rank %d\n", rank); + // free(svd); + // free(rhs_copy); + + // if (rank < N) + // ret = 1; + //} + + //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + // mat, N, ipiv, rhs, N); + LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1, + mat, N, mat_f, N, ipiv, &equed, NULL, NULL, + rhs, N, x, N, &cond, &ferr, &berr, &rpivot); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv); + memcpy(rhs, x, N * sizeof(double)); + memcpy(mat, mat_f, SQR(N) * sizeof(double)); + + tdi_log(logger, 1, "LU factorization solution to a %zdx%zd matrix: " + "condition number %16.16g; forward error %16.16g backward error %16.16g\n", + N, N, cond, ferr, berr); + + free(mat_f); + free(x); +#endif + + return ret; +} + +int tdi_pssolve_solve(PSSolveContext *ctx, + const double *(**eq_coeffs)[PSSOLVE_DIFF_ORDER_NB], + const double *rhs, double *coeffs) +{ + PSSolvePriv *s = ctx->priv; + double rhs_max; + int64_t start; + + int ret = 0; + + /* fill the matrix */ + start = gettime(); + + for (int i = 0; i < ctx->nb_equations; i++) { + PSEquationContext *eq_ctx = &s->eqs[i]; + double *mat = s->eqs[i].mat; + + for (int j = 0; j < ctx->nb_equations; j++) { + ConstructMatrixThread thread = { + .eq_ctx = eq_ctx, + .eq_coeffs = eq_coeffs[i][j], + .mat = mat, + .mat_stride = s->nb_coeffs, + .var_idx = j, + }; + tdi_threadpool_execute(s->tp, NB_COEFFS(&s->eqs[j]), construct_matrix, + &thread); + mat += NB_COEFFS(&s->eqs[j]) * s->nb_coeffs; + } + } + + ctx->construct_matrix_time += gettime() - start; + ctx->construct_matrix_count++; + +#if 0 + if (rhs_max < EPS) { + fprintf(stderr, "zero rhs\n"); + memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); + if (ms->cl_queue) { + clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double), + ms->coeffs, 0, NULL, NULL); + } + return 0; + } +#endif + + /* solve for the coeffs */ + if (s->steps_since_inverse < 1024) { + int64_t start; + + start = gettime(); + + ret = tdi_bicgstab_solve(s->bicgstab, s->mat, rhs, coeffs); + + if (ret >= 0) { + ctx->cg_time_total += gettime() - start; + ctx->cg_solve_count++; + ctx->cg_iter_count += ret + 1; + s->steps_since_inverse++; + + } + } else + ret = -1; + + if (ret < 0) { + int64_t start; + + start = gettime(); + + memcpy(coeffs, rhs, s->nb_coeffs * sizeof(*rhs)); + + ret = lu_invert(&ctx->logger, s->nb_coeffs, s->mat, coeffs, s->ipiv); + ctx->lu_solves_time += gettime() - start; + ctx->lu_solves_count++; + + ret = tdi_bicgstab_init(s->bicgstab, s->mat, coeffs); + + s->steps_since_inverse = 0; + } + + return ret; +} + +static int basis_val_init(PSSolveContext *ctx, unsigned int eq_idx) +{ + PSSolvePriv *s = ctx->priv; + PSEquationContext *eq_ctx = &s->eqs[eq_idx]; + int ret; + + eq_ctx->basis_val = calloc(ctx->nb_equations, sizeof(*eq_ctx->basis_val)); + if (!eq_ctx->basis_val) + return -ENOMEM; + + for (int i = 0; i < ctx->nb_equations; i++) { + double *basis_val[2][3] = { { NULL } }; + + /* for each direction, compute the corresponding basis values/derivatives */ + for (int dir = 0; dir < ARRAY_ELEMS(basis_val); dir++) { + for (int diff_order = 0; diff_order < ARRAY_ELEMS(basis_val[dir]); diff_order++) { + ret = posix_memalign((void**)&basis_val[dir][diff_order], 32, + sizeof(*basis_val[dir][diff_order]) * s->eqs[i].nb_coeffs[dir] * eq_ctx->nb_colloc_points[dir]); + if (ret) { + ret = -ENOMEM; + goto fail; + } + } + + for (int k = 0; k < eq_ctx->nb_colloc_points[dir]; k++) { + double coord = ctx->colloc_grid[eq_idx][dir][k]; + for (int l = 0; l < s->eqs[i].nb_coeffs[dir]; l++) { + basis_val[dir][0][k * s->eqs[i].nb_coeffs[dir] + l] = tdi_basis_eval(ctx->basis[i][dir], BS_EVAL_TYPE_VALUE, coord, l); + basis_val[dir][1][k * s->eqs[i].nb_coeffs[dir] + l] = tdi_basis_eval(ctx->basis[i][dir], BS_EVAL_TYPE_DIFF1, coord, l); + basis_val[dir][2][k * s->eqs[i].nb_coeffs[dir] + l] = tdi_basis_eval(ctx->basis[i][dir], BS_EVAL_TYPE_DIFF2, coord, l); + } + } + } + + for (int diff = 0; diff < ARRAY_ELEMS(eq_ctx->basis_val[i]); diff++) { + ret = posix_memalign((void**)&eq_ctx->basis_val[i][diff], 32, + NB_COLLOC_POINTS(eq_ctx) * NB_COEFFS(eq_ctx) * sizeof(*eq_ctx->basis_val[i][diff])); + if (ret) { + ret = -ENOMEM; + goto fail; + } + } + + for (int j = 0; j < eq_ctx->nb_colloc_points[1]; j++) { + const double *basis1 = basis_val[1][0] + j * s->eqs[i].nb_coeffs[1]; + const double *dbasis1 = basis_val[1][1] + j * s->eqs[i].nb_coeffs[1]; + const double *d2basis1 = basis_val[1][2] + j * s->eqs[i].nb_coeffs[1]; + + for (int k = 0; k < eq_ctx->nb_colloc_points[0]; k++) { + const double *basis0 = basis_val[0][0] + k * s->eqs[i].nb_coeffs[0]; + const double *dbasis0 = basis_val[0][1] + k * s->eqs[i].nb_coeffs[0]; + const double *d2basis0 = basis_val[0][2] + k * s->eqs[i].nb_coeffs[0]; + + const int idx_grid = j * eq_ctx->nb_colloc_points[0] + k; + + for (int l = 0; l < s->eqs[i].nb_coeffs[1]; l++) + for (int m = 0; m < s->eqs[i].nb_coeffs[0]; m++) { + const int idx_coeff = l * s->eqs[i].nb_coeffs[0] + m; + const int idx = idx_grid + NB_COLLOC_POINTS(eq_ctx) * idx_coeff; + + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_00][idx] = basis0[m] * basis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_10][idx] = dbasis0[m] * basis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_01][idx] = basis0[m] * dbasis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_20][idx] = d2basis0[m] * basis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_02][idx] = basis0[m] * d2basis1[l]; + eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_11][idx] = dbasis0[m] * dbasis1[l]; + } + } + } + +fail: + for (int dir = 0; dir < ARRAY_ELEMS(basis_val); dir++) + for (int diff = 0; diff < ARRAY_ELEMS(basis_val[dir]); diff++) + free(basis_val[dir][diff]); + if (ret < 0) + return ret; + } + + return 0; +} + +int tdi_pssolve_context_init(PSSolveContext *ctx) +{ + PSSolvePriv *s = ctx->priv; + size_t N = 0; + + int ret = 0; + + if (ctx->tp) { + s->tp = ctx->tp; + } else { + ret = tdi_threadpool_init(&s->tp_internal, 1); + if (ret < 0) + return ret; + s->tp = s->tp_internal; + } + + /* sanity check the parameters */ + for (int i = 0; i < ctx->nb_equations; i++) { + if (!ctx->basis[i][0] || !ctx->basis[i][1]) { + tdi_log(&ctx->logger, 0, "Basis set for variable %d not set\n", i); + return -EINVAL; + } + if (!ctx->solve_order[i][0] || !ctx->solve_order[i][1]) { + tdi_log(&ctx->logger, 0, "Solver order for variable %d not set\n", i); + return -EINVAL; + } + + N += ctx->solve_order[i][0] * ctx->solve_order[i][1]; + } + + ret = posix_memalign((void**)&s->ipiv, 32, sizeof(*s->ipiv) * N); + ret |= posix_memalign((void**)&s->mat, 32, sizeof(*s->mat) * N * N); + if (ret) + return -ENOMEM; + + s->nb_coeffs = N; + + ctx->colloc_grid = calloc(ctx->nb_equations, sizeof(*ctx->colloc_grid)); + if (!ctx->colloc_grid) + return -ENOMEM; + + /* initialize the per-equation state */ + for (int i = 0; i < ctx->nb_equations; i++) { + PSEquationContext *eq_ctx = &s->eqs[i]; + + eq_ctx->nb_coeffs[0] = ctx->solve_order[i][0]; + eq_ctx->nb_coeffs[1] = ctx->solve_order[i][1]; + eq_ctx->nb_colloc_points[0] = ctx->solve_order[i][0]; + eq_ctx->nb_colloc_points[1] = ctx->solve_order[i][1]; + eq_ctx->colloc_grid_order[0] = ctx->solve_order[i][0]; + eq_ctx->colloc_grid_order[1] = ctx->solve_order[i][1]; + + if (i == 0) + eq_ctx->mat = s->mat; + else + eq_ctx->mat = s->eqs[i - 1].mat + NB_COLLOC_POINTS(&s->eqs[i - 1]); + + /* compute the collocation grid */ + posix_memalign((void**)&ctx->colloc_grid[i][0], 32, eq_ctx->nb_colloc_points[0] * sizeof(*ctx->colloc_grid[i][0])); + posix_memalign((void**)&ctx->colloc_grid[i][1], 32, eq_ctx->nb_colloc_points[1] * sizeof(*ctx->colloc_grid[i][1])); + if (!ctx->colloc_grid[i][0] || !ctx->colloc_grid[i][1]) + return -ENOMEM; + + for (int j = 0; j < eq_ctx->nb_colloc_points[0]; j++) + ctx->colloc_grid[i][0][j] = tdi_basis_colloc_point(ctx->basis[i][0], eq_ctx->colloc_grid_order[0], j); + for (int j = 0; j < eq_ctx->nb_colloc_points[1]; j++) + ctx->colloc_grid[i][1][j] = tdi_basis_colloc_point(ctx->basis[i][1], eq_ctx->colloc_grid_order[1], j); + + } + + /* precompute the basis values we will need */ + for (int i = 0; i < ctx->nb_equations; i++) { + ret = basis_val_init(ctx, i); + if (ret < 0) + return ret; + } + + s->steps_since_inverse = INT_MAX; + + /* init the BiCGStab solver */ + ret = tdi_bicgstab_context_alloc(&s->bicgstab, N, ctx->ocl_ctx, ctx->ocl_queue); + if (ret < 0) + return ret; + + return 0; +} + +int tdi_pssolve_context_alloc(PSSolveContext **pctx, unsigned int nb_equations) +{ + PSSolveContext *ctx; + + if (!nb_equations) + return -EINVAL; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + ctx->nb_equations = nb_equations; + + ctx->priv = calloc(1, sizeof(*ctx->priv)); + if (!ctx->priv) + goto fail; + + ctx->priv->eqs = calloc(nb_equations, sizeof(*ctx->priv->eqs)); + if (!ctx->priv->eqs) + goto fail; + + ctx->basis = calloc(nb_equations, sizeof(*ctx->basis)); + if (!ctx->basis) + goto fail; + + ctx->solve_order = calloc(nb_equations, sizeof(*ctx->solve_order)); + if (!ctx->solve_order) + goto fail; + + *pctx = ctx; + return 0; +fail: + tdi_pssolve_context_free(&ctx); + return -ENOMEM; +} + +void tdi_pssolve_context_free(PSSolveContext **pctx) +{ + PSSolveContext *ctx = *pctx; + + if (!ctx) + return; + + if (ctx->priv) { + if (ctx->priv->eqs) { + for (int i = 0; i < ctx->nb_equations; i++) { + PSEquationContext *eq_ctx = &ctx->priv->eqs[i]; + + if (eq_ctx->basis_val) { + for (int j = 0; j < ctx->nb_equations; j++) + for (int k = 0; k < ARRAY_ELEMS(eq_ctx->basis_val[j]); k++) + free(eq_ctx->basis_val[j][k]); + } + free(eq_ctx->basis_val); + } + } + + free(ctx->priv->eqs); + + free(ctx->priv->ipiv); + free(ctx->priv->mat); + + tdi_bicgstab_context_free(&ctx->priv->bicgstab); + tdi_threadpool_free(&ctx->priv->tp_internal); + } + + free(ctx->priv); + + if (ctx->colloc_grid) { + for (int i = 0; i < ctx->nb_equations; i++) + for (int j = 0; j < ARRAY_ELEMS(ctx->colloc_grid[i]); j++) + free(ctx->colloc_grid[i][j]); + } + + free(ctx->colloc_grid); + + free(ctx->basis); + free(ctx->solve_order); + + free(ctx); + *pctx = NULL; +} + +int tdi_pssolve_diff_order(enum PSSolveDiffOrder order, unsigned int dir) +{ + if (dir == 0) { + switch (order) { + case PSSOLVE_DIFF_ORDER_00: + case PSSOLVE_DIFF_ORDER_01: + case PSSOLVE_DIFF_ORDER_02: return 0; + case PSSOLVE_DIFF_ORDER_10: + case PSSOLVE_DIFF_ORDER_11: return 1; + case PSSOLVE_DIFF_ORDER_20: return 2; + } + } else if (dir == 1) { + switch (order) { + case PSSOLVE_DIFF_ORDER_00: + case PSSOLVE_DIFF_ORDER_10: + case PSSOLVE_DIFF_ORDER_20: return 0; + case PSSOLVE_DIFF_ORDER_01: + case PSSOLVE_DIFF_ORDER_11: return 1; + case PSSOLVE_DIFF_ORDER_02: return 2; + } + } + return -1; +} -- cgit v1.2.3