diff options
Diffstat (limited to 'src/pssolve.c')
-rw-r--r-- | src/pssolve.c | 389 |
1 files changed, 389 insertions, 0 deletions
diff --git a/src/pssolve.c b/src/pssolve.c new file mode 100644 index 0000000..9d6124c --- /dev/null +++ b/src/pssolve.c @@ -0,0 +1,389 @@ +/* + * Pseudospectral 2nd order 2D linear PDE solver + * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <errno.h> +#include <inttypes.h> +#include <limits.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <cblas.h> +#include <lapacke.h> + +#include "bicgstab.h" +#include "pssolve.h" + +#define NB_COEFFS(priv) (priv->nb_coeffs[0] * priv->nb_coeffs[1]) +#define NB_COLLOC_POINTS(priv) (priv->nb_colloc_points[0] * priv->nb_colloc_points[1]) + +struct PSSolvePriv { + BiCGStabContext *bicgstab; + int steps_since_inverse; + + int nb_coeffs[2]; + int nb_colloc_points[2]; + int colloc_grid_order[2]; + + double *basis_val[PSSOLVE_DIFF_ORDER_NB]; + + int *ipiv; + double *mat; +}; + +static int construct_matrix(PSSolveContext *ctx, + const double *eq_coeffs[PSSOLVE_DIFF_ORDER_NB]) +{ + double *mat = ctx->priv->mat; + int idx_coeff, idx_grid; + +#pragma omp parallel for simd + for (idx_coeff = 0; idx_coeff < NB_COEFFS(ctx->priv); idx_coeff++) + for (idx_grid = 0; idx_grid < NB_COLLOC_POINTS(ctx->priv); idx_grid++) { + const int idx = idx_grid + NB_COLLOC_POINTS(ctx->priv) * idx_coeff; + double val = 0.0; + + for (int i = 0; i < ARRAY_ELEMS(ctx->priv->basis_val); i++) + val += eq_coeffs[i][idx_grid] * ctx->priv->basis_val[i][idx]; + + mat[idx] = val; + } + + return 0; +} + +static int lu_invert(const int N, double *mat, double *rhs, int *ipiv) +{ + char equed = 'N'; + double cond, ferr, berr, rpivot; + + double *mat_f, *x; + int ret = 0; +#if 0 + LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + mat, N, ipiv, rhs, N); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv); +#else + mat_f = malloc(SQR(N) * sizeof(*mat_f)); + x = malloc(N * sizeof(*x)); + + //{ + // int i, j; + // for (i = 0; i < N; i++) { + // for (j = 0; j < N; j++) + // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]); + // fprintf(stderr, "\n"); + // } + //} + //{ + // double *mat_copy = malloc(SQR(N) * sizeof(double)); + // double *svd = malloc(N * sizeof(double)); + // double *rhs_copy = malloc(N * sizeof(double)); + // int rank; + + // memcpy(mat_copy, mat, SQR(N) * sizeof(double)); + // memcpy(rhs_copy, rhs, N * sizeof(double)); + + // LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N, + // svd, 1e-13, &rank); + + // free(mat_copy); + // for (int i = 0; i < N; i++) { + // if (i > 5 && i < N - 5) + // continue; + + // fprintf(stderr, "%g\t", svd[i]); + // } + // fprintf(stderr, "\n rank %d\n", rank); + // free(svd); + // free(rhs_copy); + + // if (rank < N) + // ret = 1; + //} + + //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, + // mat, N, ipiv, rhs, N); + LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1, + mat, N, mat_f, N, ipiv, &equed, NULL, NULL, + rhs, N, x, N, &cond, &ferr, &berr, &rpivot); + LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv); + memcpy(rhs, x, N * sizeof(double)); + memcpy(mat, mat_f, SQR(N) * sizeof(double)); + + fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: " + "condition number %16.16g; forward error %16.16g backward error %16.16g\n", + N, N, cond, ferr, berr); + + free(mat_f); + free(x); +#endif + + return ret; +} + +int ms_pssolve_solve(PSSolveContext *ctx, + const double * const eq_coeffs[PSSOLVE_DIFF_ORDER_NB], + const double *rhs, double *coeffs) +{ + PSSolvePriv *s = ctx->priv; + const int N = NB_COEFFS(s); + double rhs_max; + int64_t start; + + int ret = 0; + + /* fill the matrix */ + CCTK_TimerStart("QuasiMaximalSlicing_construct_matrix"); + start = gettime(); + ret = construct_matrix(ctx, eq_coeffs); + ctx->construct_matrix_time += gettime() - start; + ctx->construct_matrix_count++; + CCTK_TimerStop("QuasiMaximalSlicing_construct_matrix"); + if (ret < 0) + return ret; + +#if 0 + if (rhs_max < EPS) { + fprintf(stderr, "zero rhs\n"); + memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); + if (ms->cl_queue) { + clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double), + ms->coeffs, 0, NULL, NULL); + } + return 0; + } +#endif + + /* solve for the coeffs */ + if (s->steps_since_inverse < 1024) { + int64_t start; + + start = gettime(); + + CCTK_TimerStart("QuasiMaximalSlicing_solve_BiCGSTAB"); + ret = ms_bicgstab_solve(s->bicgstab, s->mat, rhs, coeffs); + CCTK_TimerStop("QuasiMaximalSlicing_solve_BiCGSTAB"); + + if (ret >= 0) { + ctx->cg_time_total += gettime() - start; + ctx->cg_solve_count++; + ctx->cg_iter_count += ret + 1; + s->steps_since_inverse++; + + } + } else + ret = -1; + + if (ret < 0) { + int64_t start; + + CCTK_TimerStart("QuasiMaximalSlicing_solve_LU"); + start = gettime(); + + memcpy(coeffs, rhs, N * sizeof(*rhs)); + + ret = lu_invert(N, s->mat, coeffs, s->ipiv); + ctx->lu_solves_time += gettime() - start; + ctx->lu_solves_count++; + CCTK_TimerStop("QuasiMaximalSlicing_solve_LU"); + + ret = ms_bicgstab_init(s->bicgstab, s->mat, coeffs); + + s->steps_since_inverse = 0; + } + + return ret; +} + +int ms_pssolve_context_init(PSSolveContext *ctx) +{ + PSSolvePriv *s = ctx->priv; + double *basis_val[2][3] = { { NULL } }; + + int ret = 0; + + if (ctx->solve_order[0] <= 0 || ctx->solve_order[1] <= 0) + return -EINVAL; + s->nb_coeffs[0] = ctx->solve_order[0]; + s->nb_coeffs[1] = ctx->solve_order[1]; + s->nb_colloc_points[0] = ctx->solve_order[0]; + s->nb_colloc_points[1] = ctx->solve_order[1]; + s->colloc_grid_order[0] = ctx->solve_order[0]; + s->colloc_grid_order[1] = ctx->solve_order[1]; + + s->steps_since_inverse = INT_MAX; + + /* init the BiCGStab solver */ + ret = ms_bicgstab_context_alloc(&s->bicgstab, NB_COEFFS(s), ctx->ocl_ctx, + ctx->ocl_queue); + if (ret < 0) + return ret; + + /* compute the collocation grid */ + posix_memalign((void**)&ctx->colloc_grid[0], 32, s->nb_colloc_points[0] * sizeof(*ctx->colloc_grid[0])); + posix_memalign((void**)&ctx->colloc_grid[1], 32, s->nb_colloc_points[1] * sizeof(*ctx->colloc_grid[1])); + if (!ctx->colloc_grid[0] || !ctx->colloc_grid[1]) + return -ENOMEM; + + for (int i = 0; i < s->nb_colloc_points[0]; i++) + ctx->colloc_grid[0][i] = ctx->basis[0]->colloc_point(s->colloc_grid_order[0], i); + for (int i = 0; i < s->nb_colloc_points[1]; i++) + ctx->colloc_grid[1][i] = ctx->basis[1]->colloc_point(s->colloc_grid_order[1], i); + + /* precompute the basis values we will need */ + for (int i = 0; i < ARRAY_ELEMS(basis_val); i++) { + for (int j = 0; j < ARRAY_ELEMS(basis_val[i]); j++) { + int ret = posix_memalign((void**)&basis_val[i][j], 32, + sizeof(*basis_val[i][j]) * s->nb_coeffs[i] * s->nb_colloc_points[i]); + if (ret) { + ret = -ENOMEM; + goto fail; + } + } + + for (int j = 0; j < s->nb_colloc_points[i]; j++) { + double coord = ctx->colloc_grid[i][j]; + for (int k = 0; k < s->nb_coeffs[i]; k++) { + basis_val[i][0][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval (coord, k); + basis_val[i][1][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval_diff1(coord, k); + basis_val[i][2][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval_diff2(coord, k); + } + } + } + + for (int i = 0; i < ARRAY_ELEMS(s->basis_val); i++) { + ret = posix_memalign((void**)&s->basis_val[i], 32, NB_COLLOC_POINTS(s) * NB_COEFFS(s) * sizeof(*s->basis_val[i])); + if (ret) { + ret = -ENOMEM; + goto fail; + } + } + + for (int i = 0; i < s->nb_colloc_points[1]; i++) { + const double *basis_z = basis_val[1][0] + i * s->nb_coeffs[1]; + const double *dbasis_z = basis_val[1][1] + i * s->nb_coeffs[1]; + const double *d2basis_z = basis_val[1][2] + i * s->nb_coeffs[1]; + + for (int j = 0; j < s->nb_colloc_points[0]; j++) { + const double *basis_x = basis_val[0][0] + j * s->nb_coeffs[0]; + const double *dbasis_x = basis_val[0][1] + j * s->nb_coeffs[0]; + const double *d2basis_x = basis_val[0][2] + j * s->nb_coeffs[0]; + const int idx_grid = i * s->nb_colloc_points[0] + j; + +#if MS_POLAR + double r = ctx->colloc_grid[0][j]; + double theta = ctx->colloc_grid[1][i]; + + double x = r * cos(theta); + double z = r * sin(theta); +#else + double x = ctx->colloc_grid[0][j]; + double z = ctx->colloc_grid[1][i]; +#endif + + for (int k = 0; k < s->nb_coeffs[1]; k++) + for (int l = 0; l < s->nb_coeffs[0]; l++) { + const int idx_coeff = k * s->nb_coeffs[0] + l; + const int idx = idx_grid + NB_COLLOC_POINTS(s) * idx_coeff; + s->basis_val[PSSOLVE_DIFF_ORDER_00][idx] = basis_x[l] * basis_z[k]; +#if MS_POLAR + s->basis_val[PSSOLVE_DIFF_ORDER_10][idx] = ((r > EPS) ? (dbasis_x[l] * basis_z[k] * x / r - basis_x[l] * dbasis_z[k] * z / SQR(r)) : 0.0); + s->basis_val[PSSOLVE_DIFF_ORDER_01][idx] = ((r > EPS) ? (dbasis_x[l] * basis_z[k] * z / r + basis_x[l] * dbasis_z[k] * x / SQR(r)) : 0.0); + s->basis_val[PSSOLVE_DIFF_ORDER_20][idx] = ((r > EPS) ? (SQR(x / r) * d2basis_x[l] * basis_z[k] + SQR(z / SQR(r)) * basis_x[l] * d2basis_z[k] + + (1.0 - SQR(x / r)) / r * dbasis_x[l] * basis_z[k] + + 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k] + - 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0); + s->basis_val[PSSOLVE_DIFF_ORDER_02][idx] = ((r > EPS) ? (SQR(z / r) * d2basis_x[l] * basis_z[k] + SQR(x / SQR(r)) * basis_x[l] * d2basis_z[k] + + (1.0 - SQR(z / r)) / r * dbasis_x[l] * basis_z[k] + - 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k] + + 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0); + s->basis_val[PSSOLVE_DIFF_ORDER_11][idx] = ((r > EPS) ? (x * z / SQR(r) * d2basis_x[l] * basis_z[k] - x * z / SQR(SQR(r)) * basis_x[l] * d2basis_z[k] + - x * z / (r * SQR(r)) * dbasis_x[l] * basis_z[k] + - (1.0 - SQR(z / r)) / SQR(r) * basis_x[l] * dbasis_z[k] + + (SQR(x) - SQR(z)) / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0); +#else + s->basis_val[PSSOLVE_DIFF_ORDER_10][idx] = dbasis_x[l] * basis_z[k]; + s->basis_val[PSSOLVE_DIFF_ORDER_01][idx] = basis_x[l] * dbasis_z[k]; + s->basis_val[PSSOLVE_DIFF_ORDER_20][idx] = d2basis_x[l] * basis_z[k]; + s->basis_val[PSSOLVE_DIFF_ORDER_02][idx] = basis_x[l] * d2basis_z[k]; + s->basis_val[PSSOLVE_DIFF_ORDER_11][idx] = dbasis_x[l] * dbasis_z[k]; +#endif + } + } + } + + ret = posix_memalign((void**)&s->ipiv, 32, sizeof(*s->ipiv) * NB_COEFFS(s)); + ret |= posix_memalign((void**)&s->mat, 32, sizeof(*s->mat) * NB_COEFFS(s) * NB_COLLOC_POINTS(s)); + if (ret) { + ret = -ENOMEM; + goto fail; + } + +fail: + for (int i = 0; i < ARRAY_ELEMS(basis_val); i++) + for (int j = 0; j < ARRAY_ELEMS(basis_val[i]); j++) + free(basis_val[i][j]); + + return ret; +} + +int ms_pssolve_context_alloc(PSSolveContext **pctx) +{ + PSSolveContext *ctx = calloc(1, sizeof(*ctx)); + + if (!ctx) + return -ENOMEM; + + ctx->priv = calloc(1, sizeof(*ctx->priv)); + if (!ctx->priv) + goto fail; + + *pctx = ctx; + return 0; +fail: + ms_pssolve_context_free(&ctx); + return -ENOMEM; +} + +void ms_pssolve_context_free(PSSolveContext **pctx) +{ + PSSolveContext *ctx = *pctx; + + if (!ctx) + return; + + if (ctx->priv) { + for (int i = 0; i < ARRAY_ELEMS(ctx->priv->basis_val); i++) + free(ctx->priv->basis_val[i]); + + free(ctx->priv->ipiv); + free(ctx->priv->mat); + + ms_bicgstab_context_free(&ctx->priv->bicgstab); + } + + free(ctx->priv); + + free(ctx->colloc_grid[0]); + free(ctx->colloc_grid[1]); + + free(ctx); + *pctx = NULL; +} |