/* * Pseudospectral 2nd order 2D linear PDE solver * Copyright (C) 2016 Anton Khirnov * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include "bicgstab.h" #include "common.h" #include "log.h" #include "pssolve.h" #define NB_COEFFS(eq_ctx) ((eq_ctx)->nb_coeffs[0] * (eq_ctx)->nb_coeffs[1]) #define NB_COLLOC_POINTS(eq_ctx) ((eq_ctx)->nb_colloc_points[0] * (eq_ctx)->nb_colloc_points[1]) typedef struct PSEquationContext { size_t nb_coeffs[2]; size_t nb_colloc_points[2]; unsigned int colloc_grid_order[2]; double *(*basis_val)[PSSOLVE_DIFF_ORDER_NB]; double *mat; } PSEquationContext; struct PSSolvePriv { BiCGStabContext *bicgstab; int steps_since_inverse; size_t nb_coeffs; PSEquationContext *eqs; int *ipiv; double *mat; // same as public colloc_grid, except non-const double *(*colloc_grid)[2]; TPContext *tp; TPContext *tp_internal; }; typedef struct ConstructMatrixThread { const PSEquationContext *eq_ctx; const double * const *eq_coeffs; double *mat; ptrdiff_t mat_stride; unsigned int var_idx; } ConstructMatrixThread; static int construct_matrix(void *arg, unsigned int job_idx, unsigned int thread_idx) { ConstructMatrixThread *cmt = arg; const PSEquationContext *eq_ctx = cmt->eq_ctx; const double * const *eq_coeffs = cmt->eq_coeffs; double *mat = cmt->mat; ptrdiff_t mat_stride = cmt->mat_stride; unsigned int var_idx = cmt->var_idx; unsigned int idx_coeff = job_idx; for (int idx_grid = 0; idx_grid < NB_COLLOC_POINTS(eq_ctx); idx_grid++) { const int idx = idx_grid + NB_COLLOC_POINTS(eq_ctx) * idx_coeff; double val = 0.0; for (int i = 0; i < PSSOLVE_DIFF_ORDER_NB; i++) val += eq_coeffs[i][idx_grid] * eq_ctx->basis_val[var_idx][i][idx]; mat[idx_grid + mat_stride * idx_coeff] = val; } return 0; } static int lu_invert(TDLogger *logger, const int N, double *mat, double *rhs, int *ipiv) { char equed = 'N'; double cond, ferr, berr, rpivot; double *mat_f, *x; int ret = 0; #if 0 LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, mat, N, ipiv, rhs, N); LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv); #else mat_f = malloc(SQR(N) * sizeof(*mat_f)); x = malloc(N * sizeof(*x)); //{ // int i, j; // for (i = 0; i < N; i++) { // for (j = 0; j < N; j++) // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]); // fprintf(stderr, "\n"); // } //} //{ // double *mat_copy = malloc(SQR(N) * sizeof(double)); // double *svd = malloc(N * sizeof(double)); // double *rhs_copy = malloc(N * sizeof(double)); // int rank; // memcpy(mat_copy, mat, SQR(N) * sizeof(double)); // memcpy(rhs_copy, rhs, N * sizeof(double)); // LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N, // svd, 1e-13, &rank); // free(mat_copy); // for (int i = 0; i < N; i++) { // if (i > 5 && i < N - 5) // continue; // fprintf(stderr, "%g\t", svd[i]); // } // fprintf(stderr, "\n rank %d\n", rank); // free(svd); // free(rhs_copy); // if (rank < N) // ret = 1; //} //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1, // mat, N, ipiv, rhs, N); LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1, mat, N, mat_f, N, ipiv, &equed, NULL, NULL, rhs, N, x, N, &cond, &ferr, &berr, &rpivot); LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv); memcpy(rhs, x, N * sizeof(double)); memcpy(mat, mat_f, SQR(N) * sizeof(double)); tdi_log(logger, 1, "LU factorization solution to a %zdx%zd matrix: " "condition number %16.16g; forward error %16.16g backward error %16.16g\n", N, N, cond, ferr, berr); if (cond < DBL_EPSILON) ret = -EDOM; free(mat_f); free(x); #endif return ret; } int tdi_pssolve_solve(PSSolveContext *ctx, const PSEqCoeffs *eq_coeffs, const double *rhs, double *coeffs) { PSSolvePriv *s = ctx->priv; double rhs_max; int64_t start; int ret = 0; /* fill the matrix */ start = gettime(); for (int i = 0; i < ctx->nb_equations; i++) { PSEquationContext *eq_ctx = &s->eqs[i]; double *mat = s->eqs[i].mat; for (int j = 0; j < ctx->nb_equations; j++) { ConstructMatrixThread thread = { .eq_ctx = eq_ctx, .eq_coeffs = eq_coeffs[i].func_coeffs[j], .mat = mat, .mat_stride = s->nb_coeffs, .var_idx = j, }; tp_execute(s->tp, NB_COEFFS(&s->eqs[j]), construct_matrix, &thread); mat += NB_COEFFS(&s->eqs[j]) * s->nb_coeffs; } } ctx->construct_matrix_time += gettime() - start; ctx->construct_matrix_count++; #if 0 if (rhs_max < EPS) { fprintf(stderr, "zero rhs\n"); memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs); if (ms->cl_queue) { clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double), ms->coeffs, 0, NULL, NULL); } return 0; } #endif /* solve for the coeffs */ if (s->steps_since_inverse < 1024) { int64_t start; start = gettime(); ret = tdi_bicgstab_solve(s->bicgstab, s->mat, rhs, coeffs); if (ret >= 0) { ctx->cg_time_total += gettime() - start; ctx->cg_solve_count++; ctx->cg_iter_count += ret + 1; s->steps_since_inverse++; } } else ret = -1; if (ret < 0) { int64_t start; start = gettime(); memcpy(coeffs, rhs, s->nb_coeffs * sizeof(*rhs)); ret = lu_invert(&ctx->logger, s->nb_coeffs, s->mat, coeffs, s->ipiv); ctx->lu_solves_time += gettime() - start; ctx->lu_solves_count++; if (ret < 0) return ret; ret = tdi_bicgstab_init(s->bicgstab, s->mat, coeffs); if (ret < 0) return ret; s->steps_since_inverse = 0; } return ret; } static int basis_val_init(PSSolveContext *ctx, unsigned int eq_idx) { PSSolvePriv *s = ctx->priv; PSEquationContext *eq_ctx = &s->eqs[eq_idx]; int ret; eq_ctx->basis_val = calloc(ctx->nb_equations, sizeof(*eq_ctx->basis_val)); if (!eq_ctx->basis_val) return -ENOMEM; for (int i = 0; i < ctx->nb_equations; i++) { double *basis_val[2][3] = { { NULL } }; /* for each direction, compute the corresponding basis values/derivatives */ for (int dir = 0; dir < ARRAY_ELEMS(basis_val); dir++) { for (int diff_order = 0; diff_order < ARRAY_ELEMS(basis_val[dir]); diff_order++) { ret = posix_memalign((void**)&basis_val[dir][diff_order], 32, sizeof(*basis_val[dir][diff_order]) * s->eqs[i].nb_coeffs[dir] * eq_ctx->nb_colloc_points[dir]); if (ret) { ret = -ENOMEM; goto fail; } } for (int k = 0; k < eq_ctx->nb_colloc_points[dir]; k++) { double coord = ctx->colloc_grid[eq_idx][dir][k]; for (int l = 0; l < s->eqs[i].nb_coeffs[dir]; l++) { basis_val[dir][0][k * s->eqs[i].nb_coeffs[dir] + l] = tdi_basis_eval(ctx->basis[i][dir], BS_EVAL_TYPE_VALUE, coord, l); basis_val[dir][1][k * s->eqs[i].nb_coeffs[dir] + l] = tdi_basis_eval(ctx->basis[i][dir], BS_EVAL_TYPE_DIFF1, coord, l); basis_val[dir][2][k * s->eqs[i].nb_coeffs[dir] + l] = tdi_basis_eval(ctx->basis[i][dir], BS_EVAL_TYPE_DIFF2, coord, l); } } } for (int diff = 0; diff < ARRAY_ELEMS(eq_ctx->basis_val[i]); diff++) { ret = posix_memalign((void**)&eq_ctx->basis_val[i][diff], 32, NB_COLLOC_POINTS(eq_ctx) * NB_COEFFS(eq_ctx) * sizeof(*eq_ctx->basis_val[i][diff])); if (ret) { ret = -ENOMEM; goto fail; } } for (int j = 0; j < eq_ctx->nb_colloc_points[1]; j++) { const double *basis1 = basis_val[1][0] + j * s->eqs[i].nb_coeffs[1]; const double *dbasis1 = basis_val[1][1] + j * s->eqs[i].nb_coeffs[1]; const double *d2basis1 = basis_val[1][2] + j * s->eqs[i].nb_coeffs[1]; for (int k = 0; k < eq_ctx->nb_colloc_points[0]; k++) { const double *basis0 = basis_val[0][0] + k * s->eqs[i].nb_coeffs[0]; const double *dbasis0 = basis_val[0][1] + k * s->eqs[i].nb_coeffs[0]; const double *d2basis0 = basis_val[0][2] + k * s->eqs[i].nb_coeffs[0]; const int idx_grid = j * eq_ctx->nb_colloc_points[0] + k; for (int l = 0; l < s->eqs[i].nb_coeffs[1]; l++) for (int m = 0; m < s->eqs[i].nb_coeffs[0]; m++) { const int idx_coeff = l * s->eqs[i].nb_coeffs[0] + m; const int idx = idx_grid + NB_COLLOC_POINTS(eq_ctx) * idx_coeff; eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_00][idx] = basis0[m] * basis1[l]; eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_10][idx] = dbasis0[m] * basis1[l]; eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_01][idx] = basis0[m] * dbasis1[l]; eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_20][idx] = d2basis0[m] * basis1[l]; eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_02][idx] = basis0[m] * d2basis1[l]; eq_ctx->basis_val[i][PSSOLVE_DIFF_ORDER_11][idx] = dbasis0[m] * dbasis1[l]; } } } fail: for (int dir = 0; dir < ARRAY_ELEMS(basis_val); dir++) for (int diff = 0; diff < ARRAY_ELEMS(basis_val[dir]); diff++) free(basis_val[dir][diff]); if (ret < 0) return ret; } return 0; } int tdi_pssolve_context_init(PSSolveContext *ctx) { PSSolvePriv *s = ctx->priv; size_t N = 0; int ret = 0; if (ctx->tp) { s->tp = ctx->tp; } else { ret = tp_init(&s->tp_internal, 1); if (ret < 0) return ret; s->tp = s->tp_internal; } /* sanity check the parameters */ for (int i = 0; i < ctx->nb_equations; i++) { if (!ctx->basis[i][0] || !ctx->basis[i][1]) { tdi_log(&ctx->logger, 0, "Basis set for variable %d not set\n", i); return -EINVAL; } if (!ctx->solve_order[i][0] || !ctx->solve_order[i][1]) { tdi_log(&ctx->logger, 0, "Solver order for variable %d not set\n", i); return -EINVAL; } N += ctx->solve_order[i][0] * ctx->solve_order[i][1]; } ret = posix_memalign((void**)&s->ipiv, 32, sizeof(*s->ipiv) * N); ret |= posix_memalign((void**)&s->mat, 32, sizeof(*s->mat) * N * N); if (ret) return -ENOMEM; s->nb_coeffs = N; s->colloc_grid = calloc(ctx->nb_equations, sizeof(*s->colloc_grid)); if (!s->colloc_grid) return -ENOMEM; ctx->colloc_grid = (const double * const (*)[2])s->colloc_grid; /* initialize the per-equation state */ for (int i = 0; i < ctx->nb_equations; i++) { PSEquationContext *eq_ctx = &s->eqs[i]; eq_ctx->nb_coeffs[0] = ctx->solve_order[i][0]; eq_ctx->nb_coeffs[1] = ctx->solve_order[i][1]; eq_ctx->nb_colloc_points[0] = ctx->solve_order[i][0]; eq_ctx->nb_colloc_points[1] = ctx->solve_order[i][1]; eq_ctx->colloc_grid_order[0] = ctx->solve_order[i][0]; eq_ctx->colloc_grid_order[1] = ctx->solve_order[i][1]; if (i == 0) eq_ctx->mat = s->mat; else eq_ctx->mat = s->eqs[i - 1].mat + NB_COLLOC_POINTS(&s->eqs[i - 1]); /* compute the collocation grid */ posix_memalign((void**)&s->colloc_grid[i][0], 32, eq_ctx->nb_colloc_points[0] * sizeof(*s->colloc_grid[i][0])); posix_memalign((void**)&s->colloc_grid[i][1], 32, eq_ctx->nb_colloc_points[1] * sizeof(*s->colloc_grid[i][1])); if (!s->colloc_grid[i][0] || !s->colloc_grid[i][1]) return -ENOMEM; for (int j = 0; j < eq_ctx->nb_colloc_points[0]; j++) s->colloc_grid[i][0][j] = tdi_basis_colloc_point(ctx->basis[i][0], eq_ctx->colloc_grid_order[0], j); for (int j = 0; j < eq_ctx->nb_colloc_points[1]; j++) s->colloc_grid[i][1][j] = tdi_basis_colloc_point(ctx->basis[i][1], eq_ctx->colloc_grid_order[1], j); } /* precompute the basis values we will need */ for (int i = 0; i < ctx->nb_equations; i++) { ret = basis_val_init(ctx, i); if (ret < 0) return ret; } s->steps_since_inverse = INT_MAX; /* init the BiCGStab solver */ ret = tdi_bicgstab_context_alloc(&s->bicgstab, N, 64); if (ret < 0) return ret; return 0; } int tdi_pssolve_context_alloc(PSSolveContext **pctx, unsigned int nb_equations) { PSSolveContext *ctx; if (!nb_equations) return -EINVAL; ctx = calloc(1, sizeof(*ctx)); if (!ctx) return -ENOMEM; ctx->nb_equations = nb_equations; ctx->priv = calloc(1, sizeof(*ctx->priv)); if (!ctx->priv) goto fail; ctx->priv->eqs = calloc(nb_equations, sizeof(*ctx->priv->eqs)); if (!ctx->priv->eqs) goto fail; ctx->basis = calloc(nb_equations, sizeof(*ctx->basis)); if (!ctx->basis) goto fail; ctx->solve_order = calloc(nb_equations, sizeof(*ctx->solve_order)); if (!ctx->solve_order) goto fail; *pctx = ctx; return 0; fail: tdi_pssolve_context_free(&ctx); return -ENOMEM; } void tdi_pssolve_context_free(PSSolveContext **pctx) { PSSolveContext *ctx = *pctx; if (!ctx) return; if (ctx->priv) { if (ctx->priv->eqs) { for (int i = 0; i < ctx->nb_equations; i++) { PSEquationContext *eq_ctx = &ctx->priv->eqs[i]; if (eq_ctx->basis_val) { for (int j = 0; j < ctx->nb_equations; j++) for (int k = 0; k < ARRAY_ELEMS(eq_ctx->basis_val[j]); k++) free(eq_ctx->basis_val[j][k]); } free(eq_ctx->basis_val); } } if (ctx->priv->colloc_grid) { for (int i = 0; i < ctx->nb_equations; i++) for (int j = 0; j < ARRAY_ELEMS(ctx->priv->colloc_grid[i]); j++) free(ctx->priv->colloc_grid[i][j]); } free(ctx->priv->colloc_grid); free(ctx->priv->eqs); free(ctx->priv->ipiv); free(ctx->priv->mat); tdi_bicgstab_context_free(&ctx->priv->bicgstab); tp_free(&ctx->priv->tp_internal); } free(ctx->priv); free(ctx->basis); free(ctx->solve_order); free(ctx); *pctx = NULL; } int tdi_pssolve_diff_order(enum PSSolveDiffOrder order, unsigned int dir) { if (dir == 0) { switch (order) { case PSSOLVE_DIFF_ORDER_00: case PSSOLVE_DIFF_ORDER_01: case PSSOLVE_DIFF_ORDER_02: return 0; case PSSOLVE_DIFF_ORDER_10: case PSSOLVE_DIFF_ORDER_11: return 1; case PSSOLVE_DIFF_ORDER_20: return 2; } } else if (dir == 1) { switch (order) { case PSSOLVE_DIFF_ORDER_00: case PSSOLVE_DIFF_ORDER_10: case PSSOLVE_DIFF_ORDER_20: return 0; case PSSOLVE_DIFF_ORDER_01: case PSSOLVE_DIFF_ORDER_11: return 1; case PSSOLVE_DIFF_ORDER_02: return 2; } } return -1; }