diff options
-rw-r--r-- | ell_grid_solve.c | 299 | ||||
-rw-r--r-- | ell_grid_solve.h | 7 | ||||
-rw-r--r-- | meson.build | 9 | ||||
-rw-r--r-- | mg2d.c | 35 |
4 files changed, 334 insertions, 16 deletions
diff --git a/ell_grid_solve.c b/ell_grid_solve.c index 191fc8f..1adb8b2 100644 --- a/ell_grid_solve.c +++ b/ell_grid_solve.c @@ -24,6 +24,8 @@ #include <string.h> #include <threadpool.h> +#include <lapacke.h> + #include "common.h" #include "cpu.h" #include "ell_grid_solve.h" @@ -41,6 +43,18 @@ typedef struct EGSRelaxInternal { double relax_factor; } EGSRelaxInternal; +typedef struct EGSExactInternal { + void (*fill_mat)(double *mat_row, double **diff_coeffs, double *fd_factors, + ptrdiff_t idx_src, ptrdiff_t row_stride, ptrdiff_t row_offset); + size_t N; + size_t N_ghosts; + + double *mat; + double *rhs; + double *scratch_line; + int *ipiv; +} EGSExactInternal; + struct EGSInternal { ptrdiff_t stride; double *u_base; @@ -62,6 +76,7 @@ struct EGSInternal { union { EGSRelaxInternal r; + EGSExactInternal e; }; TPContext *tp_internal; @@ -421,10 +436,263 @@ static int solve_relax_step(EGSContext *ctx) return 0; } + +static int bnd_id(int x_lower, int x_upper, int y_lower, int y_upper) +{ + if (x_lower) + return MG2D_BOUNDARY_0L; + if (x_upper) + return MG2D_BOUNDARY_0U; + if (y_lower) + return MG2D_BOUNDARY_1L; + if (y_upper) + return MG2D_BOUNDARY_1U; + return -1; +} + +static void fill_mat_s1(double *mat_row, double **diff_coeffs, double *fd_factors, + ptrdiff_t idx_src, ptrdiff_t row_stride, ptrdiff_t row_offset) +{ + mat_row[row_offset + 0] += diff_coeffs[MG2D_DIFF_COEFF_00][idx_src]; + + mat_row[row_offset + 1] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; + mat_row[row_offset - 1] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; + + mat_row[row_offset + 1 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; + mat_row[row_offset - 1 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; + + mat_row[row_offset + 1] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; + mat_row[row_offset + 0] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; + mat_row[row_offset - 1] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; + + mat_row[row_offset + 1 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; + mat_row[row_offset + 0 * row_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; + mat_row[row_offset - 1 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; + + + mat_row[row_offset + 1 + 1 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset + 1 - 1 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset - 1 + 1 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset - 1 - 1 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; +} + +static void fill_mat_s2(double *mat_row, double **diff_coeffs, double *fd_factors, + ptrdiff_t idx_src, ptrdiff_t row_stride, ptrdiff_t row_offset) +{ + mat_row[row_offset + 0] += diff_coeffs[MG2D_DIFF_COEFF_00][idx_src]; + + mat_row[row_offset + 2] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; + mat_row[row_offset + 1] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; + mat_row[row_offset - 1] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; + mat_row[row_offset - 2] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; + + mat_row[row_offset + 2 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; + mat_row[row_offset + 1 * row_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; + mat_row[row_offset - 1 * row_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; + mat_row[row_offset - 2 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; + + mat_row[row_offset + 2] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; + mat_row[row_offset + 1] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; + mat_row[row_offset + 0] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; + mat_row[row_offset - 1] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; + mat_row[row_offset - 2] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; + + mat_row[row_offset + 2 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; + mat_row[row_offset + 1 * row_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; + mat_row[row_offset + 0 * row_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; + mat_row[row_offset - 1 * row_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; + mat_row[row_offset - 2 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; + + mat_row[row_offset + 2 + 2 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset + 2 + 1 * row_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset + 2 - 1 * row_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset + 2 - 2 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + + mat_row[row_offset + 1 + 2 * row_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset + 1 + 1 * row_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset + 1 - 1 * row_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset + 1 - 2 * row_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + + mat_row[row_offset - 1 + 2 * row_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset - 1 + 1 * row_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset - 1 - 1 * row_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset - 1 - 2 * row_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + + mat_row[row_offset - 2 + 2 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset - 2 + 1 * row_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset - 2 - 1 * row_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; + mat_row[row_offset - 2 - 2 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; +} + +static int solve_exact(EGSContext *ctx) +{ + EGSInternal *priv = ctx->priv; + EGSExactInternal *e = &priv->e; + int ret; + + memset(e->mat, 0, SQR(e->N) * sizeof(*e->mat)); + + for (ptrdiff_t idx1 = 0; idx1 < ctx->domain_size[1]; idx1++) { + const int boundary_x_lower = idx1 < ctx->fd_stencil; + const int boundary_x_upper = idx1 >= ctx->domain_size[1] - ctx->fd_stencil; + + for (ptrdiff_t idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) { + const int boundary_y_lower = idx0 < ctx->fd_stencil; + const int boundary_y_upper = idx0 >= ctx->domain_size[0] - ctx->fd_stencil; + + const int boundary = boundary_y_lower || boundary_y_upper || boundary_x_lower || boundary_x_upper; + + const ptrdiff_t idx_src = idx1 * priv->stride + idx0; + const ptrdiff_t mat_row_idx = idx1 * ctx->domain_size[0] + idx0; + + ptrdiff_t row_stride; + ptrdiff_t row_offset; + + double *mat_row; + + if (boundary) { + memset(e->scratch_line, 0, e->N_ghosts * sizeof(*e->scratch_line)); + row_stride = ctx->domain_size[0] + 2 * ctx->fd_stencil; + mat_row = e->scratch_line + row_stride * ctx->fd_stencil + ctx->fd_stencil; + } else { + mat_row = e->mat + e->N * mat_row_idx; + row_stride = ctx->domain_size[0]; + } + + row_offset = idx1 * row_stride + idx0; + + e->fill_mat(mat_row, ctx->diff_coeffs, ctx->priv->fd_factors, idx_src, row_stride, row_offset); + + e->rhs[mat_row_idx] = ctx->rhs[idx_src]; + + if (boundary) { + const int bnd_fixval = (idx1 == 0 && ctx->boundaries[MG2D_BOUNDARY_0L]->type == MG2D_BC_TYPE_FIXVAL) || + (idx0 == 0 && ctx->boundaries[MG2D_BOUNDARY_1L]->type == MG2D_BC_TYPE_FIXVAL) || + (idx1 == ctx->domain_size[1] - 1 && ctx->boundaries[MG2D_BOUNDARY_0U]->type == MG2D_BC_TYPE_FIXVAL) || + (idx0 == ctx->domain_size[0] - 1 && ctx->boundaries[MG2D_BOUNDARY_1U]->type == MG2D_BC_TYPE_FIXVAL); + + double *mat_dst = e->mat + e->N * mat_row_idx; + ptrdiff_t bnd_idx = (boundary_x_lower || boundary_x_upper) ? idx0 : idx1; + + if (bnd_fixval) { + const MG2DBoundary *bnd = ctx->boundaries[bnd_id(boundary_x_lower, boundary_x_upper, + boundary_y_lower, boundary_y_upper)]; + + mat_dst[mat_row_idx] = 1.0; + e->rhs[mat_row_idx] = bnd->val[bnd_idx]; + memset(e->scratch_line, 0, e->N_ghosts * sizeof(*e->scratch_line)); + } else { + /* apply the boundary conditions to eliminate the ghostpoint values */ + // fixval + if (boundary_x_lower && ctx->boundaries[MG2D_BOUNDARY_0L]->type == MG2D_BC_TYPE_FIXDIFF) { + for (ptrdiff_t idx1_bnd = 0; idx1_bnd < ctx->fd_stencil; idx1_bnd++) + for (ptrdiff_t idx0_col = -(ptrdiff_t)ctx->fd_stencil; idx0_col < (ptrdiff_t)(ctx->domain_size[0] + ctx->fd_stencil); idx0_col++) { + const ptrdiff_t idx_ghost = -(idx1_bnd + 1) * row_stride + idx0_col; + mat_row[(idx1_bnd + 1) * row_stride + idx0_col] += mat_row[idx_ghost]; + mat_row[idx_ghost] = 0.0; + } + } + if (boundary_x_upper && ctx->boundaries[MG2D_BOUNDARY_0U]->type == MG2D_BC_TYPE_FIXDIFF) { + for (ptrdiff_t idx1_bnd = 0; idx1_bnd < ctx->fd_stencil; idx1_bnd++) + for (ptrdiff_t idx0_col = -(ptrdiff_t)ctx->fd_stencil; idx0_col < (ptrdiff_t)(ctx->domain_size[0] + ctx->fd_stencil); idx0_col++) { + const ptrdiff_t idx_ghost = (ctx->domain_size[1] - 1 + idx1_bnd + 1) * row_stride + idx0_col; + mat_row[(ctx->domain_size[1] - 1 - (idx1_bnd + 1)) * row_stride + idx0_col] += mat_row[idx_ghost]; + mat_row[idx_ghost] = 0.0; + } + } + if (boundary_y_lower && ctx->boundaries[MG2D_BOUNDARY_1L]->type == MG2D_BC_TYPE_FIXDIFF) { + for (ptrdiff_t idx1_col = -(ptrdiff_t)ctx->fd_stencil; idx1_col < (ptrdiff_t)(ctx->domain_size[1] + ctx->fd_stencil); idx1_col++) + for (ptrdiff_t idx0_bnd = 0; idx0_bnd < ctx->fd_stencil; idx0_bnd++) { + const ptrdiff_t idx_ghost = idx1_col * row_stride - (idx0_bnd + 1); + mat_row[idx1_col * row_stride + idx0_bnd + 1] += mat_row[idx_ghost]; + mat_row[idx_ghost] = 0.0; + } + } + if (boundary_y_upper && ctx->boundaries[MG2D_BOUNDARY_1U]->type == MG2D_BC_TYPE_FIXDIFF) { + for (ptrdiff_t idx1_col = -(ptrdiff_t)ctx->fd_stencil; idx1_col < (ptrdiff_t)(ctx->domain_size[1] + ctx->fd_stencil); idx1_col++) + for (ptrdiff_t idx0_bnd = 0; idx0_bnd < ctx->fd_stencil; idx0_bnd++) { + const ptrdiff_t idx_ghost = idx1_col * row_stride + ctx->domain_size[0] - 1 + idx0_bnd + 1; + mat_row[idx1_col * row_stride + ctx->domain_size[0] - 1 - (idx0_bnd + 1)] += mat_row[idx_ghost]; + mat_row[idx_ghost] = 0.0; + } + } + + // fixval + if (boundary_x_lower && ctx->boundaries[MG2D_BOUNDARY_0L]->type == MG2D_BC_TYPE_FIXVAL) { + const MG2DBoundary *bnd = ctx->boundaries[MG2D_BOUNDARY_0L]; + for (ptrdiff_t idx1_bnd = 1; idx1_bnd < ctx->fd_stencil; idx1_bnd++) + for (ptrdiff_t idx0_col = -((ptrdiff_t)ctx->fd_stencil - 1); idx0_col < (ptrdiff_t)(ctx->domain_size[0] + ctx->fd_stencil - 1); idx0_col++) { + const ptrdiff_t idx_ghost = -idx1_bnd * row_stride + idx0_col; + e->rhs[mat_row_idx] -= bnd->val[idx1_bnd * bnd->val_stride + idx0_col] * mat_row[idx_ghost]; + mat_row[idx_ghost] = 0.0; + } + } + if (boundary_x_upper && ctx->boundaries[MG2D_BOUNDARY_0U]->type == MG2D_BC_TYPE_FIXVAL) { + const MG2DBoundary *bnd = ctx->boundaries[MG2D_BOUNDARY_0U]; + for (ptrdiff_t idx1_bnd = 1; idx1_bnd < ctx->fd_stencil; idx1_bnd++) + for (ptrdiff_t idx0_col = -((ptrdiff_t)ctx->fd_stencil - 1); idx0_col < (ptrdiff_t)(ctx->domain_size[0] + ctx->fd_stencil - 1); idx0_col++) { + const ptrdiff_t idx_ghost = (ctx->domain_size[1] - 1 + idx1_bnd) * row_stride + idx0_col; + e->rhs[mat_row_idx] -= bnd->val[idx1_bnd * bnd->val_stride + idx0_col] * mat_row[idx_ghost]; + mat_row[idx_ghost] = 0.0; + } + } + if (boundary_y_lower && ctx->boundaries[MG2D_BOUNDARY_1L]->type == MG2D_BC_TYPE_FIXVAL) { + const MG2DBoundary *bnd = ctx->boundaries[MG2D_BOUNDARY_1L]; + for (ptrdiff_t idx1_col = -((ptrdiff_t)ctx->fd_stencil - 1); idx1_col < (ptrdiff_t)(ctx->domain_size[1] + ctx->fd_stencil - 1); idx1_col++) + for (ptrdiff_t idx0_bnd = 1; idx0_bnd < ctx->fd_stencil; idx0_bnd++) { + const ptrdiff_t idx_ghost = idx1_col * row_stride - idx0_bnd; + e->rhs[mat_row_idx] -= bnd->val[idx0_bnd * bnd->val_stride + idx1_col] * mat_row[idx_ghost]; + mat_row[idx1_col * row_stride - idx0_bnd] = 0.0; + } + } + if (boundary_y_upper && ctx->boundaries[MG2D_BOUNDARY_1U]->type == MG2D_BC_TYPE_FIXVAL) { + const MG2DBoundary *bnd = ctx->boundaries[MG2D_BOUNDARY_1U]; + for (ptrdiff_t idx1_col = -((ptrdiff_t)ctx->fd_stencil - 1); idx1_col < (ptrdiff_t)(ctx->domain_size[1] + ctx->fd_stencil - 1); idx1_col++) + for (ptrdiff_t idx0_bnd = 1; idx0_bnd < ctx->fd_stencil; idx0_bnd++) { + const ptrdiff_t idx_ghost = idx1_col * row_stride + ctx->domain_size[0] - 1 + idx0_bnd; + e->rhs[mat_row_idx] -= bnd->val[idx0_bnd * bnd->val_stride + idx1_col] * mat_row[idx_ghost]; + mat_row[idx_ghost] = 0.0; + } + } + + /* copy the interior values */ + for (ptrdiff_t idx1_col = 0; idx1_col < ctx->domain_size[1]; idx1_col++) + for (ptrdiff_t idx0_col = 0; idx0_col < ctx->domain_size[0]; idx0_col++) { + mat_dst[idx1_col * ctx->domain_size[0] + idx0_col] = mat_row[idx1_col * row_stride + idx0_col]; + mat_row[idx1_col * row_stride + idx0_col] = 0.0; + } + + } + + /* make sure all the values from the scratch line have been accounted for */ + for (ptrdiff_t idx_scratch = 0; idx_scratch < e->N_ghosts; idx_scratch++) + if (e->scratch_line[idx_scratch] != 0.0) + abort(); + } + } + } + + ret = LAPACKE_dgesv(LAPACK_ROW_MAJOR, e->N, 1, e->mat, e->N, e->ipiv, e->rhs, 1); + if (ret != 0) { + mg2di_log(&ctx->logger, MG2D_LOG_ERROR, + "Error solving the linear system: %d\n", ret); + return -EDOM; + } + + for (size_t idx1 = 0; idx1 < ctx->domain_size[1]; idx1++) + memcpy(ctx->u + idx1 * ctx->u_stride, e->rhs + idx1 * ctx->domain_size[0], ctx->domain_size[0] * sizeof(*e->rhs)); + + boundaries_apply(ctx); + residual_calc(ctx); + + return 0; +} + int mg2di_egs_solve(EGSContext *ctx) { switch (ctx->solver_type) { case EGS_SOLVER_RELAXATION: return solve_relax_step(ctx); + case EGS_SOLVER_EXACT: return solve_exact(ctx); } return -EINVAL; @@ -439,6 +707,9 @@ int mg2di_egs_init(EGSContext *ctx) priv->calc_blocksize = 1; switch (ctx->fd_stencil) { case 1: + if (ctx->solver_type == EGS_SOLVER_EXACT) + priv->e.fill_mat = fill_mat_s1; + priv->residual_calc_line = residual_calc_line_s1_c; #if HAVE_EXTERNAL_ASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { @@ -448,6 +719,9 @@ int mg2di_egs_init(EGSContext *ctx) #endif break; case 2: + if (ctx->solver_type == EGS_SOLVER_EXACT) + priv->e.fill_mat = fill_mat_s2; + priv->residual_calc_line = residual_calc_line_s2_c; #if HAVE_EXTERNAL_ASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { @@ -576,6 +850,20 @@ static int arrays_alloc(EGSContext *ctx, const size_t domain_size[2]) return -ENOMEM; } + if (ctx->solver_type == EGS_SOLVER_EXACT) { + EGSExactInternal *e = &priv->e; + + e->N = domain_size[0] * domain_size[1]; + e->N_ghosts = (domain_size[0] + 2 * FD_STENCIL_MAX) * (domain_size[1] + 2 * FD_STENCIL_MAX); + + e->scratch_line = calloc(e->N_ghosts, sizeof(*e->scratch_line)); + e->mat = calloc(SQR(e->N), sizeof(*e->mat)); + e->rhs = calloc(e->N, sizeof(*e->rhs)); + e->ipiv = calloc(e->N, sizeof(*e->ipiv)); + if (!e->scratch_line || !e->mat || !e->rhs || !e->ipiv) + return -ENOMEM; + } + return 0; } @@ -600,6 +888,8 @@ EGSContext *mg2di_egs_alloc(enum EGSType type, size_t domain_size[2]) if (!ctx->solver_data) goto fail; break; + case EGS_SOLVER_EXACT: + break; default: goto fail; } ctx->solver_type = type; @@ -630,6 +920,15 @@ void mg2di_egs_free(EGSContext **pctx) free(ctx->solver_data); + if (ctx->solver_type == EGS_SOLVER_EXACT) { + EGSExactInternal *e = &ctx->priv->e; + + free(e->scratch_line); + free(e->mat); + free(e->rhs); + free(e->ipiv); + } + free(ctx->priv->u_base); free(ctx->priv->rhs_base); free(ctx->priv->residual_base); diff --git a/ell_grid_solve.h b/ell_grid_solve.h index 70782c8..a12e9c3 100644 --- a/ell_grid_solve.h +++ b/ell_grid_solve.h @@ -52,6 +52,13 @@ enum EGSType { * mg2di_egs_solve() does a single relaxation step */ EGS_SOLVER_RELAXATION, + /** + * Solve the equation exactly by contructing a linear system and solving it with LAPACK. + * + * solver_data is NULL + * mg2di_egs_solve() solves the discretized system exactly (up to roundoff error) + */ + EGS_SOLVER_EXACT, }; typedef struct EGSInternal EGSInternal; diff --git a/meson.build b/meson.build index fb20156..c672c28 100644 --- a/meson.build +++ b/meson.build @@ -15,9 +15,13 @@ lib_obj = [] verscript = 'libmg2d.v' ver_flag = '-Wl,--version-script,@0@/@1@'.format(meson.current_source_dir(), verscript) +cc = meson.get_compiler('c') +libm = cc.find_library('m', required : false) +liblapacke = cc.find_library('lapacke') + dep_tp = declare_dependency(link_args : '-lthreadpool') -deps = [dep_tp] +deps = [dep_tp, liblapacke] cdata = configuration_data() cdata.set10('ARCH_X86', false) @@ -82,9 +86,6 @@ endif library('mg2d', lib_src, lib_obj, link_args : ver_flag, dependencies : deps) # test programs -cc = meson.get_compiler('c') -libm = cc.find_library('m', required : false) - test_progs = ['relax', 'mg2d'] foreach t : test_progs target = t + '_test' @@ -297,12 +297,9 @@ static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level) /* handle coarsest grid */ if (!level->child) { - - for (int j = 0; j < 16; j++) { - ret = mg_relax_step(ctx, level, "coarse-step"); - if (ret < 0) - return ret; - } + ret = mg_relax_step(ctx, level, "coarse-step"); + if (ret < 0) + return ret; level->count_cycles++; goto finish; @@ -553,7 +550,7 @@ static void mg_level_free(MG2DLevel **plevel) *plevel = NULL; } -static MG2DLevel *mg_level_alloc(const size_t domain_size) +static MG2DLevel *mg_level_alloc(enum EGSType type, const size_t domain_size) { MG2DLevel *level; int ret; @@ -567,7 +564,7 @@ static MG2DLevel *mg_level_alloc(const size_t domain_size) goto fail; level->prolong_tmp_stride = domain_size + 1; - level->solver = mg2di_egs_alloc(EGS_SOLVER_RELAXATION, + level->solver = mg2di_egs_alloc(type, (size_t [2]){domain_size, domain_size}); if (!level->solver) goto fail; @@ -591,9 +588,11 @@ static int log2i(int n) static int mg_levels_alloc(MG2DContext *ctx, size_t domain_size) { MG2DInternal *priv = ctx->priv; - MG2DLevel *cur; + MG2DLevel *cur, **last; + size_t last_size; + unsigned int last_depth; - priv->root = mg_level_alloc(domain_size); + priv->root = mg_level_alloc(EGS_SOLVER_RELAXATION, domain_size); if (!priv->root) return -ENOMEM; @@ -601,14 +600,14 @@ static int mg_levels_alloc(MG2DContext *ctx, size_t domain_size) // the children all have sizes 2**n + 1 // but we skip the closest lower one if it is too close if (domain_size <= 2) - return 0; + goto finish; domain_size = (1 << log2i(domain_size - 2)) + 1; if ((double)priv->root->solver->domain_size[0] / domain_size < 1.5) domain_size = (domain_size >> 1) + 1; cur = priv->root; for (int i = 0; i < LEVELS_MAX && domain_size > 4; i++) { - cur->child = mg_level_alloc(domain_size); + cur->child = mg_level_alloc(EGS_SOLVER_RELAXATION, domain_size); if (!cur->child) return -ENOMEM; cur->child->depth = i + 1; @@ -617,6 +616,18 @@ static int mg_levels_alloc(MG2DContext *ctx, size_t domain_size) domain_size = (domain_size >> 1) + 1; } +finish: + last = &priv->root; + while ((*last)->child) + last = &((*last)->child); + last_size = (*last)->solver->domain_size[0]; + last_depth = (*last)->depth; + mg_level_free(last); + *last = mg_level_alloc(EGS_SOLVER_EXACT, last_size); + if (!*last) + return -ENOMEM; + (*last)->depth = last_depth; + return 0; } |