aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ell_grid_solve.c299
-rw-r--r--ell_grid_solve.h7
-rw-r--r--meson.build9
-rw-r--r--mg2d.c35
4 files changed, 334 insertions, 16 deletions
diff --git a/ell_grid_solve.c b/ell_grid_solve.c
index 191fc8f..1adb8b2 100644
--- a/ell_grid_solve.c
+++ b/ell_grid_solve.c
@@ -24,6 +24,8 @@
#include <string.h>
#include <threadpool.h>
+#include <lapacke.h>
+
#include "common.h"
#include "cpu.h"
#include "ell_grid_solve.h"
@@ -41,6 +43,18 @@ typedef struct EGSRelaxInternal {
double relax_factor;
} EGSRelaxInternal;
+typedef struct EGSExactInternal {
+ void (*fill_mat)(double *mat_row, double **diff_coeffs, double *fd_factors,
+ ptrdiff_t idx_src, ptrdiff_t row_stride, ptrdiff_t row_offset);
+ size_t N;
+ size_t N_ghosts;
+
+ double *mat;
+ double *rhs;
+ double *scratch_line;
+ int *ipiv;
+} EGSExactInternal;
+
struct EGSInternal {
ptrdiff_t stride;
double *u_base;
@@ -62,6 +76,7 @@ struct EGSInternal {
union {
EGSRelaxInternal r;
+ EGSExactInternal e;
};
TPContext *tp_internal;
@@ -421,10 +436,263 @@ static int solve_relax_step(EGSContext *ctx)
return 0;
}
+
+static int bnd_id(int x_lower, int x_upper, int y_lower, int y_upper)
+{
+ if (x_lower)
+ return MG2D_BOUNDARY_0L;
+ if (x_upper)
+ return MG2D_BOUNDARY_0U;
+ if (y_lower)
+ return MG2D_BOUNDARY_1L;
+ if (y_upper)
+ return MG2D_BOUNDARY_1U;
+ return -1;
+}
+
+static void fill_mat_s1(double *mat_row, double **diff_coeffs, double *fd_factors,
+ ptrdiff_t idx_src, ptrdiff_t row_stride, ptrdiff_t row_offset)
+{
+ mat_row[row_offset + 0] += diff_coeffs[MG2D_DIFF_COEFF_00][idx_src];
+
+ mat_row[row_offset + 1] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[row_offset - 1] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+
+ mat_row[row_offset + 1 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[row_offset - 1 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+
+ mat_row[row_offset + 1] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[row_offset + 0] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[row_offset - 1] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+
+ mat_row[row_offset + 1 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[row_offset + 0 * row_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[row_offset - 1 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+
+
+ mat_row[row_offset + 1 + 1 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset + 1 - 1 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset - 1 + 1 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset - 1 - 1 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+}
+
+static void fill_mat_s2(double *mat_row, double **diff_coeffs, double *fd_factors,
+ ptrdiff_t idx_src, ptrdiff_t row_stride, ptrdiff_t row_offset)
+{
+ mat_row[row_offset + 0] += diff_coeffs[MG2D_DIFF_COEFF_00][idx_src];
+
+ mat_row[row_offset + 2] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[row_offset + 1] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[row_offset - 1] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+ mat_row[row_offset - 2] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+
+ mat_row[row_offset + 2 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[row_offset + 1 * row_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[row_offset - 1 * row_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+ mat_row[row_offset - 2 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+
+ mat_row[row_offset + 2] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[row_offset + 1] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[row_offset + 0] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[row_offset - 1] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+ mat_row[row_offset - 2] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+
+ mat_row[row_offset + 2 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[row_offset + 1 * row_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[row_offset + 0 * row_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[row_offset - 1 * row_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+ mat_row[row_offset - 2 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+
+ mat_row[row_offset + 2 + 2 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset + 2 + 1 * row_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset + 2 - 1 * row_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset + 2 - 2 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[row_offset + 1 + 2 * row_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset + 1 + 1 * row_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset + 1 - 1 * row_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset + 1 - 2 * row_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[row_offset - 1 + 2 * row_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset - 1 + 1 * row_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset - 1 - 1 * row_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset - 1 - 2 * row_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+ mat_row[row_offset - 2 + 2 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset - 2 + 1 * row_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset - 2 - 1 * row_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+ mat_row[row_offset - 2 - 2 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+}
+
+static int solve_exact(EGSContext *ctx)
+{
+ EGSInternal *priv = ctx->priv;
+ EGSExactInternal *e = &priv->e;
+ int ret;
+
+ memset(e->mat, 0, SQR(e->N) * sizeof(*e->mat));
+
+ for (ptrdiff_t idx1 = 0; idx1 < ctx->domain_size[1]; idx1++) {
+ const int boundary_x_lower = idx1 < ctx->fd_stencil;
+ const int boundary_x_upper = idx1 >= ctx->domain_size[1] - ctx->fd_stencil;
+
+ for (ptrdiff_t idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) {
+ const int boundary_y_lower = idx0 < ctx->fd_stencil;
+ const int boundary_y_upper = idx0 >= ctx->domain_size[0] - ctx->fd_stencil;
+
+ const int boundary = boundary_y_lower || boundary_y_upper || boundary_x_lower || boundary_x_upper;
+
+ const ptrdiff_t idx_src = idx1 * priv->stride + idx0;
+ const ptrdiff_t mat_row_idx = idx1 * ctx->domain_size[0] + idx0;
+
+ ptrdiff_t row_stride;
+ ptrdiff_t row_offset;
+
+ double *mat_row;
+
+ if (boundary) {
+ memset(e->scratch_line, 0, e->N_ghosts * sizeof(*e->scratch_line));
+ row_stride = ctx->domain_size[0] + 2 * ctx->fd_stencil;
+ mat_row = e->scratch_line + row_stride * ctx->fd_stencil + ctx->fd_stencil;
+ } else {
+ mat_row = e->mat + e->N * mat_row_idx;
+ row_stride = ctx->domain_size[0];
+ }
+
+ row_offset = idx1 * row_stride + idx0;
+
+ e->fill_mat(mat_row, ctx->diff_coeffs, ctx->priv->fd_factors, idx_src, row_stride, row_offset);
+
+ e->rhs[mat_row_idx] = ctx->rhs[idx_src];
+
+ if (boundary) {
+ const int bnd_fixval = (idx1 == 0 && ctx->boundaries[MG2D_BOUNDARY_0L]->type == MG2D_BC_TYPE_FIXVAL) ||
+ (idx0 == 0 && ctx->boundaries[MG2D_BOUNDARY_1L]->type == MG2D_BC_TYPE_FIXVAL) ||
+ (idx1 == ctx->domain_size[1] - 1 && ctx->boundaries[MG2D_BOUNDARY_0U]->type == MG2D_BC_TYPE_FIXVAL) ||
+ (idx0 == ctx->domain_size[0] - 1 && ctx->boundaries[MG2D_BOUNDARY_1U]->type == MG2D_BC_TYPE_FIXVAL);
+
+ double *mat_dst = e->mat + e->N * mat_row_idx;
+ ptrdiff_t bnd_idx = (boundary_x_lower || boundary_x_upper) ? idx0 : idx1;
+
+ if (bnd_fixval) {
+ const MG2DBoundary *bnd = ctx->boundaries[bnd_id(boundary_x_lower, boundary_x_upper,
+ boundary_y_lower, boundary_y_upper)];
+
+ mat_dst[mat_row_idx] = 1.0;
+ e->rhs[mat_row_idx] = bnd->val[bnd_idx];
+ memset(e->scratch_line, 0, e->N_ghosts * sizeof(*e->scratch_line));
+ } else {
+ /* apply the boundary conditions to eliminate the ghostpoint values */
+ // fixval
+ if (boundary_x_lower && ctx->boundaries[MG2D_BOUNDARY_0L]->type == MG2D_BC_TYPE_FIXDIFF) {
+ for (ptrdiff_t idx1_bnd = 0; idx1_bnd < ctx->fd_stencil; idx1_bnd++)
+ for (ptrdiff_t idx0_col = -(ptrdiff_t)ctx->fd_stencil; idx0_col < (ptrdiff_t)(ctx->domain_size[0] + ctx->fd_stencil); idx0_col++) {
+ const ptrdiff_t idx_ghost = -(idx1_bnd + 1) * row_stride + idx0_col;
+ mat_row[(idx1_bnd + 1) * row_stride + idx0_col] += mat_row[idx_ghost];
+ mat_row[idx_ghost] = 0.0;
+ }
+ }
+ if (boundary_x_upper && ctx->boundaries[MG2D_BOUNDARY_0U]->type == MG2D_BC_TYPE_FIXDIFF) {
+ for (ptrdiff_t idx1_bnd = 0; idx1_bnd < ctx->fd_stencil; idx1_bnd++)
+ for (ptrdiff_t idx0_col = -(ptrdiff_t)ctx->fd_stencil; idx0_col < (ptrdiff_t)(ctx->domain_size[0] + ctx->fd_stencil); idx0_col++) {
+ const ptrdiff_t idx_ghost = (ctx->domain_size[1] - 1 + idx1_bnd + 1) * row_stride + idx0_col;
+ mat_row[(ctx->domain_size[1] - 1 - (idx1_bnd + 1)) * row_stride + idx0_col] += mat_row[idx_ghost];
+ mat_row[idx_ghost] = 0.0;
+ }
+ }
+ if (boundary_y_lower && ctx->boundaries[MG2D_BOUNDARY_1L]->type == MG2D_BC_TYPE_FIXDIFF) {
+ for (ptrdiff_t idx1_col = -(ptrdiff_t)ctx->fd_stencil; idx1_col < (ptrdiff_t)(ctx->domain_size[1] + ctx->fd_stencil); idx1_col++)
+ for (ptrdiff_t idx0_bnd = 0; idx0_bnd < ctx->fd_stencil; idx0_bnd++) {
+ const ptrdiff_t idx_ghost = idx1_col * row_stride - (idx0_bnd + 1);
+ mat_row[idx1_col * row_stride + idx0_bnd + 1] += mat_row[idx_ghost];
+ mat_row[idx_ghost] = 0.0;
+ }
+ }
+ if (boundary_y_upper && ctx->boundaries[MG2D_BOUNDARY_1U]->type == MG2D_BC_TYPE_FIXDIFF) {
+ for (ptrdiff_t idx1_col = -(ptrdiff_t)ctx->fd_stencil; idx1_col < (ptrdiff_t)(ctx->domain_size[1] + ctx->fd_stencil); idx1_col++)
+ for (ptrdiff_t idx0_bnd = 0; idx0_bnd < ctx->fd_stencil; idx0_bnd++) {
+ const ptrdiff_t idx_ghost = idx1_col * row_stride + ctx->domain_size[0] - 1 + idx0_bnd + 1;
+ mat_row[idx1_col * row_stride + ctx->domain_size[0] - 1 - (idx0_bnd + 1)] += mat_row[idx_ghost];
+ mat_row[idx_ghost] = 0.0;
+ }
+ }
+
+ // fixval
+ if (boundary_x_lower && ctx->boundaries[MG2D_BOUNDARY_0L]->type == MG2D_BC_TYPE_FIXVAL) {
+ const MG2DBoundary *bnd = ctx->boundaries[MG2D_BOUNDARY_0L];
+ for (ptrdiff_t idx1_bnd = 1; idx1_bnd < ctx->fd_stencil; idx1_bnd++)
+ for (ptrdiff_t idx0_col = -((ptrdiff_t)ctx->fd_stencil - 1); idx0_col < (ptrdiff_t)(ctx->domain_size[0] + ctx->fd_stencil - 1); idx0_col++) {
+ const ptrdiff_t idx_ghost = -idx1_bnd * row_stride + idx0_col;
+ e->rhs[mat_row_idx] -= bnd->val[idx1_bnd * bnd->val_stride + idx0_col] * mat_row[idx_ghost];
+ mat_row[idx_ghost] = 0.0;
+ }
+ }
+ if (boundary_x_upper && ctx->boundaries[MG2D_BOUNDARY_0U]->type == MG2D_BC_TYPE_FIXVAL) {
+ const MG2DBoundary *bnd = ctx->boundaries[MG2D_BOUNDARY_0U];
+ for (ptrdiff_t idx1_bnd = 1; idx1_bnd < ctx->fd_stencil; idx1_bnd++)
+ for (ptrdiff_t idx0_col = -((ptrdiff_t)ctx->fd_stencil - 1); idx0_col < (ptrdiff_t)(ctx->domain_size[0] + ctx->fd_stencil - 1); idx0_col++) {
+ const ptrdiff_t idx_ghost = (ctx->domain_size[1] - 1 + idx1_bnd) * row_stride + idx0_col;
+ e->rhs[mat_row_idx] -= bnd->val[idx1_bnd * bnd->val_stride + idx0_col] * mat_row[idx_ghost];
+ mat_row[idx_ghost] = 0.0;
+ }
+ }
+ if (boundary_y_lower && ctx->boundaries[MG2D_BOUNDARY_1L]->type == MG2D_BC_TYPE_FIXVAL) {
+ const MG2DBoundary *bnd = ctx->boundaries[MG2D_BOUNDARY_1L];
+ for (ptrdiff_t idx1_col = -((ptrdiff_t)ctx->fd_stencil - 1); idx1_col < (ptrdiff_t)(ctx->domain_size[1] + ctx->fd_stencil - 1); idx1_col++)
+ for (ptrdiff_t idx0_bnd = 1; idx0_bnd < ctx->fd_stencil; idx0_bnd++) {
+ const ptrdiff_t idx_ghost = idx1_col * row_stride - idx0_bnd;
+ e->rhs[mat_row_idx] -= bnd->val[idx0_bnd * bnd->val_stride + idx1_col] * mat_row[idx_ghost];
+ mat_row[idx1_col * row_stride - idx0_bnd] = 0.0;
+ }
+ }
+ if (boundary_y_upper && ctx->boundaries[MG2D_BOUNDARY_1U]->type == MG2D_BC_TYPE_FIXVAL) {
+ const MG2DBoundary *bnd = ctx->boundaries[MG2D_BOUNDARY_1U];
+ for (ptrdiff_t idx1_col = -((ptrdiff_t)ctx->fd_stencil - 1); idx1_col < (ptrdiff_t)(ctx->domain_size[1] + ctx->fd_stencil - 1); idx1_col++)
+ for (ptrdiff_t idx0_bnd = 1; idx0_bnd < ctx->fd_stencil; idx0_bnd++) {
+ const ptrdiff_t idx_ghost = idx1_col * row_stride + ctx->domain_size[0] - 1 + idx0_bnd;
+ e->rhs[mat_row_idx] -= bnd->val[idx0_bnd * bnd->val_stride + idx1_col] * mat_row[idx_ghost];
+ mat_row[idx_ghost] = 0.0;
+ }
+ }
+
+ /* copy the interior values */
+ for (ptrdiff_t idx1_col = 0; idx1_col < ctx->domain_size[1]; idx1_col++)
+ for (ptrdiff_t idx0_col = 0; idx0_col < ctx->domain_size[0]; idx0_col++) {
+ mat_dst[idx1_col * ctx->domain_size[0] + idx0_col] = mat_row[idx1_col * row_stride + idx0_col];
+ mat_row[idx1_col * row_stride + idx0_col] = 0.0;
+ }
+
+ }
+
+ /* make sure all the values from the scratch line have been accounted for */
+ for (ptrdiff_t idx_scratch = 0; idx_scratch < e->N_ghosts; idx_scratch++)
+ if (e->scratch_line[idx_scratch] != 0.0)
+ abort();
+ }
+ }
+ }
+
+ ret = LAPACKE_dgesv(LAPACK_ROW_MAJOR, e->N, 1, e->mat, e->N, e->ipiv, e->rhs, 1);
+ if (ret != 0) {
+ mg2di_log(&ctx->logger, MG2D_LOG_ERROR,
+ "Error solving the linear system: %d\n", ret);
+ return -EDOM;
+ }
+
+ for (size_t idx1 = 0; idx1 < ctx->domain_size[1]; idx1++)
+ memcpy(ctx->u + idx1 * ctx->u_stride, e->rhs + idx1 * ctx->domain_size[0], ctx->domain_size[0] * sizeof(*e->rhs));
+
+ boundaries_apply(ctx);
+ residual_calc(ctx);
+
+ return 0;
+}
+
int mg2di_egs_solve(EGSContext *ctx)
{
switch (ctx->solver_type) {
case EGS_SOLVER_RELAXATION: return solve_relax_step(ctx);
+ case EGS_SOLVER_EXACT: return solve_exact(ctx);
}
return -EINVAL;
@@ -439,6 +707,9 @@ int mg2di_egs_init(EGSContext *ctx)
priv->calc_blocksize = 1;
switch (ctx->fd_stencil) {
case 1:
+ if (ctx->solver_type == EGS_SOLVER_EXACT)
+ priv->e.fill_mat = fill_mat_s1;
+
priv->residual_calc_line = residual_calc_line_s1_c;
#if HAVE_EXTERNAL_ASM
if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
@@ -448,6 +719,9 @@ int mg2di_egs_init(EGSContext *ctx)
#endif
break;
case 2:
+ if (ctx->solver_type == EGS_SOLVER_EXACT)
+ priv->e.fill_mat = fill_mat_s2;
+
priv->residual_calc_line = residual_calc_line_s2_c;
#if HAVE_EXTERNAL_ASM
if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
@@ -576,6 +850,20 @@ static int arrays_alloc(EGSContext *ctx, const size_t domain_size[2])
return -ENOMEM;
}
+ if (ctx->solver_type == EGS_SOLVER_EXACT) {
+ EGSExactInternal *e = &priv->e;
+
+ e->N = domain_size[0] * domain_size[1];
+ e->N_ghosts = (domain_size[0] + 2 * FD_STENCIL_MAX) * (domain_size[1] + 2 * FD_STENCIL_MAX);
+
+ e->scratch_line = calloc(e->N_ghosts, sizeof(*e->scratch_line));
+ e->mat = calloc(SQR(e->N), sizeof(*e->mat));
+ e->rhs = calloc(e->N, sizeof(*e->rhs));
+ e->ipiv = calloc(e->N, sizeof(*e->ipiv));
+ if (!e->scratch_line || !e->mat || !e->rhs || !e->ipiv)
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -600,6 +888,8 @@ EGSContext *mg2di_egs_alloc(enum EGSType type, size_t domain_size[2])
if (!ctx->solver_data)
goto fail;
break;
+ case EGS_SOLVER_EXACT:
+ break;
default: goto fail;
}
ctx->solver_type = type;
@@ -630,6 +920,15 @@ void mg2di_egs_free(EGSContext **pctx)
free(ctx->solver_data);
+ if (ctx->solver_type == EGS_SOLVER_EXACT) {
+ EGSExactInternal *e = &ctx->priv->e;
+
+ free(e->scratch_line);
+ free(e->mat);
+ free(e->rhs);
+ free(e->ipiv);
+ }
+
free(ctx->priv->u_base);
free(ctx->priv->rhs_base);
free(ctx->priv->residual_base);
diff --git a/ell_grid_solve.h b/ell_grid_solve.h
index 70782c8..a12e9c3 100644
--- a/ell_grid_solve.h
+++ b/ell_grid_solve.h
@@ -52,6 +52,13 @@ enum EGSType {
* mg2di_egs_solve() does a single relaxation step
*/
EGS_SOLVER_RELAXATION,
+ /**
+ * Solve the equation exactly by contructing a linear system and solving it with LAPACK.
+ *
+ * solver_data is NULL
+ * mg2di_egs_solve() solves the discretized system exactly (up to roundoff error)
+ */
+ EGS_SOLVER_EXACT,
};
typedef struct EGSInternal EGSInternal;
diff --git a/meson.build b/meson.build
index fb20156..c672c28 100644
--- a/meson.build
+++ b/meson.build
@@ -15,9 +15,13 @@ lib_obj = []
verscript = 'libmg2d.v'
ver_flag = '-Wl,--version-script,@0@/@1@'.format(meson.current_source_dir(), verscript)
+cc = meson.get_compiler('c')
+libm = cc.find_library('m', required : false)
+liblapacke = cc.find_library('lapacke')
+
dep_tp = declare_dependency(link_args : '-lthreadpool')
-deps = [dep_tp]
+deps = [dep_tp, liblapacke]
cdata = configuration_data()
cdata.set10('ARCH_X86', false)
@@ -82,9 +86,6 @@ endif
library('mg2d', lib_src, lib_obj, link_args : ver_flag, dependencies : deps)
# test programs
-cc = meson.get_compiler('c')
-libm = cc.find_library('m', required : false)
-
test_progs = ['relax', 'mg2d']
foreach t : test_progs
target = t + '_test'
diff --git a/mg2d.c b/mg2d.c
index 9216af4..902c3cc 100644
--- a/mg2d.c
+++ b/mg2d.c
@@ -297,12 +297,9 @@ static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level)
/* handle coarsest grid */
if (!level->child) {
-
- for (int j = 0; j < 16; j++) {
- ret = mg_relax_step(ctx, level, "coarse-step");
- if (ret < 0)
- return ret;
- }
+ ret = mg_relax_step(ctx, level, "coarse-step");
+ if (ret < 0)
+ return ret;
level->count_cycles++;
goto finish;
@@ -553,7 +550,7 @@ static void mg_level_free(MG2DLevel **plevel)
*plevel = NULL;
}
-static MG2DLevel *mg_level_alloc(const size_t domain_size)
+static MG2DLevel *mg_level_alloc(enum EGSType type, const size_t domain_size)
{
MG2DLevel *level;
int ret;
@@ -567,7 +564,7 @@ static MG2DLevel *mg_level_alloc(const size_t domain_size)
goto fail;
level->prolong_tmp_stride = domain_size + 1;
- level->solver = mg2di_egs_alloc(EGS_SOLVER_RELAXATION,
+ level->solver = mg2di_egs_alloc(type,
(size_t [2]){domain_size, domain_size});
if (!level->solver)
goto fail;
@@ -591,9 +588,11 @@ static int log2i(int n)
static int mg_levels_alloc(MG2DContext *ctx, size_t domain_size)
{
MG2DInternal *priv = ctx->priv;
- MG2DLevel *cur;
+ MG2DLevel *cur, **last;
+ size_t last_size;
+ unsigned int last_depth;
- priv->root = mg_level_alloc(domain_size);
+ priv->root = mg_level_alloc(EGS_SOLVER_RELAXATION, domain_size);
if (!priv->root)
return -ENOMEM;
@@ -601,14 +600,14 @@ static int mg_levels_alloc(MG2DContext *ctx, size_t domain_size)
// the children all have sizes 2**n + 1
// but we skip the closest lower one if it is too close
if (domain_size <= 2)
- return 0;
+ goto finish;
domain_size = (1 << log2i(domain_size - 2)) + 1;
if ((double)priv->root->solver->domain_size[0] / domain_size < 1.5)
domain_size = (domain_size >> 1) + 1;
cur = priv->root;
for (int i = 0; i < LEVELS_MAX && domain_size > 4; i++) {
- cur->child = mg_level_alloc(domain_size);
+ cur->child = mg_level_alloc(EGS_SOLVER_RELAXATION, domain_size);
if (!cur->child)
return -ENOMEM;
cur->child->depth = i + 1;
@@ -617,6 +616,18 @@ static int mg_levels_alloc(MG2DContext *ctx, size_t domain_size)
domain_size = (domain_size >> 1) + 1;
}
+finish:
+ last = &priv->root;
+ while ((*last)->child)
+ last = &((*last)->child);
+ last_size = (*last)->solver->domain_size[0];
+ last_depth = (*last)->depth;
+ mg_level_free(last);
+ *last = mg_level_alloc(EGS_SOLVER_EXACT, last_size);
+ if (!*last)
+ return -ENOMEM;
+ (*last)->depth = last_depth;
+
return 0;
}