4 files changed, 334 insertions, 16 deletions
diff --git a/ell_grid_solve.c b/ell_grid_solve.c
index 191fc8f..1adb8b2 100644
--- a/ell_grid_solve.c
+++ b/ell_grid_solve.c
@@ -24,6 +24,8 @@
 #include <string.h>
 #include <threadpool.h>
 
+#include <lapacke.h>
+
 #include "common.h"
 #include "cpu.h"
 #include "ell_grid_solve.h"
@@ -41,6 +43,18 @@ typedef struct EGSRelaxInternal {
     double relax_factor;
 } EGSRelaxInternal;
 
+typedef struct EGSExactInternal {
+    void (*fill_mat)(double *mat_row, double **diff_coeffs, double *fd_factors,
+                     ptrdiff_t idx_src, ptrdiff_t row_stride, ptrdiff_t row_offset);
+    size_t N;
+    size_t N_ghosts;
+
+    double *mat;
+    double *rhs;
+    double *scratch_line;
+    int    *ipiv;
+} EGSExactInternal;
+
 struct EGSInternal {
     ptrdiff_t stride;
     double *u_base;
@@ -62,6 +76,7 @@ struct EGSInternal {
 
     union {
         EGSRelaxInternal r;
+        EGSExactInternal e;
     };
 
     TPContext *tp_internal;
@@ -421,10 +436,263 @@ static int solve_relax_step(EGSContext *ctx)
 
     return 0;
 }
+
+static int bnd_id(int x_lower, int x_upper, int y_lower, int y_upper)
+{
+    if (x_lower)
+        return MG2D_BOUNDARY_0L;
+    if (x_upper)
+        return MG2D_BOUNDARY_0U;
+    if (y_lower)
+        return MG2D_BOUNDARY_1L;
+    if (y_upper)
+        return MG2D_BOUNDARY_1U;
+    return -1;
+}
+
+static void fill_mat_s1(double *mat_row, double **diff_coeffs, double *fd_factors,
+                        ptrdiff_t idx_src, ptrdiff_t row_stride, ptrdiff_t row_offset)
+{
+    mat_row[row_offset + 0] += diff_coeffs[MG2D_DIFF_COEFF_00][idx_src];
+
+    mat_row[row_offset + 1] +=  1.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+    mat_row[row_offset - 1] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+
+    mat_row[row_offset + 1 * row_stride] +=  1.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+    mat_row[row_offset - 1 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+
+    mat_row[row_offset + 1] +=  1.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+    mat_row[row_offset + 0] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+    mat_row[row_offset - 1] +=  1.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+
+    mat_row[row_offset + 1 * row_stride] +=  1.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+    mat_row[row_offset + 0 * row_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+    mat_row[row_offset - 1 * row_stride] +=  1.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+
+
+    mat_row[row_offset + 1 + 1 * row_stride] +=  1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset + 1 - 1 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset - 1 + 1 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset - 1 - 1 * row_stride] +=  1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+}
+
+static void fill_mat_s2(double *mat_row, double **diff_coeffs, double *fd_factors,
+                        ptrdiff_t idx_src, ptrdiff_t row_stride, ptrdiff_t row_offset)
+{
+    mat_row[row_offset + 0] += diff_coeffs[MG2D_DIFF_COEFF_00][idx_src];
+
+    mat_row[row_offset + 2] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+    mat_row[row_offset + 1] +=  8.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+    mat_row[row_offset - 1] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+    mat_row[row_offset - 2] +=  1.0 * diff_coeffs[MG2D_DIFF_COEFF_10][idx_src] * fd_factors[MG2D_DIFF_COEFF_10];
+
+    mat_row[row_offset + 2 * row_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+    mat_row[row_offset + 1 * row_stride] +=  8.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+    mat_row[row_offset - 1 * row_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+    mat_row[row_offset - 2 * row_stride] +=  1.0 * diff_coeffs[MG2D_DIFF_COEFF_01][idx_src] * fd_factors[MG2D_DIFF_COEFF_01];
+
+    mat_row[row_offset + 2] +=  -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+    mat_row[row_offset + 1] +=  16.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+    mat_row[row_offset + 0] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+    mat_row[row_offset - 1] +=  16.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+    mat_row[row_offset - 2] +=  -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20][idx_src] * fd_factors[MG2D_DIFF_COEFF_20];
+
+    mat_row[row_offset + 2 * row_stride] +=  -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+    mat_row[row_offset + 1 * row_stride] +=  16.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+    mat_row[row_offset + 0 * row_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+    mat_row[row_offset - 1 * row_stride] +=  16.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+    mat_row[row_offset - 2 * row_stride] +=  -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02][idx_src] * fd_factors[MG2D_DIFF_COEFF_02];
+
+    mat_row[row_offset + 2 + 2 * row_stride] +=   1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset + 2 + 1 * row_stride] +=  -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset + 2 - 1 * row_stride] +=   8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset + 2 - 2 * row_stride] +=  -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+    mat_row[row_offset + 1 + 2 * row_stride] +=  -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset + 1 + 1 * row_stride] +=  64.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset + 1 - 1 * row_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset + 1 - 2 * row_stride] +=   8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+    mat_row[row_offset - 1 + 2 * row_stride] +=   8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset - 1 + 1 * row_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset - 1 - 1 * row_stride] +=  64.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset - 1 - 2 * row_stride] +=  -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+
+    mat_row[row_offset - 2 + 2 * row_stride] +=  -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset - 2 + 1 * row_stride] +=   8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset - 2 - 1 * row_stride] +=  -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+    mat_row[row_offset - 2 - 2 * row_stride] +=   1.0 * diff_coeffs[MG2D_DIFF_COEFF_11][idx_src] * fd_factors[MG2D_DIFF_COEFF_11];
+}
+
+static int solve_exact(EGSContext *ctx)
+{
+    EGSInternal *priv = ctx->priv;
+    EGSExactInternal *e = &priv->e;
+    int ret;
+
+    memset(e->mat, 0, SQR(e->N) * sizeof(*e->mat));
+
+    for (ptrdiff_t idx1 = 0; idx1 < ctx->domain_size[1]; idx1++) {
+        const int boundary_x_lower = idx1 < ctx->fd_stencil;
+        const int boundary_x_upper = idx1 >= ctx->domain_size[1] - ctx->fd_stencil;
+
+        for (ptrdiff_t idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) {
+            const int boundary_y_lower = idx0 < ctx->fd_stencil;
+            const int boundary_y_upper = idx0 >= ctx->domain_size[0] - ctx->fd_stencil;
+
+            const int boundary   = boundary_y_lower || boundary_y_upper || boundary_x_lower || boundary_x_upper;
+
+            const ptrdiff_t idx_src     = idx1 * priv->stride        + idx0;
+            const ptrdiff_t mat_row_idx = idx1 * ctx->domain_size[0] + idx0;
+
+            ptrdiff_t row_stride;
+            ptrdiff_t row_offset;
+
+            double *mat_row;
+
+            if (boundary) {
+                memset(e->scratch_line, 0, e->N_ghosts * sizeof(*e->scratch_line));
+                row_stride = ctx->domain_size[0] + 2 * ctx->fd_stencil;
+                mat_row    = e->scratch_line + row_stride * ctx->fd_stencil + ctx->fd_stencil;
+            } else {
+                mat_row    = e->mat + e->N * mat_row_idx;
+                row_stride = ctx->domain_size[0];
+            }
+
+            row_offset = idx1 * row_stride + idx0;
+
+            e->fill_mat(mat_row, ctx->diff_coeffs, ctx->priv->fd_factors, idx_src, row_stride, row_offset);
+
+            e->rhs[mat_row_idx] = ctx->rhs[idx_src];
+
+            if (boundary) {
+                const int bnd_fixval = (idx1 == 0                       && ctx->boundaries[MG2D_BOUNDARY_0L]->type == MG2D_BC_TYPE_FIXVAL) ||
+                                       (idx0 == 0                       && ctx->boundaries[MG2D_BOUNDARY_1L]->type == MG2D_BC_TYPE_FIXVAL) ||
+                                       (idx1 == ctx->domain_size[1] - 1 && ctx->boundaries[MG2D_BOUNDARY_0U]->type == MG2D_BC_TYPE_FIXVAL) ||
+                                       (idx0 == ctx->domain_size[0] - 1 && ctx->boundaries[MG2D_BOUNDARY_1U]->type == MG2D_BC_TYPE_FIXVAL);
+
+                double *mat_dst = e->mat + e->N * mat_row_idx;
+                ptrdiff_t bnd_idx  = (boundary_x_lower || boundary_x_upper) ? idx0 : idx1;
+
+                if (bnd_fixval) {
+                    const MG2DBoundary *bnd = ctx->boundaries[bnd_id(boundary_x_lower, boundary_x_upper,
+                                                                     boundary_y_lower, boundary_y_upper)];
+
+                    mat_dst[mat_row_idx] = 1.0;
+                    e->rhs[mat_row_idx]     = bnd->val[bnd_idx];
+                    memset(e->scratch_line, 0, e->N_ghosts * sizeof(*e->scratch_line));
+                } else {
+                    /* apply the boundary conditions to eliminate the ghostpoint values */
+                    // fixval
+                    if (boundary_x_lower && ctx->boundaries[MG2D_BOUNDARY_0L]->type == MG2D_BC_TYPE_FIXDIFF) {
+                        for (ptrdiff_t idx1_bnd = 0; idx1_bnd < ctx->fd_stencil; idx1_bnd++)
+                            for (ptrdiff_t idx0_col = -(ptrdiff_t)ctx->fd_stencil; idx0_col < (ptrdiff_t)(ctx->domain_size[0] + ctx->fd_stencil); idx0_col++) {
+                                const ptrdiff_t idx_ghost = -(idx1_bnd + 1) * row_stride + idx0_col;
+                                mat_row[(idx1_bnd + 1) * row_stride + idx0_col] += mat_row[idx_ghost];
+                                mat_row[idx_ghost] = 0.0;
+                            }
+                    }
+                    if (boundary_x_upper && ctx->boundaries[MG2D_BOUNDARY_0U]->type == MG2D_BC_TYPE_FIXDIFF) {
+                        for (ptrdiff_t idx1_bnd = 0; idx1_bnd < ctx->fd_stencil; idx1_bnd++)
+                            for (ptrdiff_t idx0_col = -(ptrdiff_t)ctx->fd_stencil; idx0_col < (ptrdiff_t)(ctx->domain_size[0] + ctx->fd_stencil); idx0_col++) {
+                                const ptrdiff_t idx_ghost = (ctx->domain_size[1] - 1 + idx1_bnd + 1) * row_stride + idx0_col;
+                                mat_row[(ctx->domain_size[1] - 1 - (idx1_bnd + 1)) * row_stride + idx0_col] += mat_row[idx_ghost];
+                                mat_row[idx_ghost] = 0.0;
+                            }
+                    }
+                    if (boundary_y_lower && ctx->boundaries[MG2D_BOUNDARY_1L]->type == MG2D_BC_TYPE_FIXDIFF) {
+                        for (ptrdiff_t idx1_col = -(ptrdiff_t)ctx->fd_stencil; idx1_col < (ptrdiff_t)(ctx->domain_size[1] + ctx->fd_stencil); idx1_col++)
+                            for (ptrdiff_t idx0_bnd = 0; idx0_bnd < ctx->fd_stencil; idx0_bnd++) {
+                                const ptrdiff_t idx_ghost = idx1_col * row_stride - (idx0_bnd + 1);
+                                mat_row[idx1_col * row_stride + idx0_bnd + 1] += mat_row[idx_ghost];
+                                mat_row[idx_ghost] = 0.0;
+                            }
+                    }
+                    if (boundary_y_upper && ctx->boundaries[MG2D_BOUNDARY_1U]->type == MG2D_BC_TYPE_FIXDIFF) {
+                        for (ptrdiff_t idx1_col = -(ptrdiff_t)ctx->fd_stencil; idx1_col < (ptrdiff_t)(ctx->domain_size[1] + ctx->fd_stencil); idx1_col++)
+                            for (ptrdiff_t idx0_bnd = 0; idx0_bnd < ctx->fd_stencil; idx0_bnd++) {
+                                const ptrdiff_t idx_ghost = idx1_col * row_stride + ctx->domain_size[0] - 1 + idx0_bnd + 1;
+                                mat_row[idx1_col * row_stride + ctx->domain_size[0] - 1 - (idx0_bnd + 1)] += mat_row[idx_ghost];
+                                mat_row[idx_ghost] = 0.0;
+                            }
+                    }
+
+                    // fixval
+                    if (boundary_x_lower && ctx->boundaries[MG2D_BOUNDARY_0L]->type == MG2D_BC_TYPE_FIXVAL) {
+                        const MG2DBoundary *bnd = ctx->boundaries[MG2D_BOUNDARY_0L];
+                        for (ptrdiff_t idx1_bnd = 1; idx1_bnd < ctx->fd_stencil; idx1_bnd++)
+                            for (ptrdiff_t idx0_col = -((ptrdiff_t)ctx->fd_stencil - 1); idx0_col < (ptrdiff_t)(ctx->domain_size[0] + ctx->fd_stencil - 1); idx0_col++) {
+                                const ptrdiff_t idx_ghost = -idx1_bnd * row_stride + idx0_col;
+                                e->rhs[mat_row_idx] -= bnd->val[idx1_bnd * bnd->val_stride + idx0_col] * mat_row[idx_ghost];
+                                mat_row[idx_ghost] = 0.0;
+                            }
+                    }
+                    if (boundary_x_upper && ctx->boundaries[MG2D_BOUNDARY_0U]->type == MG2D_BC_TYPE_FIXVAL) {
+                        const MG2DBoundary *bnd = ctx->boundaries[MG2D_BOUNDARY_0U];
+                        for (ptrdiff_t idx1_bnd = 1; idx1_bnd < ctx->fd_stencil; idx1_bnd++)
+                            for (ptrdiff_t idx0_col = -((ptrdiff_t)ctx->fd_stencil - 1); idx0_col < (ptrdiff_t)(ctx->domain_size[0] + ctx->fd_stencil - 1); idx0_col++) {
+                                const ptrdiff_t idx_ghost = (ctx->domain_size[1] - 1 + idx1_bnd) * row_stride + idx0_col;
+                                e->rhs[mat_row_idx] -= bnd->val[idx1_bnd * bnd->val_stride + idx0_col] * mat_row[idx_ghost];
+                                mat_row[idx_ghost] = 0.0;
+                            }
+                    }
+                    if (boundary_y_lower && ctx->boundaries[MG2D_BOUNDARY_1L]->type == MG2D_BC_TYPE_FIXVAL) {
+                        const MG2DBoundary *bnd = ctx->boundaries[MG2D_BOUNDARY_1L];
+                        for (ptrdiff_t idx1_col = -((ptrdiff_t)ctx->fd_stencil - 1); idx1_col < (ptrdiff_t)(ctx->domain_size[1] + ctx->fd_stencil - 1); idx1_col++)
+                            for (ptrdiff_t idx0_bnd = 1; idx0_bnd < ctx->fd_stencil; idx0_bnd++) {
+                                const ptrdiff_t idx_ghost = idx1_col * row_stride - idx0_bnd;
+                                e->rhs[mat_row_idx] -= bnd->val[idx0_bnd * bnd->val_stride + idx1_col] * mat_row[idx_ghost];
+                                mat_row[idx1_col * row_stride - idx0_bnd] = 0.0;
+                            }
+                    }
+                    if (boundary_y_upper && ctx->boundaries[MG2D_BOUNDARY_1U]->type == MG2D_BC_TYPE_FIXVAL) {
+                        const MG2DBoundary *bnd = ctx->boundaries[MG2D_BOUNDARY_1U];
+                        for (ptrdiff_t idx1_col = -((ptrdiff_t)ctx->fd_stencil - 1); idx1_col < (ptrdiff_t)(ctx->domain_size[1] + ctx->fd_stencil - 1); idx1_col++)
+                            for (ptrdiff_t idx0_bnd = 1; idx0_bnd < ctx->fd_stencil; idx0_bnd++) {
+                                const ptrdiff_t idx_ghost = idx1_col * row_stride + ctx->domain_size[0] - 1 + idx0_bnd;
+                                e->rhs[mat_row_idx] -= bnd->val[idx0_bnd * bnd->val_stride + idx1_col] * mat_row[idx_ghost];
+                                mat_row[idx_ghost] = 0.0;
+                            }
+                    }
+
+                    /* copy the interior values */
+                    for (ptrdiff_t idx1_col = 0; idx1_col < ctx->domain_size[1]; idx1_col++)
+                        for (ptrdiff_t idx0_col = 0; idx0_col < ctx->domain_size[0]; idx0_col++) {
+                            mat_dst[idx1_col * ctx->domain_size[0] + idx0_col] = mat_row[idx1_col * row_stride + idx0_col];
+                            mat_row[idx1_col * row_stride + idx0_col] = 0.0;
+                        }
+
+                }
+
+                /* make sure all the values from the scratch line have been accounted for */
+                for (ptrdiff_t idx_scratch = 0; idx_scratch < e->N_ghosts; idx_scratch++)
+                    if (e->scratch_line[idx_scratch] != 0.0)
+                        abort();
+            }
+        }
+    }
+
+    ret = LAPACKE_dgesv(LAPACK_ROW_MAJOR, e->N, 1, e->mat, e->N, e->ipiv, e->rhs, 1);
+    if (ret != 0) {
+        mg2di_log(&ctx->logger, MG2D_LOG_ERROR,
+                  "Error solving the linear system: %d\n", ret);
+        return -EDOM;
+    }
+
+    for (size_t idx1 = 0; idx1 < ctx->domain_size[1]; idx1++)
+        memcpy(ctx->u + idx1 * ctx->u_stride, e->rhs + idx1 * ctx->domain_size[0], ctx->domain_size[0] * sizeof(*e->rhs));
+
+    boundaries_apply(ctx);
+    residual_calc(ctx);
+
+    return 0;
+}
+
 int mg2di_egs_solve(EGSContext *ctx)
 {
     switch (ctx->solver_type) {
     case EGS_SOLVER_RELAXATION: return solve_relax_step(ctx);
+    case EGS_SOLVER_EXACT:      return solve_exact(ctx);
     }
 
     return -EINVAL;
@@ -439,6 +707,9 @@ int mg2di_egs_init(EGSContext *ctx)
     priv->calc_blocksize = 1;
     switch (ctx->fd_stencil) {
     case 1:
+        if (ctx->solver_type == EGS_SOLVER_EXACT)
+            priv->e.fill_mat = fill_mat_s1;
+
         priv->residual_calc_line = residual_calc_line_s1_c;
 #if HAVE_EXTERNAL_ASM
         if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
@@ -448,6 +719,9 @@ int mg2di_egs_init(EGSContext *ctx)
 #endif
         break;
     case 2:
+        if (ctx->solver_type == EGS_SOLVER_EXACT)
+            priv->e.fill_mat = fill_mat_s2;
+
         priv->residual_calc_line = residual_calc_line_s2_c;
 #if HAVE_EXTERNAL_ASM
         if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
@@ -576,6 +850,20 @@ static int arrays_alloc(EGSContext *ctx, const size_t domain_size[2])
             return -ENOMEM;
     }
 
+    if (ctx->solver_type == EGS_SOLVER_EXACT) {
+        EGSExactInternal *e = &priv->e;
+
+        e->N = domain_size[0] * domain_size[1];
+        e->N_ghosts = (domain_size[0] + 2 * FD_STENCIL_MAX) * (domain_size[1] + 2 * FD_STENCIL_MAX);
+
+        e->scratch_line = calloc(e->N_ghosts, sizeof(*e->scratch_line));
+        e->mat          = calloc(SQR(e->N),   sizeof(*e->mat));
+        e->rhs          = calloc(e->N,        sizeof(*e->rhs));
+        e->ipiv         = calloc(e->N,        sizeof(*e->ipiv));
+        if (!e->scratch_line || !e->mat || !e->rhs || !e->ipiv)
+            return -ENOMEM;
+    }
+
     return 0;
 }
 
@@ -600,6 +888,8 @@ EGSContext *mg2di_egs_alloc(enum EGSType type, size_t domain_size[2])
         if (!ctx->solver_data)
             goto fail;
         break;
+    case EGS_SOLVER_EXACT:
+        break;
     default: goto fail;
     }
     ctx->solver_type = type;
@@ -630,6 +920,15 @@ void mg2di_egs_free(EGSContext **pctx)
 
     free(ctx->solver_data);
 
+    if (ctx->solver_type == EGS_SOLVER_EXACT) {
+        EGSExactInternal *e = &ctx->priv->e;
+
+        free(e->scratch_line);
+        free(e->mat);
+        free(e->rhs);
+        free(e->ipiv);
+    }
+
     free(ctx->priv->u_base);
     free(ctx->priv->rhs_base);
     free(ctx->priv->residual_base);
diff --git a/ell_grid_solve.h b/ell_grid_solve.h
index 70782c8..a12e9c3 100644
--- a/ell_grid_solve.h
+++ b/ell_grid_solve.h
@@ -52,6 +52,13 @@ enum EGSType {
      * mg2di_egs_solve() does a single relaxation step
      */
     EGS_SOLVER_RELAXATION,
+    /**
+     * Solve the equation exactly by contructing a linear system and solving it with LAPACK.
+     *
+     * solver_data is NULL
+     * mg2di_egs_solve() solves the discretized system exactly (up to roundoff error)
+     */
+    EGS_SOLVER_EXACT,
 };
 
 typedef struct EGSInternal EGSInternal;
diff --git a/meson.build b/meson.build
index fb20156..c672c28 100644
--- a/meson.build
+++ b/meson.build
@@ -15,9 +15,13 @@ lib_obj = []
 verscript = 'libmg2d.v'
 ver_flag  = '-Wl,--version-script,@0@/@1@'.format(meson.current_source_dir(), verscript)
 
+cc = meson.get_compiler('c')
+libm = cc.find_library('m', required : false)
+liblapacke = cc.find_library('lapacke')
+
 dep_tp  = declare_dependency(link_args : '-lthreadpool')
 
-deps = [dep_tp]
+deps = [dep_tp, liblapacke]
 
 cdata = configuration_data()
 cdata.set10('ARCH_X86',          false)
@@ -82,9 +86,6 @@ endif
 library('mg2d', lib_src, lib_obj, link_args : ver_flag, dependencies : deps)
 
 # test programs
-cc = meson.get_compiler('c')
-libm = cc.find_library('m', required : false)
-
 test_progs = ['relax', 'mg2d']
 foreach t : test_progs
     target = t + '_test'
diff --git a/mg2d.c b/mg2d.c
index 9216af4..902c3cc 100644
--- a/mg2d.c
+++ b/mg2d.c
@@ -297,12 +297,9 @@ static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level)
 
     /* handle coarsest grid */
     if (!level->child) {
-
-        for (int j = 0; j < 16; j++) {
-            ret = mg_relax_step(ctx, level, "coarse-step");
-            if (ret < 0)
-                return ret;
-        }
+        ret = mg_relax_step(ctx, level, "coarse-step");
+        if (ret < 0)
+            return ret;
         level->count_cycles++;
 
         goto finish;
@@ -553,7 +550,7 @@ static void mg_level_free(MG2DLevel **plevel)
     *plevel = NULL;
 }
 
-static MG2DLevel *mg_level_alloc(const size_t domain_size)
+static MG2DLevel *mg_level_alloc(enum EGSType type, const size_t domain_size)
 {
     MG2DLevel *level;
     int ret;
@@ -567,7 +564,7 @@ static MG2DLevel *mg_level_alloc(const size_t domain_size)
         goto fail;
     level->prolong_tmp_stride = domain_size + 1;
 
-    level->solver = mg2di_egs_alloc(EGS_SOLVER_RELAXATION,
+    level->solver = mg2di_egs_alloc(type,
                                     (size_t [2]){domain_size, domain_size});
     if (!level->solver)
         goto fail;
@@ -591,9 +588,11 @@ static int log2i(int n)
 static int mg_levels_alloc(MG2DContext *ctx, size_t domain_size)
 {
     MG2DInternal *priv = ctx->priv;
-    MG2DLevel *cur;
+    MG2DLevel *cur, **last;
+    size_t last_size;
+    unsigned int last_depth;
 
-    priv->root = mg_level_alloc(domain_size);
+    priv->root = mg_level_alloc(EGS_SOLVER_RELAXATION, domain_size);
     if (!priv->root)
         return -ENOMEM;
 
@@ -601,14 +600,14 @@ static int mg_levels_alloc(MG2DContext *ctx, size_t domain_size)
     // the children all have sizes 2**n + 1
     // but we skip the closest lower one if it is too close
     if (domain_size <= 2)
-        return 0;
+        goto finish;
     domain_size = (1 << log2i(domain_size - 2)) + 1;
     if ((double)priv->root->solver->domain_size[0] / domain_size  < 1.5)
         domain_size = (domain_size >> 1) + 1;
 
     cur = priv->root;
     for (int i = 0; i < LEVELS_MAX && domain_size > 4; i++) {
-        cur->child = mg_level_alloc(domain_size);
+        cur->child = mg_level_alloc(EGS_SOLVER_RELAXATION, domain_size);
         if (!cur->child)
             return -ENOMEM;
         cur->child->depth = i + 1;
@@ -617,6 +616,18 @@ static int mg_levels_alloc(MG2DContext *ctx, size_t domain_size)
         domain_size = (domain_size >> 1) + 1;
     }
 
+finish:
+    last = &priv->root;
+    while ((*last)->child)
+        last = &((*last)->child);
+    last_size = (*last)->solver->domain_size[0];
+    last_depth = (*last)->depth;
+    mg_level_free(last);
+    *last = mg_level_alloc(EGS_SOLVER_EXACT, last_size);
+    if (!*last)
+        return -ENOMEM;
+    (*last)->depth = last_depth;
+
     return 0;
 }