From 5817b21222c84a9aaaa486216933aa068ab0f30b Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Thu, 11 Sep 2014 12:03:45 +0200
Subject: Rewrite the matrix inversion code.

---
 param.ccl   |  15 ++++-
 src/brill.c | 202 ++++++++++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 173 insertions(+), 44 deletions(-)

diff --git a/param.ccl b/param.ccl
index 64f2063..068f4a6 100644
--- a/param.ccl
+++ b/param.ccl
@@ -13,13 +13,22 @@ CCTK_REAL amplitude "amplitude"
     : :: ""
 } 1
 
-RESTRICTED:
-CCTK_INT cheb_order "order"
+CCTK_INT basis_order_r "Number of the basis functions in the radial direction"
+{
+    1: :: ""
+} 40
+
+CCTK_INT basis_order_z "Number of the basis functions in the z direction"
 {
     1: :: ""
-} 20
+} 40
 
 CCTK_REAL scale_factor "Scaling factor L for the SB basis"
 {
     0: :: ""
 } 3.0
+
+CCTK_INT overdet "Solve the system overdetermined by this many equations"
+{
+    0: :: ""
+} 0
diff --git a/src/brill.c b/src/brill.c
index 48912c4..4592081 100644
--- a/src/brill.c
+++ b/src/brill.c
@@ -8,12 +8,15 @@
 #include <string.h>
 #include <gsl/gsl_blas.h>
 #include <gsl/gsl_linalg.h>
-#include <gsl/gsl_spline.h>
+
+#include <lapacke.h>
 
 #include "cctk.h"
 #include "cctk_Arguments.h"
 #include "cctk_Parameters.h"
 
+#define ACC_TEST 1
+
 #define MAX(x, y) ((x) > (y) ? (x) : (y))
 #define MIN(x, y) ((x) > (y) ? (y) : (x))
 #define SQR(x) ((x) * (x))
@@ -94,8 +97,7 @@ static const BasisSet sb_even_basis = {
 
 typedef struct BrillDataContext {
     /* options */
-    int cheb_order;
-    float amplitude;
+    CCTK_REAL amplitude;
 
     double (*q_func)(struct BrillDataContext *bd, double rho, double z);
     double (*laplace_q_func)(struct BrillDataContext *bd, double rho, double z);
@@ -107,45 +109,121 @@ typedef struct BrillDataContext {
     int nb_coeffs_z;
     int nb_coeffs;
 
+    int nb_colloc_points_x;
+    int nb_colloc_points_z;
+    int nb_colloc_points;
+
     gsl_vector *psi_coeffs;
 } BrillDataContext;
 
 static int brill_construct_matrix(BrillDataContext *bd, gsl_matrix *mat,
                                   gsl_vector *rhs)
 {
-    const int nb_gridpoints_z = bd->cheb_order;
-    const int nb_gridpoints_x = bd->cheb_order;
-    const int nb_gridpoints   = nb_gridpoints_x * nb_gridpoints_z;
-
-    CCTK_REAL z_physical, x_physical;
-
+    CCTK_REAL *basis_val, *basis_dval, *basis_d2val;
     int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z;
-    int idx_coeff, idx_grid;
 
-    /* interior */
-    for (idx_grid_z = 0; idx_grid_z < nb_gridpoints_z; idx_grid_z++) {
-        CCTK_REAL z_physical = bd->basis->colloc_point(nb_gridpoints_z, idx_grid_z);
+    /* precompute the basis values we will need */
+    // FIXME: assumes same number of points in x and z directions
+    basis_val   = malloc(sizeof(*basis_val)   * bd->nb_colloc_points_x * bd->nb_coeffs_x);
+    basis_dval  = malloc(sizeof(*basis_dval)  * bd->nb_colloc_points_x * bd->nb_coeffs_x);
+    basis_d2val = malloc(sizeof(*basis_d2val) * bd->nb_colloc_points_x * bd->nb_coeffs_x);
+    for (int i = 0; i < bd->nb_colloc_points_x; i++) {
+        CCTK_REAL coord = bd->basis->colloc_point(bd->nb_colloc_points_x, i);
+        for (int j = 0; j < bd->nb_coeffs_x; j++) {
+            basis_val  [i * bd->nb_coeffs_x + j] = bd->basis->eval(coord, j);
+            basis_dval [i * bd->nb_coeffs_x + j] = bd->basis->eval_diff1(coord, j);
+            basis_d2val[i * bd->nb_coeffs_x + j] = bd->basis->eval_diff2(coord, j);
+        }
+    }
 
-        fprintf(stdout, "coord %d %g\n", idx_grid_z, z_physical);
+    for (idx_grid_z = 0; idx_grid_z < bd->nb_colloc_points_z; idx_grid_z++) {
+        CCTK_REAL z_physical = bd->basis->colloc_point(bd->nb_colloc_points_z, idx_grid_z);
 
-        for (idx_grid_x = 0; idx_grid_x < nb_gridpoints_x; idx_grid_x++) {
-            CCTK_REAL x_physical = bd->basis->colloc_point(nb_gridpoints_x, idx_grid_x);
+        for (idx_grid_x = 0; idx_grid_x < bd->nb_colloc_points_x; idx_grid_x++) {
+            CCTK_REAL x_physical = bd->basis->colloc_point(bd->nb_colloc_points_x, idx_grid_x);
             CCTK_REAL d2q        = bd->laplace_q_func(bd, x_physical, z_physical);
-            int idx_grid         = idx_grid_z * nb_gridpoints_z + idx_grid_x;
+            int idx_grid         = idx_grid_z * bd->nb_colloc_points_z + idx_grid_x;
 
             for (idx_coeff_z = 0; idx_coeff_z < bd->nb_coeffs_z; idx_coeff_z++)
                 for (idx_coeff_x = 0; idx_coeff_x < bd->nb_coeffs_x; idx_coeff_x++) {
                     int idx_coeff = idx_coeff_z * bd->nb_coeffs_z + idx_coeff_x;
 
-                    mat->data[idx_grid * mat->tda + idx_coeff] = bd->basis->eval_diff2(x_physical, idx_coeff_x) * bd->basis->eval(z_physical, idx_coeff_z) * x_physical +
-                                                                 bd->basis->eval_diff1(x_physical, idx_coeff_x) * bd->basis->eval(z_physical, idx_coeff_z) +
-                                                                 bd->basis->eval_diff2(z_physical, idx_coeff_z) * bd->basis->eval(x_physical, idx_coeff_x) * x_physical +
-                                                                 bd->basis->eval      (x_physical, idx_coeff_x) * bd->basis->eval(z_physical, idx_coeff_z) * 0.25 * d2q * x_physical;
+                    mat->data[idx_grid + mat->tda * idx_coeff] = basis_d2val[idx_grid_x * bd->nb_coeffs_x + idx_coeff_x] * basis_val[idx_grid_z * bd->nb_coeffs_z + idx_coeff_z] * x_physical +
+                                                                 basis_dval [idx_grid_x * bd->nb_coeffs_x + idx_coeff_x] * basis_val[idx_grid_z * bd->nb_coeffs_z + idx_coeff_z] +
+                                                                 basis_d2val[idx_grid_z * bd->nb_coeffs_z + idx_coeff_z] * basis_val[idx_grid_x * bd->nb_coeffs_x + idx_coeff_x] * x_physical +
+                                                                 basis_val  [idx_grid_x * bd->nb_coeffs_x + idx_coeff_x] * basis_val[idx_grid_z * bd->nb_coeffs_z + idx_coeff_z] * 0.25 * d2q * x_physical;
                 }
             rhs->data[idx_grid] = -0.25 * d2q * x_physical;
         }
     }
 
+    free(basis_val);
+    free(basis_dval);
+    free(basis_d2val);
+
+    return 0;
+}
+
+static int solve_linear(gsl_matrix *mat, gsl_vector **px, gsl_vector **prhs)
+{
+    int *ipiv;
+    gsl_matrix *mat_f;
+    double cond, ferr, berr, rpivot;
+
+    gsl_vector   *x = *px;
+    gsl_vector *rhs = *prhs;
+    char      equed = 'N';
+    int         ret = 0;
+
+    ipiv  = malloc(mat->size1 * sizeof(*ipiv));
+    mat_f = gsl_matrix_alloc(mat->size1, mat->size2);
+    if (!ipiv || !mat_f) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', mat->size1, 1,
+                   mat->data, mat->tda, mat_f->data, mat_f->tda, ipiv, &equed,
+                   NULL, NULL, rhs->data, rhs->size, x->data, x->size,
+                   &cond, &ferr, &berr, &rpivot);
+
+    CCTK_VInfo(CCTK_THORNSTRING, "LU factorization solution to a %zdx%zd matrix: "
+               "condition number %16.16g; forward error %16.16g backward error %16.16g",
+               mat->size1, mat->size2, cond, ferr, berr);
+fail:
+    gsl_matrix_free(mat_f);
+    free(ipiv);
+
+    return ret;
+}
+
+static int solve_svd(gsl_matrix *mat, gsl_vector **px, gsl_vector **prhs)
+{
+    double *sv;
+    int rank;
+
+    gsl_vector   *x = *px;
+    gsl_vector *rhs = *prhs;
+
+    sv = malloc(sizeof(*sv) * x->size);
+    if (!sv)
+        return -ENOMEM;
+
+    LAPACKE_dgelsd(LAPACK_COL_MAJOR, rhs->size, x->size, 1, mat->data, mat->tda,
+                   rhs->data, rhs->size, sv, -1, &rank);
+
+    CCTK_VInfo(CCTK_THORNSTRING, "Least squares SVD solution to a %zdx%zd matrix: "
+               "rank %d, condition number %16.16g", mat->size1, mat->size2,
+               rank, sv[x->size - 1] / sv[0]);
+
+    gsl_vector_free(x);
+
+    *px         = rhs;
+    (*px)->size = mat->size1;
+    *prhs       = NULL;
+
+    free(sv);
+
     return 0;
 }
 
@@ -157,40 +235,76 @@ static int brill_construct_matrix(BrillDataContext *bd, gsl_matrix *mat,
  */
 static int brill_solve(BrillDataContext *bd)
 {
-    gsl_matrix *mat;
-    gsl_permutation *perm;
-    gsl_vector *coeffs, *rhs;
+    gsl_matrix *mat = NULL;
+    gsl_vector *coeffs = NULL, *rhs = NULL;
+
+#if ACC_TEST
+    gsl_vector *rhs_bkp = NULL;
+    gsl_matrix *mat_bkp = NULL;
+#endif
 
-    int signum;
     int ret = 0;
 
     /* allocate and fill the pseudospectral matrix */
-    mat    = gsl_matrix_alloc(bd->nb_coeffs, bd->nb_coeffs);
-
-    perm   = gsl_permutation_alloc(bd->nb_coeffs);
-    coeffs = gsl_vector_alloc     (bd->nb_coeffs);
-    rhs    = gsl_vector_calloc    (bd->nb_coeffs);
-    if (!mat || !perm || !coeffs || !rhs) {
+    mat    = gsl_matrix_alloc (bd->nb_coeffs, bd->nb_colloc_points); // inverted order for lapack
+    coeffs = gsl_vector_alloc (bd->nb_coeffs);
+    rhs    = gsl_vector_calloc(bd->nb_colloc_points);
+    if (!mat || !coeffs || !rhs) {
         ret = -ENOMEM;
         goto fail;
     }
 
     /* fill the matrix */
-    brill_construct_matrix(bd, mat, rhs);
+    ret = brill_construct_matrix(bd, mat, rhs);
+    if (ret < 0)
+        goto fail;
+
+#if ACC_TEST
+    /* make backups of the matrix and RHS, since they might be destroyed later */
+    mat_bkp = gsl_matrix_alloc(mat->size2, mat->size1);
+    rhs_bkp = gsl_vector_alloc(rhs->size);
+    if (!mat_bkp || !rhs_bkp) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+
+    gsl_vector_memcpy(rhs_bkp, rhs);
+    gsl_matrix_transpose_memcpy(mat_bkp, mat);
+#endif
 
     /* solve for the coeffs */
-    gsl_linalg_LU_decomp(mat, perm, &signum);
-    gsl_linalg_LU_solve(mat, perm, rhs, coeffs);
+    if (bd->nb_colloc_points > bd->nb_coeffs)
+        ret = solve_svd(mat, &coeffs, &rhs);
+    else
+        ret = solve_linear(mat, &coeffs, &rhs);
 
     bd->psi_coeffs = coeffs;
 
+#if ACC_TEST
+    {
+        double min, max, rmin, rmax;
+
+        gsl_vector_minmax(rhs_bkp, &rmin, &rmax);
+        gsl_blas_dgemv(CblasNoTrans, 1.0, mat_bkp, coeffs, -1.0, rhs_bkp);
+        gsl_vector_minmax(rhs_bkp, &min, &max);
+
+        CCTK_VInfo(CCTK_THORNSTRING, "max(|A·x - rhs|) / max(|rhs|): %16.16g",
+                   MAX(fabs(min), fabs(max)) / MAX(fabs(rmin), fabs(rmax)));
+    }
+#endif
+
 fail:
+
+#if ACC_TEST
+    gsl_matrix_free(mat_bkp);
+    gsl_vector_free(rhs_bkp);
+#endif
+
     if (ret < 0)
         gsl_vector_free(coeffs);
 
     gsl_vector_free(rhs);
     gsl_matrix_free(mat);
-    gsl_permutation_free(perm);
 
     return ret;
 }
@@ -211,11 +325,13 @@ static double laplace_q_gundlach(BrillDataContext *bd, double rho, double z)
     return 2 * bd->amplitude * e * (1.0 + 2 * r2 * (r2 + z2 - 3));
 }
 
-static BrillDataContext *init_bd(cGH *cctkGH, float amplitude, int cheb_order)
+static BrillDataContext *init_bd(cGH *cctkGH, CCTK_REAL amplitude,
+                                 int basis_order_r, int basis_order_z, double sf, int overdet)
 {
     BrillDataContext *bd;
 
-    int i, j;
+    if (basis_order_r != basis_order_z)
+        CCTK_WARN(0, "Different r and z basis orders are not supported.");
 
     bd = malloc(sizeof(*bd));
 
@@ -224,13 +340,17 @@ static BrillDataContext *init_bd(cGH *cctkGH, float amplitude, int cheb_order)
     bd->amplitude      = amplitude;
 
     bd->basis      = &sb_even_basis;
-    bd->cheb_order = cheb_order;
 
-    bd->nb_coeffs_x = cheb_order;
-    bd->nb_coeffs_z = cheb_order;
+    bd->nb_coeffs_x = basis_order_r;
+    bd->nb_coeffs_z = basis_order_z;
 
     bd->nb_coeffs = bd->nb_coeffs_x * bd->nb_coeffs_z;
 
+    bd->nb_colloc_points_x = basis_order_r + overdet;
+    bd->nb_colloc_points_z = basis_order_z + overdet;
+
+    bd->nb_colloc_points = bd->nb_colloc_points_x * bd->nb_colloc_points_z;
+
     scale_factor = sf;
 
     return bd;
@@ -254,7 +374,7 @@ void brill_data(CCTK_ARGUMENTS)
 
     /* on the first run, solve the equation for ψ */
     if (!prev_bd) {
-        bd = init_bd(cctkGH, amplitude, cheb_order);
+        bd = init_bd(cctkGH, amplitude, basis_order_r, basis_order_z, scale_factor, overdet);
 
         brill_solve(bd);
 
-- 
cgit v1.2.3