From 234e722c5abfb15c9850b4224cabbb3e2d0c3657 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Fri, 9 Feb 2018 15:34:25 +0100
Subject: Add the remaining parts for teukolsky waves initial data

---
 init.c | 438 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 438 insertions(+)
 create mode 100644 init.c

(limited to 'init.c')

diff --git a/init.c b/init.c
new file mode 100644
index 0000000..4fdece4
--- /dev/null
+++ b/init.c
@@ -0,0 +1,438 @@
+/*
+ * Copyright 2017 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#include <cblas.h>
+
+#if HAVE_OPENCL
+#include <cl.h>
+#include <clBLAS.h>
+#endif
+
+#include "basis.h"
+#include "common.h"
+#include "cpu.h"
+#include "log.h"
+#include "nlsolve.h"
+#include "teukolsky_data.h"
+#include "threadpool.h"
+
+#define NB_EQUATIONS 3
+
+typedef struct TDPriv {
+    unsigned int basis_order[NB_EQUATIONS][2];
+    BasisSetContext *basis[NB_EQUATIONS][2];
+
+    NLSolveContext *solver;
+
+    ThreadPoolContext *tp;
+    TDLogger logger;
+
+    double *coeffs;
+    ptrdiff_t coeffs_stride;
+
+    int cpu_flags;
+
+    double (*scalarproduct_metric)(size_t len1, size_t len2, double *mat,
+                                   double *vec1, double *vec2);
+} TDPriv;
+
+double tdi_scalarproduct_metric_fma3(size_t len1, size_t len2, double *mat,
+                                     double *vec1, double *vec2);
+double tdi_scalarproduct_metric_avx(size_t len1, size_t len2, double *mat,
+                                    double *vec1, double *vec2);
+double tdi_scalarproduct_metric_sse3(size_t len1, size_t len2, double *mat,
+                                     double *vec1, double *vec2);
+double tdi_scalarproduct_metric_c(size_t len1, size_t len2, double *mat,
+                                  double *vec1, double *vec2);
+
+static void init_opencl(TDPriv *s)
+#if HAVE_OPENCL
+{
+    int err, count;
+    cl_platform_id platform;
+    cl_context_properties props[3];
+    cl_device_id ocl_device;
+
+    err = clGetPlatformIDs(1, &platform, &count);
+    if (err != CL_SUCCESS || count < 1) {
+        tdi_log(&s->logger, 0, "Could not get an OpenCL platform ID\n");
+        return;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ocl_device, &count);
+    if (err != CL_SUCCESS || count < 1) {
+        tdi_log(&s->logger, 0, "Could not get an OpenCL device ID\n");
+        return;
+    }
+
+    props[0] = CL_CONTEXT_PLATFORM;
+    props[1] = (cl_context_properties)platform;
+    props[2] = 0;
+
+    s->ocl_ctx = clCreateContext(props, 1, &ocl_device, NULL, NULL, &err);
+    if (err != CL_SUCCESS || !s->ocl_ctx) {
+        tdi_log(&s->logger, 0, "Could not create an OpenCL context\n");
+        return;
+    }
+
+    s->ocl_queue = clCreateCommandQueue(s->ocl_ctx, ocl_device, 0, &err);
+    if (err != CL_SUCCESS || !s->ocl_queue) {
+        tdi_log(&s->logger, 0, "Could not create an OpenCL command queue: %d\n", err);
+        goto fail;
+    }
+
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        tdi_log(&s->logger, 0, "Error setting up clBLAS\n");
+        goto fail;
+    }
+
+    return;
+fail:
+    if (s->ocl_queue)
+        clReleaseCommandQueue(s->ocl_queue);
+    s->ocl_queue = 0;
+
+    if (s->ocl_ctx)
+        clReleaseContext(s->ocl_ctx);
+    s->ocl_ctx = 0;
+}
+#else
+{
+}
+#endif
+
+static const enum BasisFamily basis_sets[NB_EQUATIONS][2] = {
+    { BASIS_FAMILY_SB_EVEN,  BASIS_FAMILY_COS_EVEN },
+    { BASIS_FAMILY_SB_EVEN,  BASIS_FAMILY_COS_EVEN },
+    { BASIS_FAMILY_SB_EVEN,  BASIS_FAMILY_COS_EVEN },
+};
+
+static int teukolsky_init_check_options(TDContext *td)
+{
+    TDPriv *s = td->priv;
+    int ret;
+
+    s->cpu_flags = tdi_init_cpu_flags();
+
+    if (!td->nb_threads) {
+        td->nb_threads = tdi_cpu_count();
+        if (!td->nb_threads)
+            td->nb_threads = 1;
+    }
+
+    ret = tdi_threadpool_init(&s->tp, td->nb_threads);
+    if (ret < 0)
+        return ret;
+
+    init_opencl(s);
+
+    s->logger.log = tdi_log_default_callback;
+
+    //s->scalarproduct_metric = tdi_scalarproduct_metric_c;
+    //if (EXTERNAL_SSE3(s->cpu_flags))
+    //    s->scalarproduct_metric = tdi_scalarproduct_metric_sse3;
+    //if (EXTERNAL_AVX(s->cpu_flags))
+    //    s->scalarproduct_metric = tdi_scalarproduct_metric_avx;
+    //if (EXTERNAL_FMA3(s->cpu_flags))
+    //    s->scalarproduct_metric = tdi_scalarproduct_metric_fma3;
+//    for (int i = 0; i < ctx->nb_equations; i++)
+//        for (int j = 0; j < 2; j++) {
+//            s->ps_ctx->basis[i][j]       = ctx->basis[i][j];
+//            s->ps_ctx->solve_order[i][j] = basis_order[i][j];
+//            max_order = MAX(max_order, basis_order[i][j]);
+//        }
+//
+    //for (int i = 0; i < max_order; i++) {
+    //    tdi_log(&s->logger, 2, "%d ", i);
+    //    for (int j = 0; j < ctx->nb_equations; j++)
+    //        for (int k = 0; k < 2; k++) {
+    //            if (i < s->ps_ctx->solve_order[j][k])
+    //                tdi_log(&s->logger, 2, "%8.8g\t", s->ps_ctx->colloc_grid[j][k][i]);
+    //            else
+    //                tdi_log(&s->logger, 2, "        ");
+    //        }
+    //    tdi_log(&s->logger, 2, "\n");
+    //}
+
+    s->basis_order[0][0] = td->nb_coeffs[0];
+    s->basis_order[0][1] = td->nb_coeffs[1];
+    s->basis_order[1][0] = td->nb_coeffs[0];
+    s->basis_order[1][1] = td->nb_coeffs[1];
+    s->basis_order[2][0] = td->nb_coeffs[0];
+    s->basis_order[2][1] = td->nb_coeffs[1];
+
+    ret = posix_memalign((void**)&s->coeffs, 32,
+                         sizeof(*s->coeffs) * NB_EQUATIONS * td->nb_coeffs[0] * td->nb_coeffs[1]);
+    if (ret)
+        return -ENOMEM;
+    memset(s->coeffs, 0, sizeof(*s->coeffs) * NB_EQUATIONS * td->nb_coeffs[0] * td->nb_coeffs[1]);
+
+    for (int i = 0; i < NB_EQUATIONS; i++)
+        td->coeffs[i] = s->coeffs + i * td->nb_coeffs[0] * td->nb_coeffs[1];
+
+    for (int i = 0; i < ARRAY_ELEMS(basis_sets); i++)
+        for (int j = 0; j < ARRAY_ELEMS(basis_sets[i]); j++) {
+            double sf;
+
+            ret = tdi_basis_init(&s->basis[i][j], basis_sets[i][j], td->basis_scale_factor[j]);
+            if (ret < 0)
+                return ret;
+        }
+
+
+    ret = tdi_nlsolve_context_alloc(&s->solver, ARRAY_ELEMS(basis_sets));
+    if (ret < 0) {
+        tdi_log(&s->logger, 0, "Error allocating the non-linear solver\n");
+        return ret;
+    }
+
+    s->solver->logger  = s->logger;
+    s->solver->tp      = s->tp;
+    s->solver->maxiter = td->max_iter;
+    s->solver->atol    = td->atol;
+
+    memcpy(s->solver->basis,       s->basis,       sizeof(s->basis));
+    memcpy(s->solver->solve_order, s->basis_order, sizeof(s->basis_order));
+
+#if HAVE_OPENCL
+    s->solver->ocl_ctx   = s->ocl_ctx;
+    s->solver->ocl_queue = s->ocl_queue;
+#endif
+
+    ret = tdi_nlsolve_context_init(s->solver);
+    if (ret < 0) {
+        tdi_log(&s->logger, 0, "Error initializing the non-linear solver\n");
+        return ret;
+    }
+
+    return 0;
+}
+
+static int teukolsky_constraint_eval(void *opaque, unsigned int eq_idx,
+                                     const unsigned int *colloc_grid_order,
+                                     const double * const *colloc_grid,
+                                     const double * const (*vars)[PSSOLVE_DIFF_ORDER_NB],
+                                     double *dst)
+{
+    const double *amplitude = opaque;
+
+    return tdi_constraints_eval(eq_idx, *amplitude, colloc_grid_order, colloc_grid,
+                                vars, dst);
+}
+
+#define AMPLITUDE_DIRECT_SOLVE 1e-3
+
+int td_solve(TDContext *td, double *coeffs_init[3])
+{
+    TDPriv *s = td->priv;
+    double cur_amplitude = td->amplitude;//MIN(td->amplitude, AMPLITUDE_DIRECT_SOLVE);
+    int ret;
+
+    ret = teukolsky_init_check_options(td);
+    if (ret < 0)
+        return ret;
+
+    if (coeffs_init) {
+        for (int i = 0; i < 3; i++) {
+            memcpy(td->coeffs[i], coeffs_init[i], sizeof(*td->coeffs[i]) *
+                   td->nb_coeffs[0] * td->nb_coeffs[1]);
+        }
+    }
+
+    while (1) {
+        double scale_factor;
+
+        ret = tdi_nlsolve_solve(s->solver, teukolsky_constraint_eval, NULL,
+                                &cur_amplitude, s->coeffs);
+        if (ret < 0)
+            return ret;
+
+        if (fabs(cur_amplitude - td->amplitude) < DBL_EPSILON)
+            goto finish;
+
+        scale_factor = cur_amplitude;
+        cur_amplitude = MIN(td->amplitude, 2 * cur_amplitude);
+        scale_factor = cur_amplitude / scale_factor;
+
+        cblas_dscal(td->nb_coeffs[0] * td->nb_coeffs[1], SQR(scale_factor), td->coeffs[0], 1);
+        cblas_dscal(td->nb_coeffs[0] * td->nb_coeffs[1], scale_factor, td->coeffs[1], 1);
+        cblas_dscal(td->nb_coeffs[0] * td->nb_coeffs[1], scale_factor, td->coeffs[2], 1);
+
+        tdi_log(&s->logger, 2, "Trying amplitude %g %g\n", cur_amplitude, scale_factor);
+    }
+
+finish:
+    tdi_solver_print_stats(s->solver);
+
+    return 0;
+}
+
+TDContext *td_context_alloc(void)
+{
+    TDContext *td = calloc(1, sizeof(*td));
+
+    if (!td)
+        return NULL;
+
+    td->nb_threads = 1;
+
+    td->amplitude   = 1.0;
+
+    td->nb_coeffs[0] = 32;
+    td->nb_coeffs[1] = 16;
+
+    td->basis_scale_factor[0] = 3.0;
+    td->basis_scale_factor[1] = 3.0;
+
+    td->max_iter = 16;
+    td->atol     = 1e-12;
+
+    td->log_callback = tdi_log_default_callback;
+
+    td->priv = calloc(1, sizeof(TDPriv));
+    if (!td->priv) {
+        free(td);
+        return NULL;
+    }
+
+    return td;
+}
+
+void td_context_free(TDContext **ptd)
+{
+    TDContext *td = *ptd;
+    TDPriv *s;
+
+    if (!td)
+        return;
+
+    s = td->priv;
+
+    tdi_nlsolve_context_free(&s->solver);
+    tdi_threadpool_free(&s->tp);
+
+#if HAVE_OPENCL
+    if (s->ocl_queue)
+        clReleaseCommandQueue(s->ocl_queue);
+    if (s->ocl_ctx)
+        clReleaseContext(s->ocl_ctx);
+#endif
+
+    for (int i = 0; i < ARRAY_ELEMS(s->basis); i++)
+        for (int j = 0; j < ARRAY_ELEMS(s->basis[i]); j++)
+            tdi_basis_free(&s->basis[i][j]);
+
+    free(s->coeffs);
+
+    free(td->priv);
+    free(td);
+    *ptd = NULL;
+}
+
+static double scalarproduct_metric_c(size_t len1, size_t len2, double *mat,
+                                     double *vec1, double *vec2)
+{
+    double val = 0.0;
+    for (int m = 0; m < len2; m++) {
+        double tmp = 0.0;
+        for (int n = 0; n < len1; n++)
+            tmp += mat[m * len1 + n] * vec1[n];
+        val += tmp * vec2[m];
+    }
+    return val;
+}
+
+static int eval_var(const TDContext *td, unsigned int var_idx,
+                    size_t nb_coords, const double *r, const double *theta,
+                    const unsigned int diff_order[2],
+                    double *out)
+{
+    TDPriv *s = td->priv;
+    double *basis_val[2] = { NULL };
+    int ret = 0;
+
+    if (diff_order[0] > 0 || diff_order[1] > 0) {
+        tdi_log(&s->logger, 0, "Derivatives of higher order than 2 are not supported.\n");
+        return -ENOSYS;
+    }
+
+    posix_memalign((void**)&basis_val[0], 32, sizeof(*basis_val[0]) * td->nb_coeffs[0]);
+    posix_memalign((void**)&basis_val[1], 32, sizeof(*basis_val[1]) * td->nb_coeffs[1]);
+    if (!basis_val[0] || !basis_val[1]) {
+        ret = -ENOMEM;
+        goto fail;
+    }
+    memset(basis_val[0], 0, sizeof(*basis_val[0]) * td->nb_coeffs[0]);
+    memset(basis_val[1], 0, sizeof(*basis_val[1]) * td->nb_coeffs[1]);
+
+    for (int i = 0; i < nb_coords; i++) {
+        double theta_val = theta[i];
+        double r_val     = r[i];
+
+        double val = (var_idx == 0) ? 1.0 : 0.0;
+
+        for (int k = 0; k < td->nb_coeffs[0]; k++)
+            basis_val[0][k] = tdi_basis_eval(s->basis[var_idx][0], BS_EVAL_TYPE_VALUE, r_val, k);
+        for (int k = 0; k < td->nb_coeffs[1]; k++)
+            basis_val[1][k] = tdi_basis_eval(s->basis[var_idx][1], BS_EVAL_TYPE_VALUE, theta_val, k);
+
+        val += scalarproduct_metric_c(td->nb_coeffs[0], td->nb_coeffs[1], td->coeffs[var_idx],
+                                      basis_val[0], basis_val[1]);
+
+        out[i] = val;
+    }
+
+
+fail:
+    free(basis_val[0]);
+    free(basis_val[1]);
+
+    return ret;
+}
+
+int td_eval_psi(const TDContext *td,
+                size_t nb_coords, const double *r, const double *theta,
+                const unsigned int diff_order[2],
+                double *out)
+{
+    return eval_var(td, 0, nb_coords, r, theta, diff_order, out);
+}
+
+int td_eval_krr(const TDContext *td,
+                size_t nb_coords, const double *r, const double *theta,
+                const unsigned int diff_order[2],
+                double *out)
+{
+    return eval_var(td, 1, nb_coords, r, theta, diff_order, out);
+}
+int td_eval_kpp(const TDContext *td,
+                size_t nb_coords, const double *r, const double *theta,
+                const unsigned int diff_order[2],
+                double *out)
+{
+    return eval_var(td, 2, nb_coords, r, theta, diff_order, out);
+}
-- 
cgit v1.2.3