summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2016-05-12 13:15:02 +0200
committerAnton Khirnov <anton@khirnov.net>2016-05-12 13:15:02 +0200
commit7d8d6b69891738d6837323b973e636ee86c5cdcd (patch)
tree8f1cb552a69f5cdbde1db83b5fb8f864ebe6679d
parenta4915453f05ebdfeccd8f954026096e8f145fd06 (diff)
A major rewrite.
The code is now mostly functional, this splits it into several independent subsystems to be more readable/maintainable.
-rw-r--r--param.ccl8
-rw-r--r--schedule.ccl1
-rw-r--r--src/basis.c32
-rw-r--r--src/basis.h49
-rw-r--r--src/bicgstab.c410
-rw-r--r--src/bicgstab.h60
-rw-r--r--src/common.h30
-rw-r--r--src/make.code.defn2
-rw-r--r--src/pssolve.c389
-rw-r--r--src/pssolve.h116
-rw-r--r--src/qms.c899
-rw-r--r--src/qms.h272
-rw-r--r--src/qms_solve.c880
-rw-r--r--src/qms_solve.h52
-rw-r--r--src/solve.c636
-rw-r--r--src/solve.cl28
-rw-r--r--src/solve_cl.c28
17 files changed, 2202 insertions, 1690 deletions
diff --git a/param.ccl b/param.ccl
index e80c42b..4e2053a 100644
--- a/param.ccl
+++ b/param.ccl
@@ -11,17 +11,17 @@ CCTK_INT basis_order_z "Number of the basis functions in the z direction" STEERA
1: :: ""
} 40
-CCTK_REAL scale_factor "Scaling factor L for the SB basis" STEERABLE=recover
+CCTK_REAL filter_power "" STEERABLE=recover
{
0: :: ""
-} 3.0
+} 64.0
-CCTK_REAL filter_power "" STEERABLE=recover
+CCTK_REAL scale_factor "" STEERABLE=recover
{
0: :: ""
} 64.0
-CCTK_REAL input_filter_power "" STEERABLE=recover
+CCTK_REAL scale_power "" STEERABLE=recover
{
0: :: ""
} 64.0
diff --git a/schedule.ccl b/schedule.ccl
index 07120fc..3923b92 100644
--- a/schedule.ccl
+++ b/schedule.ccl
@@ -5,6 +5,7 @@ SCHEDULE quasimaximal_slicing_axi_eval IN ML_BSSN_evolCalcGroup BEFORE ML_BSSN_l
} "Quasimaximal slicing eval W"
#SCHEDULE quasimaximal_slicing_axi_solve IN ML_BSSN_evolCalcGroup BEFORE quasimaximal_slicing_axi_eval {
+#SCHEDULE quasimaximal_slicing_axi_solve IN MoL_PostStep AFTER ML_BSSN_ApplyBCs {
SCHEDULE quasimaximal_slicing_axi_solve IN MoL_PreStep {
LANG: C
} "Quasimaximal slicing solve W"
diff --git a/src/basis.c b/src/basis.c
index 1ca61d8..4a5a504 100644
--- a/src/basis.c
+++ b/src/basis.c
@@ -1,7 +1,25 @@
+/*
+ * Basis sets for pseudospectral methods
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
#include <math.h>
-#include "qms.h"
+#include "basis.h"
+#include "common.h"
/*
* The basis of even (n = 2 * idx) SB functions (Boyd 2000, Ch 17.9)
@@ -10,7 +28,7 @@
*/
static double sb_even_eval(double coord, int idx)
{
- double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+ double val = atan2(SCALE_FACTOR, coord);
idx *= 2; // even only
@@ -19,20 +37,20 @@ static double sb_even_eval(double coord, int idx)
static double sb_even_eval_diff1(double coord, int idx)
{
- double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+ double val = atan2(SCALE_FACTOR, coord);
idx *= 2; // even only
- return - SCALE_FACTOR * (idx + 1) * SGN(coord) * cos((idx + 1) * val) / (SQR(SCALE_FACTOR) + SQR(coord));
+ return - SCALE_FACTOR * (idx + 1) * cos((idx + 1) * val) / (SQR(SCALE_FACTOR) + SQR(coord));
}
static double sb_even_eval_diff2(double coord, int idx)
{
- double val = (coord == 0.0) ? M_PI_2 : atan(SCALE_FACTOR / fabs(coord));
+ double val = atan2(SCALE_FACTOR, coord);
idx *= 2; // even only
- return SCALE_FACTOR * (idx + 1) * SGN(coord) * (2 * fabs(coord) * cos((idx + 1) * val) - SCALE_FACTOR * (idx + 1) * sin((idx + 1) * val)) / SQR(SQR(SCALE_FACTOR) + SQR(coord));
+ return SCALE_FACTOR * (idx + 1) * (2 * coord * cos((idx + 1) * val) - SCALE_FACTOR * (idx + 1) * sin((idx + 1) * val)) / SQR(SQR(SCALE_FACTOR) + SQR(coord));
}
static double sb_even_colloc_point(int order, int idx)
@@ -43,7 +61,7 @@ static double sb_even_colloc_point(int order, int idx)
//order *= 2;
//t = (idx + 2) * M_PI / (order + 4);
-#if POLAR
+#if QMS_POLAR
t = (idx + 2) * M_PI / (2 * order + 3);
#else
t = (idx + 2) * M_PI / (2 * order + 2);
diff --git a/src/basis.h b/src/basis.h
new file mode 100644
index 0000000..f076096
--- /dev/null
+++ b/src/basis.h
@@ -0,0 +1,49 @@
+/*
+ * Basis sets for pseudospectral methods
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QMS_BASIS_H
+#define QMS_BASIS_H
+
+/* a set of basis functions */
+typedef struct BasisSet {
+ /* evaluate the idx-th basis function at the specified point*/
+ double (*eval) (double coord, int idx);
+ /* evaluate the first derivative of the idx-th basis function at the specified point*/
+ double (*eval_diff1)(double coord, int idx);
+ /* evaluate the second derivative of the idx-th basis function at the specified point*/
+ double (*eval_diff2)(double coord, int idx);
+ /**
+ * Get the idx-th collocation point for the specified order.
+ * idx runs from 0 to order - 1 (inclusive)
+ */
+ double (*colloc_point)(int order, int idx);
+} BasisSet;
+
+extern const BasisSet qms_cheb_basis;
+extern const BasisSet qms_cheb_even_basis;
+extern const BasisSet qms_full_basis;
+extern const BasisSet qms_tb_even_basis;
+extern const BasisSet qms_sb_even_basis;
+extern const BasisSet qms_sb_odd_basis;
+extern const BasisSet qms_tl_basis;
+extern const BasisSet qms_cos_even_basis;
+
+#define SCALE_FACTOR scale_factor
+extern double scale_factor;
+
+#endif /* QMS_BASIS_H */
diff --git a/src/bicgstab.c b/src/bicgstab.c
new file mode 100644
index 0000000..a81bd74
--- /dev/null
+++ b/src/bicgstab.c
@@ -0,0 +1,410 @@
+/*
+ * BiCGStab iterative linear system solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#include <clBLAS.h>
+#endif
+
+#include <cblas.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bicgstab.h"
+
+#define BICGSTAB_MAXITER 16
+#define BICGSTAB_TOL (1e-15)
+
+struct BiCGStabContext {
+ int N;
+
+ double *x;
+ double *p, *v, *y, *z, *t;
+ double *res, *res0;
+ double *k;
+
+#if HAVE_OPENCL
+ cl_context ocl_ctx;
+ cl_command_queue ocl_queue;
+
+ cl_mem cl_x;
+ cl_mem cl_p, cl_v, cl_y, cl_z, cl_t;
+ cl_mem cl_res, cl_res0;
+ cl_mem cl_k, cl_mat;
+ cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1;
+ cl_mem cl_tmp, cl_tmp1;
+#endif
+};
+
+#if HAVE_OPENCL
+static int solve_cl(BiCGStabContext *ctx,
+ const double *mat, const double *rhs, double *x)
+{
+ cl_command_queue ocl_q = ctx->ocl_queue;
+ const int N = ctx->N;
+ const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+ double rho, rho_prev = 1.0;
+ double omega[2] = { 1.0 };
+ double alpha = 1.0;
+
+ double err;
+ int i;
+
+ cl_event events[8];
+
+ // upload the matrix and RHS
+ clEnqueueWriteBuffer(ocl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs, 0, NULL, &events[0]);
+ clEnqueueWriteBuffer(ocl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]);
+
+ // initialize the residual
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+ ctx->cl_mat, 0, N, ctx->cl_x, 0, 1, 1.0, ctx->cl_res, 0, 1,
+ 1, &ocl_q, 2, events, &events[2]);
+ clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double),
+ 1, &events[2], &events[3]);
+ clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double),
+ 1, &events[2], &events[4]);
+
+ clWaitForEvents(5, events);
+ // BARRIER
+
+ for (i = 0; i < MAXITER; i++) {
+ clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1,
+ ctx->cl_tmp, 1, &ocl_q, 0, NULL, &events[0]);
+ clEnqueueReadBuffer(ocl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho,
+ 1, &events[0], NULL);
+ // BARRIER
+
+ if (i) {
+ double beta = (rho / rho_prev) * (alpha / omega[0]);
+
+ clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+ clblasDscal(N, beta, ctx->cl_p, 0, 1,
+ 1, &ocl_q, 1, &events[0], &events[1]);
+ clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1,
+ 1, &ocl_q, 1, &events[1], &events[0]);
+ clWaitForEvents(1, &events[0]);
+ // BARRIER
+ }
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1,
+ 1, &ocl_q, 1, &events[0], &events[1]);
+
+ clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1,
+ ctx->cl_tmp, 1, &ocl_q, 1, &events[1], &events[0]);
+ clEnqueueReadBuffer(ocl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha,
+ 1, &events[0], NULL);
+ // BARRIER
+
+ alpha = rho / alpha;
+
+ clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1,
+ 1, &ocl_q, 1, &events[0], &events[1]);
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1,
+ 1, &ocl_q, 1, &events[1], &events[0]);
+
+ clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+ ctx->cl_tmp, 1, &ocl_q, 1, &events[0], &events[1]);
+ clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1,
+ ctx->cl_tmp1, 1, &ocl_q, 1, &events[0], &events[2]);
+
+ clEnqueueReadBuffer(ocl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega,
+ 2, &events[1], NULL);
+ // BARRIER
+
+ omega[0] /= omega[1];
+
+ clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ctx->cl_x, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+ clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ctx->cl_x, 0, 1,
+ 1, &ocl_q, 1, &events[0], &events[1]);
+
+ clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+ clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1,
+ 1, &ocl_q, 1, &events[0], &events[2]);
+ clEnqueueReadBuffer(ocl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err,
+ 1, &events[2], NULL);
+ clWaitForEvents(1, &events[1]);
+ // BARRIER
+
+ if (err < BICGSTAB_TOL)
+ break;
+
+ rho_prev = rho;
+ }
+ if (i == BICGSTAB_MAXITER)
+ return -1;
+
+ clEnqueueReadBuffer(ocl_q, ctx->cl_x, 1, 0, sizeof(double) * N,
+ x, 0, NULL, NULL);
+ return i;
+}
+#endif
+
+// based on the wikipedia article
+// and http://www.netlib.org/templates/matlab/bicgstab.m
+static int solve_sw(BiCGStabContext *ctx,
+ const double *mat, const double *rhs, double *x)
+{
+ const int N = ctx->N;
+ const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+ double rho, rho_prev = 1.0;
+ double omega = 1.0;
+ double alpha = 1.0;
+
+ double err;
+ int i;
+
+ double *k = ctx->k;
+ double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t;
+ double *res = ctx->res, *res0 = ctx->res0;
+
+ // initialize the residual
+ memcpy(res, rhs, N * sizeof(*res));
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+ mat, N, ctx->x, 1, 1.0, res, 1);
+
+ memcpy(res0, res, N * sizeof(*res0));
+ memcpy(p, res, N * sizeof(*p));
+
+ for (i = 0; i < BICGSTAB_MAXITER; i++) {
+ rho = cblas_ddot(N, res, 1, res0, 1);
+
+ if (i) {
+ double beta = (rho / rho_prev) * (alpha / omega);
+
+ cblas_daxpy(N, -omega, v, 1, p, 1);
+ cblas_dscal(N, beta, p, 1);
+ cblas_daxpy(N, 1, res, 1, p, 1);
+ }
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ k, N, p, 1, 0.0, y, 1);
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ mat, N, y, 1, 0.0, v, 1);
+
+ alpha = rho / cblas_ddot(N, res0, 1, v, 1);
+
+ cblas_daxpy(N, -alpha, v, 1, res, 1);
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ k, N, res, 1, 0.0, z, 1);
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ mat, N, z, 1, 0.0, t, 1);
+
+ omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1);
+
+ cblas_daxpy(N, alpha, y, 1, ctx->x, 1);
+ cblas_daxpy(N, omega, z, 1, ctx->x, 1);
+
+ cblas_daxpy(N, -omega, t, 1, res, 1);
+
+ err = cblas_dnrm2(N, res, 1) / rhs_norm;
+ if (err < BICGSTAB_TOL)
+ break;
+
+ rho_prev = rho;
+ }
+ if (i == BICGSTAB_MAXITER)
+ return -1;
+
+ memcpy(x, ctx->x, sizeof(*x) * ctx->N);
+
+ return i;
+}
+
+int qms_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x)
+{
+ int ret;
+
+#if HAVE_OPENCL
+ if (ctx->ocl_ctx)
+ ret = solve_cl(ctx, mat, rhs, x);
+ else
+#endif
+ ret = solve_sw(ctx, mat, rhs, x);
+ if (ret < 0)
+ return ret;
+
+#if QMS_VERIFY
+ {
+ int i;
+ double *y;
+
+ y = malloc(sizeof(*y) * ctx->N);
+ memcpy(y, rhs, sizeof(*y) * ctx->N);
+ cblas_dgemv(CblasColMajor, CblasNoTrans, ctx->N, ctx->N, -1.0,
+ mat, ctx->N, x, 1, 1.0, y, 1);
+ i = cblas_idamax(ctx->N, y, 1);
+ if (fabs(y[i]) > 1e-11)
+ abort();
+ }
+#endif
+
+ return ret;
+}
+
+int qms_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0)
+{
+#if HAVE_OPENCL
+ if (ctx->ocl_ctx) {
+ cl_event events[2];
+ clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_k, 0, 0, ctx->N * ctx->N * sizeof(double),
+ k, 0, NULL, &events[0]);
+ clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_x, 0, 0, ctx->N * sizeof(double),
+ x0, 0, NULL, &events[1]);
+ clWaitForEvents(2, events);
+ } else
+#endif
+ {
+ memcpy(ctx->x, x0, ctx->N * sizeof(*x0));
+ memcpy(ctx->k, k, ctx->N * ctx->N * sizeof(*k));
+ }
+
+ return 0;
+}
+
+int qms_bicgstab_context_alloc(BiCGStabContext **pctx, int N,
+ cl_context ocl_ctx, cl_command_queue ocl_q)
+{
+ BiCGStabContext *ctx;
+ int ret = 0;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->N = N;
+
+#if HAVE_OPENCL
+ if (ocl_ctx) {
+ ctx->ocl_ctx = ocl_ctx;
+ ctx->ocl_queue = ocl_q;
+
+#define ALLOC(dst, size) \
+do { \
+ ctx->dst = clCreateBuffer(ocl_ctx, 0, size, NULL, &ret); \
+ if (ret != CL_SUCCESS) \
+ goto fail; \
+} while (0)
+
+ ALLOC(cl_x, N * sizeof(double));
+ ALLOC(cl_p, N * sizeof(double));
+ ALLOC(cl_v, N * sizeof(double));
+ ALLOC(cl_y, N * sizeof(double));
+ ALLOC(cl_z, N * sizeof(double));
+ ALLOC(cl_t, N * sizeof(double));
+ ALLOC(cl_res, N * sizeof(double));
+ ALLOC(cl_res0, N * sizeof(double));
+ ALLOC(cl_tmp, N * sizeof(double));
+ ALLOC(cl_tmp1, N * 2 * sizeof(double));
+
+ ALLOC(cl_k, N * N * sizeof(double));
+ ALLOC(cl_mat, N * N * sizeof(double));
+
+ ALLOC(cl_rho, sizeof(double));
+ ALLOC(cl_alpha, sizeof(double));
+ ALLOC(cl_beta, sizeof(double));
+ ALLOC(cl_omega, 2 * sizeof(double));
+ ALLOC(cl_omega1, sizeof(double));
+ } else
+#endif
+ {
+ ret |= posix_memalign((void**)&ctx->x, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->p, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->v, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->y, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->z, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->t, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->res, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->res0, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->k, 32, sizeof(double) * N * N);
+ }
+
+fail:
+ if (ret) {
+ qms_bicgstab_context_free(&ctx);
+ return -ENOMEM;
+ }
+
+ *pctx = ctx;
+ return 0;
+}
+
+void qms_bicgstab_context_free(BiCGStabContext **pctx)
+{
+ BiCGStabContext *ctx = *pctx;
+
+ if (!ctx)
+ return;
+
+ free(ctx->x);
+ free(ctx->p);
+ free(ctx->v);
+ free(ctx->y);
+ free(ctx->z);
+ free(ctx->t);
+ free(ctx->res);
+ free(ctx->res0);
+ free(ctx->k);
+
+#if HAVE_OPENCL
+ if (ctx->ocl_ctx) {
+ clReleaseMemObject(ctx->cl_x);
+ clReleaseMemObject(ctx->cl_p);
+ clReleaseMemObject(ctx->cl_v);
+ clReleaseMemObject(ctx->cl_y);
+ clReleaseMemObject(ctx->cl_z);
+ clReleaseMemObject(ctx->cl_t);
+ clReleaseMemObject(ctx->cl_res);
+ clReleaseMemObject(ctx->cl_res0);
+ clReleaseMemObject(ctx->cl_tmp);
+ clReleaseMemObject(ctx->cl_tmp1);
+
+ clReleaseMemObject(ctx->cl_k);
+ clReleaseMemObject(ctx->cl_mat);
+
+ clReleaseMemObject(ctx->cl_rho);
+ clReleaseMemObject(ctx->cl_alpha);
+ clReleaseMemObject(ctx->cl_beta);
+ clReleaseMemObject(ctx->cl_omega);
+ clReleaseMemObject(ctx->cl_omega1);
+ }
+#endif
+
+ free(ctx);
+ *pctx = NULL;
+}
diff --git a/src/bicgstab.h b/src/bicgstab.h
new file mode 100644
index 0000000..ee3cff1
--- /dev/null
+++ b/src/bicgstab.h
@@ -0,0 +1,60 @@
+/*
+ * BiCGStab iterative linear system solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QMS_BICGSTAB_H
+#define QMS_BICGSTAB_H
+
+#include "common.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#else
+typedef void* cl_context;
+typedef void* cl_command_queue;
+#endif
+
+typedef struct BiCGStabContext BiCGStabContext;
+
+/**
+ * Allocate and initialize the solver for the NxN system.
+ *
+ * If the OpenCL context and command queue are provided (non-NULL), the solver
+ * will run using clBLAS.
+ */
+int qms_bicgstab_context_alloc(BiCGStabContext **ctx, int N,
+ cl_context ocl_ctx, cl_command_queue ocl_q);
+
+/**
+ * Free the solver and all its internal state.
+ */
+void qms_bicgstab_context_free(BiCGStabContext **ctx);
+
+/**
+ * Initialise the solver with the given preconditioner matrix. This function
+ * may be any number of times on a given solver context.
+ */
+int qms_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0);
+
+/**
+ * Solve the linear system
+ * mat · x = rhs
+ * The result is written into x.
+ */
+int qms_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x);
+
+#endif /* QMS_BICGSTAB_H */
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..44b0674
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,30 @@
+#ifndef QMS_COMMON_H
+#define QMS_COMMON_H
+
+#define HAVE_OPENCL 0
+#define QMS_VERIFY 0
+#define QMS_POLAR 0
+#define QMS_CCZ4 0
+
+#define SQR(x) ((x) * (x))
+#define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0)
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#define MIN(x, y) ((x) > (y) ? (y) : (x))
+#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr))
+
+/*
+ * small number to avoid r=0 singularities
+ */
+#define EPS 1E-08
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <sys/time.h>
+static inline int64_t gettime(void)
+{
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
+}
+
+#endif /* QMS_COMMON_H */
diff --git a/src/make.code.defn b/src/make.code.defn
index f31ed36..2863409 100644
--- a/src/make.code.defn
+++ b/src/make.code.defn
@@ -1,7 +1,7 @@
# Main make.code.defn file for thorn MaximalSlicingAxi
# Source files in this directory
-SRCS = basis.c qms.c register.c solve.c
+SRCS = basis.c bicgstab.c qms.c qms_solve.c pssolve.c register.c
# Subdirectories containing source files
SUBDIRS =
diff --git a/src/pssolve.c b/src/pssolve.c
new file mode 100644
index 0000000..5f2d941
--- /dev/null
+++ b/src/pssolve.c
@@ -0,0 +1,389 @@
+/*
+ * Pseudospectral 2nd order 2D linear PDE solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cblas.h>
+#include <lapacke.h>
+
+#include "bicgstab.h"
+#include "pssolve.h"
+
+#define NB_COEFFS(priv) (priv->nb_coeffs[0] * priv->nb_coeffs[1])
+#define NB_COLLOC_POINTS(priv) (priv->nb_colloc_points[0] * priv->nb_colloc_points[1])
+
+struct PSSolvePriv {
+ BiCGStabContext *bicgstab;
+ int steps_since_inverse;
+
+ int nb_coeffs[2];
+ int nb_colloc_points[2];
+ int colloc_grid_order[2];
+
+ double *basis_val[PSSOLVE_DIFF_ORDER_NB];
+
+ int *ipiv;
+ double *mat;
+};
+
+static int construct_matrix(PSSolveContext *ctx,
+ const double *eq_coeffs[PSSOLVE_DIFF_ORDER_NB])
+{
+ double *mat = ctx->priv->mat;
+ int idx_coeff, idx_grid;
+
+//#pragma omp parallel for
+ for (idx_coeff = 0; idx_coeff < NB_COEFFS(ctx->priv); idx_coeff++)
+ for (idx_grid = 0; idx_grid < NB_COLLOC_POINTS(ctx->priv); idx_grid++) {
+ const int idx = idx_grid + NB_COLLOC_POINTS(ctx->priv) * idx_coeff;
+ double val = 0.0;
+
+ for (int i = 0; i < ARRAY_ELEMS(ctx->priv->basis_val); i++)
+ val += eq_coeffs[i][idx_grid] * ctx->priv->basis_val[i][idx];
+
+ mat[idx] = val;
+ }
+
+ return 0;
+}
+
+static int lu_invert(const int N, double *mat, double *rhs, int *ipiv)
+{
+ char equed = 'N';
+ double cond, ferr, berr, rpivot;
+
+ double *mat_f, *x;
+ int ret = 0;
+#if 0
+ LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+ mat, N, ipiv, rhs, N);
+ LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv);
+#else
+ mat_f = malloc(SQR(N) * sizeof(*mat_f));
+ x = malloc(N * sizeof(*x));
+
+ //{
+ // int i, j;
+ // for (i = 0; i < N; i++) {
+ // for (j = 0; j < N; j++)
+ // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]);
+ // fprintf(stderr, "\n");
+ // }
+ //}
+ //{
+ // double *mat_copy = malloc(SQR(N) * sizeof(double));
+ // double *svd = malloc(N * sizeof(double));
+ // double *rhs_copy = malloc(N * sizeof(double));
+ // int rank;
+
+ // memcpy(mat_copy, mat, SQR(N) * sizeof(double));
+ // memcpy(rhs_copy, rhs, N * sizeof(double));
+
+ // LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N,
+ // svd, 1e-13, &rank);
+
+ // free(mat_copy);
+ // for (int i = 0; i < N; i++) {
+ // if (i > 5 && i < N - 5)
+ // continue;
+
+ // fprintf(stderr, "%g\t", svd[i]);
+ // }
+ // fprintf(stderr, "\n rank %d\n", rank);
+ // free(svd);
+ // free(rhs_copy);
+
+ // if (rank < N)
+ // ret = 1;
+ //}
+
+ //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
+ // mat, N, ipiv, rhs, N);
+ LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1,
+ mat, N, mat_f, N, ipiv, &equed, NULL, NULL,
+ rhs, N, x, N, &cond, &ferr, &berr, &rpivot);
+ LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv);
+ memcpy(rhs, x, N * sizeof(double));
+ memcpy(mat, mat_f, SQR(N) * sizeof(double));
+
+ fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: "
+ "condition number %16.16g; forward error %16.16g backward error %16.16g\n",
+ N, N, cond, ferr, berr);
+
+ free(mat_f);
+ free(x);
+#endif
+
+ return ret;
+}
+
+int qms_pssolve_solve(PSSolveContext *ctx,
+ const double * const eq_coeffs[PSSOLVE_DIFF_ORDER_NB],
+ const double *rhs, double *coeffs)
+{
+ PSSolvePriv *s = ctx->priv;
+ const int N = NB_COEFFS(s);
+ double rhs_max;
+ int64_t start;
+
+ int ret = 0;
+
+ /* fill the matrix */
+ CCTK_TimerStart("QuasiMaximalSlicing_construct_matrix");
+ start = gettime();
+ ret = construct_matrix(ctx, eq_coeffs);
+ ctx->construct_matrix_time += gettime() - start;
+ ctx->construct_matrix_count++;
+ CCTK_TimerStop("QuasiMaximalSlicing_construct_matrix");
+ if (ret < 0)
+ return ret;
+
+#if 0
+ if (rhs_max < EPS) {
+ fprintf(stderr, "zero rhs\n");
+ memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
+ if (ms->cl_queue) {
+ clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double),
+ ms->coeffs, 0, NULL, NULL);
+ }
+ return 0;
+ }
+#endif
+
+ /* solve for the coeffs */
+ if (s->steps_since_inverse < 1024) {
+ int64_t start;
+
+ start = gettime();
+
+ CCTK_TimerStart("QuasiMaximalSlicing_solve_BiCGSTAB");
+ ret = qms_bicgstab_solve(s->bicgstab, s->mat, rhs, coeffs);
+ CCTK_TimerStop("QuasiMaximalSlicing_solve_BiCGSTAB");
+
+ if (ret >= 0) {
+ ctx->cg_time_total += gettime() - start;
+ ctx->cg_solve_count++;
+ ctx->cg_iter_count += ret + 1;
+ s->steps_since_inverse++;
+
+ }
+ } else
+ ret = -1;
+
+ if (ret < 0) {
+ int64_t start;
+
+ CCTK_TimerStart("QuasiMaximalSlicing_solve_LU");
+ start = gettime();
+
+ memcpy(coeffs, rhs, N * sizeof(*rhs));
+
+ ret = lu_invert(N, s->mat, coeffs, s->ipiv);
+ ctx->lu_solves_time += gettime() - start;
+ ctx->lu_solves_count++;
+ CCTK_TimerStop("QuasiMaximalSlicing_solve_LU");
+
+ ret = qms_bicgstab_init(s->bicgstab, s->mat, coeffs);
+
+ s->steps_since_inverse = 0;
+ }
+
+ return ret;
+}
+
+int qms_pssolve_context_init(PSSolveContext *ctx)
+{
+ PSSolvePriv *s = ctx->priv;
+ double *basis_val[2][3] = { { NULL } };
+
+ int ret = 0;
+
+ if (ctx->solve_order[0] <= 0 || ctx->solve_order[1] <= 0)
+ return -EINVAL;
+ s->nb_coeffs[0] = ctx->solve_order[0];
+ s->nb_coeffs[1] = ctx->solve_order[1];
+ s->nb_colloc_points[0] = ctx->solve_order[0];
+ s->nb_colloc_points[1] = ctx->solve_order[1];
+ s->colloc_grid_order[0] = ctx->solve_order[0];
+ s->colloc_grid_order[1] = ctx->solve_order[1];
+
+ s->steps_since_inverse = INT_MAX;
+
+ /* init the BiCGStab solver */
+ ret = qms_bicgstab_context_alloc(&s->bicgstab, NB_COEFFS(s), ctx->ocl_ctx,
+ ctx->ocl_queue);
+ if (ret < 0)
+ return ret;
+
+ /* compute the collocation grid */
+ posix_memalign((void**)&ctx->colloc_grid[0], 32, s->nb_colloc_points[0] * sizeof(*ctx->colloc_grid[0]));
+ posix_memalign((void**)&ctx->colloc_grid[1], 32, s->nb_colloc_points[1] * sizeof(*ctx->colloc_grid[1]));
+ if (!ctx->colloc_grid[0] || !ctx->colloc_grid[1])
+ return -ENOMEM;
+
+ for (int i = 0; i < s->nb_colloc_points[0]; i++)
+ ctx->colloc_grid[0][i] = ctx->basis[0]->colloc_point(s->colloc_grid_order[0], i);
+ for (int i = 0; i < s->nb_colloc_points[1]; i++)
+ ctx->colloc_grid[1][i] = ctx->basis[1]->colloc_point(s->colloc_grid_order[1], i);
+
+ /* precompute the basis values we will need */
+ for (int i = 0; i < ARRAY_ELEMS(basis_val); i++) {
+ for (int j = 0; j < ARRAY_ELEMS(basis_val[i]); j++) {
+ int ret = posix_memalign((void**)&basis_val[i][j], 32,
+ sizeof(*basis_val[i][j]) * s->nb_coeffs[i] * s->nb_colloc_points[i]);
+ if (ret) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ for (int j = 0; j < s->nb_colloc_points[i]; j++) {
+ double coord = ctx->colloc_grid[i][j];
+ for (int k = 0; k < s->nb_coeffs[i]; k++) {
+ basis_val[i][0][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval (coord, k);
+ basis_val[i][1][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval_diff1(coord, k);
+ basis_val[i][2][j * s->nb_coeffs[i] + k] = ctx->basis[i]->eval_diff2(coord, k);
+ }
+ }
+ }
+
+ for (int i = 0; i < ARRAY_ELEMS(s->basis_val); i++) {
+ ret = posix_memalign((void**)&s->basis_val[i], 32, NB_COLLOC_POINTS(s) * NB_COEFFS(s) * sizeof(*s->basis_val[i]));
+ if (ret) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ for (int i = 0; i < s->nb_colloc_points[1]; i++) {
+ const double *basis_z = basis_val[1][0] + i * s->nb_coeffs[1];
+ const double *dbasis_z = basis_val[1][1] + i * s->nb_coeffs[1];
+ const double *d2basis_z = basis_val[1][2] + i * s->nb_coeffs[1];
+
+ for (int j = 0; j < s->nb_colloc_points[0]; j++) {
+ const double *basis_x = basis_val[0][0] + j * s->nb_coeffs[0];
+ const double *dbasis_x = basis_val[0][1] + j * s->nb_coeffs[0];
+ const double *d2basis_x = basis_val[0][2] + j * s->nb_coeffs[0];
+ const int idx_grid = i * s->nb_colloc_points[0] + j;
+
+#if QMS_POLAR
+ double r = ctx->colloc_grid[0][j];
+ double theta = ctx->colloc_grid[1][i];
+
+ double x = r * cos(theta);
+ double z = r * sin(theta);
+#else
+ double x = ctx->colloc_grid[0][j];
+ double z = ctx->colloc_grid[1][i];
+#endif
+
+ for (int k = 0; k < s->nb_coeffs[1]; k++)
+ for (int l = 0; l < s->nb_coeffs[0]; l++) {
+ const int idx_coeff = k * s->nb_coeffs[0] + l;
+ const int idx = idx_grid + NB_COLLOC_POINTS(s) * idx_coeff;
+ s->basis_val[PSSOLVE_DIFF_ORDER_00][idx] = basis_x[l] * basis_z[k];
+#if QMS_POLAR
+ s->basis_val[PSSOLVE_DIFF_ORDER_10][idx] = ((r > EPS) ? (dbasis_x[l] * basis_z[k] * x / r - basis_x[l] * dbasis_z[k] * z / SQR(r)) : 0.0);
+ s->basis_val[PSSOLVE_DIFF_ORDER_01][idx] = ((r > EPS) ? (dbasis_x[l] * basis_z[k] * z / r + basis_x[l] * dbasis_z[k] * x / SQR(r)) : 0.0);
+ s->basis_val[PSSOLVE_DIFF_ORDER_20][idx] = ((r > EPS) ? (SQR(x / r) * d2basis_x[l] * basis_z[k] + SQR(z / SQR(r)) * basis_x[l] * d2basis_z[k]
+ + (1.0 - SQR(x / r)) / r * dbasis_x[l] * basis_z[k]
+ + 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k]
+ - 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
+ s->basis_val[PSSOLVE_DIFF_ORDER_02][idx] = ((r > EPS) ? (SQR(z / r) * d2basis_x[l] * basis_z[k] + SQR(x / SQR(r)) * basis_x[l] * d2basis_z[k]
+ + (1.0 - SQR(z / r)) / r * dbasis_x[l] * basis_z[k]
+ - 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k]
+ + 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
+ s->basis_val[PSSOLVE_DIFF_ORDER_11][idx] = ((r > EPS) ? (x * z / SQR(r) * d2basis_x[l] * basis_z[k] - x * z / SQR(SQR(r)) * basis_x[l] * d2basis_z[k]
+ - x * z / (r * SQR(r)) * dbasis_x[l] * basis_z[k]
+ - (1.0 - SQR(z / r)) / SQR(r) * basis_x[l] * dbasis_z[k]
+ + (SQR(x) - SQR(z)) / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
+#else
+ s->basis_val[PSSOLVE_DIFF_ORDER_10][idx] = dbasis_x[l] * basis_z[k];
+ s->basis_val[PSSOLVE_DIFF_ORDER_01][idx] = basis_x[l] * dbasis_z[k];
+ s->basis_val[PSSOLVE_DIFF_ORDER_20][idx] = d2basis_x[l] * basis_z[k];
+ s->basis_val[PSSOLVE_DIFF_ORDER_02][idx] = basis_x[l] * d2basis_z[k];
+ s->basis_val[PSSOLVE_DIFF_ORDER_11][idx] = dbasis_x[l] * dbasis_z[k];
+#endif
+ }
+ }
+ }
+
+ ret = posix_memalign((void**)&s->ipiv, 32, sizeof(*s->ipiv) * NB_COEFFS(s));
+ ret |= posix_memalign((void**)&s->mat, 32, sizeof(*s->mat) * NB_COEFFS(s) * NB_COLLOC_POINTS(s));
+ if (ret) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+fail:
+ for (int i = 0; i < ARRAY_ELEMS(basis_val); i++)
+ for (int j = 0; j < ARRAY_ELEMS(basis_val[i]); j++)
+ free(basis_val[i][j]);
+
+ return ret;
+}
+
+int qms_pssolve_context_alloc(PSSolveContext **pctx)
+{
+ PSSolveContext *ctx = calloc(1, sizeof(*ctx));
+
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->priv = calloc(1, sizeof(*ctx->priv));
+ if (!ctx->priv)
+ goto fail;
+
+ *pctx = ctx;
+ return 0;
+fail:
+ qms_pssolve_context_free(&ctx);
+ return -ENOMEM;
+}
+
+void qms_pssolve_context_free(PSSolveContext **pctx)
+{
+ PSSolveContext *ctx = *pctx;
+
+ if (!ctx)
+ return;
+
+ if (ctx->priv) {
+ for (int i = 0; i < ARRAY_ELEMS(ctx->priv->basis_val); i++)
+ free(ctx->priv->basis_val[i]);
+
+ free(ctx->priv->ipiv);
+ free(ctx->priv->mat);
+
+ qms_bicgstab_context_free(&ctx->priv->bicgstab);
+ }
+
+ free(ctx->priv);
+
+ free(ctx->colloc_grid[0]);
+ free(ctx->colloc_grid[1]);
+
+ free(ctx);
+ *pctx = NULL;
+}
diff --git a/src/pssolve.h b/src/pssolve.h
new file mode 100644
index 0000000..156bdcd
--- /dev/null
+++ b/src/pssolve.h
@@ -0,0 +1,116 @@
+/*
+ * Pseudospectral 2nd order 2D linear PDE solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QMS_PSSOLVE_H
+#define QMS_PSSOLVE_H
+
+#include "common.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#else
+typedef void* cl_context;
+typedef void* cl_command_queue;
+#endif
+
+#include <stdint.h>
+
+#include "basis.h"
+
+enum PSSolveDiffOrder {
+ PSSOLVE_DIFF_ORDER_00,
+ PSSOLVE_DIFF_ORDER_10,
+ PSSOLVE_DIFF_ORDER_01,
+ PSSOLVE_DIFF_ORDER_11,
+ PSSOLVE_DIFF_ORDER_20,
+ PSSOLVE_DIFF_ORDER_02,
+ PSSOLVE_DIFF_ORDER_NB,
+};
+
+typedef struct PSSolvePriv PSSolvePriv;
+
+typedef struct PSSolveContext {
+ /**
+ * Solver private data, not to be touched by the caller.
+ */
+ PSSolvePriv *priv;
+
+ /**
+ * The basis sets to be used in each direction.
+ * Set by the caller before qms_pssolve_context_init().
+ */
+ const BasisSet *basis[2];
+
+ /**
+ * Order of the solver in each direction.
+ * Set by the caller before qms_pssolve_context_init().
+ */
+ int solve_order[2];
+
+ /**
+ * Locations of the collocation points in each direction. The equation
+ * coefficients passed to qms_pssolve_solve() should be evaluated at those
+ * grid positions.
+ *
+ * Set by the solver after qms_pssolve_context_init(). Each array is
+ * solve_order[i]-sized.
+ */
+ double *colloc_grid[2];
+
+ cl_context ocl_ctx;
+ cl_command_queue ocl_queue;
+
+ uint64_t lu_solves_count;
+ uint64_t lu_solves_time;
+
+ uint64_t cg_solve_count;
+ uint64_t cg_iter_count;
+ uint64_t cg_time_total;
+
+ uint64_t construct_matrix_count;
+ uint64_t construct_matrix_time;
+} PSSolveContext;
+
+/**
+ * Allocate a new solver.
+ */
+int qms_pssolve_context_alloc(PSSolveContext **ctx);
+
+/**
+ * Initialize the solver for use after all the context options have been set.
+ */
+int qms_pssolve_context_init(PSSolveContext *ctx);
+
+/**
+ * Free the solver and all its internal state.
+ */
+void qms_pssolve_context_free(PSSolveContext **ctx);
+
+/**
+ * Solve a second order linear PDE in 2D with a pseudospectral method.
+ *
+ * @param eq_coeffs the coefficients of each derivative term at the collocation
+ * points.
+ * @param rhs the right-hand side of the equation at the collocation points.
+ * @param coeffs the spectral coefficients of the solution will be written here.
+ */
+int qms_pssolve_solve(PSSolveContext *ctx,
+ const double * const eq_coeffs[PSSOLVE_DIFF_ORDER_NB],
+ const double *rhs, double *coeffs);
+
+#endif /* QMS_PSSOLVE_H */
diff --git a/src/qms.c b/src/qms.c
index 7332017..95c645c 100644
--- a/src/qms.c
+++ b/src/qms.c
@@ -1,3 +1,5 @@
+#include "common.h"
+
#include <ctype.h>
#include <errno.h>
#include <float.h>
@@ -9,10 +11,6 @@
#include <string.h>
#include <cblas.h>
-#include <lapacke.h>
-
-#include <cl.h>
-#include <clBLAS.h>
#include "cctk.h"
#include "cctk_Arguments.h"
@@ -21,588 +19,19 @@
#include "util_Table.h"
#include "qms.h"
-
-#define ACC_TEST 0
+#include "qms_solve.h"
double scale_factor;
-/* mapping between our indices and thorn names */
-static const char *metric_vars[] = {
-#if CCZ4
- [GTXX] = "ML_CCZ4::gt11",
- [GTYY] = "ML_CCZ4::gt22",
- [GTZZ] = "ML_CCZ4::gt33",
- [GTXY] = "ML_CCZ4::gt12",
- [GTXZ] = "ML_CCZ4::gt13",
- [GTYZ] = "ML_CCZ4::gt23",
- [ATXX] = "ML_CCZ4::At11",
- [ATYY] = "ML_CCZ4::At22",
- [ATZZ] = "ML_CCZ4::At33",
- [ATXY] = "ML_CCZ4::At12",
- [ATXZ] = "ML_CCZ4::At13",
- [ATYZ] = "ML_CCZ4::At23",
- [PHI] = "ML_CCZ4::phi",
- [K] = "ML_CCZ4::trK",
- [XTX] = "ML_CCZ4::Xt1",
- [XTY] = "ML_CCZ4::Xt2",
- [XTZ] = "ML_CCZ4::Xt3",
- [BETAX] = "ML_CCZ4::beta1",
- [BETAY] = "ML_CCZ4::beta2",
- [BETAZ] = "ML_CCZ4::beta3",
- [ALPHA] = "ML_CCZ4::alpha",
- [KDOT_XX] = "ML_CCZ4::Kdot11",
- [KDOT_YY] = "ML_CCZ4::Kdot22",
- [KDOT_ZZ] = "ML_CCZ4::Kdot33",
- [KDOT_XY] = "ML_CCZ4::Kdot12",
- [KDOT_XZ] = "ML_CCZ4::Kdot13",
- [KDOT_YZ] = "ML_CCZ4::Kdot23",
- [XTDOT_X] = "ML_CCZ4::Xtdot1",
- [XTDOT_Y] = "ML_CCZ4::Xtdot2",
- [XTDOT_Z] = "ML_CCZ4::Xtdot3",
- [PHIDOT] = "ML_CCZ4::phidot",
-#else
- [GTXX] = "ML_BSSN::gt11",
- [GTYY] = "ML_BSSN::gt22",
- [GTZZ] = "ML_BSSN::gt33",
- [GTXY] = "ML_BSSN::gt12",
- [GTXZ] = "ML_BSSN::gt13",
- [GTYZ] = "ML_BSSN::gt23",
- [ATXX] = "ML_BSSN::At11",
- [ATYY] = "ML_BSSN::At22",
- [ATZZ] = "ML_BSSN::At33",
- [ATXY] = "ML_BSSN::At12",
- [ATXZ] = "ML_BSSN::At13",
- [ATYZ] = "ML_BSSN::At23",
- [PHI] = "ML_BSSN::phi",
- [K] = "ML_BSSN::trK",
- [XTX] = "ML_BSSN::Xt1",
- [XTY] = "ML_BSSN::Xt2",
- [XTZ] = "ML_BSSN::Xt3",
- [BETAX] = "ML_BSSN::beta1",
- [BETAY] = "ML_BSSN::beta2",
- [BETAZ] = "ML_BSSN::beta3",
- [ALPHA] = "ML_BSSN::alpha",
- //[ALPHA] = "ADMBase::alp",
- [KDOT_XX] = "ML_BSSN::Kdot11",
- [KDOT_YY] = "ML_BSSN::Kdot22",
- [KDOT_ZZ] = "ML_BSSN::Kdot33",
- [KDOT_XY] = "ML_BSSN::Kdot12",
- [KDOT_XZ] = "ML_BSSN::Kdot13",
- [KDOT_YZ] = "ML_BSSN::Kdot23",
- [XTDOT_X] = "ML_BSSN::Xtdot1",
- [XTDOT_Y] = "ML_BSSN::Xtdot2",
- [XTDOT_Z] = "ML_BSSN::Xtdot3",
- [PHIDOT] = "ML_BSSN::phidot",
-#endif
-};
-
-/* mapping between the cactus grid values and interpolated values */
-static const CCTK_INT interp_operation_indices[] = {
- [I_GTXX] = GTXX,
- [I_GTYY] = GTYY,
- [I_GTZZ] = GTZZ,
- [I_GTXY] = GTXY,
- [I_GTXZ] = GTXZ,
- [I_GTYZ] = GTYZ,
- [I_PHI] = PHI,
- [I_PHI_DX] = PHI,
- [I_PHI_DY] = PHI,
- [I_PHI_DZ] = PHI,
- [I_ATXX] = ATXX,
- [I_ATYY] = ATYY,
- [I_ATZZ] = ATZZ,
- [I_ATXY] = ATXY,
- [I_ATXZ] = ATXZ,
- [I_ATYZ] = ATYZ,
- [I_K] = K,
- [I_K_DX] = K,
- [I_K_DY] = K,
- [I_K_DZ] = K,
- [I_XTX] = XTX,
- [I_XTY] = XTY,
- [I_XTZ] = XTZ,
- [I_BETAX] = BETAX,
- [I_BETAY] = BETAY,
- [I_BETAZ] = BETAZ,
- [I_ALPHA] = ALPHA,
- [I_ALPHA_DX] = ALPHA,
- [I_ALPHA_DY] = ALPHA,
- [I_ALPHA_DZ] = ALPHA,
- [I_ALPHA_DXX] = ALPHA,
- [I_ALPHA_DYY] = ALPHA,
- [I_ALPHA_DZZ] = ALPHA,
- [I_ALPHA_DXY] = ALPHA,
- [I_ALPHA_DXZ] = ALPHA,
- [I_ALPHA_DYZ] = ALPHA,
- [I_KDOT_XX] = KDOT_XX,
- [I_KDOT_YY] = KDOT_YY,
- [I_KDOT_ZZ] = KDOT_ZZ,
- [I_KDOT_XY] = KDOT_XY,
- [I_KDOT_XZ] = KDOT_XZ,
- [I_KDOT_YZ] = KDOT_YZ,
- [I_XTDOT_X] = XTDOT_X,
- [I_XTDOT_Y] = XTDOT_Y,
- [I_XTDOT_Z] = XTDOT_Z,
- [I_PHIDOT] = PHIDOT,
- [I_PHIDOT_DX] = PHIDOT,
- [I_PHIDOT_DY] = PHIDOT,
- [I_PHIDOT_DZ] = PHIDOT,
-};
-
-/* the operation (plain value or x/y/z-derivative) to apply during interpolation */
-static const CCTK_INT interp_operation_codes[] = {
- [I_GTXX] = 0,
- [I_GTYY] = 0,
- [I_GTZZ] = 0,
- [I_GTXY] = 0,
- [I_GTXZ] = 0,
- [I_GTYZ] = 0,
- [I_PHI] = 0,
- [I_PHI_DX] = 1,
- [I_PHI_DY] = 2,
- [I_PHI_DZ] = 3,
- [I_ATXX] = 0,
- [I_ATYY] = 0,
- [I_ATZZ] = 0,
- [I_ATXY] = 0,
- [I_ATXZ] = 0,
- [I_ATYZ] = 0,
- [I_K] = 0,
- [I_K_DX] = 1,
- [I_K_DY] = 2,
- [I_K_DZ] = 3,
- [I_XTX] = 0,
- [I_XTY] = 0,
- [I_XTZ] = 0,
- [I_BETAX] = 0,
- [I_BETAY] = 0,
- [I_BETAZ] = 0,
- [I_ALPHA] = 0,
- [I_ALPHA_DX] = 1,
- [I_ALPHA_DY] = 2,
- [I_ALPHA_DZ] = 3,
- [I_ALPHA_DXX] = 11,
- [I_ALPHA_DYY] = 22,
- [I_ALPHA_DZZ] = 33,
- [I_ALPHA_DXY] = 12,
- [I_ALPHA_DXZ] = 13,
- [I_ALPHA_DYZ] = 23,
- [I_KDOT_XX] = 0,
- [I_KDOT_YY] = 0,
- [I_KDOT_ZZ] = 0,
- [I_KDOT_XY] = 0,
- [I_KDOT_XZ] = 0,
- [I_KDOT_YZ] = 0,
- [I_XTDOT_X] = 0,
- [I_XTDOT_Y] = 0,
- [I_XTDOT_Z] = 0,
- [I_PHIDOT] = 0,
- [I_PHIDOT_DX] = 1,
- [I_PHIDOT_DY] = 2,
- [I_PHIDOT_DZ] = 3,
-};
-
-static void init_opencl(MaximalSlicingContext *ms)
-{
- int err, count;
- cl_platform_id platform;
- cl_context_properties props[3];
-
- err = clGetPlatformIDs(1, &platform, &count);
- if (err != CL_SUCCESS || count < 1) {
- fprintf(stderr, "Could not get an OpenCL platform ID\n");
- return;
- }
-
- err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ms->ocl_device, &count);
- if (err != CL_SUCCESS || count < 1) {
- fprintf(stderr, "Could not get an OpenCL device ID\n");
- return;
- }
-
- props[0] = CL_CONTEXT_PLATFORM;
- props[1] = (cl_context_properties)platform;
- props[2] = 0;
-
- ms->cl_ctx = clCreateContext(props, 1, &ms->ocl_device, NULL, NULL, &err);
- if (err != CL_SUCCESS || !ms->cl_ctx) {
- fprintf(stderr, "Could not create an OpenCL context\n");
- return;
- }
-
- ms->cl_queue = clCreateCommandQueue(ms->cl_ctx, ms->ocl_device, 0, &err);
- if (err != CL_SUCCESS || !ms->cl_queue) {
- fprintf(stderr, "Could not create an OpenCL command queue: %d\n", err);
- goto fail;
- }
-
- err = clblasSetup();
- if (err != CL_SUCCESS) {
- fprintf(stderr, "Error setting up clBLAS\n");
- goto fail;
- }
-
- ms->ocl_coeffs = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
-
- ms->bicgstab.cl_p = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
- ms->bicgstab.cl_v = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
- ms->bicgstab.cl_y = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
- ms->bicgstab.cl_z = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
- ms->bicgstab.cl_t = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
- ms->bicgstab.cl_res = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
- ms->bicgstab.cl_res0 = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
- ms->bicgstab.cl_tmp = clCreateBuffer(ms->cl_ctx, 0, ms->nb_coeffs * sizeof(double), NULL, &err);
- ms->bicgstab.cl_tmp1 = clCreateBuffer(ms->cl_ctx, 0, 2 * ms->nb_coeffs * sizeof(double), NULL, &err);
-
- ms->bicgstab.cl_k = clCreateBuffer(ms->cl_ctx, 0, ms->nb_colloc_points * ms->nb_coeffs * sizeof(double), NULL, &err);
- ms->bicgstab.cl_mat = clCreateBuffer(ms->cl_ctx, 0, ms->nb_colloc_points * ms->nb_coeffs * sizeof(double), NULL, &err);
-
- ms->bicgstab.cl_rho = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
- ms->bicgstab.cl_alpha = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
- ms->bicgstab.cl_beta = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
- ms->bicgstab.cl_omega = clCreateBuffer(ms->cl_ctx, 0, 2 * sizeof(double), NULL, &err);
- ms->bicgstab.cl_omega1 = clCreateBuffer(ms->cl_ctx, 0, sizeof(double), NULL, &err);
-
- return;
-fail:
- if (ms->cl_queue)
- clReleaseCommandQueue(ms->cl_queue);
- ms->cl_queue = 0;
-
- if (ms->cl_ctx)
- clReleaseContext(ms->cl_ctx);
- ms->cl_ctx = 0;
-}
-
-static void construct_filter_matrix(MaximalSlicingContext *ms, double filter_power)
+/* get an approximate "main" frequency component in a basis function */
+static double calc_basis_freq(const BasisSet *b, int order)
{
- char equed = 'N';
- double cond, ferr, berr, rpivot;
-
- double *m, *minv, *scale, *tmp;
- int *ipiv;
- int idx_coeff_x, idx_coeff_z, idx_grid_x, idx_grid_z;
- int N = ms->nb_coeffs;
-
- ms->input_filter = malloc(sizeof(*m) * N * N);
-
- m = malloc(sizeof(*m) * N * N);
- minv = malloc(sizeof(*m) * N * N);
- scale = malloc(sizeof(*m) * N * N);
- tmp = malloc(sizeof(*m) * N * N);
- ipiv = malloc(sizeof(*ipiv) * N);
-
-#define BASIS_X (ms->basis_x_val [idx_grid_x * ms->nb_coeffs_x + idx_coeff_x])
-#define BASIS_Z (ms->basis_z_val [idx_grid_z * ms->nb_coeffs_z + idx_coeff_z])
- for (idx_grid_z = 0; idx_grid_z < ms->nb_colloc_points_z - 0; idx_grid_z++) {
- for (idx_grid_x = 0; idx_grid_x < ms->nb_colloc_points_x - 0; idx_grid_x++) {
- int idx_grid = idx_grid_z * ms->nb_colloc_points_x + idx_grid_x;
-
- for (idx_coeff_z = 0; idx_coeff_z < ms->nb_coeffs_z; idx_coeff_z++)
- for (idx_coeff_x = 0; idx_coeff_x < ms->nb_coeffs_x; idx_coeff_x++) {
- const int idx_coeff = idx_coeff_z * ms->nb_coeffs_x + idx_coeff_x;
-
- minv[idx_grid + ms->nb_colloc_points * idx_coeff] = BASIS_X * BASIS_Z;
- scale[idx_grid + ms->nb_colloc_points * idx_coeff] = (idx_grid == idx_coeff) ?
- exp(-36.0 * pow((double)idx_grid_x / ms->nb_coeffs_x, filter_power)) *
- exp(-36.0 * pow((double)idx_grid_z / ms->nb_coeffs_z, filter_power)) : 0.0;
-
- scale[idx_grid + ms->nb_colloc_points * idx_coeff] = (idx_grid == idx_coeff) ? 1.0 : 0.0;
- //if (idx_coeff_z == idx_grid_z && idx_coeff_z == 0 && idx_grid_x == idx_coeff_x)
- // fprintf(stderr, "%d %g\n", idx_coeff_x, scale[idx_grid + ms->nb_colloc_points * idx_coeff]);
- }
- }
- }
-
- memcpy(m, minv, sizeof(*m) * N * N);
- LAPACKE_dgetrf(LAPACK_COL_MAJOR, N, N, m, N, ipiv);
- LAPACKE_dgetri(LAPACK_COL_MAJOR, N, m, N, ipiv);
-
- cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
- N, N, N, 1.0, scale, N, m, N, 0.0, tmp, N);
- cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
- N, N, N, 1.0, minv, N, tmp, N, 0.0, ms->input_filter, N);
-
- free(m);
- free(minv);
- free(scale);
- free(tmp);
- free(ipiv);
-}
-
-static MaximalSlicingContext *init_ms(cGH *cctkGH,
- int basis_order_r, int basis_order_z,
- double sf, double filter_power, double input_filter_power,
- CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z,
- const int grid_size[3])
-{
- MaximalSlicingContext *ms;
- int ret;
-
- ms = calloc(1, sizeof(*ms));
-
- ms->gh = cctkGH;
-
- ms->basis = &qms_sb_even_basis;
- //ms->basis = &qms_cheb_basis;
- //ms->basis = &qms_cheb_even_basis;
- //ms->basis = &qms_tl_basis;
-#if POLAR
- ms->basis1 = &qms_cos_even_basis;
-#else
- ms->basis1 = &qms_sb_even_basis;
-#endif
-
- ms->nb_coeffs_x = basis_order_r;
- ms->nb_coeffs_z = basis_order_z;
-
- ms->nb_coeffs = ms->nb_coeffs_x * ms->nb_coeffs_z;
-
- ms->nb_colloc_points_x = basis_order_r;
- ms->nb_colloc_points_z = basis_order_z;
-
- ms->nb_colloc_points = ms->nb_colloc_points_x * ms->nb_colloc_points_z;
-
- if (ms->nb_colloc_points != ms->nb_coeffs)
- CCTK_WARN(0, "Non-square collocation matrix");
-
- ms->colloc_grid_order_x = ms->nb_colloc_points_x;
- ms->colloc_grid_order_z = ms->nb_colloc_points_z;
-
- ms->mat = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points);
- ms->coeffs = malloc(sizeof(double) * ms->nb_coeffs);
- ms->rhs = malloc(sizeof(double) * ms->nb_colloc_points);
-
- ms->coeffs_eval = malloc(sizeof(double) * ms->nb_coeffs);
- for (int i = 0; i < ARRAY_ELEMS(ms->solution_cache); i++) {
- ms->solution_cache[i].coeffs = malloc(sizeof(double) * ms->nb_coeffs);
- if (!ms->solution_cache[i].coeffs)
- CCTK_WARN(0, "malloc failure");
- }
-
- ms->mat_f = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points);
- ms->ipiv = malloc(sizeof(*ms->ipiv) * ms->nb_coeffs);
-
-#if 1
- scale_factor = 1.0;
-
- //scale_factor = (x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)] * 0.75) / ms->basis->colloc_point(ms->colloc_grid_order_x, ms->nb_colloc_points_x - 1);
- scale_factor = (64.0 / ms->basis->colloc_point(ms->colloc_grid_order_x, ms->nb_colloc_points_x - 1));
- //scale_factor = (x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)]);
- //scale_factor = x[CCTK_GFINDEX3D(cctkGH, grid_size[0] - 1, 0, 0)] - 0.5;
- fprintf(stderr, "scale factor %16.16g\n", scale_factor);
-
-#else
- scale_factor = sf;
-#endif
-
- /* initialize the collocation grid */
- posix_memalign((void**)&ms->colloc_grid[0], 32, ms->nb_colloc_points_x * sizeof(*ms->colloc_grid[0]));
- posix_memalign((void**)&ms->colloc_grid[1], 32, ms->nb_colloc_points_z * sizeof(*ms->colloc_grid[1]));
-
- for (int i = 0; i < ms->nb_colloc_points_x; i++) {
- ms->colloc_grid[0][i] = ms->basis->colloc_point(ms->colloc_grid_order_x, i);
- fprintf(stderr, "%d %g\n", i, ms->colloc_grid[0][i]);
- }
- for (int i = 0; i < ms->nb_colloc_points_z; i++) {
- ms->colloc_grid[1][i] = ms->basis1->colloc_point(ms->colloc_grid_order_z, i);
- fprintf(stderr, "%d %g\n", i, ms->colloc_grid[1][i] / (POLAR ? M_PI : 1.0));
- }
-
- /* precompute the basis values we will need */
- ms->basis_x_val = malloc(sizeof(*ms->basis_x_val) * ms->nb_colloc_points_x * ms->nb_coeffs_x);
- ms->basis_x_dval = malloc(sizeof(*ms->basis_x_dval) * ms->nb_colloc_points_x * ms->nb_coeffs_x);
- ms->basis_x_d2val = malloc(sizeof(*ms->basis_x_d2val) * ms->nb_colloc_points_x * ms->nb_coeffs_x);
- for (int i = 0; i < ms->nb_colloc_points_x; i++) {
- CCTK_REAL coord = ms->colloc_grid[0][i];
- for (int j = 0; j < ms->nb_coeffs_x; j++) {
- ms->basis_x_val [i * ms->nb_coeffs_x + j] = ms->basis->eval(coord, j);
- ms->basis_x_dval [i * ms->nb_coeffs_x + j] = ms->basis->eval_diff1(coord, j);
- ms->basis_x_d2val[i * ms->nb_coeffs_x + j] = ms->basis->eval_diff2(coord, j);
- }
- }
-
- ms->basis_z_val = malloc(sizeof(*ms->basis_z_val) * ms->nb_colloc_points_z * ms->nb_coeffs_z);
- ms->basis_z_dval = malloc(sizeof(*ms->basis_z_dval) * ms->nb_colloc_points_z * ms->nb_coeffs_z);
- ms->basis_z_d2val = malloc(sizeof(*ms->basis_z_d2val) * ms->nb_colloc_points_z * ms->nb_coeffs_z);
- for (int i = 0; i < ms->nb_colloc_points_z; i++) {
- CCTK_REAL coord = ms->colloc_grid[1][i];
- for (int j = 0; j < ms->nb_coeffs_z; j++) {
- ms->basis_z_val [i * ms->nb_coeffs_z + j] = ms->basis1->eval(coord, j);
- ms->basis_z_dval [i * ms->nb_coeffs_z + j] = ms->basis1->eval_diff1(coord, j);
- ms->basis_z_d2val[i * ms->nb_coeffs_z + j] = ms->basis1->eval_diff2(coord, j);
- }
- }
-
- posix_memalign((void**)&ms->basis_val_00, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00));
- posix_memalign((void**)&ms->basis_val_11, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00));
- posix_memalign((void**)&ms->basis_val_10, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00));
- posix_memalign((void**)&ms->basis_val_01, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00));
- posix_memalign((void**)&ms->basis_val_02, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00));
- posix_memalign((void**)&ms->basis_val_20, 32, ms->nb_colloc_points * ms->nb_coeffs * sizeof(*ms->basis_val_00));
- for (int i = 0; i < ms->nb_colloc_points_z; i++) {
- const double *basis_z = ms->basis_z_val + i * ms->nb_coeffs_z;
- const double *dbasis_z = ms->basis_z_dval + i * ms->nb_coeffs_z;
- const double *d2basis_z = ms->basis_z_d2val + i * ms->nb_coeffs_z;
-
- for (int j = 0; j < ms->nb_colloc_points_x; j++) {
- const double *basis_x = ms->basis_x_val + j * ms->nb_coeffs_x;
- const double *dbasis_x = ms->basis_x_dval + j * ms->nb_coeffs_x;
- const double *d2basis_x = ms->basis_x_d2val + j * ms->nb_coeffs_x;
- const int idx_grid = i * ms->nb_colloc_points_x + j;
-
-#if POLAR
- double r = ms->colloc_grid[0][j];
- double theta = ms->colloc_grid[1][i];
-
- double x = r * cos(theta);
- double z = r * sin(theta);
-#else
- double x = ms->colloc_grid[0][j];
- double z = ms->colloc_grid[1][i];
-#endif
-
- for (int k = 0; k < ms->nb_coeffs_z; k++)
- for (int l = 0; l < ms->nb_coeffs_x; l++) {
- const int idx_coeff = k * ms->nb_coeffs_x + l;
- const int idx = idx_grid + ms->nb_colloc_points * idx_coeff;
- ms->basis_val_00[idx] = basis_x[l] * basis_z[k];
-#if POLAR
- ms->basis_val_10[idx] = ((r > EPS) ? (dbasis_x[l] * basis_z[k] * x / r - basis_x[l] * dbasis_z[k] * z / SQR(r)) : 0.0);
- ms->basis_val_01[idx] = ((r > EPS) ? (dbasis_x[l] * basis_z[k] * z / r + basis_x[l] * dbasis_z[k] * x / SQR(r)) : 0.0);
- ms->basis_val_20[idx] = ((r > EPS) ? (SQR(x / r) * d2basis_x[l] * basis_z[k] + SQR(z / SQR(r)) * basis_x[l] * d2basis_z[k]
- + (1.0 - SQR(x / r)) / r * dbasis_x[l] * basis_z[k]
- + 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k]
- - 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
- ms->basis_val_02[idx] = ((r > EPS) ? (SQR(z / r) * d2basis_x[l] * basis_z[k] + SQR(x / SQR(r)) * basis_x[l] * d2basis_z[k]
- + (1.0 - SQR(z / r)) / r * dbasis_x[l] * basis_z[k]
- - 2 * x * z / SQR(SQR(r)) * basis_x[l] * dbasis_z[k]
- + 2 * z * x / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
- ms->basis_val_11[idx] = ((r > EPS) ? (x * z / SQR(r) * d2basis_x[l] * basis_z[k] - x * z / SQR(SQR(r)) * basis_x[l] * d2basis_z[k]
- - x * z / (r * SQR(r)) * dbasis_x[l] * basis_z[k]
- - (1.0 - SQR(z / r)) / SQR(r) * basis_x[l] * dbasis_z[k]
- + (SQR(x) - SQR(z)) / (r * SQR(r)) * dbasis_x[l] * dbasis_z[k]) : 0.0);
-#else
- ms->basis_val_10[idx] = dbasis_x[l] * basis_z[k];
- ms->basis_val_01[idx] = basis_x[l] * dbasis_z[k];
- ms->basis_val_20[idx] = d2basis_x[l] * basis_z[k];
- ms->basis_val_02[idx] = basis_x[l] * d2basis_z[k];
- ms->basis_val_11[idx] = dbasis_x[l] * dbasis_z[k];
-#endif
- }
- }
- }
-
- posix_memalign((void**)&ms->eq_coeff_00, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00));
- posix_memalign((void**)&ms->eq_coeff_11, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00));
- posix_memalign((void**)&ms->eq_coeff_10, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00));
- posix_memalign((void**)&ms->eq_coeff_01, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00));
- posix_memalign((void**)&ms->eq_coeff_02, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00));
- posix_memalign((void**)&ms->eq_coeff_20, 32, ms->nb_colloc_points * sizeof(*ms->eq_coeff_00));
-
- ms->interp_coords[0] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[0]));
- ms->interp_coords[1] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[1]));
- ms->interp_coords[2] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_coords[2]));
- for (int i = 0; i < ms->nb_colloc_points_z; i++) {
- for (int j = 0; j < ms->nb_colloc_points_x; j++) {
-#if POLAR
- double phi = ms->colloc_grid[1][i];
- double r = ms->colloc_grid[0][j];
-
- double x = r * cos(phi);
- double z = r * sin(phi);
-#else
- double x = ms->colloc_grid[0][j];
- double z = ms->colloc_grid[1][i];
-#endif
-
- ms->interp_coords[0][i * ms->nb_colloc_points_x + j] = x;
- ms->interp_coords[1][i * ms->nb_colloc_points_x + j] = 0;
- ms->interp_coords[2][i * ms->nb_colloc_points_x + j] = z;
- }
- }
-
- for (int i = 0; i < ARRAY_ELEMS(ms->metric_u); i++)
- ms->metric_u[i] = malloc(ms->nb_colloc_points * sizeof(*ms->interp_values[i]));
-
- ms->kij_kij = malloc(ms->nb_colloc_points * sizeof(*ms->kij_kij));
-
- ms->coeff_scale = malloc(ms->nb_coeffs * sizeof(double));
- for (int j = 0; j < ms->nb_coeffs_z; j++)
- for (int i = 0; i < ms->nb_coeffs_x; i++) {
- //ms->coeff_scale[j * ms->nb_coeffs_x + i] = 1.0;
- ms->coeff_scale[j * ms->nb_coeffs_x + i] = exp(-36.0 * pow((double)i / ms->nb_coeffs_x, filter_power)) *
- exp(-36.0 * pow((double)j / ms->nb_coeffs_z, filter_power));
- //ms->coeff_scale[j * ms->nb_coeffs_x + i] = ((i < (2.0 / 3.0) * ms->nb_coeffs_x) ? 1.0 : SQR(cos((((double)i / ms->nb_coeffs_x) - (2.0 / 3.0)) * 3.0 * M_PI / 2.0))) *
- // ((j < (2.0 / 3.0) * ms->nb_coeffs_z) ? 1.0 : SQR(cos((((double)j / ms->nb_coeffs_z) - (2.0 / 3.0)) * 3.0 * M_PI / 2.0)));
- }
-
- for (int i = 0; i < ARRAY_ELEMS(ms->interp_values); i++) {
- ms->interp_values[i] = malloc(sizeof(*ms->interp_values[i]) * ms->nb_colloc_points);
- ms->interp_values_prefilter[i] = malloc(sizeof(*ms->interp_values[i]) * ms->nb_colloc_points);
- if (!ms->interp_values[i] || !ms->interp_values_prefilter[i])
- CCTK_WARN(0, "Malloc failure");
- ms->interp_value_codes[i] = CCTK_VARIABLE_REAL;
- }
-
- for (int i = 0; i < ARRAY_ELEMS(metric_vars); i++) {
- ms->interp_vars_indices[i] = CCTK_VarIndex(metric_vars[i]);
- if (ms->interp_vars_indices[i] < 0)
- CCTK_VWarn(0, __LINE__, __FILE__, CCTK_THORNSTRING, "Error getting the index of variable: %s\n", metric_vars[i]);
- }
-
- ms->coord_system = CCTK_CoordSystemHandle("cart3d");
- if (ms->coord_system < 0)
- CCTK_WARN(0, "Error getting the coordinate system");
-
- ms->interp_operator = CCTK_InterpHandle("Lagrange polynomial interpolation (tensor product)");
- if (ms->interp_operator < 0)
- CCTK_WARN(0, "Error getting the interpolation operator");
-
- ms->interp_params = Util_TableCreateFromString("order=4 want_global_mode=1");
- if (ms->interp_params < 0)
- CCTK_WARN(0, "Error creating interpolation parameters table");
-
- ret = Util_TableSetIntArray(ms->interp_params, NB_INTERP_VARS,
- interp_operation_codes, "operation_codes");
- if (ret < 0)
- CCTK_WARN(0, "Error setting operation codes");
-
- ret = Util_TableSetIntArray(ms->interp_params, NB_INTERP_VARS,
- interp_operation_indices, "operand_indices");
- if (ret < 0)
- CCTK_WARN(0, "Error setting operand indices");
-
- ms->bicgstab.p = malloc(sizeof(double) * ms->nb_coeffs);
- ms->bicgstab.v = malloc(sizeof(double) * ms->nb_coeffs);
- ms->bicgstab.y = malloc(sizeof(double) * ms->nb_coeffs);
- ms->bicgstab.z = malloc(sizeof(double) * ms->nb_coeffs);
- ms->bicgstab.t = malloc(sizeof(double) * ms->nb_coeffs);
- ms->bicgstab.res = malloc(sizeof(double) * ms->nb_coeffs);
- ms->bicgstab.res0 = malloc(sizeof(double) * ms->nb_coeffs);
- ms->bicgstab.k = malloc(sizeof(double) * ms->nb_coeffs * ms->nb_colloc_points);
-
- ms->steps_since_inverse = INT_MAX;
-
- init_opencl(ms);
-
- construct_filter_matrix(ms, input_filter_power);
-
- CCTK_TimerCreate("MaximalSlicingAxi_Solve");
- CCTK_TimerCreate("MaximalSlicingAxi_Expand");
- CCTK_TimerCreate("MaximalSlicingAxi_interp_geometry");
- CCTK_TimerCreate("MaximalSlicingAxi_calc_eq_coeffs");
- CCTK_TimerCreate("MaximalSlicingAxi_construct_matrix");
- CCTK_TimerCreate("MaximalSlicingAxi_filter_input");
- CCTK_TimerCreate("MaximalSlicingAxi_solve_LU");
- CCTK_TimerCreate("MaximalSlicingAxi_solve_BiCGSTAB");
- CCTK_TimerCreate("MaximalSlicingAxi_Polish");
-
- return ms;
+ return b->colloc_point(order, 1);
}
-static CoordPatch *get_coord_patch(MaximalSlicingContext *ms,
- CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z)
+static CoordPatch *get_coord_patch(QMSContext *ms,
+ CCTK_REAL *x, CCTK_REAL *y, CCTK_REAL *z,
+ double scale_factor, double scale_power)
{
cGH *cctkGH = ms->gh;
@@ -645,48 +74,9 @@ static CoordPatch *get_coord_patch(MaximalSlicingContext *ms,
if (i == cp->size[1])
CCTK_WARN(0, "The grid does not include y==0");
-#if 0
- posix_memalign((void**)&cp->basis_val_r, 32, sizeof(*cp->basis_val_r) * ms->nb_coeffs_x * ms->gh->cctk_lsh[1] * ms->gh->cctk_lsh[0]);
- for (int j = 0; j < ms->gh->cctk_lsh[1]; j++)
- for (int i = 0; i < ms->gh->cctk_lsh[0]; i++) {
- CCTK_REAL xx = x[CCTK_GFINDEX3D(ms->gh, i, j, 0)];
- CCTK_REAL yy = y[CCTK_GFINDEX3D(ms->gh, i, j, 0)];
- CCTK_REAL r = sqrt(SQR(xx) + SQR(yy));
-
- for (int k = 0; k < ms->nb_coeffs_x; k++)
- //cp->basis_val_r [(j * ms->gh->cctk_lsh[0] + i) * ms->nb_coeffs_x + k] = ms->basis->eval(r, k);
- cp->basis_val_r [(j * ms->gh->cctk_lsh[0] + i) + ms->gh->cctk_lsh[1] * ms->gh->cctk_lsh[0] * k] = ms->basis->eval(r, k);
- }
-
- posix_memalign((void**)&cp->basis_val_z, 32, sizeof(*cp->basis_val_z) * ms->nb_coeffs_z * ms->gh->cctk_lsh[2]);
- for (int i = 0; i < ms->gh->cctk_lsh[2]; i++) {
- CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, i)];
- for (int j = 0; j < ms->nb_coeffs_z; j++)
- cp->basis_val_z [i * ms->nb_coeffs_z + j] = ms->basis->eval(fabs(zz), j);
- //cp->basis_val_z [i + ms->gh->cctk_lsh[2] * j] = ms->basis->eval(zz, j);
- }
- posix_memalign((void**)&cp->transform_z, 32, sizeof(*cp->transform_z) * cctkGH->cctk_lsh[2] * ms->nb_coeffs_x);
- posix_memalign((void**)&cp->one, 32, sizeof(*cp->one) * grid_size);
- for (int i = 0; i < grid_size; i++)
- cp->one[i] = 1.0;
-#else
- posix_memalign((void**)&cp->basis_val_r, 32, sizeof(*cp->basis_val_r) * ms->nb_coeffs_x * ms->gh->cctk_lsh[0]);
- for (int i = 0; i < ms->gh->cctk_lsh[0]; i++) {
- CCTK_REAL xx = x[CCTK_GFINDEX3D(ms->gh, i, 0, 0)];
-
- for (int k = 0; k < ms->nb_coeffs_x; k++)
- cp->basis_val_r[i * ms->nb_coeffs_x + k] = ms->basis->eval(fabs(xx), k);
- }
-
- posix_memalign((void**)&cp->basis_val_z, 32, sizeof(*cp->basis_val_z) * ms->nb_coeffs_z * ms->gh->cctk_lsh[2]);
- for (int i = 0; i < ms->gh->cctk_lsh[2]; i++) {
- CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, i)];
- for (int j = 0; j < ms->nb_coeffs_z; j++)
- cp->basis_val_z[i * ms->nb_coeffs_z + j] = ms->basis->eval(fabs(zz), j);
- }
-
- posix_memalign((void**)&cp->transform_matrix, 32, sizeof(*cp->transform_matrix) * ms->nb_coeffs_x * cp->size[0] * cp->size[2]);
- posix_memalign((void**)&cp->transform_matrix1, 32, sizeof(*cp->transform_matrix1) * ms->nb_coeffs_z * cp->size[0] * cp->size[2]);
+#if QMS_POLAR || 1
+ posix_memalign((void**)&cp->transform_matrix, 32, sizeof(*cp->transform_matrix) * ms->solver->nb_coeffs[0] * cp->size[0] * cp->size[2]);
+ posix_memalign((void**)&cp->transform_matrix1, 32, sizeof(*cp->transform_matrix1) * ms->solver->nb_coeffs[1] * cp->size[0] * cp->size[2]);
#pragma omp parallel for
for (int j = 0; j < cp->size[2]; j++) {
CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, j)];
@@ -694,10 +84,11 @@ static CoordPatch *get_coord_patch(MaximalSlicingContext *ms,
for (int i = 0; i < cp->size[0]; i++) {
const int idx_grid = j * cp->size[0] + i;
- CCTK_REAL xx = x[CCTK_GFINDEX3D(ms->gh, i, 0, 0)];
+ double xx = x[CCTK_GFINDEX3D(ms->gh, i, 0, 0)];
+ double rr = sqrt(SQR(xx) + SQR(zz));
-#if POLAR
- double coord0 = sqrt(SQR(xx) + SQR(zz));
+#if QMS_POLAR
+ double coord0 = rr;
double coord1 = atan2(zz, xx);
#else
double coord0 = xx;
@@ -709,51 +100,88 @@ static CoordPatch *get_coord_patch(MaximalSlicingContext *ms,
// const int idx_coeff = k * ms->nb_coeffs_x + l;
// cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * idx_coeff] = ms->basis->eval(r, l) * ms->basis1->eval(phi, k);
// }
- for (int k = 0; k < ms->nb_coeffs_x; k++)
- cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * k] = ms->basis->eval(coord0, k);
- for (int k = 0; k < ms->nb_coeffs_z; k++)
- cp->transform_matrix1[idx_grid * ms->nb_coeffs_z + k] = ms->basis1->eval(coord1, k);
+ for (int k = 0; k < ms->solver->nb_coeffs[0]; k++) {
+ double dx = calc_basis_freq(ms->solver->basis[0], k);
+ double r0 = dx * scale_factor;
+ double fact = exp(-36.0 * pow(rr / r0, scale_power));
+
+ cp->transform_matrix[idx_grid + cp->size[0] * cp->size[2] * k] = ms->solver->basis[0]->eval(coord0, k) * fact;
+ }
+ for (int k = 0; k < ms->solver->nb_coeffs[1]; k++) {
+ double dx = calc_basis_freq(ms->solver->basis[1], k);
+ double r0 = dx * scale_factor;
+ double fact = exp(-36.0 * pow(rr / r0, scale_power));
+
+ cp->transform_matrix1[idx_grid * ms->solver->nb_coeffs[1] + k] = ms->solver->basis[1]->eval(coord1, k) * fact;
+ }
}
}
- posix_memalign((void**)&cp->transform_tmp, 32, sizeof(*cp->transform_tmp) * cp->size[0] * cp->size[2] * ms->nb_coeffs_z);
+ posix_memalign((void**)&cp->transform_tmp, 32, sizeof(*cp->transform_tmp) * cp->size[0] * cp->size[2] * ms->solver->nb_coeffs[1]);
+#else
+ posix_memalign((void**)&cp->basis_val_r, 32, sizeof(*cp->basis_val_r) * ms->solver->nb_coeffs[0] * ms->gh->cctk_lsh[1] * ms->gh->cctk_lsh[0]);
+ for (int j = 0; j < ms->gh->cctk_lsh[1]; j++)
+ for (int i = 0; i < ms->gh->cctk_lsh[0]; i++) {
+ CCTK_REAL xx = x[CCTK_GFINDEX3D(ms->gh, i, j, 0)];
+ CCTK_REAL yy = y[CCTK_GFINDEX3D(ms->gh, i, j, 0)];
+ CCTK_REAL r = sqrt(SQR(xx) + SQR(yy));
+
+ for (int k = 0; k < ms->solver->nb_coeffs[0]; k++)
+ //cp->basis_val_r [(j * ms->gh->cctk_lsh[0] + i) * ms->nb_coeffs_x + k] = ms->basis->eval(r, k);
+ cp->basis_val_r [(j * ms->gh->cctk_lsh[0] + i) + ms->gh->cctk_lsh[1] * ms->gh->cctk_lsh[0] * k] = ms->solver->basis[0]->eval(r, k);
+ }
+
+ posix_memalign((void**)&cp->basis_val_z, 32, sizeof(*cp->basis_val_z) * ms->solver->nb_coeffs[1] * ms->gh->cctk_lsh[2]);
+ for (int i = 0; i < ms->gh->cctk_lsh[2]; i++) {
+ CCTK_REAL zz = z[CCTK_GFINDEX3D(ms->gh, 0, 0, i)];
+ for (int j = 0; j < ms->solver->nb_coeffs[1]; j++)
+ cp->basis_val_z [i * ms->solver->nb_coeffs[1] + j] = ms->solver->basis[0]->eval(fabs(zz), j);
+ //cp->basis_val_z [i + ms->gh->cctk_lsh[2] * j] = ms->basis->eval(zz, j);
+ }
+ posix_memalign((void**)&cp->transform_z, 32, sizeof(*cp->transform_z) * cctkGH->cctk_lsh[2] * ms->solver->nb_coeffs[0]);
+ posix_memalign((void**)&cp->one, 32, sizeof(*cp->one) * grid_size);
+ for (int i = 0; i < grid_size; i++)
+ cp->one[i] = 1.0;
+
+ posix_memalign((void**)&cp->w_scale, 32, sizeof(*cp->w_scale) * grid_size);
+ for (int k = 0; k < ms->gh->cctk_lsh[2]; k++)
+ for (int j = 0; j < ms->gh->cctk_lsh[1]; j++)
+ for (int i = 0; i < ms->gh->cctk_lsh[0]; i++) {
+ int idx = CCTK_GFINDEX3D(ms->gh, i, j, k);
+ double r = sqrt(SQR(x[idx]) + SQR(y[idx]) + SQR(z[idx]));
+ const double R = 32.0;
+ const double width = 4.0;
+ cp->w_scale[idx] = (r > R) ? exp(-pow((r - R) / width, 4.0)) : 1.0;
+ }
#endif
ms->nb_patches++;
return cp;
}
-static MaximalSlicingContext *ms_context;
+static QMSContext *qms_context;
void quasimaximal_slicing_axi_solve(CCTK_ARGUMENTS)
{
- MaximalSlicingContext *ms;
+ QMSContext *ms;
DECLARE_CCTK_ARGUMENTS;
DECLARE_CCTK_PARAMETERS;
double time;
- if (!ms_context) {
- ms_context = init_ms(cctkGH, basis_order_r, basis_order_z,
- scale_factor, filter_power, input_filter_power, x, y, z, cctk_lsh);
- }
- ms = ms_context;
+ ms = qms_context;
time = cctkGH->cctk_time / ms->gh->cctk_delta_time;
- if (ms->gh->cctk_levfac[0] != 1 ||
- fabs(time - ceilf(time)) > 1e-8)
+ if (ms->gh->cctk_levfac[0] != 1 || fabs(time - ceilf(time)) > 1e-8 ||
+ (ms->nb_solutions && ms->solution_cache[ms->nb_solutions - 1].time == cctkGH->cctk_time))
return;
- fprintf(stderr, "qms solve: time %g %g\n", ms->gh->cctk_time, time);
-
- CCTK_TimerStart("MaximalSlicingAxi_Solve");
- qms_maximal_solve(ms);
- CCTK_TimerStop("MaximalSlicingAxi_Solve");
-
- if (export_coeffs)
- memcpy(w_coeffs, ms->coeffs, sizeof(*w_coeffs) * ms->nb_coeffs);
+ CCTK_TimerStart("QuasiMaximalSlicing_Solve");
+ qms_solver_solve(ms->solver);
+ CCTK_TimerStop("QuasiMaximalSlicing_Solve");
+ fprintf(stderr, "%d qms solve: time %g %g %g\n", ms->gh->cctk_levfac[0], ms->gh->cctk_time, time, ms->solver->coeffs[0]);
if (1) {
double *tmp;
if (ms->nb_solutions == ARRAY_ELEMS(ms->solution_cache)) {
@@ -763,45 +191,39 @@ void quasimaximal_slicing_axi_solve(CCTK_ARGUMENTS)
ms->nb_solutions++;
tmp = ms->solution_cache[ms->nb_solutions - 1].coeffs;
}
- ms->solution_cache[ms->nb_solutions - 1].coeffs = ms->coeffs;
+ ms->solution_cache[ms->nb_solutions - 1].coeffs = ms->solver->coeffs;
ms->solution_cache[ms->nb_solutions - 1].time = ms->gh->cctk_time;
- ms->coeffs = tmp;
+ ms->solver->coeffs = tmp;
}
}
void quasimaximal_slicing_axi_eval(CCTK_ARGUMENTS)
{
- MaximalSlicingContext *ms;
+ QMSContext *ms;
CoordPatch *cp;
DECLARE_CCTK_ARGUMENTS;
DECLARE_CCTK_PARAMETERS;
- int64_t expand_start, totaltime_start;
+ int64_t expand_start;
double *coeffs = NULL;
int i, ret;
- totaltime_start = gettime();
-
- /* on the first run, init the solver */
- if (!ms_context) {
- ms_context = init_ms(cctkGH, basis_order_r, basis_order_z,
- scale_factor, filter_power, input_filter_power, x, y, z, cctk_lsh);
- }
- ms = ms_context;
+ ms = qms_context;
- cp = get_coord_patch(ms, x, y, z);
+ cp = get_coord_patch(ms, x, y, z, scale_factor, scale_power);
#if 0
- coeffs = ms->coeffs;
+ //coeffs = ms->coeffs;
+ coeffs = ms->solution_cache[ms->nb_solutions - 1].coeffs;
#else
coeffs = ms->coeffs_eval;
- if (ms->nb_solutions < 2) {
- memset(coeffs, 0, sizeof(*coeffs) * ms->nb_coeffs);
+ if (cctkGH->cctk_levfac[0] < 1 || ms->nb_solutions < 2) {
+ memset(coeffs, 0, sizeof(*coeffs) * ms->solver->nb_coeffs[0] * ms->solver->nb_coeffs[1]);
//fprintf(stderr, "qms eval: time %g zero\n", ms->gh->cctk_time);
} else {
double *coeffs0 = ms->solution_cache[ms->nb_solutions - 2].coeffs;
@@ -810,16 +232,28 @@ void quasimaximal_slicing_axi_eval(CCTK_ARGUMENTS)
double time1 = ms->solution_cache[ms->nb_solutions - 1].time;
double time = ms->gh->cctk_time;
- //fprintf(stderr, "qms eval: time %g interp from %g %g\n", ms->gh->cctk_time, time0, time1);
+ //double fact;
- for (int i = 0; i < ms->nb_coeffs; i++)
+ //if (time > 2.0)
+ // fact = 1.0;
+ //else if (time < 0.1)
+ // fact = 0.0;
+ //else
+ // fact = (1.0 - exp(-pow((time - 0.0) / 0.25, 4.0)));
+ //fact = 1.0;
+
+ //fprintf(stderr, "qms eval: time %g interp from %g %g %g\n", ms->gh->cctk_time, time0, time1, fact);
+
+ for (int i = 0; i < ms->solver->nb_coeffs[0] * ms->solver->nb_coeffs[1]; i++)
coeffs[i] = coeffs1[i] * (time - time0) / (time1 - time0) + coeffs0[i] * (time - time1) / (time0 - time1);
}
#endif
+ if (export_coeffs)
+ memcpy(w_coeffs, coeffs, sizeof(*w_coeffs) * ms->solver->nb_coeffs[0] * ms->solver->nb_coeffs[1]);
- CCTK_TimerStart("MaximalSlicingAxi_Expand");
+ CCTK_TimerStart("QuasiMaximalSlicing_Expand");
expand_start = gettime();
#if 0
#pragma omp parallel for
@@ -831,106 +265,115 @@ void quasimaximal_slicing_axi_eval(CCTK_ARGUMENTS)
double r = sqrt(SQR(xx) + SQR(zz));
double phi = atan2(zz, xx);
- double val = 1.0;
+ double val = 0.0;
for (int l = 0; l < ms->nb_coeffs_z; l++) {
double tmp = 0.0;
for (int m = 0; m < ms->nb_coeffs_x; m++) {
const int idx_coeff = l * ms->nb_coeffs_x + m;
- tmp += ms->coeffs[idx_coeff] * ms->basis->eval(r, m);
+ tmp += coeffs[idx_coeff] * ms->basis->eval(r, m);
}
val += tmp * ms->basis1->eval(phi, l);
}
- alp[idx] = val;
+ W[idx] = val;
}
}
-#else
+#elif QMS_POLAR || 1
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
- cctk_lsh[0] * cctk_lsh[2], ms->nb_coeffs_z, ms->nb_coeffs_x,
+ cctk_lsh[0] * cctk_lsh[2], ms->solver->nb_coeffs[1], ms->solver->nb_coeffs[0],
1.0, cp->transform_matrix, cctk_lsh[0] * cctk_lsh[2],
- coeffs, ms->nb_coeffs_x, 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]);
+ coeffs, ms->solver->nb_coeffs[0], 0.0, cp->transform_tmp, cctk_lsh[0] * cctk_lsh[2]);
#pragma omp parallel for
for (int j = 0; j < cctk_lsh[2]; j++)
for (int i = 0; i < cctk_lsh[0]; i++) {
const int idx_grid = j * cctk_lsh[0] + i;
- const double val = cblas_ddot(ms->nb_coeffs_z, cp->transform_matrix1 + idx_grid * ms->nb_coeffs_z, 1,
+ const double val = cblas_ddot(ms->solver->nb_coeffs[1], cp->transform_matrix1 + idx_grid * ms->solver->nb_coeffs[1], 1,
cp->transform_tmp + idx_grid, cctk_lsh[0] * cctk_lsh[2]);
W[CCTK_GFINDEX3D(cctkGH, i, cp->y_idx, j)] = val;
}
-#endif
+#else
//memcpy(alp, cp->one, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*alp));
- //memset(alp, 0, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*alp));
- //cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
- // ms->nb_coeffs_x, cctk_lsh[2], ms->nb_coeffs_z, 1.0,
- // coeffs, ms->nb_coeffs_x, cp->basis_val_z, ms->nb_coeffs_z,
- // 0.0, cp->transform_z, ms->nb_coeffs_x);
- //cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
- // cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], ms->nb_coeffs_x, 1.0,
- // cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, ms->nb_coeffs_x,
- // 1.0, alp, cctk_lsh[0] * cctk_lsh[1]);
+ memset(W, 0, cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2] * sizeof(*W));
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+ ms->solver->nb_coeffs[0], cctk_lsh[2], ms->solver->nb_coeffs[1], 1.0,
+ coeffs, ms->solver->nb_coeffs[0], cp->basis_val_z, ms->solver->nb_coeffs[1],
+ 0.0, cp->transform_z, ms->solver->nb_coeffs[0]);
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+ cctk_lsh[1] * cctk_lsh[0], cctk_lsh[2], ms->solver->nb_coeffs[0], 1.0,
+ cp->basis_val_r, cctk_lsh[0] * cctk_lsh[1], cp->transform_z, ms->solver->nb_coeffs[0],
+ 1.0, W, cctk_lsh[0] * cctk_lsh[1]);
+
+// {
+// const int grid_size = cctk_lsh[0] * cctk_lsh[1] * cctk_lsh[2];
+//#pragma omp parallel for
+// for (int i = 0; i < grid_size; i++)
+// W[i] *= cp->w_scale[i];
+// }
+#endif
ms->grid_expand_time += gettime() - expand_start;
ms->grid_expand_count++;
- CCTK_TimerStop("MaximalSlicingAxi_Expand");
-
- ms->solve_time += gettime() - totaltime_start;
- ms->solve_count++;
+ CCTK_TimerStop("QuasiMaximalSlicing_Expand");
/* print stats */
- if (!(ms->solve_count & 255)) {
- fprintf(stderr,
- "maximal slicing solves: %lu, "
- "total time %g s, avg time per call %g ms\n",
- ms->solve_count, (double)ms->solve_time / 1e6,
- (double)ms->solve_time / ms->solve_count / 1e3);
- fprintf(stderr,
- "%g%% interpolate geometry: %lu, "
- "total time %g s, avg time per call %g ms\n",
- (double)ms->interp_geometry_time * 100 / ms->solve_time,
- ms->interp_geometry_count, (double)ms->interp_geometry_time / 1e6,
- (double)ms->interp_geometry_time / ms->interp_geometry_count / 1e3);
- fprintf(stderr,
- "%g%% calc equation coefficients: %lu, "
- "total time %g s, avg time per call %g ms\n",
- (double)ms->calc_eq_coeffs_time * 100 / ms->solve_time,
- ms->calc_eq_coeffs_count, (double)ms->calc_eq_coeffs_time / 1e6,
- (double)ms->calc_eq_coeffs_time / ms->calc_eq_coeffs_count / 1e3);
- fprintf(stderr,
- "%g%% pseudospectral matrix construction: %lu, "
- "total time %g s, avg time per call %g ms\n",
- (double)ms->construct_matrix_time * 100 / ms->solve_time,
- ms->construct_matrix_count, (double)ms->construct_matrix_time / 1e6,
- (double)ms->construct_matrix_time / ms->construct_matrix_count / 1e3);
- fprintf(stderr,
- "%g%% BiCGSTAB %lu solves, "
- "%lu iterations, total time %g s, "
- "avg iterations per solve %g, avg time per solve %g ms, "
- "avg time per iteration %g ms\n",
- (double)ms->cg_time_total * 100 / ms->solve_time,
- ms->cg_solve_count, ms->cg_iter_count, (double)ms->cg_time_total / 1e6,
- (double)ms->cg_iter_count / ms->cg_solve_count,
- (double)ms->cg_time_total / ms->cg_solve_count / 1e3,
- (double)ms->cg_time_total / ms->cg_iter_count / 1e3);
- fprintf(stderr,
- "%g%% LU %lu solves, total time %g s, avg time per solve %g ms\n",
- (double)ms->lu_solves_time * 100 / ms->solve_time,
- ms->lu_solves_count, (double)ms->lu_solves_time / 1e6,
- (double)ms->lu_solves_time / ms->lu_solves_count / 1e3);
+ if (!(ms->grid_expand_count & 255)) {
+ fprintf(stderr, "Quasi-maximal slicing stats:\n");
+
+ qms_solver_print_stats(ms->solver);
+
fprintf(stderr,
- "%g%% grid expansion: %lu, total time %g s, avg time per call %g ms\n",
- (double)ms->grid_expand_time * 100 / ms->solve_time,
+ "%lu evals: total time %g s, avg time per call %g ms\n",
ms->grid_expand_count, (double)ms->grid_expand_time / 1e6,
(double)ms->grid_expand_time / ms->grid_expand_count / 1e3);
}
}
+static int context_init(cGH *cctkGH)
+{
+ QMSContext *qms;
+ int ret;
+
+ DECLARE_CCTK_ARGUMENTS;
+ DECLARE_CCTK_PARAMETERS;
+
+ qms = calloc(1, sizeof(*qms));
+ if (!qms)
+ return -ENOMEM;
+
+ qms->gh = cctkGH;
+
+ ret = qms_solver_init(&qms->solver, cctkGH, basis_order_r, basis_order_z,
+ scale_factor, filter_power, 0.0);
+ if (ret < 0)
+ return ret;
+
+ ret = posix_memalign((void**)&qms->coeffs_eval, 32,
+ basis_order_r * basis_order_z * sizeof(*qms->coeffs_eval));
+ if (ret)
+ return -ENOMEM;
+
+ for (int i = 0; i < ARRAY_ELEMS(qms->solution_cache); i++) {
+ ret = posix_memalign((void**)&qms->solution_cache[i].coeffs, 32,
+ basis_order_r * basis_order_z * sizeof(*qms->solution_cache[i].coeffs));
+ if (ret)
+ return -ENOMEM;
+ }
+
+ qms_context = qms;
+
+ return 0;
+}
+
void qms_init(CCTK_ARGUMENTS)
{
DECLARE_CCTK_ARGUMENTS;
DECLARE_CCTK_PARAMETERS;
+ if (!qms_context)
+ context_init(cctkGH);
+
double *Kdot11 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot11");
double *Kdot22 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot22");
double *Kdot33 = CCTK_VarDataPtr(cctkGH, 0, "ML_BSSN::Kdot33");
diff --git a/src/qms.h b/src/qms.h
index f88c0a8..d8a69e6 100644
--- a/src/qms.h
+++ b/src/qms.h
@@ -1,145 +1,17 @@
+#ifndef QMS_QMS_H
+#define QMS_QMS_H
-#include <inttypes.h>
+#include "common.h"
+#if HAVE_OPENCL
#include <cl.h>
+#endif
-#include "cctk.h"
-
-#define POLAR (1)
-
-#define CCZ4 (0)
-
-#define SQR(x) ((x) * (x))
-#define SGN(x) ((x) >= 0.0 ? 1.0 : -1.0)
-#define MAX(x, y) ((x) > (y) ? (x) : (y))
-#define MIN(x, y) ((x) > (y) ? (y) : (x))
-#define ARRAY_ELEMS(arr) (sizeof(arr) / sizeof(*arr))
-
-/*
- * small number to avoid r=0 singularities
- */
-#define EPS 1E-08
-
-#define SCALE_FACTOR scale_factor
-
-/* indices (in our code, not cactus structs) of the grid functions which we'll need to
- * interpolate on the pseudospectral grid */
-enum MetricVars {
- GTXX = 0,
- GTYY,
- GTZZ,
- GTXY,
- GTXZ,
- GTYZ,
- PHI,
- ATXX,
- ATYY,
- ATZZ,
- ATXY,
- ATXZ,
- ATYZ,
- K,
- XTX,
- XTY,
- XTZ,
- BETAX,
- BETAY,
- BETAZ,
- ALPHA,
- KDOT_XX,
- KDOT_YY,
- KDOT_ZZ,
- KDOT_XY,
- KDOT_XZ,
- KDOT_YZ,
- XTDOT_X,
- XTDOT_Y,
- XTDOT_Z,
- PHIDOT,
- NB_METRIC_VARS,
-};
-
-/* indices of the interpolated values of the above grid functions and their derivatives */
-enum InterpMetricVars {
- I_GTXX = 0,
- I_GTYY,
- I_GTZZ,
- I_GTXY,
- I_GTXZ,
- I_GTYZ,
- I_PHI,
- I_PHI_DX,
- I_PHI_DY,
- I_PHI_DZ,
- I_ATXX,
- I_ATYY,
- I_ATZZ,
- I_ATXY,
- I_ATXZ,
- I_ATYZ,
- I_K,
- I_K_DX,
- I_K_DY,
- I_K_DZ,
- I_XTX,
- I_XTY,
- I_XTZ,
- I_BETAX,
- I_BETAY,
- I_BETAZ,
- I_ALPHA,
- I_ALPHA_DX,
- I_ALPHA_DY,
- I_ALPHA_DZ,
- I_ALPHA_DXX,
- I_ALPHA_DYY,
- I_ALPHA_DZZ,
- I_ALPHA_DXY,
- I_ALPHA_DXZ,
- I_ALPHA_DYZ,
- I_KDOT_XX,
- I_KDOT_YY,
- I_KDOT_ZZ,
- I_KDOT_XY,
- I_KDOT_XZ,
- I_KDOT_YZ,
- I_XTDOT_X,
- I_XTDOT_Y,
- I_XTDOT_Z,
- I_PHIDOT,
- I_PHIDOT_DX,
- I_PHIDOT_DY,
- I_PHIDOT_DZ,
- NB_INTERP_VARS,
-};
-
-/* a set of basis functions */
-typedef struct BasisSet {
- /* evaluate the idx-th basis function at the specified point*/
- double (*eval) (double coord, int idx);
- /* evaluate the first derivative of the idx-th basis function at the specified point*/
- double (*eval_diff1)(double coord, int idx);
- /* evaluate the second derivative of the idx-th basis function at the specified point*/
- double (*eval_diff2)(double coord, int idx);
- /**
- * Get the idx-th collocation point for the specified order.
- * idx runs from 0 to order - 1 (inclusive)
- */
- double (*colloc_point)(int order, int idx);
-} BasisSet;
-
-extern const BasisSet qms_cheb_basis;
-extern const BasisSet qms_cheb_even_basis;
-extern const BasisSet qms_full_basis;
-extern const BasisSet qms_tb_even_basis;
-extern const BasisSet qms_sb_even_basis;
-extern const BasisSet qms_tl_basis;
-extern const BasisSet qms_cos_even_basis;
+#include <inttypes.h>
-extern double scale_factor;
+#include "cctk.h"
-typedef struct MGContext MGContext;
-typedef struct SORContext SORContext;
+#include "qms_solve.h"
/* precomputed values for a given refined grid */
typedef struct CoordPatch {
@@ -156,31 +28,14 @@ typedef struct CoordPatch {
double *transform_matrix1;
double *transform_tmp;
double *one;
+ double *w_scale;
int y_idx;
-
- MGContext *mg;
- SORContext *sor;
} CoordPatch;
-/* state and scratch storage for the BiCGSTAB solver */
-typedef struct BiCGSTABContext {
- double *p, *v, *y, *z, *t;
- double *res, *res0;
- double *k;
-
- cl_mem cl_p, cl_v, cl_y, cl_z, cl_t;
- cl_mem cl_res, cl_res0;
- cl_mem cl_k, cl_mat;
- cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1;
- cl_mem cl_tmp, cl_tmp1;
-
-} BiCGSTABContext;
-
-typedef struct MaximalSlicingContext {
+typedef struct QMSContext {
+ QMSSolver *solver;
cGH *gh;
- const BasisSet *basis;
- const BasisSet *basis1;
struct {
double time;
@@ -190,112 +45,13 @@ typedef struct MaximalSlicingContext {
double *coeffs_eval;
- BiCGSTABContext bicgstab;
- int steps_since_inverse;
-
- uint64_t solve_count;
- uint64_t solve_time;
-
- uint64_t lu_solves_count;
- uint64_t lu_solves_time;
-
- uint64_t cg_solve_count;
- uint64_t cg_iter_count;
- uint64_t cg_time_total;
-
- uint64_t interp_geometry_count;
- uint64_t interp_geometry_time;
-
- uint64_t calc_eq_coeffs_count;
- uint64_t calc_eq_coeffs_time;
-
- uint64_t construct_matrix_count;
- uint64_t construct_matrix_time;
-
uint64_t grid_expand_count;
uint64_t grid_expand_time;
- // the grid of collocation points
- double *colloc_grid[2];
-
- // interpolation parameters
- int coord_system;
- int interp_operator;
- int interp_params;
-
- CCTK_REAL *interp_coords[3];
-
- int interp_vars_indices[NB_METRIC_VARS];
-
- CCTK_REAL *interp_values[NB_INTERP_VARS];
- CCTK_REAL *interp_values_prefilter[NB_INTERP_VARS];
- CCTK_INT interp_value_codes[NB_INTERP_VARS];
-
- CCTK_REAL *metric_u[6];
-
- CCTK_REAL *kij_kij;
- CCTK_REAL *trk;
-
- int nb_coeffs_x;
- int nb_coeffs_z;
- int nb_coeffs;
-
- int nb_colloc_points_x;
- int nb_colloc_points_z;
- int nb_colloc_points;
-
- int colloc_grid_order_x;
- int colloc_grid_order_z;
-
- double *mat;
- double *mat_f;
- double *rhs;
- double *coeffs;
- int *ipiv;
- double *basis_x_val;
- double *basis_x_dval;
- double *basis_x_d2val;
-
- double *basis_z_val;
- double *basis_z_dval;
- double *basis_z_d2val;
-
- double *basis_val_00;
- double *basis_val_20;
- double *basis_val_02;
- double *basis_val_11;
- double *basis_val_10;
- double *basis_val_01;
-
- double *eq_coeff_00;
- double *eq_coeff_20;
- double *eq_coeff_02;
- double *eq_coeff_11;
- double *eq_coeff_10;
- double *eq_coeff_01;
-
- double *coeff_scale;
-
- double *input_filter;
-
CoordPatch *patches;
int nb_patches;
+} QMSContext;
- // OpenCL / CLBLAS stuff
- cl_context cl_ctx;
- cl_command_queue cl_queue;
- cl_device_id ocl_device;
-
- cl_mem ocl_coeffs;
-} MaximalSlicingContext;
-
-int qms_maximal_solve(MaximalSlicingContext *ms);
-
-#include <sys/time.h>
-static inline int64_t gettime(void)
-{
- struct timeval tv;
- gettimeofday(&tv, NULL);
- return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
-}
+int qms_maximal_solve(QMSContext *ms);
+#endif /* QMS_QMS_H */
diff --git a/src/qms_solve.c b/src/qms_solve.c
new file mode 100644
index 0000000..7136f38
--- /dev/null
+++ b/src/qms_solve.c
@@ -0,0 +1,880 @@
+/*
+ * Quasimaximal slicing -- actual solver code
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+
+#include <errno.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#if HAVE_OPENCL
+#include <cl.h>
+#include <clBLAS.h>
+#endif
+
+#include "cctk.h"
+#include "cctk_Timers.h"
+#include "util_Table.h"
+
+#include "basis.h"
+#include "pssolve.h"
+#include "qms_solve.h"
+
+#define NB_COEFFS(qms) (qms->nb_coeffs[0] * qms->nb_coeffs[1])
+#define NB_COLLOC_POINTS(qms) (qms->nb_colloc_points[0] * qms->nb_colloc_points[1])
+
+/* indices (in our code, not cactus structs) of the grid functions which we'll need to
+ * interpolate on the pseudospectral grid */
+enum MetricVars {
+ GTXX = 0,
+ GTYY,
+ GTZZ,
+ GTXY,
+ GTXZ,
+ GTYZ,
+ PHI,
+ ATXX,
+ ATYY,
+ ATZZ,
+ ATXY,
+ ATXZ,
+ ATYZ,
+ K,
+ XTX,
+ XTY,
+ XTZ,
+ BETAX,
+ BETAY,
+ BETAZ,
+ ALPHA,
+ KDOT_XX,
+ KDOT_YY,
+ KDOT_ZZ,
+ KDOT_XY,
+ KDOT_XZ,
+ KDOT_YZ,
+ XTDOT_X,
+ XTDOT_Y,
+ XTDOT_Z,
+ PHIDOT,
+ NB_METRIC_VARS,
+};
+
+/* indices of the interpolated values of the above grid functions and their derivatives */
+enum InterpMetricVars {
+ I_GTXX = 0,
+ I_GTYY,
+ I_GTZZ,
+ I_GTXY,
+ I_GTXZ,
+ I_GTYZ,
+ I_PHI,
+ I_PHI_DX,
+ I_PHI_DY,
+ I_PHI_DZ,
+ I_ATXX,
+ I_ATYY,
+ I_ATZZ,
+ I_ATXY,
+ I_ATXZ,
+ I_ATYZ,
+ I_K,
+ I_XTX,
+ I_XTY,
+ I_XTZ,
+ I_BETAX,
+ I_BETAY,
+ I_BETAZ,
+ I_ALPHA,
+ I_ALPHA_DX,
+ I_ALPHA_DY,
+ I_ALPHA_DZ,
+ I_ALPHA_DXX,
+ I_ALPHA_DYY,
+ I_ALPHA_DZZ,
+ I_ALPHA_DXY,
+ I_ALPHA_DXZ,
+ I_ALPHA_DYZ,
+ I_KDOT_XX,
+ I_KDOT_YY,
+ I_KDOT_ZZ,
+ I_KDOT_XY,
+ I_KDOT_XZ,
+ I_KDOT_YZ,
+ I_XTDOT_X,
+ I_XTDOT_Y,
+ I_XTDOT_Z,
+ I_PHIDOT,
+ I_PHIDOT_DX,
+ I_PHIDOT_DY,
+ I_PHIDOT_DZ,
+ NB_INTERP_VARS,
+};
+
+struct QMSSolverPriv {
+ PSSolveContext *ps_ctx;
+ cGH *gh;
+
+ int colloc_grid_order[2];
+
+ double *eq_coeffs[PSSOLVE_DIFF_ORDER_NB];
+ double *rhs;
+
+ double *coeff_scale;
+
+ // interpolation parameters
+ int coord_system;
+ int interp_operator;
+ int interp_params;
+
+ CCTK_REAL *interp_coords[3];
+
+ int interp_vars_indices[NB_METRIC_VARS];
+ CCTK_REAL *interp_values[NB_INTERP_VARS];
+ CCTK_INT interp_value_codes[NB_INTERP_VARS];
+
+#if HAVE_OPENCL
+ // OpenCL / CLBLAS stuff
+ cl_context ocl_ctx;
+ cl_command_queue ocl_queue;
+#endif
+
+ uint64_t solve_count;
+ uint64_t solve_time;
+
+ uint64_t interp_geometry_count;
+ uint64_t interp_geometry_time;
+
+ uint64_t calc_eq_coeffs_count;
+ uint64_t calc_eq_coeffs_time;
+};
+
+/* mapping between our indices and thorn names */
+static const char *metric_vars[] = {
+#if QMS_CCZ4
+ [GTXX] = "ML_CCZ4::gt11",
+ [GTYY] = "ML_CCZ4::gt22",
+ [GTZZ] = "ML_CCZ4::gt33",
+ [GTXY] = "ML_CCZ4::gt12",
+ [GTXZ] = "ML_CCZ4::gt13",
+ [GTYZ] = "ML_CCZ4::gt23",
+ [ATXX] = "ML_CCZ4::At11",
+ [ATYY] = "ML_CCZ4::At22",
+ [ATZZ] = "ML_CCZ4::At33",
+ [ATXY] = "ML_CCZ4::At12",
+ [ATXZ] = "ML_CCZ4::At13",
+ [ATYZ] = "ML_CCZ4::At23",
+ [PHI] = "ML_CCZ4::phi",
+ [K] = "ML_CCZ4::trK",
+ [XTX] = "ML_CCZ4::Xt1",
+ [XTY] = "ML_CCZ4::Xt2",
+ [XTZ] = "ML_CCZ4::Xt3",
+ [BETAX] = "ML_CCZ4::beta1",
+ [BETAY] = "ML_CCZ4::beta2",
+ [BETAZ] = "ML_CCZ4::beta3",
+ [ALPHA] = "ML_CCZ4::alpha",
+ [KDOT_XX] = "ML_CCZ4::Kdot11",
+ [KDOT_YY] = "ML_CCZ4::Kdot22",
+ [KDOT_ZZ] = "ML_CCZ4::Kdot33",
+ [KDOT_XY] = "ML_CCZ4::Kdot12",
+ [KDOT_XZ] = "ML_CCZ4::Kdot13",
+ [KDOT_YZ] = "ML_CCZ4::Kdot23",
+ [XTDOT_X] = "ML_CCZ4::Xtdot1",
+ [XTDOT_Y] = "ML_CCZ4::Xtdot2",
+ [XTDOT_Z] = "ML_CCZ4::Xtdot3",
+ [PHIDOT] = "ML_CCZ4::phidot",
+#else
+ [GTXX] = "ML_BSSN::gt11",
+ [GTYY] = "ML_BSSN::gt22",
+ [GTZZ] = "ML_BSSN::gt33",
+ [GTXY] = "ML_BSSN::gt12",
+ [GTXZ] = "ML_BSSN::gt13",
+ [GTYZ] = "ML_BSSN::gt23",
+ [ATXX] = "ML_BSSN::At11",
+ [ATYY] = "ML_BSSN::At22",
+ [ATZZ] = "ML_BSSN::At33",
+ [ATXY] = "ML_BSSN::At12",
+ [ATXZ] = "ML_BSSN::At13",
+ [ATYZ] = "ML_BSSN::At23",
+ [PHI] = "ML_BSSN::phi",
+ [K] = "ML_BSSN::trK",
+ [XTX] = "ML_BSSN::Xt1",
+ [XTY] = "ML_BSSN::Xt2",
+ [XTZ] = "ML_BSSN::Xt3",
+ [BETAX] = "ML_BSSN::beta1",
+ [BETAY] = "ML_BSSN::beta2",
+ [BETAZ] = "ML_BSSN::beta3",
+ [ALPHA] = "ML_BSSN::alpha",
+ //[ALPHA] = "ADMBase::alp",
+ [KDOT_XX] = "ML_BSSN::Kdot11",
+ [KDOT_YY] = "ML_BSSN::Kdot22",
+ [KDOT_ZZ] = "ML_BSSN::Kdot33",
+ [KDOT_XY] = "ML_BSSN::Kdot12",
+ [KDOT_XZ] = "ML_BSSN::Kdot13",
+ [KDOT_YZ] = "ML_BSSN::Kdot23",
+ [XTDOT_X] = "ML_BSSN::Xtdot1",
+ [XTDOT_Y] = "ML_BSSN::Xtdot2",
+ [XTDOT_Z] = "ML_BSSN::Xtdot3",
+ [PHIDOT] = "ML_BSSN::phidot",
+#endif
+};
+
+/* mapping between the cactus grid values and interpolated values */
+static const CCTK_INT interp_operation_indices[] = {
+ [I_GTXX] = GTXX,
+ [I_GTYY] = GTYY,
+ [I_GTZZ] = GTZZ,
+ [I_GTXY] = GTXY,
+ [I_GTXZ] = GTXZ,
+ [I_GTYZ] = GTYZ,
+ [I_PHI] = PHI,
+ [I_PHI_DX] = PHI,
+ [I_PHI_DY] = PHI,
+ [I_PHI_DZ] = PHI,
+ [I_ATXX] = ATXX,
+ [I_ATYY] = ATYY,
+ [I_ATZZ] = ATZZ,
+ [I_ATXY] = ATXY,
+ [I_ATXZ] = ATXZ,
+ [I_ATYZ] = ATYZ,
+ [I_K] = K,
+ [I_XTX] = XTX,
+ [I_XTY] = XTY,
+ [I_XTZ] = XTZ,
+ [I_BETAX] = BETAX,
+ [I_BETAY] = BETAY,
+ [I_BETAZ] = BETAZ,
+ [I_ALPHA] = ALPHA,
+ [I_ALPHA_DX] = ALPHA,
+ [I_ALPHA_DY] = ALPHA,
+ [I_ALPHA_DZ] = ALPHA,
+ [I_ALPHA_DXX] = ALPHA,
+ [I_ALPHA_DYY] = ALPHA,
+ [I_ALPHA_DZZ] = ALPHA,
+ [I_ALPHA_DXY] = ALPHA,
+ [I_ALPHA_DXZ] = ALPHA,
+ [I_ALPHA_DYZ] = ALPHA,
+ [I_KDOT_XX] = KDOT_XX,
+ [I_KDOT_YY] = KDOT_YY,
+ [I_KDOT_ZZ] = KDOT_ZZ,
+ [I_KDOT_XY] = KDOT_XY,
+ [I_KDOT_XZ] = KDOT_XZ,
+ [I_KDOT_YZ] = KDOT_YZ,
+ [I_XTDOT_X] = XTDOT_X,
+ [I_XTDOT_Y] = XTDOT_Y,
+ [I_XTDOT_Z] = XTDOT_Z,
+ [I_PHIDOT] = PHIDOT,
+ [I_PHIDOT_DX] = PHIDOT,
+ [I_PHIDOT_DY] = PHIDOT,
+ [I_PHIDOT_DZ] = PHIDOT,
+};
+
+/* the operation (plain value or x/y/z-derivative) to apply during interpolation */
+static const CCTK_INT interp_operation_codes[] = {
+ [I_GTXX] = 0,
+ [I_GTYY] = 0,
+ [I_GTZZ] = 0,
+ [I_GTXY] = 0,
+ [I_GTXZ] = 0,
+ [I_GTYZ] = 0,
+ [I_PHI] = 0,
+ [I_PHI_DX] = 1,
+ [I_PHI_DY] = 2,
+ [I_PHI_DZ] = 3,
+ [I_ATXX] = 0,
+ [I_ATYY] = 0,
+ [I_ATZZ] = 0,
+ [I_ATXY] = 0,
+ [I_ATXZ] = 0,
+ [I_ATYZ] = 0,
+ [I_K] = 0,
+ [I_XTX] = 0,
+ [I_XTY] = 0,
+ [I_XTZ] = 0,
+ [I_BETAX] = 0,
+ [I_BETAY] = 0,
+ [I_BETAZ] = 0,
+ [I_ALPHA] = 0,
+ [I_ALPHA_DX] = 1,
+ [I_ALPHA_DY] = 2,
+ [I_ALPHA_DZ] = 3,
+ [I_ALPHA_DXX] = 11,
+ [I_ALPHA_DYY] = 22,
+ [I_ALPHA_DZZ] = 33,
+ [I_ALPHA_DXY] = 12,
+ [I_ALPHA_DXZ] = 13,
+ [I_ALPHA_DYZ] = 23,
+ [I_KDOT_XX] = 0,
+ [I_KDOT_YY] = 0,
+ [I_KDOT_ZZ] = 0,
+ [I_KDOT_XY] = 0,
+ [I_KDOT_XZ] = 0,
+ [I_KDOT_YZ] = 0,
+ [I_XTDOT_X] = 0,
+ [I_XTDOT_Y] = 0,
+ [I_XTDOT_Z] = 0,
+ [I_PHIDOT] = 0,
+ [I_PHIDOT_DX] = 1,
+ [I_PHIDOT_DY] = 2,
+ [I_PHIDOT_DZ] = 3,
+};
+
+/* interpolate the cactus gridfunctions onto the pseudospectral grid */
+static int interp_geometry(QMSSolver *ctx)
+{
+ QMSSolverPriv *s = ctx->priv;
+ int ret;
+
+ ret = CCTK_InterpGridArrays(s->gh, 3, s->interp_operator, s->interp_params,
+ s->coord_system, NB_COLLOC_POINTS(ctx), CCTK_VARIABLE_REAL,
+ (const void * const *)s->interp_coords, ARRAY_ELEMS(s->interp_vars_indices), s->interp_vars_indices,
+ ARRAY_ELEMS(s->interp_values), s->interp_value_codes, (void * const *)s->interp_values);
+ if (ret < 0)
+ CCTK_WARN(0, "Error interpolating");
+
+ return 0;
+}
+
+/* evaluate the equation coefficients at the collocation points */
+static int calc_eq_coeffs(QMSSolver *ctx)
+{
+ QMSSolverPriv *s = ctx->priv;
+
+//#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x)
+ for (int i = 0; i < NB_COLLOC_POINTS(ctx); i++) {
+ const double x = s->interp_coords[0][i];
+ const double z = s->interp_coords[2][i];
+ const int zaxis = x <= EPS;
+
+ double Am[3][3], K[3][3], Km[3][3], Ku[3][3], gtu[3][3];
+ double k2, kij_dij_alpha, k_kdot, k3;
+
+ const double gtxx = s->interp_values[I_GTXX][i];
+ const double gtyy = s->interp_values[I_GTYY][i];
+ const double gtzz = s->interp_values[I_GTZZ][i];
+ const double gtxy = s->interp_values[I_GTXY][i];
+ const double gtxz = s->interp_values[I_GTXZ][i];
+ const double gtyz = s->interp_values[I_GTYZ][i];
+
+ const double gt[3][3] = {{ gtxx, gtxy, gtxz },
+ { gtxy, gtyy, gtyz },
+ { gtxz, gtyz, gtzz }};
+
+ const double Atxx = s->interp_values[I_ATXX][i];
+ const double Atyy = s->interp_values[I_ATYY][i];
+ const double Atzz = s->interp_values[I_ATZZ][i];
+ const double Atxy = s->interp_values[I_ATXY][i];
+ const double Atxz = s->interp_values[I_ATXZ][i];
+ const double Atyz = s->interp_values[I_ATYZ][i];
+
+ const double phi = s->interp_values[I_PHI][i];
+
+ const double phidot = s->interp_values[I_PHIDOT][i];
+ const double phidot_dx = s->interp_values[I_PHIDOT_DX][i];
+ const double phidot_dz = s->interp_values[I_PHIDOT_DZ][i];
+
+ const double At[3][3] = {{ Atxx, Atxy, Atxz },
+ { Atxy, Atyy, Atyz },
+ { Atxz, Atyz, Atzz }};
+
+ const double trK = s->interp_values[I_K][i];
+ const double kdot_xx = s->interp_values[I_KDOT_XX][i];
+ const double kdot_yy = s->interp_values[I_KDOT_YY][i];
+ const double kdot_zz = s->interp_values[I_KDOT_ZZ][i];
+ const double kdot_xy = s->interp_values[I_KDOT_XY][i];
+ const double kdot_xz = s->interp_values[I_KDOT_XZ][i];
+ const double kdot_yz = s->interp_values[I_KDOT_YZ][i];
+
+ const double kdot[3][3] = {{ kdot_xx, kdot_xy, kdot_xz },
+ { kdot_xy, kdot_yy, kdot_yz },
+ { kdot_xz, kdot_yz, kdot_zz }};
+
+ const double alpha = s->interp_values[I_ALPHA][i];
+ const double dx_alpha = s->interp_values[I_ALPHA_DX][i];
+ const double dz_alpha = s->interp_values[I_ALPHA_DZ][i];
+ const double dxx_alpha = s->interp_values[I_ALPHA_DXX][i];
+ const double dzz_alpha = s->interp_values[I_ALPHA_DZZ][i];
+ const double dxz_alpha = s->interp_values[I_ALPHA_DXZ][i];
+
+ const double dij_alpha[3][3] = {{ dxx_alpha, 0, dxz_alpha },
+ { 0, zaxis ? dxx_alpha : dx_alpha / x, 0 },
+ { dxz_alpha, 0, dzz_alpha }};
+
+ const double Xtx = s->interp_values[I_XTX][i];
+ const double Xtz = s->interp_values[I_XTZ][i];
+
+ const double Xtdot_x = s->interp_values[I_XTDOT_X][i];
+ const double Xtdot_z = s->interp_values[I_XTDOT_Z][i];
+
+ const double det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz);
+
+ // \tilde{γ}^{ij}
+ gtu[0][0] = (gtyy * gtzz - SQR(gtyz)) / det;
+ gtu[1][1] = (gtxx * gtzz - SQR(gtxz)) / det;
+ gtu[2][2] = (gtxx * gtyy - SQR(gtxy)) / det;
+ gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det;
+ gtu[0][2] = (gtxy * gtyz - gtyy * gtxz) / det;
+ gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det;
+ gtu[1][0] = gtu[0][1];
+ gtu[2][0] = gtu[0][2];
+ gtu[2][1] = gtu[1][2];
+
+ // K_{ij}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ K[j][k] = At[j][k] / SQR(phi) + gt[j][k] * trK;
+
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ double val = 0.0;
+ for (int l = 0; l < 3; l++)
+ val += SQR(phi) * gtu[j][l] * K[l][k];
+ Km[j][k] = val;
+ }
+
+ // K^{ij}
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ double val = 0.0;
+ for (int l = 0; l < 3; l++)
+ val += SQR(phi) * gtu[j][l] * Km[k][l];
+ Ku[j][k] = val;
+ }
+
+ // \tilde{A}_{i}^j
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ double val = 0.0;
+ for (int l = 0; l < 3; l++)
+ val += gtu[j][l] * At[l][k];
+ Am[j][k] = val;
+ }
+
+ kij_dij_alpha = 0.0;
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ kij_dij_alpha += Ku[j][k] * dij_alpha[j][k];
+
+ k_kdot = 0.0;
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ k_kdot += kdot[j][k] * Ku[j][k];
+
+ k3 = 0.0;
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++) {
+ double val = 0.0;
+ for (int l = 0; l < 3; l++)
+ val += Km[k][l] * Ku[l][j];
+ k3 += val * K[j][k];
+ }
+
+ // K_{ij} K^{ij}
+ k2 = 0.0;
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 3; k++)
+ k2 += Km[j][k] * Km[k][j];
+
+ {
+ const double gtuxx = gtu[0][0];
+ const double gtuyy = gtu[1][1];
+ const double gtuzz = gtu[2][2];
+ const double gtuxz = gtu[0][2];
+
+ const double phi_dx = s->interp_values[I_PHI_DX][i];
+ const double phi_dz = s->interp_values[I_PHI_DZ][i];
+
+ const double Xtx = s->interp_values[I_XTX][i];
+ const double Xtz = s->interp_values[I_XTZ][i];
+
+ const double betax = s->interp_values[I_BETAX][i];
+ const double betaz = s->interp_values[I_BETAZ][i];
+
+ const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi);
+ const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);
+
+ const double Xdot_x = 2 * phi * phidot * Xtx + SQR(phi) * Xtdot_x + phi * (phidot_dx * gtuxx + phidot_dz * gtuxz) -
+ phidot * (phi_dx * gtuxx + phi_dz * gtuxz) + 2 * alpha * (phi_dx * Ku[0][0] + phi_dz * Ku[0][2]) / phi;
+ const double Xdot_z = 2 * phi * phidot * Xtz + SQR(phi) * Xtdot_z + phi * (phidot_dz * gtuzz + phidot_dx * gtuxz) -
+ phidot * (phi_dz * gtuzz + phi_dx * gtuxz) + 2 * alpha * (phi_dz * Ku[2][2] + phi_dx * Ku[0][2]) / phi;
+
+ s->eq_coeffs[PSSOLVE_DIFF_ORDER_20][i] = SQR(phi) * (gtuxx + ((x <= EPS) ? gtuyy : 0.0));
+ s->eq_coeffs[PSSOLVE_DIFF_ORDER_02][i] = SQR(phi) * gtuzz;
+ s->eq_coeffs[PSSOLVE_DIFF_ORDER_11][i] = SQR(phi) * gtuxz * 2;
+ s->eq_coeffs[PSSOLVE_DIFF_ORDER_10][i] = -Xx + ((x > EPS) ? SQR(phi) * gtuyy / x : 0.0);
+ s->eq_coeffs[PSSOLVE_DIFF_ORDER_01][i] = -Xz;
+ s->eq_coeffs[PSSOLVE_DIFF_ORDER_00][i] = -k2;
+
+ s->rhs[i] = -2 * alpha * kij_dij_alpha + Xdot_x * dx_alpha + Xdot_z * dz_alpha +
+ 2 * (k_kdot + 2 * alpha * k3) * alpha;
+ }
+ }
+
+ return 0;
+}
+
+int qms_solver_solve(QMSSolver *ctx)
+{
+ QMSSolverPriv *s = ctx->priv;
+ int ret;
+ int64_t start, totaltime_start;
+
+ totaltime_start = gettime();
+
+ /* interpolate the metric values and construct the quantities we'll need */
+ CCTK_TimerStart("QuasiMaximalSlicing_interp_geometry");
+ start = gettime();
+
+ ret = interp_geometry(ctx);
+
+ s->interp_geometry_time += gettime() - start;
+ s->interp_geometry_count++;
+ CCTK_TimerStop("QuasiMaximalSlicing_interp_geometry");
+ if (ret < 0)
+ return ret;
+
+ CCTK_TimerStart("QuasiMaximalSlicing_calc_eq_coeffs");
+ start = gettime();
+
+ ret = calc_eq_coeffs(ctx);
+
+ s->calc_eq_coeffs_time += gettime() - start;
+ s->calc_eq_coeffs_count++;
+ CCTK_TimerStop("QuasiMaximalSlicing_calc_eq_coeffs");
+ if (ret < 0)
+ return ret;
+
+ ret = qms_pssolve_solve(s->ps_ctx, (const double * const *)s->eq_coeffs,
+ s->rhs, ctx->coeffs);
+ if (ret < 0)
+ return ret;
+
+ for (int i = 0; i < NB_COEFFS(ctx); i++)
+ ctx->coeffs[i] *= s->coeff_scale[i];
+
+ s->solve_count++;
+ s->solve_time += gettime() - totaltime_start;
+
+ return 0;
+}
+
+void qms_solver_print_stats(QMSSolver *ctx)
+{
+ QMSSolverPriv *s = ctx->priv;
+
+ fprintf(stderr,
+ "%g%% interpolate geometry: %lu, "
+ "total time %g s, avg time per call %g ms\n",
+ (double)s->interp_geometry_time * 100 / s->solve_time,
+ s->interp_geometry_count, (double)s->interp_geometry_time / 1e6,
+ (double)s->interp_geometry_time / s->interp_geometry_count / 1e3);
+ fprintf(stderr,
+ "%g%% calc equation coefficients: %lu, "
+ "total time %g s, avg time per call %g ms\n",
+ (double)s->calc_eq_coeffs_time * 100 / s->solve_time,
+ s->calc_eq_coeffs_count, (double)s->calc_eq_coeffs_time / 1e6,
+ (double)s->calc_eq_coeffs_time / s->calc_eq_coeffs_count / 1e3);
+ fprintf(stderr,
+ "%g%% pseudospectral matrix construction: %lu, "
+ "total time %g s, avg time per call %g ms\n",
+ (double)s->ps_ctx->construct_matrix_time * 100 / s->solve_time,
+ s->ps_ctx->construct_matrix_count, (double)s->ps_ctx->construct_matrix_time / 1e6,
+ (double)s->ps_ctx->construct_matrix_time / s->ps_ctx->construct_matrix_count / 1e3);
+ fprintf(stderr,
+ "%g%% BiCGSTAB %lu solves, "
+ "%lu iterations, total time %g s, "
+ "avg iterations per solve %g, avg time per solve %g ms, "
+ "avg time per iteration %g ms\n",
+ (double)s->ps_ctx->cg_time_total * 100 / s->solve_time,
+ s->ps_ctx->cg_solve_count, s->ps_ctx->cg_iter_count, (double)s->ps_ctx->cg_time_total / 1e6,
+ (double)s->ps_ctx->cg_iter_count / s->ps_ctx->cg_solve_count,
+ (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_solve_count / 1e3,
+ (double)s->ps_ctx->cg_time_total / s->ps_ctx->cg_iter_count / 1e3);
+ fprintf(stderr,
+ "%g%% LU %lu solves, total time %g s, avg time per solve %g ms\n",
+ (double)s->ps_ctx->lu_solves_time * 100 / s->solve_time,
+ s->ps_ctx->lu_solves_count, (double)s->ps_ctx->lu_solves_time / 1e6,
+ (double)s->ps_ctx->lu_solves_time / s->ps_ctx->lu_solves_count / 1e3);
+}
+
+static void init_opencl(QMSSolver *ctx)
+#if HAVE_OPENCL
+{
+ QMSSolverPriv *s = ctx->priv;
+ int err, count;
+ cl_platform_id platform;
+ cl_context_properties props[3];
+ cl_device_id ocl_device;
+
+ err = clGetPlatformIDs(1, &platform, &count);
+ if (err != CL_SUCCESS || count < 1) {
+ fprintf(stderr, "Could not get an OpenCL platform ID\n");
+ return;
+ }
+
+ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ocl_device, &count);
+ if (err != CL_SUCCESS || count < 1) {
+ fprintf(stderr, "Could not get an OpenCL device ID\n");
+ return;
+ }
+
+ props[0] = CL_CONTEXT_PLATFORM;
+ props[1] = (cl_context_properties)platform;
+ props[2] = 0;
+
+ s->ocl_ctx = clCreateContext(props, 1, &ocl_device, NULL, NULL, &err);
+ if (err != CL_SUCCESS || !s->ocl_ctx) {
+ fprintf(stderr, "Could not create an OpenCL context\n");
+ return;
+ }
+
+ s->ocl_queue = clCreateCommandQueue(s->ocl_ctx, ocl_device, 0, &err);
+ if (err != CL_SUCCESS || !s->ocl_queue) {
+ fprintf(stderr, "Could not create an OpenCL command queue: %d\n", err);
+ goto fail;
+ }
+
+ err = clblasSetup();
+ if (err != CL_SUCCESS) {
+ fprintf(stderr, "Error setting up clBLAS\n");
+ goto fail;
+ }
+
+ return;
+fail:
+ if (s->ocl_queue)
+ clReleaseCommandQueue(s->ocl_queue);
+ s->ocl_queue = 0;
+
+ if (s->ocl_ctx)
+ clReleaseContext(s->ocl_ctx);
+ s->ocl_ctx = 0;
+}
+#else
+{
+}
+#endif
+
+int qms_solver_init(QMSSolver **pctx,
+ cGH *cctkGH,
+ int basis_order_r, int basis_order_z,
+ double sf, double filter_power, double input_filter_power)
+{
+ QMSSolver *ctx;
+ QMSSolverPriv *s;
+ int ret;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->priv = calloc(1, sizeof(*ctx->priv));
+ if (!ctx->priv)
+ goto fail;
+ s = ctx->priv;
+
+ s->gh = cctkGH;
+
+ ctx->basis[0] = &qms_sb_even_basis;
+#if QMS_POLAR
+ ctx->basis[1] = &qms_cos_even_basis;
+#else
+ ctx->basis[1] = &qms_sb_even_basis;
+#endif
+
+ ctx->nb_coeffs[0] = basis_order_r;
+ ctx->nb_coeffs[1] = basis_order_z;
+
+ ctx->nb_colloc_points[0] = basis_order_r;
+ ctx->nb_colloc_points[1] = basis_order_z;
+
+ if (NB_COLLOC_POINTS(ctx) != NB_COEFFS(ctx))
+ CCTK_WARN(0, "Non-square collocation matrix");
+
+ s->colloc_grid_order[0] = ctx->nb_colloc_points[0];
+ s->colloc_grid_order[1] = ctx->nb_colloc_points[1];
+
+ ret = posix_memalign((void**)&ctx->coeffs, 32, sizeof(*ctx->coeffs) * NB_COEFFS(ctx));
+ ret |= posix_memalign((void**)&s->rhs, 32, sizeof(*s->rhs) * NB_COLLOC_POINTS(ctx));
+ if (ret)
+ goto fail;
+
+ //FIXME
+ scale_factor = 1.0;
+ scale_factor = (64.0 / ctx->basis[0]->colloc_point(s->colloc_grid_order[0], ctx->nb_colloc_points[0] - 1));
+ fprintf(stderr, "scale factor %16.16g\n", scale_factor);
+
+ init_opencl(ctx);
+
+ ret = qms_pssolve_context_alloc(&s->ps_ctx);
+ if (ret < 0)
+ CCTK_WARN(0, "Error allocating the pseudospectral solver");
+
+ s->ps_ctx->basis[0] = ctx->basis[0];
+ s->ps_ctx->basis[1] = ctx->basis[1];
+ s->ps_ctx->solve_order[0] = basis_order_r;
+ s->ps_ctx->solve_order[1] = basis_order_z;
+#if HAVE_OPENCL
+ s->ps_ctx->ocl_ctx = s->ocl_ctx;
+ s->ps_ctx->ocl_queue = s->ocl_queue;
+#endif
+
+ ret = qms_pssolve_context_init(s->ps_ctx);
+ if (ret < 0)
+ CCTK_WARN(0, "Error initializing the pseudospectral solver");
+
+ for (int i = 0; i < MAX(s->ps_ctx->solve_order[0], s->ps_ctx->solve_order[1]); i++) {
+ fprintf(stderr, "%d ", i);
+ if (i < s->ps_ctx->solve_order[0])
+ fprintf(stderr, "%g\t", s->ps_ctx->colloc_grid[0][i]);
+ else
+ fprintf(stderr, "\t\t");
+ if (i < s->ps_ctx->solve_order[1])
+ fprintf(stderr, "%g\t", s->ps_ctx->colloc_grid[1][i]);
+ fprintf(stderr, "\n");
+ }
+
+ for (int i = 0; i < ARRAY_ELEMS(s->eq_coeffs); i++) {
+ ret = posix_memalign((void**)&s->eq_coeffs[i], 32,
+ NB_COLLOC_POINTS(ctx) * sizeof(*s->eq_coeffs[i]));
+ if (ret)
+ goto fail;
+ }
+
+ for (int i = 0; i < ARRAY_ELEMS(s->interp_coords); i++) {
+ ret |= posix_memalign((void**)&s->interp_coords[i], 32,
+ NB_COLLOC_POINTS(ctx) * sizeof(*s->interp_coords[i]));
+ }
+ if (ret)
+ goto fail;
+
+ for (int i = 0; i < ctx->nb_colloc_points[1]; i++) {
+ for (int j = 0; j < ctx->nb_colloc_points[0]; j++) {
+#if QMS_POLAR
+ double phi = s->ps_ctx->colloc_grid[1][i];
+ double r = s->ps_ctx->colloc_grid[0][j];
+
+ double x = r * cos(phi);
+ double z = r * sin(phi);
+#else
+ double x = s->ps_ctx->colloc_grid[0][j];
+ double z = s->ps_ctx->colloc_grid[1][i];
+#endif
+
+ s->interp_coords[0][i * ctx->nb_colloc_points[0] + j] = x;
+ s->interp_coords[1][i * ctx->nb_colloc_points[0] + j] = 0;
+ s->interp_coords[2][i * ctx->nb_colloc_points[0] + j] = z;
+ }
+ }
+
+ ret = posix_memalign((void**)&s->coeff_scale, 32, NB_COEFFS(ctx) * sizeof(*s->coeff_scale));
+ if (ret)
+ goto fail;
+ for (int j = 0; j < ctx->nb_coeffs[1]; j++)
+ for (int i = 0; i < ctx->nb_coeffs[0]; i++) {
+ s->coeff_scale[j * ctx->nb_coeffs[0] + i] = exp(-36.0 * pow((double)i / ctx->nb_coeffs[0], filter_power)) *
+ exp(-36.0 * pow((double)j / ctx->nb_coeffs[1], filter_power));
+ }
+
+ for (int i = 0; i < ARRAY_ELEMS(s->interp_values); i++) {
+ ret = posix_memalign((void**)&s->interp_values[i], 32,
+ NB_COLLOC_POINTS(ctx) * sizeof(*s->interp_values[i]));
+ if (ret)
+ goto fail;
+ s->interp_value_codes[i] = CCTK_VARIABLE_REAL;
+ }
+
+ for (int i = 0; i < ARRAY_ELEMS(metric_vars); i++) {
+ s->interp_vars_indices[i] = CCTK_VarIndex(metric_vars[i]);
+ if (s->interp_vars_indices[i] < 0)
+ CCTK_VWarn(0, __LINE__, __FILE__, CCTK_THORNSTRING, "Error getting the index of variable: %s\n", metric_vars[i]);
+ }
+
+ s->coord_system = CCTK_CoordSystemHandle("cart3d");
+ if (s->coord_system < 0)
+ CCTK_WARN(0, "Error getting the coordinate system");
+
+ s->interp_operator = CCTK_InterpHandle("Lagrange polynomial interpolation (tensor product)");
+ if (s->interp_operator < 0)
+ CCTK_WARN(0, "Error getting the interpolation operator");
+
+ s->interp_params = Util_TableCreateFromString("order=4 want_global_mode=1");
+ if (s->interp_params < 0)
+ CCTK_WARN(0, "Error creating interpolation parameters table");
+
+ ret = Util_TableSetIntArray(s->interp_params, NB_INTERP_VARS,
+ interp_operation_codes, "operation_codes");
+ if (ret < 0)
+ CCTK_WARN(0, "Error setting operation codes");
+
+ ret = Util_TableSetIntArray(s->interp_params, NB_INTERP_VARS,
+ interp_operation_indices, "operand_indices");
+ if (ret < 0)
+ CCTK_WARN(0, "Error setting operand indices");
+
+ CCTK_TimerCreate("QuasiMaximalSlicing_Solve");
+ CCTK_TimerCreate("QuasiMaximalSlicing_Expand");
+ CCTK_TimerCreate("QuasiMaximalSlicing_interp_geometry");
+ CCTK_TimerCreate("QuasiMaximalSlicing_calc_eq_coeffs");
+ CCTK_TimerCreate("QuasiMaximalSlicing_construct_matrix");
+ CCTK_TimerCreate("QuasiMaximalSlicing_solve_LU");
+ CCTK_TimerCreate("QuasiMaximalSlicing_solve_BiCGSTAB");
+
+ *pctx = ctx;
+ return 0;
+fail:
+ qms_solver_free(&ctx);
+ return -ENOMEM;
+}
+
+void qms_solver_free(QMSSolver **pctx)
+{
+ QMSSolver *ctx = *pctx;
+
+ if (!ctx)
+ return;
+
+ if (ctx->priv) {
+ for (int i = 0; i < ARRAY_ELEMS(ctx->priv->interp_coords); i++)
+ free(ctx->priv->interp_coords[i]);
+ for (int i = 0; i < ARRAY_ELEMS(ctx->priv->interp_values); i++)
+ free(ctx->priv->interp_values[i]);
+ for (int i = 0; i < ARRAY_ELEMS(ctx->priv->eq_coeffs); i++)
+ free(ctx->priv->eq_coeffs[i]);
+ free(ctx->priv->rhs);
+ free(ctx->priv->coeff_scale);
+
+ qms_pssolve_context_free(&ctx->priv->ps_ctx);
+
+#if HAVE_OPENCL
+ if (ctx->priv->ocl_queue)
+ clReleaseCommandQueue(ctx->priv->ocl_queue);
+ if (ctx->priv->ocl_ctx)
+ clReleaseContext(ctx->priv->ocl_ctx);
+#endif
+ }
+
+ free(ctx->priv);
+
+ free(ctx->coeffs);
+
+ free(ctx);
+ *pctx = NULL;
+}
diff --git a/src/qms_solve.h b/src/qms_solve.h
new file mode 100644
index 0000000..18b2628
--- /dev/null
+++ b/src/qms_solve.h
@@ -0,0 +1,52 @@
+/*
+ * Quasimaximal slicing -- actual solver code
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QMS_SOLVE_H
+#define QMS_SOLVE_H
+
+#include "common.h"
+
+#include "cctk.h"
+
+#include "basis.h"
+
+typedef struct QMSSolverPriv QMSSolverPriv;
+
+typedef struct QMSSolver {
+ QMSSolverPriv *priv;
+
+ const BasisSet *basis[2];
+
+ int nb_coeffs[2];
+ int nb_colloc_points[2];
+
+ double *coeffs;
+} QMSSolver;
+
+int qms_solver_init(QMSSolver **ctx,
+ cGH *cctkGH,
+ int basis_order_r, int basis_order_z,
+ double sf, double filter_power, double input_filter_power);
+
+void qms_solver_free(QMSSolver **ctx);
+
+int qms_solver_solve(QMSSolver *ctx);
+
+void qms_solver_print_stats(QMSSolver *ctx);
+
+#endif /* QMS_SOLVE_H */
diff --git a/src/solve.c b/src/solve.c
deleted file mode 100644
index 0bd56e2..0000000
--- a/src/solve.c
+++ /dev/null
@@ -1,636 +0,0 @@
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <cblas.h>
-#include <lapacke.h>
-
-#include <cl.h>
-#include <clBLAS.h>
-
-#include "qms.h"
-
-static int construct_matrix(MaximalSlicingContext *ms)
-{
- double *mat = ms->mat;
- int idx_coeff, idx_grid;
-
-#pragma omp parallel for
- for (idx_coeff = 0; idx_coeff < ms->nb_coeffs; idx_coeff++)
- for (idx_grid = 0; idx_grid < ms->nb_colloc_points; idx_grid++) {
- const int idx = idx_grid + ms->nb_colloc_points * idx_coeff;
-
- mat[idx] = ms->eq_coeff_00[idx_grid] * ms->basis_val_00[idx] +
- ms->eq_coeff_10[idx_grid] * ms->basis_val_10[idx] +
- ms->eq_coeff_01[idx_grid] * ms->basis_val_01[idx] +
- ms->eq_coeff_11[idx_grid] * ms->basis_val_11[idx] +
- ms->eq_coeff_20[idx_grid] * ms->basis_val_20[idx] +
- ms->eq_coeff_02[idx_grid] * ms->basis_val_02[idx];
- }
-
- return 0;
-}
-
-/* interpolate the cactus gridfunctions onto the pseudospectral grid */
-static int interp_geometry(MaximalSlicingContext *ms)
-{
- int ret;
-
- ret = CCTK_InterpGridArrays(ms->gh, 3, ms->interp_operator, ms->interp_params,
- ms->coord_system, ms->nb_colloc_points, CCTK_VARIABLE_REAL,
- (const void * const *)ms->interp_coords, ARRAY_ELEMS(ms->interp_vars_indices), ms->interp_vars_indices,
- ARRAY_ELEMS(ms->interp_values), ms->interp_value_codes, (void * const *)ms->interp_values_prefilter);
- if (ret < 0)
- CCTK_WARN(0, "Error interpolating");
-
- CCTK_TimerStart("MaximalSlicingAxi_filter_input");
- for (int i = 0; i < ARRAY_ELEMS(ms->interp_values); i++) {
- cblas_dgemv(CblasColMajor, CblasNoTrans, ms->nb_colloc_points, ms->nb_colloc_points, 1.0,
- ms->input_filter, ms->nb_colloc_points, ms->interp_values_prefilter[i], 1, 0.0,
- ms->interp_values[i], 1);
- }
- CCTK_TimerStop("MaximalSlicingAxi_filter_input");
-
- return 0;
-}
-
-static int calc_eq_coeffs(MaximalSlicingContext *ms, double *prhs_max)
-{
- double rhs_max = 0.0;
-//#pragma omp parallel for schedule(dynamic, ms->nb_colloc_points_x) reduction(max : rhs_max)
- for (int i = 0; i < ms->nb_colloc_points; i++) {
- const double x = ms->interp_coords[0][i];
- const double z = ms->interp_coords[2][i];
- const int zaxis = x <= EPS;
-
- double Am[3][3], K[3][3], Km[3][3], Ku[3][3], gtu[3][3];
- double k2, kij_dij_alpha, k_kdot, k3;
-
- const double gtxx = ms->interp_values[I_GTXX][i];
- const double gtyy = ms->interp_values[I_GTYY][i];
- const double gtzz = ms->interp_values[I_GTZZ][i];
- const double gtxy = ms->interp_values[I_GTXY][i];
- const double gtxz = ms->interp_values[I_GTXZ][i];
- const double gtyz = ms->interp_values[I_GTYZ][i];
-
- const double gt[3][3] = {{ gtxx, gtxy, gtxz },
- { gtxy, gtyy, gtyz },
- { gtxz, gtyz, gtzz }};
-
- const double Atxx = ms->interp_values[I_ATXX][i];
- const double Atyy = ms->interp_values[I_ATYY][i];
- const double Atzz = ms->interp_values[I_ATZZ][i];
- const double Atxy = ms->interp_values[I_ATXY][i];
- const double Atxz = ms->interp_values[I_ATXZ][i];
- const double Atyz = ms->interp_values[I_ATYZ][i];
-
- const double phi = ms->interp_values[I_PHI][i];
-
- const double phidot = ms->interp_values[I_PHIDOT][i];
- const double phidot_dx = ms->interp_values[I_PHIDOT_DX][i];
- const double phidot_dz = ms->interp_values[I_PHIDOT_DZ][i];
-
- const double At[3][3] = {{ Atxx, Atxy, Atxz },
- { Atxy, Atyy, Atyz },
- { Atxz, Atyz, Atzz }};
-
- const double trK = ms->interp_values[I_K][i];
- const double kdot_xx = ms->interp_values[I_KDOT_XX][i];
- const double kdot_yy = ms->interp_values[I_KDOT_YY][i];
- const double kdot_zz = ms->interp_values[I_KDOT_ZZ][i];
- const double kdot_xy = ms->interp_values[I_KDOT_XY][i];
- const double kdot_xz = ms->interp_values[I_KDOT_XZ][i];
- const double kdot_yz = ms->interp_values[I_KDOT_YZ][i];
-
- const double kdot[3][3] = {{ kdot_xx, kdot_xy, kdot_xz },
- { kdot_xy, kdot_yy, kdot_yz },
- { kdot_xz, kdot_yz, kdot_zz }};
-
- const double alpha = ms->interp_values[I_ALPHA][i];
- const double dx_alpha = ms->interp_values[I_ALPHA_DX][i];
- const double dz_alpha = ms->interp_values[I_ALPHA_DZ][i];
- const double dxx_alpha = ms->interp_values[I_ALPHA_DXX][i];
- const double dzz_alpha = ms->interp_values[I_ALPHA_DZZ][i];
- const double dxz_alpha = ms->interp_values[I_ALPHA_DXZ][i];
-
- const double dij_alpha[3][3] = {{ dxx_alpha, 0, dxz_alpha },
- { 0, zaxis ? dxx_alpha : dx_alpha / x, 0 },
- { dxz_alpha, 0, dzz_alpha }};
-
- const double Xtx = ms->interp_values[I_XTX][i];
- const double Xtz = ms->interp_values[I_XTZ][i];
-
- const double Xtdot_x = ms->interp_values[I_XTDOT_X][i];
- const double Xtdot_z = ms->interp_values[I_XTDOT_Z][i];
-
- const double det = gtxx * gtyy * gtzz + 2 * gtxy * gtyz * gtxz - gtzz * SQR(gtxy) - SQR(gtxz) * gtyy - gtxx * SQR(gtyz);
-
- // \tilde{γ}^{ij}
- gtu[0][0] = (gtyy * gtzz - SQR(gtyz)) / det;
- gtu[1][1] = (gtxx * gtzz - SQR(gtxz)) / det;
- gtu[2][2] = (gtxx * gtyy - SQR(gtxy)) / det;
- gtu[0][1] = -(gtxy * gtzz - gtyz * gtxz) / det;
- gtu[0][2] = (gtxy * gtyz - gtyy * gtxz) / det;
- gtu[1][2] = -(gtxx * gtyz - gtxy * gtxz) / det;
- gtu[1][0] = gtu[0][1];
- gtu[2][0] = gtu[0][2];
- gtu[2][1] = gtu[1][2];
-
- // K_{ij}
- for (int j = 0; j < 3; j++)
- for (int k = 0; k < 3; k++)
- K[j][k] = At[j][k] / SQR(phi) + gt[j][k] * trK;
-
- for (int j = 0; j < 3; j++)
- for (int k = 0; k < 3; k++) {
- double val = 0.0;
- for (int l = 0; l < 3; l++)
- val += SQR(phi) * gtu[j][l] * K[l][k];
- Km[j][k] = val;
- }
-
- // K^{ij}
- for (int j = 0; j < 3; j++)
- for (int k = 0; k < 3; k++) {
- double val = 0.0;
- for (int l = 0; l < 3; l++)
- val += SQR(phi) * gtu[j][l] * Km[k][l];
- Ku[j][k] = val;
- }
-
- // \tilde{A}_{i}^j
- for (int j = 0; j < 3; j++)
- for (int k = 0; k < 3; k++) {
- double val = 0.0;
- for (int l = 0; l < 3; l++)
- val += gtu[j][l] * At[l][k];
- Am[j][k] = val;
- }
-
- kij_dij_alpha = 0.0;
- for (int j = 0; j < 3; j++)
- for (int k = 0; k < 3; k++)
- kij_dij_alpha += Ku[j][k] * dij_alpha[j][k];
-
- k_kdot = 0.0;
- for (int j = 0; j < 3; j++)
- for (int k = 0; k < 3; k++)
- k_kdot += kdot[j][k] * Ku[j][k];
-
- k3 = 0.0;
- for (int j = 0; j < 3; j++)
- for (int k = 0; k < 3; k++) {
- double val = 0.0;
- for (int l = 0; l < 3; l++)
- val += Km[k][l] * Ku[l][j];
- k3 += val * K[j][k];
- }
-
- // K_{ij} K^{ij}
- k2 = 0.0;
- for (int j = 0; j < 3; j++)
- for (int k = 0; k < 3; k++)
- k2 += Km[j][k] * Km[k][j];
-
- {
- const double gtuxx = gtu[0][0];
- const double gtuyy = gtu[1][1];
- const double gtuzz = gtu[2][2];
- const double gtuxz = gtu[0][2];
-
- const double phi_dx = ms->interp_values[I_PHI_DX][i];
- const double phi_dz = ms->interp_values[I_PHI_DZ][i];
-
- const double Xtx = ms->interp_values[I_XTX][i];
- const double Xtz = ms->interp_values[I_XTZ][i];
-
- //const double k2 = a2 + SQR(trK) / 3.;
-
- const double trk_dx = ms->interp_values[I_K_DX][i];
- const double trk_dz = ms->interp_values[I_K_DZ][i];
-
- const double betax = ms->interp_values[I_BETAX][i];
- const double betaz = ms->interp_values[I_BETAZ][i];
-
- const double Xx = SQR(phi) * (Xtx + (phi_dx * gtuxx + phi_dz * gtuxz) / phi);
- const double Xz = SQR(phi) * (Xtz + (phi_dx * gtuxz + phi_dz * gtuzz) / phi);
-
- const double Xdot_x = 2 * phi * phidot * Xtx + SQR(phi) * Xtdot_x + phi * (phidot_dx * gtuxx + phidot_dz * gtuxz) -
- phidot * (phi_dx * gtuxx + phi_dz * gtuxz) + 2 * alpha * (phi_dx * Ku[0][0] + phi_dz * Ku[0][2]) / phi;
- const double Xdot_z = 2 * phi * phidot * Xtz + SQR(phi) * Xtdot_z + phi * (phidot_dz * gtuzz + phidot_dx * gtuxz) -
- phidot * (phi_dz * gtuzz + phi_dx * gtuxz) + 2 * alpha * (phi_dz * Ku[2][2] + phi_dx * Ku[0][2]) / phi;
-
- ms->eq_coeff_20[i] = SQR(phi) * (gtuxx + ((x <= EPS) ? gtuyy : 0.0));
- ms->eq_coeff_02[i] = SQR(phi) * gtuzz;
- ms->eq_coeff_11[i] = SQR(phi) * gtuxz * 2;
- ms->eq_coeff_10[i] = -Xx + ((x > EPS) ? SQR(phi) * gtuyy / x : 0.0);
- ms->eq_coeff_01[i] = -Xz;
- ms->eq_coeff_00[i] = -k2;
-
- ms->rhs[i] = -2 * alpha * kij_dij_alpha + Xdot_x * dx_alpha + Xdot_z * dz_alpha +
- 2 * (k_kdot + 2 * alpha * k3) * alpha;
-
- rhs_max = MAX(rhs_max, fabs(ms->rhs[i]));
- }
- }
-
- *prhs_max = rhs_max;
-
- return 0;
-}
-
-// based on the wikipedia article
-// and http://www.netlib.org/templates/matlab/bicgstab.m
-static int solve_bicgstab(BiCGSTABContext *ctx, const int N,
- double *mat, double *rhs, double *x)
-{
- const double rhs_norm = cblas_dnrm2(N, rhs, 1);
-
- double rho, rho_prev = 1.0;
- double omega = 1.0;
- double alpha = 1.0;
-
- double err;
- int i;
-
- double *k = ctx->k;
- double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t;
- double *res = ctx->res, *res0 = ctx->res0;
-
- // initialize the residual
- memcpy(res, rhs, N * sizeof(*res));
- cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
- mat, N, x, 1, 1.0, res, 1);
-
- memcpy(res0, res, N * sizeof(*res0));
- memcpy(p, res, N * sizeof(*p));
-
-#define MAXITER 16
-#define TOL (1e-15)
- for (i = 0; i < MAXITER; i++) {
- rho = cblas_ddot(N, res, 1, res0, 1);
-
- if (i) {
- double beta = (rho / rho_prev) * (alpha / omega);
-
- cblas_daxpy(N, -omega, v, 1, p, 1);
- cblas_dscal(N, beta, p, 1);
- cblas_daxpy(N, 1, res, 1, p, 1);
- }
-
- cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
- k, N, p, 1, 0.0, y, 1);
-
- cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
- mat, N, y, 1, 0.0, v, 1);
-
- alpha = rho / cblas_ddot(N, res0, 1, v, 1);
-
- cblas_daxpy(N, -alpha, v, 1, res, 1);
-
- cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
- k, N, res, 1, 0.0, z, 1);
- cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
- mat, N, z, 1, 0.0, t, 1);
-
- omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1);
-
- cblas_daxpy(N, alpha, y, 1, x, 1);
- cblas_daxpy(N, omega, z, 1, x, 1);
-
- cblas_daxpy(N, -omega, t, 1, res, 1);
-
- err = cblas_dnrm2(N, res, 1) / rhs_norm;
- if (err < TOL)
- break;
-
- rho_prev = rho;
- }
- if (i == MAXITER)
- return -1;
-
- return i;
-}
-
-static int solve_bicgstab_cl(BiCGSTABContext *ctx, cl_command_queue cl_q,
- const int N, double *mat, double *rhs, cl_mem ocl_x)
-{
- const double rhs_norm = cblas_dnrm2(N, rhs, 1);
-
- double rho, rho_prev = 1.0;
- double omega[2] = { 1.0 };
- double alpha = 1.0;
-
- double err;
- int i;
-
- cl_event events[8];
-
- // the matrix, rhs, k and x are assumed to be already uploaded
- clEnqueueWriteBuffer(cl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs, 0, NULL, &events[0]);
- clEnqueueWriteBuffer(cl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]);
-
- // initialize the residual
- clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
- ctx->cl_mat, 0, N, ocl_x, 0, 1, 1.0, ctx->cl_res, 0, 1,
- 1, &cl_q, 2, events, &events[2]);
- clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double),
- 1, &events[2], &events[3]);
- clEnqueueCopyBuffer(cl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double),
- 1, &events[2], &events[4]);
-
- clWaitForEvents(5, events);
- // BARRIER
-
-#define MAXITER 16
-#define TOL (1e-15)
- for (i = 0; i < MAXITER; i++) {
- clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1,
- ctx->cl_tmp, 1, &cl_q, 0, NULL, &events[0]);
- clEnqueueReadBuffer(cl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho,
- 1, &events[0], NULL);
- // BARRIER
-
- if (i) {
- double beta = (rho / rho_prev) * (alpha / omega[0]);
-
- clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1,
- 1, &cl_q, 0, NULL, &events[0]);
- clblasDscal(N, beta, ctx->cl_p, 0, 1,
- 1, &cl_q, 1, &events[0], &events[1]);
- clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1,
- 1, &cl_q, 1, &events[1], &events[0]);
- clWaitForEvents(1, &events[0]);
- // BARRIER
- }
-
- clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
- ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1,
- 1, &cl_q, 0, NULL, &events[0]);
-
- clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
- ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1,
- 1, &cl_q, 1, &events[0], &events[1]);
-
- clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1,
- ctx->cl_tmp, 1, &cl_q, 1, &events[1], &events[0]);
- clEnqueueReadBuffer(cl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha,
- 1, &events[0], NULL);
- // BARRIER
-
- alpha = rho / alpha;
-
- clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1,
- 1, &cl_q, 0, NULL, &events[0]);
-
- clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
- ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1,
- 1, &cl_q, 1, &events[0], &events[1]);
- clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
- ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1,
- 1, &cl_q, 1, &events[1], &events[0]);
-
- clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
- ctx->cl_tmp, 1, &cl_q, 1, &events[0], &events[1]);
- clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1,
- ctx->cl_tmp1, 1, &cl_q, 1, &events[0], &events[2]);
-
- clEnqueueReadBuffer(cl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega,
- 2, &events[1], NULL);
- // BARRIER
-
- omega[0] /= omega[1];
-
- clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ocl_x, 0, 1,
- 1, &cl_q, 0, NULL, &events[0]);
- clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ocl_x, 0, 1,
- 1, &cl_q, 1, &events[0], &events[1]);
-
- clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
- 1, &cl_q, 0, NULL, &events[0]);
- clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1,
- 1, &cl_q, 1, &events[0], &events[2]);
- clEnqueueReadBuffer(cl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err,
- 1, &events[2], NULL);
- clWaitForEvents(1, &events[1]);
- // BARRIER
-
- if (err < TOL)
- break;
-
- rho_prev = rho;
- }
- if (i == MAXITER)
- return -1;
-
- return i;
-}
-
-static int lu_invert(const int N, double *mat, double *rhs, int *ipiv)
-{
- char equed = 'N';
- double cond, ferr, berr, rpivot;
-
- double *mat_f, *x;
- int ret = 0;
-#if 1
- LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
- mat, N, ipiv, rhs, N);
- LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat, N, ipiv);
-#else
- mat_f = malloc(SQR(N) * sizeof(*mat_f));
- x = malloc(N * sizeof(*x));
-
- //{
- // int i, j;
- // for (i = 0; i < N; i++) {
- // for (j = 0; j < N; j++)
- // fprintf(stderr, "%+#010.8g\t", mat[i + j * N]);
- // fprintf(stderr, "\n");
- // }
- //}
- //{
- // double *mat_copy = malloc(SQR(N) * sizeof(double));
- // double *svd = malloc(N * sizeof(double));
- // double *rhs_copy = malloc(N * sizeof(double));
- // int rank;
-
- // memcpy(mat_copy, mat, SQR(N) * sizeof(double));
- // memcpy(rhs_copy, rhs, N * sizeof(double));
-
- // LAPACKE_dgelsd(LAPACK_COL_MAJOR, N, N, 1, mat_copy, N, rhs_copy, N,
- // svd, 1e-13, &rank);
-
- // free(mat_copy);
- // for (int i = 0; i < N; i++) {
- // if (i > 5 && i < N - 5)
- // continue;
-
- // fprintf(stderr, "%g\t", svd[i]);
- // }
- // fprintf(stderr, "\n rank %d\n", rank);
- // free(svd);
- // free(rhs_copy);
-
- // if (rank < N)
- // ret = 1;
- //}
-
- //LAPACKE_dgesv(LAPACK_COL_MAJOR, N, 1,
- // mat, N, ipiv, rhs, N);
- LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', N, 1,
- mat, N, mat_f, N, ipiv, &equed, NULL, NULL,
- rhs, N, x, N, &cond, &ferr, &berr, &rpivot);
- LAPACKE_dgetri(LAPACK_COL_MAJOR, N, mat_f, N, ipiv);
- memcpy(rhs, x, N * sizeof(double));
- memcpy(mat, mat_f, SQR(N) * sizeof(double));
-
- fprintf(stderr, "LU factorization solution to a %zdx%zd matrix: "
- "condition number %16.16g; forward error %16.16g backward error %16.16g\n",
- N, N, cond, ferr, berr);
-
- free(mat_f);
- free(x);
-#endif
-
- return ret;
-}
-
-/*
- * Solve the equation
- * D²α - KᵢⱼKⁱʲα = -K
- * for the coefficients of spectral approximation of α:
- * α(ρ, z) = 1 + ΣaᵢⱼTᵢ(ρ)Tⱼ(z)
- * where i = { 0, ... , ms->nb_coeffs_x };
- * j = { 0, ... , ms->nb_coeffs_z };
- * Tᵢ(x) are defined by ms->basis.
- */
-int qms_maximal_solve(MaximalSlicingContext *ms)
-{
- const int N = ms->nb_coeffs;
- double rhs_max;
- int64_t start;
-
- int ret = 0;
-
- /* interpolate the metric values and construct the quantities we'll need */
- CCTK_TimerStart("MaximalSlicingAxi_interp_geometry");
- start = gettime();
- ret = interp_geometry(ms);
- ms->interp_geometry_time += gettime() - start;
- ms->interp_geometry_count++;
- CCTK_TimerStop("MaximalSlicingAxi_interp_geometry");
- if (ret < 0)
- return ret;
-
- CCTK_TimerStart("MaximalSlicingAxi_calc_eq_coeffs");
- start = gettime();
- ret = calc_eq_coeffs(ms, &rhs_max);
- ms->calc_eq_coeffs_time += gettime() - start;
- ms->calc_eq_coeffs_count++;
- CCTK_TimerStop("MaximalSlicingAxi_calc_eq_coeffs");
- if (ret < 0)
- return ret;
-
- /* fill the matrix */
- CCTK_TimerStart("MaximalSlicingAxi_construct_matrix");
- start = gettime();
- ret = construct_matrix(ms);
- ms->construct_matrix_time += gettime() - start;
- ms->construct_matrix_count++;
- CCTK_TimerStop("MaximalSlicingAxi_construct_matrix");
- if (ret < 0)
- return ret;
-
-#if 1
- if (rhs_max < EPS) {
- fprintf(stderr, "zero rhs\n");
- memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
- if (ms->cl_queue) {
- clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, N * sizeof(double),
- ms->coeffs, 0, NULL, NULL);
- }
- return 0;
- }
-#endif
-
- /* solve for the coeffs */
- if (ms->steps_since_inverse < 1024) {
- BiCGSTABContext *b = &ms->bicgstab;
- int64_t start = gettime();
-
- CCTK_TimerStart("MaximalSlicingAxi_solve_BiCGSTAB");
- if (ms->cl_queue) {
- ret = solve_bicgstab_cl(b, ms->cl_queue, ms->nb_coeffs, ms->mat, ms->rhs, ms->ocl_coeffs);
- clEnqueueReadBuffer(ms->cl_queue, ms->ocl_coeffs, 1, 0, sizeof(double) * N,
- ms->coeffs, 0, NULL, NULL);
- } else
- ret = solve_bicgstab(b, ms->nb_coeffs, ms->mat, ms->rhs, ms->coeffs);
- CCTK_TimerStop("MaximalSlicingAxi_solve_BiCGSTAB");
-
- if (ret >= 0) {
- ms->cg_time_total += gettime() - start;
- ms->cg_solve_count++;
- ms->cg_iter_count += ret + 1;
- ms->steps_since_inverse++;
-
-#if 0
- {
- double min, max;
- gsl_vector_memcpy(b->y, ms->rhs);
- cblas_dgemv(CblasColMajor, CblasNoTrans, ms->mat->size1, ms->mat->size2, -1.0,
- ms->mat->data, ms->mat->tda, ms->coeffs->data, 1, 1.0, b->y->data, 1);
- gsl_vector_minmax(b->y, &min, &max);
- if (fabs(min) > 1e-11 || fabs(max) > 1e-11)
- abort();
- }
-#endif
- }
- } else
- ret = -1;
-
- if (ret < 0) {
- double *tmpv;
- double *tmpm;
- int64_t start;
-
- CCTK_TimerStart("MaximalSlicingAxi_solve_LU");
- start = gettime();
-
- ret = lu_invert(ms->nb_coeffs, ms->mat, ms->rhs, ms->ipiv);
- ms->lu_solves_time += gettime() - start;
- ms->lu_solves_count++;
- CCTK_TimerStop("MaximalSlicingAxi_solve_LU");
-
- tmpv = ms->coeffs;
- ms->coeffs = ms->rhs;
- ms->rhs = tmpv;
-
- tmpm = ms->mat;
- ms->mat = ms->bicgstab.k;
- ms->bicgstab.k = tmpm;
-
- if (ret == 1) {
- memset(ms->coeffs, 0, sizeof(*ms->coeffs) * ms->nb_coeffs);
- ms->coeffs[0] = 1.0;
- }
-
- if (ms->cl_queue) {
- cl_event events[2];
- clEnqueueWriteBuffer(ms->cl_queue, ms->bicgstab.cl_k, 0, 0, N * N * sizeof(double),
- ms->bicgstab.k, 0, NULL, &events[0]);
- clEnqueueWriteBuffer(ms->cl_queue, ms->ocl_coeffs, 0, 0, N * sizeof(double),
- ms->coeffs, 0, NULL, &events[1]);
- clWaitForEvents(2, events);
- }
-
- ms->steps_since_inverse = 0;
- }
-
- for (int i = 0; i < N; i++)
- ms->coeffs[i] *= ms->coeff_scale[i];
-
- return ret;
-}
diff --git a/src/solve.cl b/src/solve.cl
deleted file mode 100644
index 125133a..0000000
--- a/src/solve.cl
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma OPENCL EXTENSION all : enable
-
-__kernel void construct_matrix_polar(__global double *mat,
- __global double *coeff_00,
- __global double *coeff_10,
- __global double *coeff_01,
- __global double *coeff_11,
- __global double *coeff_20,
- __global double *coeff_02,
- __global double *basis_00,
- __global double *basis_10,
- __global double *basis_01,
- __global double *basis_11,
- __global double *basis_20,
- __global double *basis_02,
- int nb_coeffs, int nb_colloc_points)
-{
- unsigned int idx_coeff = get_global_id(0);
- unsigned int idx_grid = get_global_id(1);
- unsigned int idx = idx_grid + nb_colloc_points * idx_coeff;
-
- mat[idx] = coeff_00[idx_grid] * basis_00[idx] +
- coeff_10[idx_grid] * basis_10[idx] +
- coeff_01[idx_grid] * basis_01[idx] +
- coeff_11[idx_grid] * basis_11[idx] +
- coeff_20[idx_grid] * basis_20[idx] +
- coeff_02[idx_grid] * basis_02[idx];
-}
diff --git a/src/solve_cl.c b/src/solve_cl.c
deleted file mode 100644
index f138631..0000000
--- a/src/solve_cl.c
+++ /dev/null
@@ -1,28 +0,0 @@
-"#pragma OPENCL EXTENSION all : enable\n"
-"\n"
-"__kernel void construct_matrix_polar(__global double *mat,\n"
-" __global double *coeff_00,\n"
-" __global double *coeff_10,\n"
-" __global double *coeff_01,\n"
-" __global double *coeff_11,\n"
-" __global double *coeff_20,\n"
-" __global double *coeff_02,\n"
-" __global double *basis_00,\n"
-" __global double *basis_10,\n"
-" __global double *basis_01,\n"
-" __global double *basis_11,\n"
-" __global double *basis_20,\n"
-" __global double *basis_02,\n"
-" int nb_coeffs, int nb_colloc_points)\n"
-"{\n"
-" unsigned int idx_coeff = get_global_id(0);\n"
-" unsigned int idx_grid = get_global_id(1);\n"
-" unsigned int idx = idx_grid + nb_colloc_points * idx_coeff;\n"
-"\n"
-" mat[idx] = coeff_00[idx_grid] * basis_00[idx] +\n"
-" coeff_10[idx_grid] * basis_10[idx] +\n"
-" coeff_01[idx_grid] * basis_01[idx] +\n"
-" coeff_11[idx_grid] * basis_11[idx] +\n"
-" coeff_20[idx_grid] * basis_20[idx] +\n"
-" coeff_02[idx_grid] * basis_02[idx];\n"
-"}\n"