aboutsummaryrefslogtreecommitdiff
path: root/bicgstab.c
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2017-11-16 13:11:07 +0100
committerAnton Khirnov <anton@khirnov.net>2017-11-19 16:30:14 +0100
commit0007a0b0c11fa7c12b228883453368f105a4324b (patch)
treea5ac8016c58c13668bf87931dd921bea933cf07f /bicgstab.c
Initial commit.
The following code is present: * the basis API * the BiCGSTAB solver * the pseudospectral linear system solver * helper APIs: - threadpool - logging - cpuid
Diffstat (limited to 'bicgstab.c')
-rw-r--r--bicgstab.c411
1 files changed, 411 insertions, 0 deletions
diff --git a/bicgstab.c b/bicgstab.c
new file mode 100644
index 0000000..9b4d330
--- /dev/null
+++ b/bicgstab.c
@@ -0,0 +1,411 @@
+/*
+ * BiCGStab iterative linear system solver
+ * Copyright (C) 2016 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if HAVE_OPENCL
+#include <cl.h>
+#include <clBLAS.h>
+#endif
+
+#include <cblas.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bicgstab.h"
+#include "common.h"
+
+#define BICGSTAB_MAXITER 16
+#define BICGSTAB_TOL (1e-15)
+
+struct BiCGStabContext {
+ int N;
+
+ double *x;
+ double *p, *v, *y, *z, *t;
+ double *res, *res0;
+ double *k;
+
+#if HAVE_OPENCL
+ cl_context ocl_ctx;
+ cl_command_queue ocl_queue;
+
+ cl_mem cl_x;
+ cl_mem cl_p, cl_v, cl_y, cl_z, cl_t;
+ cl_mem cl_res, cl_res0;
+ cl_mem cl_k, cl_mat;
+ cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1;
+ cl_mem cl_tmp, cl_tmp1;
+#endif
+};
+
+#if HAVE_OPENCL
+static int solve_cl(BiCGStabContext *ctx,
+ const double *mat, const double *rhs, double *x)
+{
+ cl_command_queue ocl_q = ctx->ocl_queue;
+ const int N = ctx->N;
+ const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+ double rho, rho_prev = 1.0;
+ double omega[2] = { 1.0 };
+ double alpha = 1.0;
+
+ double err;
+ int i;
+
+ cl_event events[8];
+
+ // upload the matrix and RHS
+ clEnqueueWriteBuffer(ocl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs, 0, NULL, &events[0]);
+ clEnqueueWriteBuffer(ocl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]);
+
+ // initialize the residual
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+ ctx->cl_mat, 0, N, ctx->cl_x, 0, 1, 1.0, ctx->cl_res, 0, 1,
+ 1, &ocl_q, 2, events, &events[2]);
+ clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double),
+ 1, &events[2], &events[3]);
+ clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double),
+ 1, &events[2], &events[4]);
+
+ clWaitForEvents(5, events);
+ // BARRIER
+
+ for (i = 0; i < MAXITER; i++) {
+ clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1,
+ ctx->cl_tmp, 1, &ocl_q, 0, NULL, &events[0]);
+ clEnqueueReadBuffer(ocl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho,
+ 1, &events[0], NULL);
+ // BARRIER
+
+ if (i) {
+ double beta = (rho / rho_prev) * (alpha / omega[0]);
+
+ clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+ clblasDscal(N, beta, ctx->cl_p, 0, 1,
+ 1, &ocl_q, 1, &events[0], &events[1]);
+ clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1,
+ 1, &ocl_q, 1, &events[1], &events[0]);
+ clWaitForEvents(1, &events[0]);
+ // BARRIER
+ }
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1,
+ 1, &ocl_q, 1, &events[0], &events[1]);
+
+ clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1,
+ ctx->cl_tmp, 1, &ocl_q, 1, &events[1], &events[0]);
+ clEnqueueReadBuffer(ocl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha,
+ 1, &events[0], NULL);
+ // BARRIER
+
+ alpha = rho / alpha;
+
+ clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1,
+ 1, &ocl_q, 1, &events[0], &events[1]);
+ clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1,
+ 1, &ocl_q, 1, &events[1], &events[0]);
+
+ clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+ ctx->cl_tmp, 1, &ocl_q, 1, &events[0], &events[1]);
+ clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1,
+ ctx->cl_tmp1, 1, &ocl_q, 1, &events[0], &events[2]);
+
+ clEnqueueReadBuffer(ocl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega,
+ 2, &events[1], NULL);
+ // BARRIER
+
+ omega[0] /= omega[1];
+
+ clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ctx->cl_x, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+ clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ctx->cl_x, 0, 1,
+ 1, &ocl_q, 1, &events[0], &events[1]);
+
+ clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1,
+ 1, &ocl_q, 0, NULL, &events[0]);
+ clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1,
+ 1, &ocl_q, 1, &events[0], &events[2]);
+ clEnqueueReadBuffer(ocl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err,
+ 1, &events[2], NULL);
+ clWaitForEvents(1, &events[1]);
+ // BARRIER
+
+ if (err < BICGSTAB_TOL)
+ break;
+
+ rho_prev = rho;
+ }
+ if (i == BICGSTAB_MAXITER)
+ return -1;
+
+ clEnqueueReadBuffer(ocl_q, ctx->cl_x, 1, 0, sizeof(double) * N,
+ x, 0, NULL, NULL);
+ return i;
+}
+#endif
+
+// based on the wikipedia article
+// and http://www.netlib.org/templates/matlab/bicgstab.m
+static int solve_sw(BiCGStabContext *ctx,
+ const double *mat, const double *rhs, double *x)
+{
+ const int N = ctx->N;
+ const double rhs_norm = cblas_dnrm2(N, rhs, 1);
+
+ double rho, rho_prev = 1.0;
+ double omega = 1.0;
+ double alpha = 1.0;
+
+ double err;
+ int i;
+
+ double *k = ctx->k;
+ double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t;
+ double *res = ctx->res, *res0 = ctx->res0;
+
+ // initialize the residual
+ memcpy(res, rhs, N * sizeof(*res));
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0,
+ mat, N, ctx->x, 1, 1.0, res, 1);
+
+ memcpy(res0, res, N * sizeof(*res0));
+ memcpy(p, res, N * sizeof(*p));
+
+ for (i = 0; i < BICGSTAB_MAXITER; i++) {
+ rho = cblas_ddot(N, res, 1, res0, 1);
+
+ if (i) {
+ double beta = (rho / rho_prev) * (alpha / omega);
+
+ cblas_daxpy(N, -omega, v, 1, p, 1);
+ cblas_dscal(N, beta, p, 1);
+ cblas_daxpy(N, 1, res, 1, p, 1);
+ }
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ k, N, p, 1, 0.0, y, 1);
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ mat, N, y, 1, 0.0, v, 1);
+
+ alpha = rho / cblas_ddot(N, res0, 1, v, 1);
+
+ cblas_daxpy(N, -alpha, v, 1, res, 1);
+
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ k, N, res, 1, 0.0, z, 1);
+ cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0,
+ mat, N, z, 1, 0.0, t, 1);
+
+ omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1);
+
+ cblas_daxpy(N, alpha, y, 1, ctx->x, 1);
+ cblas_daxpy(N, omega, z, 1, ctx->x, 1);
+
+ cblas_daxpy(N, -omega, t, 1, res, 1);
+
+ err = cblas_dnrm2(N, res, 1) / rhs_norm;
+ if (err < BICGSTAB_TOL)
+ break;
+
+ rho_prev = rho;
+ }
+ if (i == BICGSTAB_MAXITER)
+ return -1;
+
+ memcpy(x, ctx->x, sizeof(*x) * ctx->N);
+
+ return i;
+}
+
+int tdi_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x)
+{
+ int ret;
+
+#if HAVE_OPENCL
+ if (ctx->ocl_ctx)
+ ret = solve_cl(ctx, mat, rhs, x);
+ else
+#endif
+ ret = solve_sw(ctx, mat, rhs, x);
+ if (ret < 0)
+ return ret;
+
+#if MD_VERIFY
+ {
+ int i;
+ double *y;
+
+ y = malloc(sizeof(*y) * ctx->N);
+ memcpy(y, rhs, sizeof(*y) * ctx->N);
+ cblas_dgemv(CblasColMajor, CblasNoTrans, ctx->N, ctx->N, -1.0,
+ mat, ctx->N, x, 1, 1.0, y, 1);
+ i = cblas_idamax(ctx->N, y, 1);
+ if (fabs(y[i]) > 1e-11)
+ abort();
+ }
+#endif
+
+ return ret;
+}
+
+int tdi_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0)
+{
+#if HAVE_OPENCL
+ if (ctx->ocl_ctx) {
+ cl_event events[2];
+ clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_k, 0, 0, ctx->N * ctx->N * sizeof(double),
+ k, 0, NULL, &events[0]);
+ clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_x, 0, 0, ctx->N * sizeof(double),
+ x0, 0, NULL, &events[1]);
+ clWaitForEvents(2, events);
+ } else
+#endif
+ {
+ memcpy(ctx->x, x0, ctx->N * sizeof(*x0));
+ memcpy(ctx->k, k, ctx->N * ctx->N * sizeof(*k));
+ }
+
+ return 0;
+}
+
+int tdi_bicgstab_context_alloc(BiCGStabContext **pctx, int N,
+ cl_context ocl_ctx, cl_command_queue ocl_q)
+{
+ BiCGStabContext *ctx;
+ int ret = 0;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->N = N;
+
+#if HAVE_OPENCL
+ if (ocl_ctx) {
+ ctx->ocl_ctx = ocl_ctx;
+ ctx->ocl_queue = ocl_q;
+
+#define ALLOC(dst, size) \
+do { \
+ ctx->dst = clCreateBuffer(ocl_ctx, 0, size, NULL, &ret); \
+ if (ret != CL_SUCCESS) \
+ goto fail; \
+} while (0)
+
+ ALLOC(cl_x, N * sizeof(double));
+ ALLOC(cl_p, N * sizeof(double));
+ ALLOC(cl_v, N * sizeof(double));
+ ALLOC(cl_y, N * sizeof(double));
+ ALLOC(cl_z, N * sizeof(double));
+ ALLOC(cl_t, N * sizeof(double));
+ ALLOC(cl_res, N * sizeof(double));
+ ALLOC(cl_res0, N * sizeof(double));
+ ALLOC(cl_tmp, N * sizeof(double));
+ ALLOC(cl_tmp1, N * 2 * sizeof(double));
+
+ ALLOC(cl_k, N * N * sizeof(double));
+ ALLOC(cl_mat, N * N * sizeof(double));
+
+ ALLOC(cl_rho, sizeof(double));
+ ALLOC(cl_alpha, sizeof(double));
+ ALLOC(cl_beta, sizeof(double));
+ ALLOC(cl_omega, 2 * sizeof(double));
+ ALLOC(cl_omega1, sizeof(double));
+ } else
+#endif
+ {
+ ret |= posix_memalign((void**)&ctx->x, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->p, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->v, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->y, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->z, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->t, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->res, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->res0, 32, sizeof(double) * N);
+ ret |= posix_memalign((void**)&ctx->k, 32, sizeof(double) * N * N);
+ }
+
+fail:
+ if (ret) {
+ tdi_bicgstab_context_free(&ctx);
+ return -ENOMEM;
+ }
+
+ *pctx = ctx;
+ return 0;
+}
+
+void tdi_bicgstab_context_free(BiCGStabContext **pctx)
+{
+ BiCGStabContext *ctx = *pctx;
+
+ if (!ctx)
+ return;
+
+ free(ctx->x);
+ free(ctx->p);
+ free(ctx->v);
+ free(ctx->y);
+ free(ctx->z);
+ free(ctx->t);
+ free(ctx->res);
+ free(ctx->res0);
+ free(ctx->k);
+
+#if HAVE_OPENCL
+ if (ctx->ocl_ctx) {
+ clReleaseMemObject(ctx->cl_x);
+ clReleaseMemObject(ctx->cl_p);
+ clReleaseMemObject(ctx->cl_v);
+ clReleaseMemObject(ctx->cl_y);
+ clReleaseMemObject(ctx->cl_z);
+ clReleaseMemObject(ctx->cl_t);
+ clReleaseMemObject(ctx->cl_res);
+ clReleaseMemObject(ctx->cl_res0);
+ clReleaseMemObject(ctx->cl_tmp);
+ clReleaseMemObject(ctx->cl_tmp1);
+
+ clReleaseMemObject(ctx->cl_k);
+ clReleaseMemObject(ctx->cl_mat);
+
+ clReleaseMemObject(ctx->cl_rho);
+ clReleaseMemObject(ctx->cl_alpha);
+ clReleaseMemObject(ctx->cl_beta);
+ clReleaseMemObject(ctx->cl_omega);
+ clReleaseMemObject(ctx->cl_omega1);
+ }
+#endif
+
+ free(ctx);
+ *pctx = NULL;
+}