From 0007a0b0c11fa7c12b228883453368f105a4324b Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Thu, 16 Nov 2017 13:11:07 +0100 Subject: Initial commit. The following code is present: * the basis API * the BiCGSTAB solver * the pseudospectral linear system solver * helper APIs: - threadpool - logging - cpuid --- bicgstab.c | 411 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 411 insertions(+) create mode 100644 bicgstab.c (limited to 'bicgstab.c') diff --git a/bicgstab.c b/bicgstab.c new file mode 100644 index 0000000..9b4d330 --- /dev/null +++ b/bicgstab.c @@ -0,0 +1,411 @@ +/* + * BiCGStab iterative linear system solver + * Copyright (C) 2016 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "config.h" + +#if HAVE_OPENCL +#include +#include +#endif + +#include +#include +#include +#include + +#include "bicgstab.h" +#include "common.h" + +#define BICGSTAB_MAXITER 16 +#define BICGSTAB_TOL (1e-15) + +struct BiCGStabContext { + int N; + + double *x; + double *p, *v, *y, *z, *t; + double *res, *res0; + double *k; + +#if HAVE_OPENCL + cl_context ocl_ctx; + cl_command_queue ocl_queue; + + cl_mem cl_x; + cl_mem cl_p, cl_v, cl_y, cl_z, cl_t; + cl_mem cl_res, cl_res0; + cl_mem cl_k, cl_mat; + cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1; + cl_mem cl_tmp, cl_tmp1; +#endif +}; + +#if HAVE_OPENCL +static int solve_cl(BiCGStabContext *ctx, + const double *mat, const double *rhs, double *x) +{ + cl_command_queue ocl_q = ctx->ocl_queue; + const int N = ctx->N; + const double rhs_norm = cblas_dnrm2(N, rhs, 1); + + double rho, rho_prev = 1.0; + double omega[2] = { 1.0 }; + double alpha = 1.0; + + double err; + int i; + + cl_event events[8]; + + // upload the matrix and RHS + clEnqueueWriteBuffer(ocl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs, 0, NULL, &events[0]); + clEnqueueWriteBuffer(ocl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]); + + // initialize the residual + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, + ctx->cl_mat, 0, N, ctx->cl_x, 0, 1, 1.0, ctx->cl_res, 0, 1, + 1, &ocl_q, 2, events, &events[2]); + clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double), + 1, &events[2], &events[3]); + clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double), + 1, &events[2], &events[4]); + + clWaitForEvents(5, events); + // BARRIER + + for (i = 0; i < MAXITER; i++) { + clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1, + ctx->cl_tmp, 1, &ocl_q, 0, NULL, &events[0]); + clEnqueueReadBuffer(ocl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho, + 1, &events[0], NULL); + // BARRIER + + if (i) { + double beta = (rho / rho_prev) * (alpha / omega[0]); + + clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + clblasDscal(N, beta, ctx->cl_p, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1, + 1, &ocl_q, 1, &events[1], &events[0]); + clWaitForEvents(1, &events[0]); + // BARRIER + } + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + + clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1, + ctx->cl_tmp, 1, &ocl_q, 1, &events[1], &events[0]); + clEnqueueReadBuffer(ocl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha, + 1, &events[0], NULL); + // BARRIER + + alpha = rho / alpha; + + clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1, + 1, &ocl_q, 1, &events[1], &events[0]); + + clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, + ctx->cl_tmp, 1, &ocl_q, 1, &events[0], &events[1]); + clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1, + ctx->cl_tmp1, 1, &ocl_q, 1, &events[0], &events[2]); + + clEnqueueReadBuffer(ocl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega, + 2, &events[1], NULL); + // BARRIER + + omega[0] /= omega[1]; + + clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ctx->cl_x, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ctx->cl_x, 0, 1, + 1, &ocl_q, 1, &events[0], &events[1]); + + clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, + 1, &ocl_q, 0, NULL, &events[0]); + clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1, + 1, &ocl_q, 1, &events[0], &events[2]); + clEnqueueReadBuffer(ocl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err, + 1, &events[2], NULL); + clWaitForEvents(1, &events[1]); + // BARRIER + + if (err < BICGSTAB_TOL) + break; + + rho_prev = rho; + } + if (i == BICGSTAB_MAXITER) + return -1; + + clEnqueueReadBuffer(ocl_q, ctx->cl_x, 1, 0, sizeof(double) * N, + x, 0, NULL, NULL); + return i; +} +#endif + +// based on the wikipedia article +// and http://www.netlib.org/templates/matlab/bicgstab.m +static int solve_sw(BiCGStabContext *ctx, + const double *mat, const double *rhs, double *x) +{ + const int N = ctx->N; + const double rhs_norm = cblas_dnrm2(N, rhs, 1); + + double rho, rho_prev = 1.0; + double omega = 1.0; + double alpha = 1.0; + + double err; + int i; + + double *k = ctx->k; + double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t; + double *res = ctx->res, *res0 = ctx->res0; + + // initialize the residual + memcpy(res, rhs, N * sizeof(*res)); + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, + mat, N, ctx->x, 1, 1.0, res, 1); + + memcpy(res0, res, N * sizeof(*res0)); + memcpy(p, res, N * sizeof(*p)); + + for (i = 0; i < BICGSTAB_MAXITER; i++) { + rho = cblas_ddot(N, res, 1, res0, 1); + + if (i) { + double beta = (rho / rho_prev) * (alpha / omega); + + cblas_daxpy(N, -omega, v, 1, p, 1); + cblas_dscal(N, beta, p, 1); + cblas_daxpy(N, 1, res, 1, p, 1); + } + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + k, N, p, 1, 0.0, y, 1); + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + mat, N, y, 1, 0.0, v, 1); + + alpha = rho / cblas_ddot(N, res0, 1, v, 1); + + cblas_daxpy(N, -alpha, v, 1, res, 1); + + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + k, N, res, 1, 0.0, z, 1); + cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, + mat, N, z, 1, 0.0, t, 1); + + omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1); + + cblas_daxpy(N, alpha, y, 1, ctx->x, 1); + cblas_daxpy(N, omega, z, 1, ctx->x, 1); + + cblas_daxpy(N, -omega, t, 1, res, 1); + + err = cblas_dnrm2(N, res, 1) / rhs_norm; + if (err < BICGSTAB_TOL) + break; + + rho_prev = rho; + } + if (i == BICGSTAB_MAXITER) + return -1; + + memcpy(x, ctx->x, sizeof(*x) * ctx->N); + + return i; +} + +int tdi_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x) +{ + int ret; + +#if HAVE_OPENCL + if (ctx->ocl_ctx) + ret = solve_cl(ctx, mat, rhs, x); + else +#endif + ret = solve_sw(ctx, mat, rhs, x); + if (ret < 0) + return ret; + +#if MD_VERIFY + { + int i; + double *y; + + y = malloc(sizeof(*y) * ctx->N); + memcpy(y, rhs, sizeof(*y) * ctx->N); + cblas_dgemv(CblasColMajor, CblasNoTrans, ctx->N, ctx->N, -1.0, + mat, ctx->N, x, 1, 1.0, y, 1); + i = cblas_idamax(ctx->N, y, 1); + if (fabs(y[i]) > 1e-11) + abort(); + } +#endif + + return ret; +} + +int tdi_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0) +{ +#if HAVE_OPENCL + if (ctx->ocl_ctx) { + cl_event events[2]; + clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_k, 0, 0, ctx->N * ctx->N * sizeof(double), + k, 0, NULL, &events[0]); + clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_x, 0, 0, ctx->N * sizeof(double), + x0, 0, NULL, &events[1]); + clWaitForEvents(2, events); + } else +#endif + { + memcpy(ctx->x, x0, ctx->N * sizeof(*x0)); + memcpy(ctx->k, k, ctx->N * ctx->N * sizeof(*k)); + } + + return 0; +} + +int tdi_bicgstab_context_alloc(BiCGStabContext **pctx, int N, + cl_context ocl_ctx, cl_command_queue ocl_q) +{ + BiCGStabContext *ctx; + int ret = 0; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return -ENOMEM; + + ctx->N = N; + +#if HAVE_OPENCL + if (ocl_ctx) { + ctx->ocl_ctx = ocl_ctx; + ctx->ocl_queue = ocl_q; + +#define ALLOC(dst, size) \ +do { \ + ctx->dst = clCreateBuffer(ocl_ctx, 0, size, NULL, &ret); \ + if (ret != CL_SUCCESS) \ + goto fail; \ +} while (0) + + ALLOC(cl_x, N * sizeof(double)); + ALLOC(cl_p, N * sizeof(double)); + ALLOC(cl_v, N * sizeof(double)); + ALLOC(cl_y, N * sizeof(double)); + ALLOC(cl_z, N * sizeof(double)); + ALLOC(cl_t, N * sizeof(double)); + ALLOC(cl_res, N * sizeof(double)); + ALLOC(cl_res0, N * sizeof(double)); + ALLOC(cl_tmp, N * sizeof(double)); + ALLOC(cl_tmp1, N * 2 * sizeof(double)); + + ALLOC(cl_k, N * N * sizeof(double)); + ALLOC(cl_mat, N * N * sizeof(double)); + + ALLOC(cl_rho, sizeof(double)); + ALLOC(cl_alpha, sizeof(double)); + ALLOC(cl_beta, sizeof(double)); + ALLOC(cl_omega, 2 * sizeof(double)); + ALLOC(cl_omega1, sizeof(double)); + } else +#endif + { + ret |= posix_memalign((void**)&ctx->x, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->p, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->v, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->y, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->z, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->t, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->res, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->res0, 32, sizeof(double) * N); + ret |= posix_memalign((void**)&ctx->k, 32, sizeof(double) * N * N); + } + +fail: + if (ret) { + tdi_bicgstab_context_free(&ctx); + return -ENOMEM; + } + + *pctx = ctx; + return 0; +} + +void tdi_bicgstab_context_free(BiCGStabContext **pctx) +{ + BiCGStabContext *ctx = *pctx; + + if (!ctx) + return; + + free(ctx->x); + free(ctx->p); + free(ctx->v); + free(ctx->y); + free(ctx->z); + free(ctx->t); + free(ctx->res); + free(ctx->res0); + free(ctx->k); + +#if HAVE_OPENCL + if (ctx->ocl_ctx) { + clReleaseMemObject(ctx->cl_x); + clReleaseMemObject(ctx->cl_p); + clReleaseMemObject(ctx->cl_v); + clReleaseMemObject(ctx->cl_y); + clReleaseMemObject(ctx->cl_z); + clReleaseMemObject(ctx->cl_t); + clReleaseMemObject(ctx->cl_res); + clReleaseMemObject(ctx->cl_res0); + clReleaseMemObject(ctx->cl_tmp); + clReleaseMemObject(ctx->cl_tmp1); + + clReleaseMemObject(ctx->cl_k); + clReleaseMemObject(ctx->cl_mat); + + clReleaseMemObject(ctx->cl_rho); + clReleaseMemObject(ctx->cl_alpha); + clReleaseMemObject(ctx->cl_beta); + clReleaseMemObject(ctx->cl_omega); + clReleaseMemObject(ctx->cl_omega1); + } +#endif + + free(ctx); + *pctx = NULL; +} -- cgit v1.2.3