/* * BiCGStab iterative linear system solver * Copyright (C) 2016 Anton Khirnov * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "config.h" #if HAVE_OPENCL #include #include #endif #include #include #include #include #include "bicgstab.h" #include "common.h" #define BICGSTAB_MAXITER 16 #define BICGSTAB_TOL (1e-15) struct BiCGStabContext { int N; double *x; double *p, *v, *y, *z, *t; double *res, *res0; double *k; #if HAVE_OPENCL cl_context ocl_ctx; cl_command_queue ocl_queue; cl_mem cl_x; cl_mem cl_p, cl_v, cl_y, cl_z, cl_t; cl_mem cl_res, cl_res0; cl_mem cl_k, cl_mat; cl_mem cl_rho, cl_alpha, cl_beta, cl_omega, cl_omega1; cl_mem cl_tmp, cl_tmp1; #endif }; #if HAVE_OPENCL static int solve_cl(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x) { cl_command_queue ocl_q = ctx->ocl_queue; const int N = ctx->N; const double rhs_norm = cblas_dnrm2(N, rhs, 1); double rho, rho_prev = 1.0; double omega[2] = { 1.0 }; double alpha = 1.0; double err; int i; cl_event events[8]; // upload the matrix and RHS clEnqueueWriteBuffer(ocl_q, ctx->cl_res, 0, 0, N * sizeof(double), rhs, 0, NULL, &events[0]); clEnqueueWriteBuffer(ocl_q, ctx->cl_mat, 0, 0, N * N * sizeof(double), mat, 0, NULL, &events[1]); // initialize the residual clblasDgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, ctx->cl_mat, 0, N, ctx->cl_x, 0, 1, 1.0, ctx->cl_res, 0, 1, 1, &ocl_q, 2, events, &events[2]); clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_res0, 0, 0, N * sizeof(double), 1, &events[2], &events[3]); clEnqueueCopyBuffer(ocl_q, ctx->cl_res, ctx->cl_p, 0, 0, N * sizeof(double), 1, &events[2], &events[4]); clWaitForEvents(5, events); // BARRIER for (i = 0; i < MAXITER; i++) { clblasDdot(N, ctx->cl_rho, 0, ctx->cl_res, 0, 1, ctx->cl_res0, 0, 1, ctx->cl_tmp, 1, &ocl_q, 0, NULL, &events[0]); clEnqueueReadBuffer(ocl_q, ctx->cl_rho, 1, 0, sizeof(double), &rho, 1, &events[0], NULL); // BARRIER if (i) { double beta = (rho / rho_prev) * (alpha / omega[0]); clblasDaxpy(N, -omega[0], ctx->cl_v, 0, 1, ctx->cl_p, 0, 1, 1, &ocl_q, 0, NULL, &events[0]); clblasDscal(N, beta, ctx->cl_p, 0, 1, 1, &ocl_q, 1, &events[0], &events[1]); clblasDaxpy(N, 1, ctx->cl_res, 0, 1, ctx->cl_p, 0, 1, 1, &ocl_q, 1, &events[1], &events[0]); clWaitForEvents(1, &events[0]); // BARRIER } clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, ctx->cl_k, 0, N, ctx->cl_p, 0, 1, 0.0, ctx->cl_y, 0, 1, 1, &ocl_q, 0, NULL, &events[0]); clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, ctx->cl_mat, 0, N, ctx->cl_y, 0, 1, 0.0, ctx->cl_v, 0, 1, 1, &ocl_q, 1, &events[0], &events[1]); clblasDdot(N, ctx->cl_alpha, 0, ctx->cl_res0, 0, 1, ctx->cl_v, 0, 1, ctx->cl_tmp, 1, &ocl_q, 1, &events[1], &events[0]); clEnqueueReadBuffer(ocl_q, ctx->cl_alpha, 1, 0, sizeof(double), &alpha, 1, &events[0], NULL); // BARRIER alpha = rho / alpha; clblasDaxpy(N, -alpha, ctx->cl_v, 0, 1, ctx->cl_res, 0, 1, 1, &ocl_q, 0, NULL, &events[0]); clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, ctx->cl_k, 0, N, ctx->cl_res, 0, 1, 0.0, ctx->cl_z, 0, 1, 1, &ocl_q, 1, &events[0], &events[1]); clblasDgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, ctx->cl_mat, 0, N, ctx->cl_z, 0, 1, 0.0, ctx->cl_t, 0, 1, 1, &ocl_q, 1, &events[1], &events[0]); clblasDdot(N, ctx->cl_omega, 0, ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, ctx->cl_tmp, 1, &ocl_q, 1, &events[0], &events[1]); clblasDdot(N, ctx->cl_omega, 1, ctx->cl_t, 0, 1, ctx->cl_t, 0, 1, ctx->cl_tmp1, 1, &ocl_q, 1, &events[0], &events[2]); clEnqueueReadBuffer(ocl_q, ctx->cl_omega, 1, 0, sizeof(omega), omega, 2, &events[1], NULL); // BARRIER omega[0] /= omega[1]; clblasDaxpy(N, alpha, ctx->cl_y, 0, 1, ctx->cl_x, 0, 1, 1, &ocl_q, 0, NULL, &events[0]); clblasDaxpy(N, omega[0], ctx->cl_z, 0, 1, ctx->cl_x, 0, 1, 1, &ocl_q, 1, &events[0], &events[1]); clblasDaxpy(N, -omega[0], ctx->cl_t, 0, 1, ctx->cl_res, 0, 1, 1, &ocl_q, 0, NULL, &events[0]); clblasDnrm2(N, ctx->cl_tmp, 0, ctx->cl_res, 0, 1, ctx->cl_tmp1, 1, &ocl_q, 1, &events[0], &events[2]); clEnqueueReadBuffer(ocl_q, ctx->cl_tmp, 1, 0, sizeof(double), &err, 1, &events[2], NULL); clWaitForEvents(1, &events[1]); // BARRIER if (err < BICGSTAB_TOL) break; rho_prev = rho; } if (i == BICGSTAB_MAXITER) return -1; clEnqueueReadBuffer(ocl_q, ctx->cl_x, 1, 0, sizeof(double) * N, x, 0, NULL, NULL); return i; } #endif // based on the wikipedia article // and http://www.netlib.org/templates/matlab/bicgstab.m static int solve_sw(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x) { const int N = ctx->N; const double rhs_norm = cblas_dnrm2(N, rhs, 1); double rho, rho_prev = 1.0; double omega = 1.0; double alpha = 1.0; double err; int i; double *k = ctx->k; double *p = ctx->p, *v = ctx->v, *y = ctx->y, *z = ctx->z, *t = ctx->t; double *res = ctx->res, *res0 = ctx->res0; // initialize the residual memcpy(res, rhs, N * sizeof(*res)); cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, -1.0, mat, N, ctx->x, 1, 1.0, res, 1); memcpy(res0, res, N * sizeof(*res0)); memcpy(p, res, N * sizeof(*p)); for (i = 0; i < BICGSTAB_MAXITER; i++) { rho = cblas_ddot(N, res, 1, res0, 1); if (i) { double beta = (rho / rho_prev) * (alpha / omega); cblas_daxpy(N, -omega, v, 1, p, 1); cblas_dscal(N, beta, p, 1); cblas_daxpy(N, 1, res, 1, p, 1); } cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, k, N, p, 1, 0.0, y, 1); cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, mat, N, y, 1, 0.0, v, 1); alpha = rho / cblas_ddot(N, res0, 1, v, 1); cblas_daxpy(N, -alpha, v, 1, res, 1); cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, k, N, res, 1, 0.0, z, 1); cblas_dgemv(CblasColMajor, CblasNoTrans, N, N, 1.0, mat, N, z, 1, 0.0, t, 1); omega = cblas_ddot(N, t, 1, res, 1) / cblas_ddot(N, t, 1, t, 1); cblas_daxpy(N, alpha, y, 1, ctx->x, 1); cblas_daxpy(N, omega, z, 1, ctx->x, 1); cblas_daxpy(N, -omega, t, 1, res, 1); err = cblas_dnrm2(N, res, 1) / rhs_norm; if (err < BICGSTAB_TOL) break; rho_prev = rho; } if (i == BICGSTAB_MAXITER) return -1; memcpy(x, ctx->x, sizeof(*x) * ctx->N); return i; } int tdi_bicgstab_solve(BiCGStabContext *ctx, const double *mat, const double *rhs, double *x) { int ret; #if HAVE_OPENCL if (ctx->ocl_ctx) ret = solve_cl(ctx, mat, rhs, x); else #endif ret = solve_sw(ctx, mat, rhs, x); if (ret < 0) return ret; #if MD_VERIFY { int i; double *y; y = malloc(sizeof(*y) * ctx->N); memcpy(y, rhs, sizeof(*y) * ctx->N); cblas_dgemv(CblasColMajor, CblasNoTrans, ctx->N, ctx->N, -1.0, mat, ctx->N, x, 1, 1.0, y, 1); i = cblas_idamax(ctx->N, y, 1); if (fabs(y[i]) > 1e-11) abort(); } #endif return ret; } int tdi_bicgstab_init(BiCGStabContext *ctx, const double *k, const double *x0) { #if HAVE_OPENCL if (ctx->ocl_ctx) { cl_event events[2]; clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_k, 0, 0, ctx->N * ctx->N * sizeof(double), k, 0, NULL, &events[0]); clEnqueueWriteBuffer(ctx->ocl_queue, ctx->cl_x, 0, 0, ctx->N * sizeof(double), x0, 0, NULL, &events[1]); clWaitForEvents(2, events); } else #endif { memcpy(ctx->x, x0, ctx->N * sizeof(*x0)); memcpy(ctx->k, k, ctx->N * ctx->N * sizeof(*k)); } return 0; } int tdi_bicgstab_context_alloc(BiCGStabContext **pctx, int N, cl_context ocl_ctx, cl_command_queue ocl_q) { BiCGStabContext *ctx; int ret = 0; ctx = calloc(1, sizeof(*ctx)); if (!ctx) return -ENOMEM; ctx->N = N; #if HAVE_OPENCL if (ocl_ctx) { ctx->ocl_ctx = ocl_ctx; ctx->ocl_queue = ocl_q; #define ALLOC(dst, size) \ do { \ ctx->dst = clCreateBuffer(ocl_ctx, 0, size, NULL, &ret); \ if (ret != CL_SUCCESS) \ goto fail; \ } while (0) ALLOC(cl_x, N * sizeof(double)); ALLOC(cl_p, N * sizeof(double)); ALLOC(cl_v, N * sizeof(double)); ALLOC(cl_y, N * sizeof(double)); ALLOC(cl_z, N * sizeof(double)); ALLOC(cl_t, N * sizeof(double)); ALLOC(cl_res, N * sizeof(double)); ALLOC(cl_res0, N * sizeof(double)); ALLOC(cl_tmp, N * sizeof(double)); ALLOC(cl_tmp1, N * 2 * sizeof(double)); ALLOC(cl_k, N * N * sizeof(double)); ALLOC(cl_mat, N * N * sizeof(double)); ALLOC(cl_rho, sizeof(double)); ALLOC(cl_alpha, sizeof(double)); ALLOC(cl_beta, sizeof(double)); ALLOC(cl_omega, 2 * sizeof(double)); ALLOC(cl_omega1, sizeof(double)); } else #endif { ret |= posix_memalign((void**)&ctx->x, 32, sizeof(double) * N); ret |= posix_memalign((void**)&ctx->p, 32, sizeof(double) * N); ret |= posix_memalign((void**)&ctx->v, 32, sizeof(double) * N); ret |= posix_memalign((void**)&ctx->y, 32, sizeof(double) * N); ret |= posix_memalign((void**)&ctx->z, 32, sizeof(double) * N); ret |= posix_memalign((void**)&ctx->t, 32, sizeof(double) * N); ret |= posix_memalign((void**)&ctx->res, 32, sizeof(double) * N); ret |= posix_memalign((void**)&ctx->res0, 32, sizeof(double) * N); ret |= posix_memalign((void**)&ctx->k, 32, sizeof(double) * N * N); } fail: if (ret) { tdi_bicgstab_context_free(&ctx); return -ENOMEM; } *pctx = ctx; return 0; } void tdi_bicgstab_context_free(BiCGStabContext **pctx) { BiCGStabContext *ctx = *pctx; if (!ctx) return; free(ctx->x); free(ctx->p); free(ctx->v); free(ctx->y); free(ctx->z); free(ctx->t); free(ctx->res); free(ctx->res0); free(ctx->k); #if HAVE_OPENCL if (ctx->ocl_ctx) { clReleaseMemObject(ctx->cl_x); clReleaseMemObject(ctx->cl_p); clReleaseMemObject(ctx->cl_v); clReleaseMemObject(ctx->cl_y); clReleaseMemObject(ctx->cl_z); clReleaseMemObject(ctx->cl_t); clReleaseMemObject(ctx->cl_res); clReleaseMemObject(ctx->cl_res0); clReleaseMemObject(ctx->cl_tmp); clReleaseMemObject(ctx->cl_tmp1); clReleaseMemObject(ctx->cl_k); clReleaseMemObject(ctx->cl_mat); clReleaseMemObject(ctx->cl_rho); clReleaseMemObject(ctx->cl_alpha); clReleaseMemObject(ctx->cl_beta); clReleaseMemObject(ctx->cl_omega); clReleaseMemObject(ctx->cl_omega1); } #endif free(ctx); *pctx = NULL; }