From 234e722c5abfb15c9850b4224cabbb3e2d0c3657 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Fri, 9 Feb 2018 15:34:25 +0100 Subject: Add the remaining parts for teukolsky waves initial data --- init.c | 438 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 438 insertions(+) create mode 100644 init.c (limited to 'init.c') diff --git a/init.c b/init.c new file mode 100644 index 0000000..4fdece4 --- /dev/null +++ b/init.c @@ -0,0 +1,438 @@ +/* + * Copyright 2017 Anton Khirnov + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include + +#if HAVE_OPENCL +#include +#include +#endif + +#include "basis.h" +#include "common.h" +#include "cpu.h" +#include "log.h" +#include "nlsolve.h" +#include "teukolsky_data.h" +#include "threadpool.h" + +#define NB_EQUATIONS 3 + +typedef struct TDPriv { + unsigned int basis_order[NB_EQUATIONS][2]; + BasisSetContext *basis[NB_EQUATIONS][2]; + + NLSolveContext *solver; + + ThreadPoolContext *tp; + TDLogger logger; + + double *coeffs; + ptrdiff_t coeffs_stride; + + int cpu_flags; + + double (*scalarproduct_metric)(size_t len1, size_t len2, double *mat, + double *vec1, double *vec2); +} TDPriv; + +double tdi_scalarproduct_metric_fma3(size_t len1, size_t len2, double *mat, + double *vec1, double *vec2); +double tdi_scalarproduct_metric_avx(size_t len1, size_t len2, double *mat, + double *vec1, double *vec2); +double tdi_scalarproduct_metric_sse3(size_t len1, size_t len2, double *mat, + double *vec1, double *vec2); +double tdi_scalarproduct_metric_c(size_t len1, size_t len2, double *mat, + double *vec1, double *vec2); + +static void init_opencl(TDPriv *s) +#if HAVE_OPENCL +{ + int err, count; + cl_platform_id platform; + cl_context_properties props[3]; + cl_device_id ocl_device; + + err = clGetPlatformIDs(1, &platform, &count); + if (err != CL_SUCCESS || count < 1) { + tdi_log(&s->logger, 0, "Could not get an OpenCL platform ID\n"); + return; + } + + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ocl_device, &count); + if (err != CL_SUCCESS || count < 1) { + tdi_log(&s->logger, 0, "Could not get an OpenCL device ID\n"); + return; + } + + props[0] = CL_CONTEXT_PLATFORM; + props[1] = (cl_context_properties)platform; + props[2] = 0; + + s->ocl_ctx = clCreateContext(props, 1, &ocl_device, NULL, NULL, &err); + if (err != CL_SUCCESS || !s->ocl_ctx) { + tdi_log(&s->logger, 0, "Could not create an OpenCL context\n"); + return; + } + + s->ocl_queue = clCreateCommandQueue(s->ocl_ctx, ocl_device, 0, &err); + if (err != CL_SUCCESS || !s->ocl_queue) { + tdi_log(&s->logger, 0, "Could not create an OpenCL command queue: %d\n", err); + goto fail; + } + + err = clblasSetup(); + if (err != CL_SUCCESS) { + tdi_log(&s->logger, 0, "Error setting up clBLAS\n"); + goto fail; + } + + return; +fail: + if (s->ocl_queue) + clReleaseCommandQueue(s->ocl_queue); + s->ocl_queue = 0; + + if (s->ocl_ctx) + clReleaseContext(s->ocl_ctx); + s->ocl_ctx = 0; +} +#else +{ +} +#endif + +static const enum BasisFamily basis_sets[NB_EQUATIONS][2] = { + { BASIS_FAMILY_SB_EVEN, BASIS_FAMILY_COS_EVEN }, + { BASIS_FAMILY_SB_EVEN, BASIS_FAMILY_COS_EVEN }, + { BASIS_FAMILY_SB_EVEN, BASIS_FAMILY_COS_EVEN }, +}; + +static int teukolsky_init_check_options(TDContext *td) +{ + TDPriv *s = td->priv; + int ret; + + s->cpu_flags = tdi_init_cpu_flags(); + + if (!td->nb_threads) { + td->nb_threads = tdi_cpu_count(); + if (!td->nb_threads) + td->nb_threads = 1; + } + + ret = tdi_threadpool_init(&s->tp, td->nb_threads); + if (ret < 0) + return ret; + + init_opencl(s); + + s->logger.log = tdi_log_default_callback; + + //s->scalarproduct_metric = tdi_scalarproduct_metric_c; + //if (EXTERNAL_SSE3(s->cpu_flags)) + // s->scalarproduct_metric = tdi_scalarproduct_metric_sse3; + //if (EXTERNAL_AVX(s->cpu_flags)) + // s->scalarproduct_metric = tdi_scalarproduct_metric_avx; + //if (EXTERNAL_FMA3(s->cpu_flags)) + // s->scalarproduct_metric = tdi_scalarproduct_metric_fma3; +// for (int i = 0; i < ctx->nb_equations; i++) +// for (int j = 0; j < 2; j++) { +// s->ps_ctx->basis[i][j] = ctx->basis[i][j]; +// s->ps_ctx->solve_order[i][j] = basis_order[i][j]; +// max_order = MAX(max_order, basis_order[i][j]); +// } +// + //for (int i = 0; i < max_order; i++) { + // tdi_log(&s->logger, 2, "%d ", i); + // for (int j = 0; j < ctx->nb_equations; j++) + // for (int k = 0; k < 2; k++) { + // if (i < s->ps_ctx->solve_order[j][k]) + // tdi_log(&s->logger, 2, "%8.8g\t", s->ps_ctx->colloc_grid[j][k][i]); + // else + // tdi_log(&s->logger, 2, " "); + // } + // tdi_log(&s->logger, 2, "\n"); + //} + + s->basis_order[0][0] = td->nb_coeffs[0]; + s->basis_order[0][1] = td->nb_coeffs[1]; + s->basis_order[1][0] = td->nb_coeffs[0]; + s->basis_order[1][1] = td->nb_coeffs[1]; + s->basis_order[2][0] = td->nb_coeffs[0]; + s->basis_order[2][1] = td->nb_coeffs[1]; + + ret = posix_memalign((void**)&s->coeffs, 32, + sizeof(*s->coeffs) * NB_EQUATIONS * td->nb_coeffs[0] * td->nb_coeffs[1]); + if (ret) + return -ENOMEM; + memset(s->coeffs, 0, sizeof(*s->coeffs) * NB_EQUATIONS * td->nb_coeffs[0] * td->nb_coeffs[1]); + + for (int i = 0; i < NB_EQUATIONS; i++) + td->coeffs[i] = s->coeffs + i * td->nb_coeffs[0] * td->nb_coeffs[1]; + + for (int i = 0; i < ARRAY_ELEMS(basis_sets); i++) + for (int j = 0; j < ARRAY_ELEMS(basis_sets[i]); j++) { + double sf; + + ret = tdi_basis_init(&s->basis[i][j], basis_sets[i][j], td->basis_scale_factor[j]); + if (ret < 0) + return ret; + } + + + ret = tdi_nlsolve_context_alloc(&s->solver, ARRAY_ELEMS(basis_sets)); + if (ret < 0) { + tdi_log(&s->logger, 0, "Error allocating the non-linear solver\n"); + return ret; + } + + s->solver->logger = s->logger; + s->solver->tp = s->tp; + s->solver->maxiter = td->max_iter; + s->solver->atol = td->atol; + + memcpy(s->solver->basis, s->basis, sizeof(s->basis)); + memcpy(s->solver->solve_order, s->basis_order, sizeof(s->basis_order)); + +#if HAVE_OPENCL + s->solver->ocl_ctx = s->ocl_ctx; + s->solver->ocl_queue = s->ocl_queue; +#endif + + ret = tdi_nlsolve_context_init(s->solver); + if (ret < 0) { + tdi_log(&s->logger, 0, "Error initializing the non-linear solver\n"); + return ret; + } + + return 0; +} + +static int teukolsky_constraint_eval(void *opaque, unsigned int eq_idx, + const unsigned int *colloc_grid_order, + const double * const *colloc_grid, + const double * const (*vars)[PSSOLVE_DIFF_ORDER_NB], + double *dst) +{ + const double *amplitude = opaque; + + return tdi_constraints_eval(eq_idx, *amplitude, colloc_grid_order, colloc_grid, + vars, dst); +} + +#define AMPLITUDE_DIRECT_SOLVE 1e-3 + +int td_solve(TDContext *td, double *coeffs_init[3]) +{ + TDPriv *s = td->priv; + double cur_amplitude = td->amplitude;//MIN(td->amplitude, AMPLITUDE_DIRECT_SOLVE); + int ret; + + ret = teukolsky_init_check_options(td); + if (ret < 0) + return ret; + + if (coeffs_init) { + for (int i = 0; i < 3; i++) { + memcpy(td->coeffs[i], coeffs_init[i], sizeof(*td->coeffs[i]) * + td->nb_coeffs[0] * td->nb_coeffs[1]); + } + } + + while (1) { + double scale_factor; + + ret = tdi_nlsolve_solve(s->solver, teukolsky_constraint_eval, NULL, + &cur_amplitude, s->coeffs); + if (ret < 0) + return ret; + + if (fabs(cur_amplitude - td->amplitude) < DBL_EPSILON) + goto finish; + + scale_factor = cur_amplitude; + cur_amplitude = MIN(td->amplitude, 2 * cur_amplitude); + scale_factor = cur_amplitude / scale_factor; + + cblas_dscal(td->nb_coeffs[0] * td->nb_coeffs[1], SQR(scale_factor), td->coeffs[0], 1); + cblas_dscal(td->nb_coeffs[0] * td->nb_coeffs[1], scale_factor, td->coeffs[1], 1); + cblas_dscal(td->nb_coeffs[0] * td->nb_coeffs[1], scale_factor, td->coeffs[2], 1); + + tdi_log(&s->logger, 2, "Trying amplitude %g %g\n", cur_amplitude, scale_factor); + } + +finish: + tdi_solver_print_stats(s->solver); + + return 0; +} + +TDContext *td_context_alloc(void) +{ + TDContext *td = calloc(1, sizeof(*td)); + + if (!td) + return NULL; + + td->nb_threads = 1; + + td->amplitude = 1.0; + + td->nb_coeffs[0] = 32; + td->nb_coeffs[1] = 16; + + td->basis_scale_factor[0] = 3.0; + td->basis_scale_factor[1] = 3.0; + + td->max_iter = 16; + td->atol = 1e-12; + + td->log_callback = tdi_log_default_callback; + + td->priv = calloc(1, sizeof(TDPriv)); + if (!td->priv) { + free(td); + return NULL; + } + + return td; +} + +void td_context_free(TDContext **ptd) +{ + TDContext *td = *ptd; + TDPriv *s; + + if (!td) + return; + + s = td->priv; + + tdi_nlsolve_context_free(&s->solver); + tdi_threadpool_free(&s->tp); + +#if HAVE_OPENCL + if (s->ocl_queue) + clReleaseCommandQueue(s->ocl_queue); + if (s->ocl_ctx) + clReleaseContext(s->ocl_ctx); +#endif + + for (int i = 0; i < ARRAY_ELEMS(s->basis); i++) + for (int j = 0; j < ARRAY_ELEMS(s->basis[i]); j++) + tdi_basis_free(&s->basis[i][j]); + + free(s->coeffs); + + free(td->priv); + free(td); + *ptd = NULL; +} + +static double scalarproduct_metric_c(size_t len1, size_t len2, double *mat, + double *vec1, double *vec2) +{ + double val = 0.0; + for (int m = 0; m < len2; m++) { + double tmp = 0.0; + for (int n = 0; n < len1; n++) + tmp += mat[m * len1 + n] * vec1[n]; + val += tmp * vec2[m]; + } + return val; +} + +static int eval_var(const TDContext *td, unsigned int var_idx, + size_t nb_coords, const double *r, const double *theta, + const unsigned int diff_order[2], + double *out) +{ + TDPriv *s = td->priv; + double *basis_val[2] = { NULL }; + int ret = 0; + + if (diff_order[0] > 0 || diff_order[1] > 0) { + tdi_log(&s->logger, 0, "Derivatives of higher order than 2 are not supported.\n"); + return -ENOSYS; + } + + posix_memalign((void**)&basis_val[0], 32, sizeof(*basis_val[0]) * td->nb_coeffs[0]); + posix_memalign((void**)&basis_val[1], 32, sizeof(*basis_val[1]) * td->nb_coeffs[1]); + if (!basis_val[0] || !basis_val[1]) { + ret = -ENOMEM; + goto fail; + } + memset(basis_val[0], 0, sizeof(*basis_val[0]) * td->nb_coeffs[0]); + memset(basis_val[1], 0, sizeof(*basis_val[1]) * td->nb_coeffs[1]); + + for (int i = 0; i < nb_coords; i++) { + double theta_val = theta[i]; + double r_val = r[i]; + + double val = (var_idx == 0) ? 1.0 : 0.0; + + for (int k = 0; k < td->nb_coeffs[0]; k++) + basis_val[0][k] = tdi_basis_eval(s->basis[var_idx][0], BS_EVAL_TYPE_VALUE, r_val, k); + for (int k = 0; k < td->nb_coeffs[1]; k++) + basis_val[1][k] = tdi_basis_eval(s->basis[var_idx][1], BS_EVAL_TYPE_VALUE, theta_val, k); + + val += scalarproduct_metric_c(td->nb_coeffs[0], td->nb_coeffs[1], td->coeffs[var_idx], + basis_val[0], basis_val[1]); + + out[i] = val; + } + + +fail: + free(basis_val[0]); + free(basis_val[1]); + + return ret; +} + +int td_eval_psi(const TDContext *td, + size_t nb_coords, const double *r, const double *theta, + const unsigned int diff_order[2], + double *out) +{ + return eval_var(td, 0, nb_coords, r, theta, diff_order, out); +} + +int td_eval_krr(const TDContext *td, + size_t nb_coords, const double *r, const double *theta, + const unsigned int diff_order[2], + double *out) +{ + return eval_var(td, 1, nb_coords, r, theta, diff_order, out); +} +int td_eval_kpp(const TDContext *td, + size_t nb_coords, const double *r, const double *theta, + const unsigned int diff_order[2], + double *out) +{ + return eval_var(td, 2, nb_coords, r, theta, diff_order, out); +} -- cgit v1.2.3