/* * Copyright 2017 Anton Khirnov * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "config.h" #include #include #include #include #include #include #include #if HAVE_OPENCL #include #include #endif #include "basis.h" #include "common.h" #include "cpu.h" #include "log.h" #include "nlsolve.h" #include "td_constraints.h" #include "teukolsky_data.h" #include "threadpool.h" #define NB_EQUATIONS 3 typedef struct TDPriv { unsigned int basis_order[NB_EQUATIONS][2]; BasisSetContext *basis[NB_EQUATIONS][2]; NLSolveContext *solver; ThreadPoolContext *tp; TDLogger logger; double *coeffs; double *coeffs_tmp; ptrdiff_t coeffs_stride; int cpu_flags; double (*scalarproduct_metric)(size_t len1, size_t len2, double *mat, double *vec1, double *vec2); } TDPriv; double tdi_scalarproduct_metric_fma3(size_t len1, size_t len2, double *mat, double *vec1, double *vec2); double tdi_scalarproduct_metric_avx(size_t len1, size_t len2, double *mat, double *vec1, double *vec2); double tdi_scalarproduct_metric_sse3(size_t len1, size_t len2, double *mat, double *vec1, double *vec2); double tdi_scalarproduct_metric_c(size_t len1, size_t len2, double *mat, double *vec1, double *vec2); static void init_opencl(TDPriv *s) #if HAVE_OPENCL { int err, count; cl_platform_id platform; cl_context_properties props[3]; cl_device_id ocl_device; err = clGetPlatformIDs(1, &platform, &count); if (err != CL_SUCCESS || count < 1) { tdi_log(&s->logger, 0, "Could not get an OpenCL platform ID\n"); return; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ocl_device, &count); if (err != CL_SUCCESS || count < 1) { tdi_log(&s->logger, 0, "Could not get an OpenCL device ID\n"); return; } props[0] = CL_CONTEXT_PLATFORM; props[1] = (cl_context_properties)platform; props[2] = 0; s->ocl_ctx = clCreateContext(props, 1, &ocl_device, NULL, NULL, &err); if (err != CL_SUCCESS || !s->ocl_ctx) { tdi_log(&s->logger, 0, "Could not create an OpenCL context\n"); return; } s->ocl_queue = clCreateCommandQueue(s->ocl_ctx, ocl_device, 0, &err); if (err != CL_SUCCESS || !s->ocl_queue) { tdi_log(&s->logger, 0, "Could not create an OpenCL command queue: %d\n", err); goto fail; } err = clblasSetup(); if (err != CL_SUCCESS) { tdi_log(&s->logger, 0, "Error setting up clBLAS\n"); goto fail; } return; fail: if (s->ocl_queue) clReleaseCommandQueue(s->ocl_queue); s->ocl_queue = 0; if (s->ocl_ctx) clReleaseContext(s->ocl_ctx); s->ocl_ctx = 0; } #else { } #endif static const enum BasisFamily basis_sets[NB_EQUATIONS][2] = { { BASIS_FAMILY_SB_EVEN, BASIS_FAMILY_COS_EVEN }, { BASIS_FAMILY_SB_EVEN, BASIS_FAMILY_COS_EVEN }, { BASIS_FAMILY_SB_EVEN, BASIS_FAMILY_COS_EVEN }, }; static void log_callback(TDLogger *log, int level, const char *fmt, va_list vl) { TDContext *td = log->opaque; td->log_callback(td, level, fmt, vl); } static int teukolsky_init_check_options(TDContext *td) { TDPriv *s = td->priv; int ret; s->cpu_flags = tdi_init_cpu_flags(); if (!td->nb_threads) { td->nb_threads = tdi_cpu_count(); if (!td->nb_threads) td->nb_threads = 1; } ret = tdi_threadpool_init(&s->tp, td->nb_threads); if (ret < 0) return ret; init_opencl(s); s->logger.log = log_callback; s->logger.opaque = td; //s->scalarproduct_metric = tdi_scalarproduct_metric_c; //if (EXTERNAL_SSE3(s->cpu_flags)) // s->scalarproduct_metric = tdi_scalarproduct_metric_sse3; //if (EXTERNAL_AVX(s->cpu_flags)) // s->scalarproduct_metric = tdi_scalarproduct_metric_avx; //if (EXTERNAL_FMA3(s->cpu_flags)) // s->scalarproduct_metric = tdi_scalarproduct_metric_fma3; // for (int i = 0; i < ctx->nb_equations; i++) // for (int j = 0; j < 2; j++) { // s->ps_ctx->basis[i][j] = ctx->basis[i][j]; // s->ps_ctx->solve_order[i][j] = basis_order[i][j]; // max_order = MAX(max_order, basis_order[i][j]); // } // //for (int i = 0; i < max_order; i++) { // tdi_log(&s->logger, 2, "%d ", i); // for (int j = 0; j < ctx->nb_equations; j++) // for (int k = 0; k < 2; k++) { // if (i < s->ps_ctx->solve_order[j][k]) // tdi_log(&s->logger, 2, "%8.8g\t", s->ps_ctx->colloc_grid[j][k][i]); // else // tdi_log(&s->logger, 2, " "); // } // tdi_log(&s->logger, 2, "\n"); //} s->basis_order[0][0] = td->nb_coeffs[0]; s->basis_order[0][1] = td->nb_coeffs[1]; s->basis_order[1][0] = td->nb_coeffs[0]; s->basis_order[1][1] = td->nb_coeffs[1]; s->basis_order[2][0] = td->nb_coeffs[0]; s->basis_order[2][1] = td->nb_coeffs[1]; ret = posix_memalign((void**)&s->coeffs, 32, sizeof(*s->coeffs) * NB_EQUATIONS * td->nb_coeffs[0] * td->nb_coeffs[1]); if (ret) return -ENOMEM; memset(s->coeffs, 0, sizeof(*s->coeffs) * NB_EQUATIONS * td->nb_coeffs[0] * td->nb_coeffs[1]); if (td->solution_branch > 0) { ret = posix_memalign((void**)&s->coeffs_tmp, 32, sizeof(*s->coeffs_tmp) * NB_EQUATIONS * td->nb_coeffs[0] * td->nb_coeffs[1]); if (ret) return -ENOMEM; memset(s->coeffs_tmp, 0, sizeof(*s->coeffs_tmp) * NB_EQUATIONS * td->nb_coeffs[0] * td->nb_coeffs[1]); } for (int i = 0; i < NB_EQUATIONS; i++) td->coeffs[i] = s->coeffs + i * td->nb_coeffs[0] * td->nb_coeffs[1]; for (int i = 0; i < ARRAY_ELEMS(basis_sets); i++) for (int j = 0; j < ARRAY_ELEMS(basis_sets[i]); j++) { double sf; ret = tdi_basis_init(&s->basis[i][j], basis_sets[i][j], td->basis_scale_factor[j]); if (ret < 0) return ret; } ret = tdi_nlsolve_context_alloc(&s->solver, ARRAY_ELEMS(basis_sets)); if (ret < 0) { tdi_log(&s->logger, 0, "Error allocating the non-linear solver\n"); return ret; } s->solver->logger = s->logger; s->solver->tp = s->tp; s->solver->maxiter = td->max_iter; s->solver->atol = td->atol; memcpy(s->solver->basis, s->basis, sizeof(s->basis)); memcpy(s->solver->solve_order, s->basis_order, sizeof(s->basis_order)); #if HAVE_OPENCL s->solver->ocl_ctx = s->ocl_ctx; s->solver->ocl_queue = s->ocl_queue; #endif ret = tdi_nlsolve_context_init(s->solver); if (ret < 0) { tdi_log(&s->logger, 0, "Error initializing the non-linear solver\n"); return ret; } return 0; } static int constraint_eval_alloc(TDContext *td, double amplitude, TDConstraintEvalContext **pce) { TDPriv *priv = td->priv; TDConstraintEvalContext *ce; int ret; ret = tdi_constraint_eval_alloc(&ce, TD_FAMILY_AE_TIME_ANTISYM); if (ret < 0) { tdi_log(&priv->logger, 0, "Error allocating the constraints evaluator\n"); return ret; } ce->logger = priv->logger; ce->amplitude = amplitude; ce->nb_coords[0] = td->nb_coeffs[0]; ce->nb_coords[1] = td->nb_coeffs[1]; ce->coords[0] = priv->solver->colloc_grid[0][0]; ce->coords[1] = priv->solver->colloc_grid[0][1]; ret = tdi_constraint_eval_init(ce); if (ret < 0) { tdi_constraint_eval_free(&ce); tdi_log(&priv->logger, 0, "Error initializing the constraints evaluator\n"); return ret; } *pce = ce; return 0; } static int teukolsky_constraint_eval(void *opaque, unsigned int eq_idx, const unsigned int *colloc_grid_order, const double * const *colloc_grid, const double * const (*vars)[PSSOLVE_DIFF_ORDER_NB], double *dst) { TDConstraintEvalContext *ce = opaque; const double *amplitude = opaque; return tdi_constraint_eval(ce, eq_idx, vars, dst); } int td_solve(TDContext *td, double *coeffs_init[3]) { TDPriv *s = td->priv; TDConstraintEvalContext *ce; double a0; int ret; ret = teukolsky_init_check_options(td); if (ret < 0) return ret; ret = constraint_eval_alloc(td, 0.0, &ce); if (ret < 0) return ret; if (fabs(td->amplitude) >= ce->a_diverge) { tdi_log(&s->logger, 0, "Amplitude A=%16.16g is above the point A_{max}=%g, no solutions " "are known to exist there. Set solution_branch=1 to get to the " "second branch where mass increases with decreasing amplitude\n", td->amplitude, ce->a_converge); } a0 = ce->a_converge; tdi_constraint_eval_free(&ce); if (coeffs_init) { for (int i = 0; i < 3; i++) { memcpy(td->coeffs[i], coeffs_init[i], sizeof(*td->coeffs[i]) * td->nb_coeffs[0] * td->nb_coeffs[1]); } } if (td->solution_branch == 0 || coeffs_init) { // direct solve with default (flat space) or user-provided initial guess ret = constraint_eval_alloc(td, td->amplitude, &ce); if (ret < 0) return ret; ret = tdi_nlsolve_solve(s->solver, teukolsky_constraint_eval, NULL, ce, s->coeffs, 0); tdi_constraint_eval_free(&ce); if (ret < 0) { tdi_log(&s->logger, 0, "tdi_nlsolve_solve() failed: %d", ret); return ret; } } else { // second branch requested and no user-provided initial guess // execute two lower-branch solutions and extrapolate to get to the upper branch const int N = NB_EQUATIONS * td->nb_coeffs[0] * td->nb_coeffs[1]; const double delta = 1e-5; const double a1 = a0 - delta; double cur_amplitude, new_amplitude, inverse_step; int dir; ret = constraint_eval_alloc(td, a0, &ce); if (ret < 0) return ret; ret = tdi_nlsolve_solve(s->solver, teukolsky_constraint_eval, NULL, ce, s->coeffs, 0); tdi_constraint_eval_free(&ce); if (ret < 0) return ret; ret = constraint_eval_alloc(td, a1, &ce); if (ret < 0) return ret; ret = tdi_nlsolve_solve(s->solver, teukolsky_constraint_eval, NULL, ce, s->coeffs_tmp, 0); tdi_constraint_eval_free(&ce); if (ret < 0) return ret; cblas_daxpy(N, -1.0, s->coeffs, 1, s->coeffs_tmp, 1); cblas_daxpy(N, -1.0, s->coeffs_tmp, 1, s->coeffs, 1); // obtain solution for a1 in the upper branch ret = constraint_eval_alloc(td, a1, &ce); if (ret < 0) return ret; ret = tdi_nlsolve_solve(s->solver, teukolsky_constraint_eval, NULL, ce, s->coeffs, 0); tdi_constraint_eval_free(&ce); if (ret < 0) { tdi_log(&s->logger, 0, "Failed to get into the upper branch\n"); return ret; } cur_amplitude = a1; dir = SGN(td->amplitude - cur_amplitude); inverse_step = 1.0 / td->amplitude - 1.0 / cur_amplitude; while (fabs(cur_amplitude - td->amplitude) > DBL_EPSILON) { //double scale_factor; //scale_factor = cur_amplitude; //cur_amplitude = MIN(td->amplitude, 2 * cur_amplitude); //scale_factor = cur_amplitude / scale_factor; new_amplitude = 1.0 / ((1.0 / cur_amplitude) + inverse_step); if (dir * (td->amplitude - new_amplitude) < 0) new_amplitude = td->amplitude; tdi_log(&s->logger, 2, "Trying amplitude %g\n", new_amplitude); ret = constraint_eval_alloc(td, new_amplitude, &ce); if (ret < 0) return ret; memcpy(s->coeffs_tmp, s->coeffs, sizeof(*s->coeffs) * N); ret = tdi_nlsolve_solve(s->solver, teukolsky_constraint_eval, NULL, ce, s->coeffs_tmp, 1); tdi_constraint_eval_free(&ce); if (ret == -EDOM) { inverse_step = 0.5 * inverse_step; if (fabs(inverse_step) < 1e-2) return ret; continue; } else if (ret < 0) return ret; cur_amplitude = new_amplitude; memcpy(s->coeffs, s->coeffs_tmp, sizeof(*s->coeffs) * N); } //while (1) { // if (fabs(cur_amplitude - td->amplitude) < DBL_EPSILON) // goto finish; // cblas_dscal(td->nb_coeffs[0] * td->nb_coeffs[1], SQR(scale_factor), td->coeffs[0], 1); // cblas_dscal(td->nb_coeffs[0] * td->nb_coeffs[1], scale_factor, td->coeffs[1], 1); // cblas_dscal(td->nb_coeffs[0] * td->nb_coeffs[1], scale_factor, td->coeffs[2], 1); //} } finish: tdi_nlsolve_print_stats(s->solver); return 0; } static void log_default_callback(const TDContext *td, int level, const char *fmt, va_list vl) { vfprintf(stderr, fmt, vl); } TDContext *td_context_alloc(void) { TDContext *td = calloc(1, sizeof(*td)); if (!td) return NULL; td->nb_threads = 1; td->amplitude = 1.0; td->nb_coeffs[0] = 32; td->nb_coeffs[1] = 16; td->basis_scale_factor[0] = 3.0; td->basis_scale_factor[1] = 3.0; td->max_iter = 16; td->atol = 1e-12; td->log_callback = log_default_callback; td->priv = calloc(1, sizeof(TDPriv)); if (!td->priv) { free(td); return NULL; } return td; } void td_context_free(TDContext **ptd) { TDContext *td = *ptd; TDPriv *s; if (!td) return; s = td->priv; tdi_nlsolve_context_free(&s->solver); tdi_threadpool_free(&s->tp); #if HAVE_OPENCL if (s->ocl_queue) clReleaseCommandQueue(s->ocl_queue); if (s->ocl_ctx) clReleaseContext(s->ocl_ctx); #endif for (int i = 0; i < ARRAY_ELEMS(s->basis); i++) for (int j = 0; j < ARRAY_ELEMS(s->basis[i]); j++) tdi_basis_free(&s->basis[i][j]); free(s->coeffs); free(s->coeffs_tmp); free(td->priv); free(td); *ptd = NULL; } static double scalarproduct_metric_c(size_t len1, size_t len2, double *mat, double *vec1, double *vec2) { double val = 0.0; for (int m = 0; m < len2; m++) { double tmp = 0.0; for (int n = 0; n < len1; n++) tmp += mat[m * len1 + n] * vec1[n]; val += tmp * vec2[m]; } return val; } static int eval_var(const TDContext *td, unsigned int var_idx, size_t nb_coords, const double *r, const double *theta, const unsigned int diff_order[2], double *out) { TDPriv *s = td->priv; double *basis_val[2] = { NULL }; int ret = 0; if (diff_order[0] > 0 || diff_order[1] > 0) { tdi_log(&s->logger, 0, "Derivatives of higher order than 2 are not supported.\n"); return -ENOSYS; } posix_memalign((void**)&basis_val[0], 32, sizeof(*basis_val[0]) * td->nb_coeffs[0]); posix_memalign((void**)&basis_val[1], 32, sizeof(*basis_val[1]) * td->nb_coeffs[1]); if (!basis_val[0] || !basis_val[1]) { ret = -ENOMEM; goto fail; } memset(basis_val[0], 0, sizeof(*basis_val[0]) * td->nb_coeffs[0]); memset(basis_val[1], 0, sizeof(*basis_val[1]) * td->nb_coeffs[1]); for (int i = 0; i < nb_coords; i++) { double theta_val = theta[i]; double r_val = r[i]; double val = (var_idx == 0) ? 1.0 : 0.0; for (int k = 0; k < td->nb_coeffs[0]; k++) basis_val[0][k] = tdi_basis_eval(s->basis[var_idx][0], BS_EVAL_TYPE_VALUE, r_val, k); for (int k = 0; k < td->nb_coeffs[1]; k++) basis_val[1][k] = tdi_basis_eval(s->basis[var_idx][1], BS_EVAL_TYPE_VALUE, theta_val, k); val += scalarproduct_metric_c(td->nb_coeffs[0], td->nb_coeffs[1], td->coeffs[var_idx], basis_val[0], basis_val[1]); out[i] = val; } fail: free(basis_val[0]); free(basis_val[1]); return ret; } int td_eval_psi(const TDContext *td, size_t nb_coords, const double *r, const double *theta, const unsigned int diff_order[2], double *out) { return eval_var(td, 0, nb_coords, r, theta, diff_order, out); } int td_eval_krr(const TDContext *td, size_t nb_coords, const double *r, const double *theta, const unsigned int diff_order[2], double *out) { return eval_var(td, 1, nb_coords, r, theta, diff_order, out); } int td_eval_kpp(const TDContext *td, size_t nb_coords, const double *r, const double *theta, const unsigned int diff_order[2], double *out) { return eval_var(td, 2, nb_coords, r, theta, diff_order, out); }