From 57adceb6497e50355c2d6011ec4bb03ea10cb23b Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Sun, 26 Jul 2020 19:08:23 +0200 Subject: init: multi-thread solution evaluation --- init.c | 145 ++++++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 94 insertions(+), 51 deletions(-) diff --git a/init.c b/init.c index f4d130f..6467988 100644 --- a/init.c +++ b/init.c @@ -750,13 +750,58 @@ static double scalarproduct_metric_c(size_t len1, size_t len2, double *mat, return val; } +typedef struct EvalVarData { + const TDContext *td; + + const BasisSetContext *basis[2]; + + const double *coeffs; + + const double *theta; + const double *r; + + double *basis_val[2]; + double *out; + + double diff_order[2]; + double init_val; +} EvalVarData; + +static int eval_var_kernel(void *arg, unsigned int job_idx, unsigned int thread_idx) +{ + EvalVarData *d = arg; + const TDContext *td = d->td; + const TDPriv *s = td->priv; + + double *basis_val[2] = { + d->basis_val[0] + thread_idx * td->nb_coeffs[0], + d->basis_val[1] + thread_idx * td->nb_coeffs[1], + }; + + double theta_val = d->theta[job_idx]; + double r_val = d->r[job_idx]; + + double val = d->init_val; + + for (int k = 0; k < td->nb_coeffs[0]; k++) + basis_val[0][k] = tdi_basis_eval(d->basis[0], d->diff_order[0], r_val, k); + for (int k = 0; k < td->nb_coeffs[1]; k++) + basis_val[1][k] = tdi_basis_eval(d->basis[1], d->diff_order[1], theta_val, k); + + val += scalarproduct_metric_c(td->nb_coeffs[0], td->nb_coeffs[1], d->coeffs, + basis_val[0], basis_val[1]); + + d->out[job_idx] = val; +} + static int eval_var(const TDContext *td, unsigned int var_idx, size_t nb_coords, const double *r, const double *theta, const unsigned int diff_order[2], double *out) { TDPriv *s = td->priv; - double *basis_val[2] = { NULL }; + const int nb_threads = tp_get_nb_threads(s->tp); + EvalVarData thread_data = { NULL }; int ret = 0; if (diff_order[0] > 2 || diff_order[1] > 2) { @@ -764,36 +809,34 @@ static int eval_var(const TDContext *td, unsigned int var_idx, return -ENOSYS; } - posix_memalign((void**)&basis_val[0], 32, sizeof(*basis_val[0]) * td->nb_coeffs[0]); - posix_memalign((void**)&basis_val[1], 32, sizeof(*basis_val[1]) * td->nb_coeffs[1]); - if (!basis_val[0] || !basis_val[1]) { - ret = -ENOMEM; - goto fail; + for (int i = 0; i < ARRAY_ELEMS(thread_data.basis_val); i++) { + const size_t alloc_elems = td->nb_coeffs[i] * nb_threads; + ret = posix_memalign((void**)&thread_data.basis_val[i], 32, + sizeof(*thread_data.basis_val[i]) * alloc_elems); + if (ret != 0) { + ret = -ENOMEM; + goto fail; + } + memset(thread_data.basis_val[i], 0, + sizeof(*thread_data.basis_val[i]) * alloc_elems); } - memset(basis_val[0], 0, sizeof(*basis_val[0]) * td->nb_coeffs[0]); - memset(basis_val[1], 0, sizeof(*basis_val[1]) * td->nb_coeffs[1]); - for (int i = 0; i < nb_coords; i++) { - double theta_val = theta[i]; - double r_val = r[i]; - - double val = (var_idx == 0 && diff_order[0] == 0 && diff_order[1] == 0) ? 1.0 : 0.0; - - for (int k = 0; k < td->nb_coeffs[0]; k++) - basis_val[0][k] = tdi_basis_eval(s->basis[var_idx][0], diff_order[0], r_val, k); - for (int k = 0; k < td->nb_coeffs[1]; k++) - basis_val[1][k] = tdi_basis_eval(s->basis[var_idx][1], diff_order[1], theta_val, k); - - val += scalarproduct_metric_c(td->nb_coeffs[0], td->nb_coeffs[1], td->coeffs[var_idx], - basis_val[0], basis_val[1]); - - out[i] = val; - } + thread_data.td = td; + thread_data.r = r; + thread_data.theta = theta; + thread_data.out = out; + thread_data.diff_order[0] = diff_order[0]; + thread_data.diff_order[1] = diff_order[1]; + thread_data.basis[0] = s->basis[var_idx][0]; + thread_data.basis[1] = s->basis[var_idx][1]; + thread_data.coeffs = td->coeffs[var_idx]; + thread_data.init_val = (var_idx == 0 && diff_order[0] == 0 && diff_order[1] == 0) ? 1.0 : 0.0; + tp_execute(s->tp, nb_coords, eval_var_kernel, &thread_data); fail: - free(basis_val[0]); - free(basis_val[1]); + free(thread_data.basis_val[0]); + free(thread_data.basis_val[1]); return ret; } @@ -866,7 +909,8 @@ int td_eval_lapse(const TDContext *td, double *out) { TDPriv *priv = td->priv; - double *basis_val[2] = { NULL }; + const int nb_threads = tp_get_nb_threads(priv->tp); + EvalVarData thread_data = { NULL }; int ret; if (!priv->coeffs_lapse) { @@ -880,35 +924,34 @@ int td_eval_lapse(const TDContext *td, return -ENOSYS; } - posix_memalign((void**)&basis_val[0], 32, sizeof(*basis_val[0]) * td->nb_coeffs[0]); - posix_memalign((void**)&basis_val[1], 32, sizeof(*basis_val[1]) * td->nb_coeffs[1]); - if (!basis_val[0] || !basis_val[1]) { - ret = -ENOMEM; - goto fail; + for (int i = 0; i < ARRAY_ELEMS(thread_data.basis_val); i++) { + const size_t alloc_elems = td->nb_coeffs[i] * nb_threads; + ret = posix_memalign((void**)&thread_data.basis_val[i], 32, + sizeof(*thread_data.basis_val[i]) * alloc_elems); + if (ret != 0) { + ret = -ENOMEM; + goto fail; + } + memset(thread_data.basis_val[i], 0, + sizeof(*thread_data.basis_val[i]) * alloc_elems); } - memset(basis_val[0], 0, sizeof(*basis_val[0]) * td->nb_coeffs[0]); - memset(basis_val[1], 0, sizeof(*basis_val[1]) * td->nb_coeffs[1]); - for (int i = 0; i < nb_coords; i++) { - double theta_val = theta[i]; - double r_val = r[i]; + thread_data.td = td; + thread_data.r = r; + thread_data.theta = theta; + thread_data.out = out; + thread_data.diff_order[0] = diff_order[0]; + thread_data.diff_order[1] = diff_order[1]; + thread_data.basis[0] = priv->basis[0][0]; + thread_data.basis[1] = priv->basis[0][1]; + thread_data.coeffs = priv->coeffs_lapse; + thread_data.init_val = (diff_order[0] == 0 && diff_order[1] == 0) ? 1.0 : 0.0; - double val = (diff_order[0] == 0 && diff_order[1] == 0) ? 1.0 : 0.0; - - for (int k = 0; k < td->nb_coeffs[0]; k++) - basis_val[0][k] = tdi_basis_eval(priv->basis[0][0], diff_order[0], r_val, k); - for (int k = 0; k < td->nb_coeffs[1]; k++) - basis_val[1][k] = tdi_basis_eval(priv->basis[0][1], diff_order[1], theta_val, k); - - val += scalarproduct_metric_c(td->nb_coeffs[0], td->nb_coeffs[1], priv->coeffs_lapse, - basis_val[0], basis_val[1]); - - out[i] = val; - } + tp_execute(priv->tp, nb_coords, eval_var_kernel, &thread_data); fail: - free(basis_val[0]); - free(basis_val[1]); + free(thread_data.basis_val[0]); + free(thread_data.basis_val[1]); return 0; } -- cgit v1.2.3