summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2018-12-27 16:26:29 +0100
committerAnton Khirnov <anton@khirnov.net>2018-12-27 16:26:29 +0100
commit70186149b931a49f1eb599d2f207fdc3d3048388 (patch)
treee8f825af87c92d30d20f1ca4fb44c1e57a5fe25a
parent7606e74f1a0310c1031aad48f77a7c701791025a (diff)
Add threading support through libthreadpool.
-rw-r--r--ell_relax.c60
-rw-r--r--ell_relax.h7
-rw-r--r--libmg2d.v2
-rw-r--r--mg2d.c155
-rw-r--r--mg2d.h7
5 files changed, 175 insertions, 56 deletions
diff --git a/ell_relax.c b/ell_relax.c
index a3f8968..042c4a5 100644
--- a/ell_relax.c
+++ b/ell_relax.c
@@ -22,6 +22,7 @@
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
+#include <threadpool.h>
#include "common.h"
#include "cpu.h"
@@ -48,6 +49,8 @@ struct EllRelaxInternal {
const double *fd_factors);
double fd_factors[ELL_RELAX_DIFF_COEFF_NB];
double relax_factor;
+
+ TPContext *tp_internal;
};
static const struct {
@@ -198,23 +201,29 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst,
}
}
+static void residual_calc_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
+{
+ EllRelaxContext *ctx = arg;
+ EllRelaxInternal *priv = ctx->priv;
+ const ptrdiff_t offset = job_idx * priv->stride;
+ const double *diff_coeffs[ELL_RELAX_DIFF_COEFF_NB];
+
+ for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++)
+ diff_coeffs[i] = ctx->diff_coeffs[i] + offset;
+
+ priv->residual_calc_line(ctx->domain_size[0], ctx->residual + offset,
+ priv->stride, ctx->u + offset, ctx->rhs + offset,
+ diff_coeffs, priv->fd_factors);
+}
+
static void residual_calc(EllRelaxContext *ctx)
{
EllRelaxInternal *priv = ctx->priv;
int64_t start;
start = gettime();
- for (int idx1 = 0; idx1 < ctx->domain_size[1]; idx1++) {
- const ptrdiff_t offset = idx1 * priv->stride;
- const double *diff_coeffs[ELL_RELAX_DIFF_COEFF_NB];
- for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++)
- diff_coeffs[i] = ctx->diff_coeffs[i] + offset;
-
- priv->residual_calc_line(ctx->domain_size[0], ctx->residual + offset,
- priv->stride, ctx->u + offset, ctx->rhs + offset,
- diff_coeffs, priv->fd_factors);
- }
+ tp_execute(ctx->tp, ctx->domain_size[1], residual_calc_task, ctx);
for (int i = 0; i < ARRAY_ELEMS(ctx->boundaries); i++) {
const ptrdiff_t strides[2] = { 1, priv->stride };
@@ -329,19 +338,29 @@ static void boundaries_apply(EllRelaxContext *ctx)
ctx->count_boundaries++;
}
+static void residual_add_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
+{
+ EllRelaxContext *ctx = arg;
+ EllRelaxInternal *priv = ctx->priv;
+ ptrdiff_t offset = job_idx * priv->stride;
+
+ for (int idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) {
+ ptrdiff_t idx = job_idx * ctx->u_stride + idx0;
+
+ ctx->u[idx] += priv->relax_factor * ctx->residual[idx];
+ }
+}
+
int mg2di_ell_relax_step(EllRelaxContext *ctx)
{
EllRelaxInternal *priv = ctx->priv;
- const double cfl_fac = priv->relax_factor * ctx->step[0] * ctx->step[1];
+ const double cfl_fac = priv->relax_factor;
int64_t start;
start = gettime();
- for (int idx1 = 0; idx1 < ctx->domain_size[1]; idx1++)
- for (int idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) {
- ptrdiff_t idx = idx1 * priv->stride + idx0;
- ctx->u[idx] += cfl_fac * ctx->residual[idx];
- }
+ tp_execute(ctx->tp, ctx->domain_size[1], residual_add_task, ctx);
+
ctx->time_correct += gettime() - start;
ctx->count_correct++;
@@ -386,6 +405,7 @@ int mg2di_ell_relax_init(EllRelaxContext *ctx)
priv->relax_factor = relax_factors[ctx->fd_stencil - 1];
else
priv->relax_factor = ctx->relax_factor;
+ priv->relax_factor *= ctx->step[0] * ctx->step[0];
priv->fd_factors[ELL_RELAX_DIFF_COEFF_00] = 1.0 / fd_denoms[ctx->fd_stencil - 1][ELL_RELAX_DIFF_COEFF_00];
priv->fd_factors[ELL_RELAX_DIFF_COEFF_10] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][ELL_RELAX_DIFF_COEFF_10] * ctx->step[0]);
@@ -394,6 +414,13 @@ int mg2di_ell_relax_init(EllRelaxContext *ctx)
priv->fd_factors[ELL_RELAX_DIFF_COEFF_02] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][ELL_RELAX_DIFF_COEFF_02] * SQR(ctx->step[1]));
priv->fd_factors[ELL_RELAX_DIFF_COEFF_11] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][ELL_RELAX_DIFF_COEFF_11] * ctx->step[0] * ctx->step[1]);
+ if (!ctx->tp) {
+ ret = tp_init(&priv->tp_internal, 1);
+ if (ret < 0)
+ return ret;
+ ctx->tp = priv->tp_internal;
+ }
+
for (int i = 0; i < ARRAY_ELEMS(ctx->boundaries); i++) {
if (ctx->boundaries[i].type == ELL_RELAX_BC_TYPE_FIXDIFF) {
for (int k = 0; k < ctx->domain_size[boundary_def[i].stride_idx]; k++)
@@ -517,6 +544,7 @@ void mg2di_ell_relax_free(EllRelaxContext **pctx)
for (int i = 0; i < ARRAY_ELEMS(ctx->priv->boundary_val_base); i++)
free(ctx->priv->boundary_val_base[i]);
+ tp_free(&ctx->priv->tp_internal);
free(ctx->priv);
free(ctx);
diff --git a/ell_relax.h b/ell_relax.h
index 43546c4..9bee372 100644
--- a/ell_relax.h
+++ b/ell_relax.h
@@ -38,6 +38,7 @@
#include <stddef.h>
#include <stdint.h>
+#include <threadpool.h>
#include "log.h"
@@ -155,6 +156,12 @@ typedef struct EllRelaxContext {
MG2DLogger logger;
/**
+ * The thread pool used for execution. May be set by the caller before
+ * mg2di_ell_relax_init().
+ */
+ TPContext *tp;
+
+ /**
* Flags indicating supported CPU features.
*/
int cpuflags;
diff --git a/libmg2d.v b/libmg2d.v
index 271d7eb..e4b7b24 100644
--- a/libmg2d.v
+++ b/libmg2d.v
@@ -1,4 +1,4 @@
-LIBMG2D_1 {
+LIBMG2D_2 {
global: mg2d_*;
local: *;
};
diff --git a/mg2d.c b/mg2d.c
index 4ad048c..8968306 100644
--- a/mg2d.c
+++ b/mg2d.c
@@ -22,6 +22,7 @@
#include <stdint.h>
#include <stdio.h>
#include <string.h>
+#include <threadpool.h>
#include "common.h"
#include "cpu.h"
@@ -53,6 +54,8 @@ typedef struct MG2DLevel {
struct MG2DInternal {
MG2DLogger logger;
+ TPContext *tp;
+
MG2DLevel *root;
int cpuflags;
@@ -139,38 +142,89 @@ static void mg_prolongate(EllRelaxContext *fine, double *dst, ptrdiff_t dst_stri
}
}
-static void mg_interp_bilinear(EllRelaxContext *ctx_dst, double *dst, ptrdiff_t dst_stride,
+typedef struct InterpParamsBilin {
+ size_t dst_domain_size[2];
+ ptrdiff_t src_stride;
+ ptrdiff_t dst_stride;
+ const double *src;
+ double *dst;
+ double dx_dst;
+ double dy_dst;
+ double dx_src;
+ double dy_src;
+ double scalex;
+ double scaley;
+} InterpParamsBilin;
+
+static int mg_interp_bilinear_kernel(void *arg, unsigned int job_idx, unsigned int thread_idx)
+{
+ InterpParamsBilin *ip = arg;
+ unsigned int idx1_dst = job_idx;
+
+ double *dst = ip->dst;
+ const double *src = ip->src;
+ const double dx_dst = ip->dx_dst;
+ const double dy_dst = ip->dy_dst;
+ const double dx_src = ip->dx_src;
+ const double dy_src = ip->dy_src;
+ const double scalex = ip->scalex;
+ const double scaley = ip->scaley;
+ const ptrdiff_t src_stride = ip->src_stride;
+ const ptrdiff_t dst_stride = ip->dst_stride;
+
+ const double y = idx1_dst * dy_dst;
+ const size_t idx1_src = idx1_dst * scaley;
+ const double y0 = idx1_src * dy_src;
+ const double y1 = y0 + dy_src;
+ const double fact_y0 = (y1 - y) / dy_src;
+ const double fact_y1 = (y - y0) / dy_src;
+
+ for (size_t idx0_dst = 0; idx0_dst < ip->dst_domain_size[0]; idx0_dst++) {
+ const double x = idx0_dst * dx_dst;
+ const size_t idx0_src = idx0_dst * scalex;
+ const size_t idx_src = idx1_src * src_stride + idx0_src;
+ const double x0 = idx0_src * dx_src;
+ const double x1 = x0 + dx_src;
+ const double fact_x0 = (x1 - x) / dx_src;
+ const double fact_x1 = (x - x0) / dx_src;
+
+ const double val = fact_x0 * (fact_y0 * src[idx_src] + fact_y1 * src[idx_src + src_stride]) +
+ fact_x1 * (fact_y0 * src[idx_src + 1] + fact_y1 * src[idx_src + 1 + src_stride]);
+
+ dst[idx1_dst * dst_stride + idx0_dst] = val;
+ }
+
+ return 0;
+}
+
+static void mg_interp_bilinear(TPContext *tp,
+ EllRelaxContext *ctx_dst, double *dst, ptrdiff_t dst_stride,
EllRelaxContext *ctx_src, const double *src, ptrdiff_t src_stride)
{
- const double dx_dst = ctx_dst->step[0];
- const double dy_dst = ctx_dst->step[1];
- const double dx_src = ctx_src->step[0];
- const double dy_src = ctx_src->step[1];
- const double scalex = dx_dst / dx_src;
- const double scaley = dy_dst / dy_src;
-
- for (size_t idx1_dst = 0; idx1_dst < ctx_dst->domain_size[1]; idx1_dst++) {
- const double y = idx1_dst * dy_dst;
- const size_t idx1_src = idx1_dst * scaley;
- const double y0 = idx1_src * dy_src;
- const double y1 = y0 + dy_src;
- const double fact_y0 = (y1 - y) / dy_src;
- const double fact_y1 = (y - y0) / dy_src;
-
- for (size_t idx0_dst = 0; idx0_dst < ctx_dst->domain_size[0]; idx0_dst++) {
- const double x = idx0_dst * dx_dst;
- const size_t idx0_src = idx0_dst * scalex;
- const size_t idx_src = idx1_src * src_stride + idx0_src;
- const double x0 = idx0_src * dx_src;
- const double x1 = x0 + dx_src;
- const double fact_x0 = (x1 - x) / dx_src;
- const double fact_x1 = (x - x0) / dx_src;
-
- const double val = fact_x0 * (fact_y0 * src[idx_src] + fact_y1 * src[idx_src + src_stride]) +
- fact_x1 * (fact_y0 * src[idx_src + 1] + fact_y1 * src[idx_src + 1 + src_stride]);
-
- dst[idx1_dst * dst_stride + idx0_dst] = val;
- }
+ InterpParamsBilin ip;
+ ip.dst_domain_size[0] = ctx_dst->domain_size[0];
+ ip.dst_domain_size[1] = ctx_dst->domain_size[1];
+ ip.src_stride = src_stride;
+ ip.dst_stride = dst_stride;
+ ip.dx_dst = ctx_dst->step[0];
+ ip.dy_dst = ctx_dst->step[1];
+ ip.dx_src = ctx_src->step[0];
+ ip.dy_src = ctx_src->step[1];
+ ip.scalex = ip.dx_dst / ip.dx_src;
+ ip.scaley = ip.dy_dst / ip.dy_src;
+ ip.src = src;
+ ip.dst = dst;
+ tp_execute(tp, ctx_dst->domain_size[1], mg_interp_bilinear_kernel, &ip);
+}
+
+static void coarse_correct_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
+{
+ MG2DLevel *level = arg;
+
+ for (size_t idx0 = 0; idx0 < level->solver->domain_size[0]; idx0++) {
+ const ptrdiff_t idx_dst = job_idx * level->solver->u_stride + idx0;
+ const ptrdiff_t idx_src = job_idx * level->prolong_tmp_stride + idx0;
+ level->solver->u[idx_dst] -= level->prolong_tmp[idx_src];
}
}
@@ -221,7 +275,7 @@ static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level)
/* restrict the residual as to the coarser-level rhs */
start = gettime();
if (!is_power2(level->solver->domain_size[0] - 1)) {
- mg_interp_bilinear(level->child->solver, level->child->solver->rhs, level->child->solver->rhs_stride,
+ mg_interp_bilinear(ctx->priv->tp, level->child->solver, level->child->solver->rhs, level->child->solver->rhs_stride,
level->solver, level->solver->residual, level->solver->residual_stride);
} else {
mg_restrict_fw(level->child->solver, level->child->solver->rhs, level->child->solver->rhs_stride,
@@ -237,7 +291,7 @@ static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level)
/* prolongate the coarser-level correction */
start = gettime();
if (!is_power2(level->solver->domain_size[0] - 1)) {
- mg_interp_bilinear(level->solver, level->prolong_tmp, level->prolong_tmp_stride,
+ mg_interp_bilinear(ctx->priv->tp, level->solver, level->prolong_tmp, level->prolong_tmp_stride,
level->child->solver, level->child->solver->u, level->child->solver->u_stride);
} else {
mg_prolongate(level->solver, level->prolong_tmp, level->prolong_tmp_stride,
@@ -247,12 +301,9 @@ static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level)
/* apply the correction */
start = gettime();
- for (size_t idx1 = 0; idx1 < level->solver->domain_size[1]; idx1++)
- for (size_t idx0 = 0; idx0 < level->solver->domain_size[0]; idx0++) {
- const ptrdiff_t idx_dst = idx1 * level->solver->u_stride + idx0;
- const ptrdiff_t idx_src = idx1 * level->prolong_tmp_stride + idx0;
- level->solver->u[idx_dst] -= level->prolong_tmp[idx_src];
- }
+
+ tp_execute(ctx->priv->tp, level->solver->domain_size[1], coarse_correct_task, level);
+
level->time_correct += gettime() - start;
/* re-init the current-level solver (re-calc the residual) */
@@ -315,7 +366,7 @@ static int mg_levels_init(MG2DContext *ctx)
if (prev) {
for (int i = 0; i < ARRAY_ELEMS(prev->solver->diff_coeffs); i++) {
if (!is_power2(prev->solver->domain_size[0] - 1)) {
- mg_interp_bilinear(cur->solver, cur->solver->diff_coeffs[i], cur->solver->diff_coeffs_stride,
+ mg_interp_bilinear(priv->tp, cur->solver, cur->solver->diff_coeffs[i], cur->solver->diff_coeffs_stride,
prev->solver, prev->solver->diff_coeffs[i], prev->solver->diff_coeffs_stride);
} else {
mg_restrict_inject(cur->solver, cur->solver->diff_coeffs[i], cur->solver->diff_coeffs_stride,
@@ -357,6 +408,24 @@ static int mg_levels_init(MG2DContext *ctx)
return 0;
}
+static int threadpool_init(MG2DContext *ctx)
+{
+ MG2DInternal *priv = ctx->priv;
+ MG2DLevel *level = priv->root;
+ int ret;
+
+ ret = tp_init(&priv->tp, ctx->nb_threads);
+ if (ret < 0)
+ return ret;
+
+ while (level) {
+ level->solver->tp = priv->tp;
+ level = level->child;
+ }
+
+ return 0;
+}
+
int mg2d_solve(MG2DContext *ctx)
{
MG2DInternal *priv = ctx->priv;
@@ -364,6 +433,12 @@ int mg2d_solve(MG2DContext *ctx)
double res_prev, res_cur;
int ret;
+ if (!priv->tp) {
+ ret = threadpool_init(ctx);
+ if (ret < 0)
+ return ret;
+ }
+
time_start = gettime();
ret = mg_levels_init(ctx);
@@ -530,6 +605,7 @@ MG2DContext *mg2d_solver_alloc(size_t domain_size)
ctx->nb_relax_pre = 1;
ctx->nb_relax_post = 1;
ctx->log_callback = log_default_callback;
+ ctx->nb_threads = 1;
ctx->u = priv->root->solver->u;
ctx->u_stride = priv->root->solver->u_stride;
@@ -567,6 +643,7 @@ void mg2d_solver_free(MG2DContext **pctx)
free(ctx->boundaries[i]);
}
+ tp_free(&ctx->priv->tp);
free(ctx->priv);
free(ctx);
diff --git a/mg2d.h b/mg2d.h
index 2682918..c487215 100644
--- a/mg2d.h
+++ b/mg2d.h
@@ -260,6 +260,13 @@ typedef struct MG2DContext {
* Distance between neighbouring gridpoints along coord1.
*/
ptrdiff_t diff_coeffs_stride;
+
+ /**
+ * Number of threads to use for processing.
+ *
+ * Set to 0 to autodetect. Defaults to 1.
+ */
+ unsigned int nb_threads;
} MG2DContext;
/**