From 70186149b931a49f1eb599d2f207fdc3d3048388 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Thu, 27 Dec 2018 16:26:29 +0100 Subject: Add threading support through libthreadpool. --- mg2d.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 116 insertions(+), 39 deletions(-) (limited to 'mg2d.c') diff --git a/mg2d.c b/mg2d.c index 4ad048c..8968306 100644 --- a/mg2d.c +++ b/mg2d.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "common.h" #include "cpu.h" @@ -53,6 +54,8 @@ typedef struct MG2DLevel { struct MG2DInternal { MG2DLogger logger; + TPContext *tp; + MG2DLevel *root; int cpuflags; @@ -139,38 +142,89 @@ static void mg_prolongate(EllRelaxContext *fine, double *dst, ptrdiff_t dst_stri } } -static void mg_interp_bilinear(EllRelaxContext *ctx_dst, double *dst, ptrdiff_t dst_stride, +typedef struct InterpParamsBilin { + size_t dst_domain_size[2]; + ptrdiff_t src_stride; + ptrdiff_t dst_stride; + const double *src; + double *dst; + double dx_dst; + double dy_dst; + double dx_src; + double dy_src; + double scalex; + double scaley; +} InterpParamsBilin; + +static int mg_interp_bilinear_kernel(void *arg, unsigned int job_idx, unsigned int thread_idx) +{ + InterpParamsBilin *ip = arg; + unsigned int idx1_dst = job_idx; + + double *dst = ip->dst; + const double *src = ip->src; + const double dx_dst = ip->dx_dst; + const double dy_dst = ip->dy_dst; + const double dx_src = ip->dx_src; + const double dy_src = ip->dy_src; + const double scalex = ip->scalex; + const double scaley = ip->scaley; + const ptrdiff_t src_stride = ip->src_stride; + const ptrdiff_t dst_stride = ip->dst_stride; + + const double y = idx1_dst * dy_dst; + const size_t idx1_src = idx1_dst * scaley; + const double y0 = idx1_src * dy_src; + const double y1 = y0 + dy_src; + const double fact_y0 = (y1 - y) / dy_src; + const double fact_y1 = (y - y0) / dy_src; + + for (size_t idx0_dst = 0; idx0_dst < ip->dst_domain_size[0]; idx0_dst++) { + const double x = idx0_dst * dx_dst; + const size_t idx0_src = idx0_dst * scalex; + const size_t idx_src = idx1_src * src_stride + idx0_src; + const double x0 = idx0_src * dx_src; + const double x1 = x0 + dx_src; + const double fact_x0 = (x1 - x) / dx_src; + const double fact_x1 = (x - x0) / dx_src; + + const double val = fact_x0 * (fact_y0 * src[idx_src] + fact_y1 * src[idx_src + src_stride]) + + fact_x1 * (fact_y0 * src[idx_src + 1] + fact_y1 * src[idx_src + 1 + src_stride]); + + dst[idx1_dst * dst_stride + idx0_dst] = val; + } + + return 0; +} + +static void mg_interp_bilinear(TPContext *tp, + EllRelaxContext *ctx_dst, double *dst, ptrdiff_t dst_stride, EllRelaxContext *ctx_src, const double *src, ptrdiff_t src_stride) { - const double dx_dst = ctx_dst->step[0]; - const double dy_dst = ctx_dst->step[1]; - const double dx_src = ctx_src->step[0]; - const double dy_src = ctx_src->step[1]; - const double scalex = dx_dst / dx_src; - const double scaley = dy_dst / dy_src; - - for (size_t idx1_dst = 0; idx1_dst < ctx_dst->domain_size[1]; idx1_dst++) { - const double y = idx1_dst * dy_dst; - const size_t idx1_src = idx1_dst * scaley; - const double y0 = idx1_src * dy_src; - const double y1 = y0 + dy_src; - const double fact_y0 = (y1 - y) / dy_src; - const double fact_y1 = (y - y0) / dy_src; - - for (size_t idx0_dst = 0; idx0_dst < ctx_dst->domain_size[0]; idx0_dst++) { - const double x = idx0_dst * dx_dst; - const size_t idx0_src = idx0_dst * scalex; - const size_t idx_src = idx1_src * src_stride + idx0_src; - const double x0 = idx0_src * dx_src; - const double x1 = x0 + dx_src; - const double fact_x0 = (x1 - x) / dx_src; - const double fact_x1 = (x - x0) / dx_src; - - const double val = fact_x0 * (fact_y0 * src[idx_src] + fact_y1 * src[idx_src + src_stride]) + - fact_x1 * (fact_y0 * src[idx_src + 1] + fact_y1 * src[idx_src + 1 + src_stride]); - - dst[idx1_dst * dst_stride + idx0_dst] = val; - } + InterpParamsBilin ip; + ip.dst_domain_size[0] = ctx_dst->domain_size[0]; + ip.dst_domain_size[1] = ctx_dst->domain_size[1]; + ip.src_stride = src_stride; + ip.dst_stride = dst_stride; + ip.dx_dst = ctx_dst->step[0]; + ip.dy_dst = ctx_dst->step[1]; + ip.dx_src = ctx_src->step[0]; + ip.dy_src = ctx_src->step[1]; + ip.scalex = ip.dx_dst / ip.dx_src; + ip.scaley = ip.dy_dst / ip.dy_src; + ip.src = src; + ip.dst = dst; + tp_execute(tp, ctx_dst->domain_size[1], mg_interp_bilinear_kernel, &ip); +} + +static void coarse_correct_task(void *arg, unsigned int job_idx, unsigned int thread_idx) +{ + MG2DLevel *level = arg; + + for (size_t idx0 = 0; idx0 < level->solver->domain_size[0]; idx0++) { + const ptrdiff_t idx_dst = job_idx * level->solver->u_stride + idx0; + const ptrdiff_t idx_src = job_idx * level->prolong_tmp_stride + idx0; + level->solver->u[idx_dst] -= level->prolong_tmp[idx_src]; } } @@ -221,7 +275,7 @@ static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level) /* restrict the residual as to the coarser-level rhs */ start = gettime(); if (!is_power2(level->solver->domain_size[0] - 1)) { - mg_interp_bilinear(level->child->solver, level->child->solver->rhs, level->child->solver->rhs_stride, + mg_interp_bilinear(ctx->priv->tp, level->child->solver, level->child->solver->rhs, level->child->solver->rhs_stride, level->solver, level->solver->residual, level->solver->residual_stride); } else { mg_restrict_fw(level->child->solver, level->child->solver->rhs, level->child->solver->rhs_stride, @@ -237,7 +291,7 @@ static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level) /* prolongate the coarser-level correction */ start = gettime(); if (!is_power2(level->solver->domain_size[0] - 1)) { - mg_interp_bilinear(level->solver, level->prolong_tmp, level->prolong_tmp_stride, + mg_interp_bilinear(ctx->priv->tp, level->solver, level->prolong_tmp, level->prolong_tmp_stride, level->child->solver, level->child->solver->u, level->child->solver->u_stride); } else { mg_prolongate(level->solver, level->prolong_tmp, level->prolong_tmp_stride, @@ -247,12 +301,9 @@ static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level) /* apply the correction */ start = gettime(); - for (size_t idx1 = 0; idx1 < level->solver->domain_size[1]; idx1++) - for (size_t idx0 = 0; idx0 < level->solver->domain_size[0]; idx0++) { - const ptrdiff_t idx_dst = idx1 * level->solver->u_stride + idx0; - const ptrdiff_t idx_src = idx1 * level->prolong_tmp_stride + idx0; - level->solver->u[idx_dst] -= level->prolong_tmp[idx_src]; - } + + tp_execute(ctx->priv->tp, level->solver->domain_size[1], coarse_correct_task, level); + level->time_correct += gettime() - start; /* re-init the current-level solver (re-calc the residual) */ @@ -315,7 +366,7 @@ static int mg_levels_init(MG2DContext *ctx) if (prev) { for (int i = 0; i < ARRAY_ELEMS(prev->solver->diff_coeffs); i++) { if (!is_power2(prev->solver->domain_size[0] - 1)) { - mg_interp_bilinear(cur->solver, cur->solver->diff_coeffs[i], cur->solver->diff_coeffs_stride, + mg_interp_bilinear(priv->tp, cur->solver, cur->solver->diff_coeffs[i], cur->solver->diff_coeffs_stride, prev->solver, prev->solver->diff_coeffs[i], prev->solver->diff_coeffs_stride); } else { mg_restrict_inject(cur->solver, cur->solver->diff_coeffs[i], cur->solver->diff_coeffs_stride, @@ -357,6 +408,24 @@ static int mg_levels_init(MG2DContext *ctx) return 0; } +static int threadpool_init(MG2DContext *ctx) +{ + MG2DInternal *priv = ctx->priv; + MG2DLevel *level = priv->root; + int ret; + + ret = tp_init(&priv->tp, ctx->nb_threads); + if (ret < 0) + return ret; + + while (level) { + level->solver->tp = priv->tp; + level = level->child; + } + + return 0; +} + int mg2d_solve(MG2DContext *ctx) { MG2DInternal *priv = ctx->priv; @@ -364,6 +433,12 @@ int mg2d_solve(MG2DContext *ctx) double res_prev, res_cur; int ret; + if (!priv->tp) { + ret = threadpool_init(ctx); + if (ret < 0) + return ret; + } + time_start = gettime(); ret = mg_levels_init(ctx); @@ -530,6 +605,7 @@ MG2DContext *mg2d_solver_alloc(size_t domain_size) ctx->nb_relax_pre = 1; ctx->nb_relax_post = 1; ctx->log_callback = log_default_callback; + ctx->nb_threads = 1; ctx->u = priv->root->solver->u; ctx->u_stride = priv->root->solver->u_stride; @@ -567,6 +643,7 @@ void mg2d_solver_free(MG2DContext **pctx) free(ctx->boundaries[i]); } + tp_free(&ctx->priv->tp); free(ctx->priv); free(ctx); -- cgit v1.2.3