diff options
-rw-r--r-- | ell_relax.c | 60 | ||||
-rw-r--r-- | ell_relax.h | 7 | ||||
-rw-r--r-- | libmg2d.v | 2 | ||||
-rw-r--r-- | mg2d.c | 155 | ||||
-rw-r--r-- | mg2d.h | 7 |
5 files changed, 175 insertions, 56 deletions
diff --git a/ell_relax.c b/ell_relax.c index a3f8968..042c4a5 100644 --- a/ell_relax.c +++ b/ell_relax.c @@ -22,6 +22,7 @@ #include <stdint.h> #include <stdlib.h> #include <string.h> +#include <threadpool.h> #include "common.h" #include "cpu.h" @@ -48,6 +49,8 @@ struct EllRelaxInternal { const double *fd_factors); double fd_factors[ELL_RELAX_DIFF_COEFF_NB]; double relax_factor; + + TPContext *tp_internal; }; static const struct { @@ -198,23 +201,29 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, } } +static void residual_calc_task(void *arg, unsigned int job_idx, unsigned int thread_idx) +{ + EllRelaxContext *ctx = arg; + EllRelaxInternal *priv = ctx->priv; + const ptrdiff_t offset = job_idx * priv->stride; + const double *diff_coeffs[ELL_RELAX_DIFF_COEFF_NB]; + + for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++) + diff_coeffs[i] = ctx->diff_coeffs[i] + offset; + + priv->residual_calc_line(ctx->domain_size[0], ctx->residual + offset, + priv->stride, ctx->u + offset, ctx->rhs + offset, + diff_coeffs, priv->fd_factors); +} + static void residual_calc(EllRelaxContext *ctx) { EllRelaxInternal *priv = ctx->priv; int64_t start; start = gettime(); - for (int idx1 = 0; idx1 < ctx->domain_size[1]; idx1++) { - const ptrdiff_t offset = idx1 * priv->stride; - const double *diff_coeffs[ELL_RELAX_DIFF_COEFF_NB]; - for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++) - diff_coeffs[i] = ctx->diff_coeffs[i] + offset; - - priv->residual_calc_line(ctx->domain_size[0], ctx->residual + offset, - priv->stride, ctx->u + offset, ctx->rhs + offset, - diff_coeffs, priv->fd_factors); - } + tp_execute(ctx->tp, ctx->domain_size[1], residual_calc_task, ctx); for (int i = 0; i < ARRAY_ELEMS(ctx->boundaries); i++) { const ptrdiff_t strides[2] = { 1, priv->stride }; @@ -329,19 +338,29 @@ static void boundaries_apply(EllRelaxContext *ctx) ctx->count_boundaries++; } +static void residual_add_task(void *arg, unsigned int job_idx, unsigned int thread_idx) +{ + EllRelaxContext *ctx = arg; + EllRelaxInternal *priv = ctx->priv; + ptrdiff_t offset = job_idx * priv->stride; + + for (int idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) { + ptrdiff_t idx = job_idx * ctx->u_stride + idx0; + + ctx->u[idx] += priv->relax_factor * ctx->residual[idx]; + } +} + int mg2di_ell_relax_step(EllRelaxContext *ctx) { EllRelaxInternal *priv = ctx->priv; - const double cfl_fac = priv->relax_factor * ctx->step[0] * ctx->step[1]; + const double cfl_fac = priv->relax_factor; int64_t start; start = gettime(); - for (int idx1 = 0; idx1 < ctx->domain_size[1]; idx1++) - for (int idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) { - ptrdiff_t idx = idx1 * priv->stride + idx0; - ctx->u[idx] += cfl_fac * ctx->residual[idx]; - } + tp_execute(ctx->tp, ctx->domain_size[1], residual_add_task, ctx); + ctx->time_correct += gettime() - start; ctx->count_correct++; @@ -386,6 +405,7 @@ int mg2di_ell_relax_init(EllRelaxContext *ctx) priv->relax_factor = relax_factors[ctx->fd_stencil - 1]; else priv->relax_factor = ctx->relax_factor; + priv->relax_factor *= ctx->step[0] * ctx->step[0]; priv->fd_factors[ELL_RELAX_DIFF_COEFF_00] = 1.0 / fd_denoms[ctx->fd_stencil - 1][ELL_RELAX_DIFF_COEFF_00]; priv->fd_factors[ELL_RELAX_DIFF_COEFF_10] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][ELL_RELAX_DIFF_COEFF_10] * ctx->step[0]); @@ -394,6 +414,13 @@ int mg2di_ell_relax_init(EllRelaxContext *ctx) priv->fd_factors[ELL_RELAX_DIFF_COEFF_02] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][ELL_RELAX_DIFF_COEFF_02] * SQR(ctx->step[1])); priv->fd_factors[ELL_RELAX_DIFF_COEFF_11] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][ELL_RELAX_DIFF_COEFF_11] * ctx->step[0] * ctx->step[1]); + if (!ctx->tp) { + ret = tp_init(&priv->tp_internal, 1); + if (ret < 0) + return ret; + ctx->tp = priv->tp_internal; + } + for (int i = 0; i < ARRAY_ELEMS(ctx->boundaries); i++) { if (ctx->boundaries[i].type == ELL_RELAX_BC_TYPE_FIXDIFF) { for (int k = 0; k < ctx->domain_size[boundary_def[i].stride_idx]; k++) @@ -517,6 +544,7 @@ void mg2di_ell_relax_free(EllRelaxContext **pctx) for (int i = 0; i < ARRAY_ELEMS(ctx->priv->boundary_val_base); i++) free(ctx->priv->boundary_val_base[i]); + tp_free(&ctx->priv->tp_internal); free(ctx->priv); free(ctx); diff --git a/ell_relax.h b/ell_relax.h index 43546c4..9bee372 100644 --- a/ell_relax.h +++ b/ell_relax.h @@ -38,6 +38,7 @@ #include <stddef.h> #include <stdint.h> +#include <threadpool.h> #include "log.h" @@ -155,6 +156,12 @@ typedef struct EllRelaxContext { MG2DLogger logger; /** + * The thread pool used for execution. May be set by the caller before + * mg2di_ell_relax_init(). + */ + TPContext *tp; + + /** * Flags indicating supported CPU features. */ int cpuflags; @@ -1,4 +1,4 @@ -LIBMG2D_1 { +LIBMG2D_2 { global: mg2d_*; local: *; }; @@ -22,6 +22,7 @@ #include <stdint.h> #include <stdio.h> #include <string.h> +#include <threadpool.h> #include "common.h" #include "cpu.h" @@ -53,6 +54,8 @@ typedef struct MG2DLevel { struct MG2DInternal { MG2DLogger logger; + TPContext *tp; + MG2DLevel *root; int cpuflags; @@ -139,38 +142,89 @@ static void mg_prolongate(EllRelaxContext *fine, double *dst, ptrdiff_t dst_stri } } -static void mg_interp_bilinear(EllRelaxContext *ctx_dst, double *dst, ptrdiff_t dst_stride, +typedef struct InterpParamsBilin { + size_t dst_domain_size[2]; + ptrdiff_t src_stride; + ptrdiff_t dst_stride; + const double *src; + double *dst; + double dx_dst; + double dy_dst; + double dx_src; + double dy_src; + double scalex; + double scaley; +} InterpParamsBilin; + +static int mg_interp_bilinear_kernel(void *arg, unsigned int job_idx, unsigned int thread_idx) +{ + InterpParamsBilin *ip = arg; + unsigned int idx1_dst = job_idx; + + double *dst = ip->dst; + const double *src = ip->src; + const double dx_dst = ip->dx_dst; + const double dy_dst = ip->dy_dst; + const double dx_src = ip->dx_src; + const double dy_src = ip->dy_src; + const double scalex = ip->scalex; + const double scaley = ip->scaley; + const ptrdiff_t src_stride = ip->src_stride; + const ptrdiff_t dst_stride = ip->dst_stride; + + const double y = idx1_dst * dy_dst; + const size_t idx1_src = idx1_dst * scaley; + const double y0 = idx1_src * dy_src; + const double y1 = y0 + dy_src; + const double fact_y0 = (y1 - y) / dy_src; + const double fact_y1 = (y - y0) / dy_src; + + for (size_t idx0_dst = 0; idx0_dst < ip->dst_domain_size[0]; idx0_dst++) { + const double x = idx0_dst * dx_dst; + const size_t idx0_src = idx0_dst * scalex; + const size_t idx_src = idx1_src * src_stride + idx0_src; + const double x0 = idx0_src * dx_src; + const double x1 = x0 + dx_src; + const double fact_x0 = (x1 - x) / dx_src; + const double fact_x1 = (x - x0) / dx_src; + + const double val = fact_x0 * (fact_y0 * src[idx_src] + fact_y1 * src[idx_src + src_stride]) + + fact_x1 * (fact_y0 * src[idx_src + 1] + fact_y1 * src[idx_src + 1 + src_stride]); + + dst[idx1_dst * dst_stride + idx0_dst] = val; + } + + return 0; +} + +static void mg_interp_bilinear(TPContext *tp, + EllRelaxContext *ctx_dst, double *dst, ptrdiff_t dst_stride, EllRelaxContext *ctx_src, const double *src, ptrdiff_t src_stride) { - const double dx_dst = ctx_dst->step[0]; - const double dy_dst = ctx_dst->step[1]; - const double dx_src = ctx_src->step[0]; - const double dy_src = ctx_src->step[1]; - const double scalex = dx_dst / dx_src; - const double scaley = dy_dst / dy_src; - - for (size_t idx1_dst = 0; idx1_dst < ctx_dst->domain_size[1]; idx1_dst++) { - const double y = idx1_dst * dy_dst; - const size_t idx1_src = idx1_dst * scaley; - const double y0 = idx1_src * dy_src; - const double y1 = y0 + dy_src; - const double fact_y0 = (y1 - y) / dy_src; - const double fact_y1 = (y - y0) / dy_src; - - for (size_t idx0_dst = 0; idx0_dst < ctx_dst->domain_size[0]; idx0_dst++) { - const double x = idx0_dst * dx_dst; - const size_t idx0_src = idx0_dst * scalex; - const size_t idx_src = idx1_src * src_stride + idx0_src; - const double x0 = idx0_src * dx_src; - const double x1 = x0 + dx_src; - const double fact_x0 = (x1 - x) / dx_src; - const double fact_x1 = (x - x0) / dx_src; - - const double val = fact_x0 * (fact_y0 * src[idx_src] + fact_y1 * src[idx_src + src_stride]) + - fact_x1 * (fact_y0 * src[idx_src + 1] + fact_y1 * src[idx_src + 1 + src_stride]); - - dst[idx1_dst * dst_stride + idx0_dst] = val; - } + InterpParamsBilin ip; + ip.dst_domain_size[0] = ctx_dst->domain_size[0]; + ip.dst_domain_size[1] = ctx_dst->domain_size[1]; + ip.src_stride = src_stride; + ip.dst_stride = dst_stride; + ip.dx_dst = ctx_dst->step[0]; + ip.dy_dst = ctx_dst->step[1]; + ip.dx_src = ctx_src->step[0]; + ip.dy_src = ctx_src->step[1]; + ip.scalex = ip.dx_dst / ip.dx_src; + ip.scaley = ip.dy_dst / ip.dy_src; + ip.src = src; + ip.dst = dst; + tp_execute(tp, ctx_dst->domain_size[1], mg_interp_bilinear_kernel, &ip); +} + +static void coarse_correct_task(void *arg, unsigned int job_idx, unsigned int thread_idx) +{ + MG2DLevel *level = arg; + + for (size_t idx0 = 0; idx0 < level->solver->domain_size[0]; idx0++) { + const ptrdiff_t idx_dst = job_idx * level->solver->u_stride + idx0; + const ptrdiff_t idx_src = job_idx * level->prolong_tmp_stride + idx0; + level->solver->u[idx_dst] -= level->prolong_tmp[idx_src]; } } @@ -221,7 +275,7 @@ static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level) /* restrict the residual as to the coarser-level rhs */ start = gettime(); if (!is_power2(level->solver->domain_size[0] - 1)) { - mg_interp_bilinear(level->child->solver, level->child->solver->rhs, level->child->solver->rhs_stride, + mg_interp_bilinear(ctx->priv->tp, level->child->solver, level->child->solver->rhs, level->child->solver->rhs_stride, level->solver, level->solver->residual, level->solver->residual_stride); } else { mg_restrict_fw(level->child->solver, level->child->solver->rhs, level->child->solver->rhs_stride, @@ -237,7 +291,7 @@ static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level) /* prolongate the coarser-level correction */ start = gettime(); if (!is_power2(level->solver->domain_size[0] - 1)) { - mg_interp_bilinear(level->solver, level->prolong_tmp, level->prolong_tmp_stride, + mg_interp_bilinear(ctx->priv->tp, level->solver, level->prolong_tmp, level->prolong_tmp_stride, level->child->solver, level->child->solver->u, level->child->solver->u_stride); } else { mg_prolongate(level->solver, level->prolong_tmp, level->prolong_tmp_stride, @@ -247,12 +301,9 @@ static int mg_solve_subgrid(MG2DContext *ctx, MG2DLevel *level) /* apply the correction */ start = gettime(); - for (size_t idx1 = 0; idx1 < level->solver->domain_size[1]; idx1++) - for (size_t idx0 = 0; idx0 < level->solver->domain_size[0]; idx0++) { - const ptrdiff_t idx_dst = idx1 * level->solver->u_stride + idx0; - const ptrdiff_t idx_src = idx1 * level->prolong_tmp_stride + idx0; - level->solver->u[idx_dst] -= level->prolong_tmp[idx_src]; - } + + tp_execute(ctx->priv->tp, level->solver->domain_size[1], coarse_correct_task, level); + level->time_correct += gettime() - start; /* re-init the current-level solver (re-calc the residual) */ @@ -315,7 +366,7 @@ static int mg_levels_init(MG2DContext *ctx) if (prev) { for (int i = 0; i < ARRAY_ELEMS(prev->solver->diff_coeffs); i++) { if (!is_power2(prev->solver->domain_size[0] - 1)) { - mg_interp_bilinear(cur->solver, cur->solver->diff_coeffs[i], cur->solver->diff_coeffs_stride, + mg_interp_bilinear(priv->tp, cur->solver, cur->solver->diff_coeffs[i], cur->solver->diff_coeffs_stride, prev->solver, prev->solver->diff_coeffs[i], prev->solver->diff_coeffs_stride); } else { mg_restrict_inject(cur->solver, cur->solver->diff_coeffs[i], cur->solver->diff_coeffs_stride, @@ -357,6 +408,24 @@ static int mg_levels_init(MG2DContext *ctx) return 0; } +static int threadpool_init(MG2DContext *ctx) +{ + MG2DInternal *priv = ctx->priv; + MG2DLevel *level = priv->root; + int ret; + + ret = tp_init(&priv->tp, ctx->nb_threads); + if (ret < 0) + return ret; + + while (level) { + level->solver->tp = priv->tp; + level = level->child; + } + + return 0; +} + int mg2d_solve(MG2DContext *ctx) { MG2DInternal *priv = ctx->priv; @@ -364,6 +433,12 @@ int mg2d_solve(MG2DContext *ctx) double res_prev, res_cur; int ret; + if (!priv->tp) { + ret = threadpool_init(ctx); + if (ret < 0) + return ret; + } + time_start = gettime(); ret = mg_levels_init(ctx); @@ -530,6 +605,7 @@ MG2DContext *mg2d_solver_alloc(size_t domain_size) ctx->nb_relax_pre = 1; ctx->nb_relax_post = 1; ctx->log_callback = log_default_callback; + ctx->nb_threads = 1; ctx->u = priv->root->solver->u; ctx->u_stride = priv->root->solver->u_stride; @@ -567,6 +643,7 @@ void mg2d_solver_free(MG2DContext **pctx) free(ctx->boundaries[i]); } + tp_free(&ctx->priv->tp); free(ctx->priv); free(ctx); @@ -260,6 +260,13 @@ typedef struct MG2DContext { * Distance between neighbouring gridpoints along coord1. */ ptrdiff_t diff_coeffs_stride; + + /** + * Number of threads to use for processing. + * + * Set to 0 to autodetect. Defaults to 1. + */ + unsigned int nb_threads; } MG2DContext; /** |