From 7c18c16aeab0476dcc0a7e1d10a3f8ca0db753ef Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Mon, 25 Mar 2019 19:33:12 +0100 Subject: egs_exact: parallelize matrix construction --- ell_grid_solve.c | 298 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 165 insertions(+), 133 deletions(-) diff --git a/ell_grid_solve.c b/ell_grid_solve.c index 12863d0..184c9e3 100644 --- a/ell_grid_solve.c +++ b/ell_grid_solve.c @@ -56,9 +56,11 @@ typedef struct EGSExactInternal { double *mat_f; double *rhs; double *x; - double *scratch_line; int *ipiv; + double *scratch_lines; + ptrdiff_t scratch_lines_allocated; + BiCGStabContext *bicgstab; } EGSExactInternal; @@ -375,171 +377,187 @@ static void fill_mat_s2(double *mat_row, NDArray **diff_coeffs, double *fd_facto mat_row[row_offset - 2 - 2 * row_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; } -static int solve_exact(EGSContext *ctx) +static void mat_fill_row(EGSContext *ctx, double *scratch_line, + ptrdiff_t idx1, ptrdiff_t idx0) { EGSInternal *priv = ctx->priv; - EGSExactContext *ec = ctx->solver_data; EGSExactInternal *e = &priv->e; - int64_t start; - int ret; - start = gettime(); + const ptrdiff_t idx_src = idx1 * priv->stride + idx0; + const ptrdiff_t mat_row_idx = idx1 * ctx->domain_size[0] + idx0; - memset(e->mat, 0, SQR(e->N) * sizeof(*e->mat)); + int is_bnd[4], boundary; + ptrdiff_t row_stride; + ptrdiff_t row_offset; - for (ptrdiff_t idx1 = 0; idx1 < ctx->domain_size[1]; idx1++) { - int is_bnd[4], boundary; + double *mat_row; - is_bnd[MG2D_BOUNDARY_1L] = idx1 < ctx->fd_stencil; - is_bnd[MG2D_BOUNDARY_1U] = idx1 >= ctx->domain_size[1] - ctx->fd_stencil; + is_bnd[MG2D_BOUNDARY_0L] = idx0 < ctx->fd_stencil; + is_bnd[MG2D_BOUNDARY_0U] = idx0 >= ctx->domain_size[0] - ctx->fd_stencil; + is_bnd[MG2D_BOUNDARY_1L] = idx1 < ctx->fd_stencil; + is_bnd[MG2D_BOUNDARY_1U] = idx1 >= ctx->domain_size[1] - ctx->fd_stencil; - for (ptrdiff_t idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) { - const ptrdiff_t idx_src = idx1 * priv->stride + idx0; - const ptrdiff_t mat_row_idx = idx1 * ctx->domain_size[0] + idx0; + boundary = is_bnd[0] || is_bnd[1] || is_bnd[2] || is_bnd[3]; - ptrdiff_t row_stride; - ptrdiff_t row_offset; + if (boundary) { + memset(scratch_line, 0, e->N_ghosts * sizeof(*scratch_line)); + row_stride = ctx->domain_size[0] + 2 * ctx->fd_stencil; + mat_row = scratch_line + row_stride * ctx->fd_stencil + ctx->fd_stencil; + } else { + mat_row = e->mat + e->N * mat_row_idx; + row_stride = ctx->domain_size[0]; + } - double *mat_row; + row_offset = idx1 * row_stride + idx0; - is_bnd[MG2D_BOUNDARY_0L] = idx0 < ctx->fd_stencil; - is_bnd[MG2D_BOUNDARY_0U] = idx0 >= ctx->domain_size[0] - ctx->fd_stencil; + e->fill_mat(mat_row, ctx->diff_coeffs, ctx->priv->fd_factors, idx_src, row_stride, row_offset); - boundary = is_bnd[0] || is_bnd[1] || is_bnd[2] || is_bnd[3]; + e->rhs[mat_row_idx] = ctx->rhs->data[idx_src]; - if (boundary) { - memset(e->scratch_line, 0, e->N_ghosts * sizeof(*e->scratch_line)); - row_stride = ctx->domain_size[0] + 2 * ctx->fd_stencil; - mat_row = e->scratch_line + row_stride * ctx->fd_stencil + ctx->fd_stencil; - } else { - mat_row = e->mat + e->N * mat_row_idx; - row_stride = ctx->domain_size[0]; - } + if (boundary) { + double *mat_dst = e->mat + e->N * mat_row_idx; - row_offset = idx1 * row_stride + idx0; + const MG2DBoundary *bnd_fixval = NULL; - e->fill_mat(mat_row, ctx->diff_coeffs, ctx->priv->fd_factors, idx_src, row_stride, row_offset); + for (int bnd_loc = 0; bnd_loc < ARRAY_ELEMS(ctx->boundaries); bnd_loc++) { + const int ci = mg2d_bnd_coord_idx(bnd_loc); + const ptrdiff_t idx[] = { idx0, idx1 }; - e->rhs[mat_row_idx] = ctx->rhs->data[idx_src]; + if (!(idx[ci] == (ctx->domain_size[ci] - 1) * mg2d_bnd_is_upper(bnd_loc) && + ctx->boundaries[bnd_loc]->type == MG2D_BC_TYPE_FIXVAL)) + continue; + + bnd_fixval = ctx->boundaries[bnd_loc]; - if (boundary) { - double *mat_dst = e->mat + e->N * mat_row_idx; + mat_dst[mat_row_idx] = 1.0; + e->rhs[mat_row_idx] = bnd_fixval->val[idx[!ci]]; + memset(scratch_line, 0, e->N_ghosts * sizeof(*scratch_line)); + break; + } - const MG2DBoundary *bnd_fixval = NULL; + if (!bnd_fixval) { + /* apply the boundary conditions to eliminate the ghostpoint values */ + // reflect + for (int bnd_loc = 0; bnd_loc < ARRAY_ELEMS(ctx->boundaries); bnd_loc++) { + MG2DBoundary *bnd = ctx->boundaries[bnd_loc]; - for (int bnd_loc = 0; bnd_loc < ARRAY_ELEMS(ctx->boundaries); bnd_loc++) { - const int ci = mg2d_bnd_coord_idx(bnd_loc); - const ptrdiff_t idx[] = { idx0, idx1 }; + const int ci = mg2d_bnd_coord_idx(bnd_loc); + const int dir = mg2d_bnd_out_dir(bnd_loc); - if (!(idx[ci] == (ctx->domain_size[ci] - 1) * mg2d_bnd_is_upper(bnd_loc) && - ctx->boundaries[bnd_loc]->type == MG2D_BC_TYPE_FIXVAL)) - continue; + const ptrdiff_t dst_stride[2] = { 1, row_stride }; + const ptrdiff_t bnd_stride[2] = { dst_stride[!ci], dst_stride[ci] }; + const ptrdiff_t bnd_len[2] = { ctx->domain_size[!ci], ctx->domain_size[ci] }; - bnd_fixval = ctx->boundaries[bnd_loc]; + double *dst = mat_row + mg2d_bnd_is_upper(bnd_loc) * ((bnd_len[1] - 1) * bnd_stride[1]); - mat_dst[mat_row_idx] = 1.0; - e->rhs[mat_row_idx] = bnd_fixval->val[idx[!ci]]; - memset(e->scratch_line, 0, e->N_ghosts * sizeof(*e->scratch_line)); - break; - } + if (!is_bnd[bnd_loc] || bnd->type != MG2D_BC_TYPE_REFLECT) + continue; - if (!bnd_fixval) { - /* apply the boundary conditions to eliminate the ghostpoint values */ - // reflect - for (int bnd_loc = 0; bnd_loc < ARRAY_ELEMS(ctx->boundaries); bnd_loc++) { - MG2DBoundary *bnd = ctx->boundaries[bnd_loc]; + for (ptrdiff_t bnd_layer = 1; bnd_layer <= ctx->fd_stencil; bnd_layer++) + for (ptrdiff_t bnd_idx = -(ptrdiff_t)ctx->fd_stencil; bnd_idx < (ptrdiff_t)(bnd_len[0] + ctx->fd_stencil); bnd_idx++) { + dst[-dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]] += dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]]; + dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]] = 0.0; + } + } - const int ci = mg2d_bnd_coord_idx(bnd_loc); - const int dir = mg2d_bnd_out_dir(bnd_loc); + // fixval + for (int bnd_loc = 0; bnd_loc < ARRAY_ELEMS(ctx->boundaries); bnd_loc++) { + MG2DBoundary *bnd = ctx->boundaries[bnd_loc]; - const ptrdiff_t dst_stride[2] = { 1, row_stride }; - const ptrdiff_t bnd_stride[2] = { dst_stride[!ci], dst_stride[ci] }; - const ptrdiff_t bnd_len[2] = { ctx->domain_size[!ci], ctx->domain_size[ci] }; + const int ci = mg2d_bnd_coord_idx(bnd_loc); + const int dir = mg2d_bnd_out_dir(bnd_loc); - double *dst = mat_row + mg2d_bnd_is_upper(bnd_loc) * ((bnd_len[1] - 1) * bnd_stride[1]); + const ptrdiff_t dst_stride[2] = { 1, row_stride }; + const ptrdiff_t bnd_stride[2] = { dst_stride[!ci], dst_stride[ci] }; + const ptrdiff_t bnd_len[2] = { ctx->domain_size[!ci], ctx->domain_size[ci] }; - if (!is_bnd[bnd_loc] || bnd->type != MG2D_BC_TYPE_REFLECT) - continue; + double *dst = mat_row + mg2d_bnd_is_upper(bnd_loc) * ((bnd_len[1] - 1) * bnd_stride[1]); - for (ptrdiff_t bnd_layer = 1; bnd_layer <= ctx->fd_stencil; bnd_layer++) - for (ptrdiff_t bnd_idx = -(ptrdiff_t)ctx->fd_stencil; bnd_idx < (ptrdiff_t)(bnd_len[0] + ctx->fd_stencil); bnd_idx++) { - dst[-dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]] += dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]]; - dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]] = 0.0; - } + if (!is_bnd[bnd_loc] || bnd->type != MG2D_BC_TYPE_FIXVAL) + continue; + + for (ptrdiff_t bnd_layer = 1; bnd_layer < ctx->fd_stencil; bnd_layer++) + for (ptrdiff_t bnd_idx = -((ptrdiff_t)ctx->fd_stencil - 1); bnd_idx < (ptrdiff_t)(bnd_len[0] + ctx->fd_stencil - 1); bnd_idx++) { + e->rhs[mat_row_idx] -= bnd->val[bnd_layer * bnd->val_stride + bnd_idx] * dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]]; + dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]] = 0.0; } + } - // fixval - for (int bnd_loc = 0; bnd_loc < ARRAY_ELEMS(ctx->boundaries); bnd_loc++) { - MG2DBoundary *bnd = ctx->boundaries[bnd_loc]; + // falloff + for (int bnd_loc = 0; bnd_loc < ARRAY_ELEMS(ctx->boundaries); bnd_loc++) { + MG2DBoundary *bnd = ctx->boundaries[bnd_loc]; - const int ci = mg2d_bnd_coord_idx(bnd_loc); - const int dir = mg2d_bnd_out_dir(bnd_loc); + const int ci = mg2d_bnd_coord_idx(bnd_loc); + const int dir = mg2d_bnd_out_dir(bnd_loc); - const ptrdiff_t dst_stride[2] = { 1, row_stride }; - const ptrdiff_t bnd_stride[2] = { dst_stride[!ci], dst_stride[ci] }; - const ptrdiff_t bnd_len[2] = { ctx->domain_size[!ci], ctx->domain_size[ci] }; + const ptrdiff_t dst_stride[2] = { 1, row_stride }; + const ptrdiff_t bnd_stride[2] = { dst_stride[!ci], dst_stride[ci] }; + const ptrdiff_t bnd_len[2] = { ctx->domain_size[!ci], ctx->domain_size[ci] }; - double *dst = mat_row + mg2d_bnd_is_upper(bnd_loc) * ((bnd_len[1] - 1) * bnd_stride[1]); + double *dst = mat_row + mg2d_bnd_is_upper(bnd_loc) * ((bnd_len[1] - 1) * bnd_stride[1]); - if (!is_bnd[bnd_loc] || bnd->type != MG2D_BC_TYPE_FIXVAL) - continue; + if (!is_bnd[bnd_loc] || bnd->type != MG2D_BC_TYPE_FALLOFF) + continue; - for (ptrdiff_t bnd_layer = 1; bnd_layer < ctx->fd_stencil; bnd_layer++) - for (ptrdiff_t bnd_idx = -((ptrdiff_t)ctx->fd_stencil - 1); bnd_idx < (ptrdiff_t)(bnd_len[0] + ctx->fd_stencil - 1); bnd_idx++) { - e->rhs[mat_row_idx] -= bnd->val[bnd_layer * bnd->val_stride + bnd_idx] * dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]]; - dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]] = 0.0; - } + for (ptrdiff_t bnd_layer = ctx->fd_stencil; bnd_layer > 0; bnd_layer--) + for (ptrdiff_t bnd_idx = -(ptrdiff_t)ctx->fd_stencil; bnd_idx < (ptrdiff_t)(bnd_len[0] + ctx->fd_stencil); bnd_idx++) { + const double x = ctx->step[0] * (ctx->domain_size[0] - 1 + bnd_layer - ctx->fd_stencil); + const double y = ctx->step[0] * bnd_idx; + const double r = sqrt(SQR(x) + SQR(y)); + double val = dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]]; + + for (int i = 1; i < ctx->fd_stencil * 2 + 1; i++) { + dst[dir * (bnd_layer - i) * bnd_stride[1] + bnd_idx * bnd_stride[0]] -= + val * falloff_bnd_fd_factors[ctx->fd_stencil - 1][i] / falloff_bnd_fd_factors[ctx->fd_stencil - 1][0]; + } + dst[dir * (bnd_layer - (ptrdiff_t)ctx->fd_stencil) * bnd_stride[1] + bnd_idx * bnd_stride[0]] -= val * (x / SQR(r)) * (ctx->step[0] / falloff_bnd_fd_factors[ctx->fd_stencil - 1][0]); + + dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]] = 0.0; } + } - // falloff - for (int bnd_loc = 0; bnd_loc < ARRAY_ELEMS(ctx->boundaries); bnd_loc++) { - MG2DBoundary *bnd = ctx->boundaries[bnd_loc]; + /* copy the interior values */ + for (ptrdiff_t idx1_col = 0; idx1_col < ctx->domain_size[1]; idx1_col++) + for (ptrdiff_t idx0_col = 0; idx0_col < ctx->domain_size[0]; idx0_col++) { + mat_dst[idx1_col * ctx->domain_size[0] + idx0_col] = mat_row[idx1_col * row_stride + idx0_col]; + mat_row[idx1_col * row_stride + idx0_col] = 0.0; + } - const int ci = mg2d_bnd_coord_idx(bnd_loc); - const int dir = mg2d_bnd_out_dir(bnd_loc); + } - const ptrdiff_t dst_stride[2] = { 1, row_stride }; - const ptrdiff_t bnd_stride[2] = { dst_stride[!ci], dst_stride[ci] }; - const ptrdiff_t bnd_len[2] = { ctx->domain_size[!ci], ctx->domain_size[ci] }; + /* make sure all the values from the scratch line have been accounted for */ + for (ptrdiff_t idx_scratch = 0; idx_scratch < e->N_ghosts; idx_scratch++) + if (scratch_line[idx_scratch] != 0.0) + abort(); + } +} - double *dst = mat_row + mg2d_bnd_is_upper(bnd_loc) * ((bnd_len[1] - 1) * bnd_stride[1]); +static int mat_fill_row_task(void *arg, unsigned int job_idx, unsigned int thread_idx) +{ + EGSContext *ctx = arg; + EGSExactInternal *e = &ctx->priv->e; - if (!is_bnd[bnd_loc] || bnd->type != MG2D_BC_TYPE_FALLOFF) - continue; + const ptrdiff_t idx1 = job_idx / ctx->domain_size[0]; + const ptrdiff_t idx0 = job_idx % ctx->domain_size[0]; - for (ptrdiff_t bnd_layer = ctx->fd_stencil; bnd_layer > 0; bnd_layer--) - for (ptrdiff_t bnd_idx = -(ptrdiff_t)ctx->fd_stencil; bnd_idx < (ptrdiff_t)(bnd_len[0] + ctx->fd_stencil); bnd_idx++) { - const double x = ctx->step[0] * (ctx->domain_size[0] - 1 + bnd_layer - ctx->fd_stencil); - const double y = ctx->step[0] * bnd_idx; - const double r = sqrt(SQR(x) + SQR(y)); - double val = dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]]; + mat_fill_row(ctx, e->scratch_lines + e->N_ghosts * thread_idx, idx1, idx0); - for (int i = 1; i < ctx->fd_stencil * 2 + 1; i++) { - dst[dir * (bnd_layer - i) * bnd_stride[1] + bnd_idx * bnd_stride[0]] -= - val * falloff_bnd_fd_factors[ctx->fd_stencil - 1][i] / falloff_bnd_fd_factors[ctx->fd_stencil - 1][0]; - } - dst[dir * (bnd_layer - (ptrdiff_t)ctx->fd_stencil) * bnd_stride[1] + bnd_idx * bnd_stride[0]] -= val * (x / SQR(r)) * (ctx->step[0] / falloff_bnd_fd_factors[ctx->fd_stencil - 1][0]); + return 0; +} - dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]] = 0.0; - } - } +static int solve_exact(EGSContext *ctx) +{ + EGSInternal *priv = ctx->priv; + EGSExactContext *ec = ctx->solver_data; + EGSExactInternal *e = &priv->e; + int64_t start; + int ret; - /* copy the interior values */ - for (ptrdiff_t idx1_col = 0; idx1_col < ctx->domain_size[1]; idx1_col++) - for (ptrdiff_t idx0_col = 0; idx0_col < ctx->domain_size[0]; idx0_col++) { - mat_dst[idx1_col * ctx->domain_size[0] + idx0_col] = mat_row[idx1_col * row_stride + idx0_col]; - mat_row[idx1_col * row_stride + idx0_col] = 0.0; - } + start = gettime(); - } + memset(e->mat, 0, SQR(e->N) * sizeof(*e->mat)); + tp_execute(ctx->tp, ctx->domain_size[1] * ctx->domain_size[0], mat_fill_row_task, ctx); - /* make sure all the values from the scratch line have been accounted for */ - for (ptrdiff_t idx_scratch = 0; idx_scratch < e->N_ghosts; idx_scratch++) - if (e->scratch_line[idx_scratch] != 0.0) - abort(); - } - } - } ec->time_mat_construct += gettime() - start; ec->count_mat_construct++; @@ -632,17 +650,6 @@ int mg2di_egs_init(EGSContext *ctx) start_init = gettime(); - if (ctx->solver_type == EGS_SOLVER_EXACT) { - switch (ctx->fd_stencil) { - case 1: priv->e.fill_mat = fill_mat_s1; break; - case 2: priv->e.fill_mat = fill_mat_s2; break; - default: - mg2di_log(&ctx->logger, 0, "Invalid finite difference stencil: %zd\n", - ctx->fd_stencil); - return -EINVAL; - } - } - if (ctx->step[0] <= DBL_EPSILON || ctx->step[1] <= DBL_EPSILON) { mg2di_log(&ctx->logger, 0, "Spatial step size too small\n"); return -EINVAL; @@ -674,6 +681,32 @@ int mg2di_egs_init(EGSContext *ctx) ctx->tp = priv->tp_internal; } + if (ctx->solver_type == EGS_SOLVER_EXACT) { + EGSExactInternal *e = &priv->e; + unsigned int nb_threads = tp_get_nb_threads(ctx->tp); + + switch (ctx->fd_stencil) { + case 1: e->fill_mat = fill_mat_s1; break; + case 2: e->fill_mat = fill_mat_s2; break; + default: + mg2di_log(&ctx->logger, 0, "Invalid finite difference stencil: %zd\n", + ctx->fd_stencil); + return -EINVAL; + } + + if (e->scratch_lines_allocated < nb_threads) { + free(e->scratch_lines); + e->scratch_lines = NULL; + e->scratch_lines_allocated = 0; + + e->scratch_lines = calloc(e->N_ghosts * nb_threads, + sizeof(*e->scratch_lines)); + if (!e->scratch_lines) + return -ENOMEM; + e->scratch_lines_allocated = nb_threads; + } + } + priv->residual_calc_offset = 0; priv->residual_calc_size[0] = ctx->domain_size[0]; priv->residual_calc_size[1] = ctx->domain_size[1]; @@ -786,13 +819,12 @@ static int arrays_alloc(EGSContext *ctx, const size_t domain_size[2]) e->N = domain_size[0] * domain_size[1]; e->N_ghosts = (domain_size[0] + 2 * FD_STENCIL_MAX) * (domain_size[1] + 2 * FD_STENCIL_MAX); - e->scratch_line = calloc(e->N_ghosts, sizeof(*e->scratch_line)); e->mat = calloc(SQR(e->N), sizeof(*e->mat)); e->mat_f = calloc(SQR(e->N), sizeof(*e->mat_f)); e->rhs = calloc(e->N, sizeof(*e->rhs)); e->x = calloc(e->N, sizeof(*e->x)); e->ipiv = calloc(e->N, sizeof(*e->ipiv)); - if (!e->scratch_line || !e->mat || !e->rhs || !e->ipiv) + if (!e->mat || !e->mat_f || !e->rhs || !e->x || !e->ipiv) return -ENOMEM; ret = mg2di_bicgstab_context_alloc(&e->bicgstab, e->N, 64); @@ -873,7 +905,7 @@ void mg2di_egs_free(EGSContext **pctx) if (ctx->solver_type == EGS_SOLVER_EXACT) { EGSExactInternal *e = &ctx->priv->e; - free(e->scratch_line); + free(e->scratch_lines); free(e->mat); free(e->mat_f); free(e->rhs); -- cgit v1.2.3