/* * Copyright 2018 Anton Khirnov * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "config.h" #include #include #include #include #include #include #include #include #include "bicgstab.h" #include "common.h" #include "cpu.h" #include "ell_grid_solve.h" #include "log.h" #include "mg2d_boundary.h" #include "mg2d_boundary_internal.h" #include "mg2d_constants.h" #include "ndarray.h" #include "residual_calc.h" static const double relax_factors[FD_STENCIL_MAX] = { [0] = 1.0 / 5, [1] = 1.0 / 7, }; typedef struct EGSRelaxInternal { void (*line_add)(ptrdiff_t, double *, const double *, double); double relax_factor; } EGSRelaxInternal; typedef struct EGSExactInternal { void (*fill_mat)(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride, NDArray **diff_coeffs, double *fd_factors, ptrdiff_t idx_src); size_t N; size_t N_ghosts; double *mat; double *mat_f; double *rhs; double *rhs_mat; double *rhs_factor; double *x; int *ipiv; double *scratch_lines; ptrdiff_t scratch_lines_allocated; int mat_filled; BiCGStabContext *bicgstab; } EGSExactInternal; struct EGSInternal { ptrdiff_t stride; NDArray *u_base; NDArray *rhs_base; NDArray *residual_base; NDArray *diff_coeffs_base[MG2D_DIFF_COEFF_NB]; ptrdiff_t residual_calc_offset; size_t residual_calc_size[2]; double fd_factors[MG2D_DIFF_COEFF_NB]; int bnd_zero[4]; ResidualCalcContext *rescalc; union { EGSRelaxInternal r; EGSExactInternal e; }; TPContext *tp_internal; }; static const double fd_denoms[][MG2D_DIFF_COEFF_NB] = { { [MG2D_DIFF_COEFF_00] = 1.0, [MG2D_DIFF_COEFF_10] = 2.0, [MG2D_DIFF_COEFF_01] = 2.0, [MG2D_DIFF_COEFF_20] = 1.0, [MG2D_DIFF_COEFF_02] = 1.0, [MG2D_DIFF_COEFF_11] = 4.0, }, { [MG2D_DIFF_COEFF_00] = 1.0, [MG2D_DIFF_COEFF_10] = 12.0, [MG2D_DIFF_COEFF_01] = 12.0, [MG2D_DIFF_COEFF_20] = 12.0, [MG2D_DIFF_COEFF_02] = 12.0, [MG2D_DIFF_COEFF_11] = 144.0, }, { [MG2D_DIFF_COEFF_00] = 1.0, [MG2D_DIFF_COEFF_10] = 60.0, [MG2D_DIFF_COEFF_01] = 60.0, [MG2D_DIFF_COEFF_20] = 90.0, [MG2D_DIFF_COEFF_02] = 90.0, [MG2D_DIFF_COEFF_11] = 3600.0, }, { [MG2D_DIFF_COEFF_00] = 1.0, [MG2D_DIFF_COEFF_10] = 840.0, [MG2D_DIFF_COEFF_01] = 840.0, [MG2D_DIFF_COEFF_20] = 5040.0, [MG2D_DIFF_COEFF_02] = 5040.0, [MG2D_DIFF_COEFF_11] = SQR(840.0), }, }; static void residual_calc(EGSContext *ctx) { EGSInternal *priv = ctx->priv; const double *diff_coeffs[MG2D_DIFF_COEFF_NB]; int64_t start; start = gettime(); for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++) diff_coeffs[i] = ctx->diff_coeffs[i]->data + priv->residual_calc_offset; mg2di_residual_calc(priv->rescalc, priv->residual_calc_size, priv->stride, &ctx->residual_max, ctx->residual->data + priv->residual_calc_offset, ctx->u->data + priv->residual_calc_offset, ctx->rhs->data + priv->residual_calc_offset, diff_coeffs, priv->fd_factors); ctx->time_res_calc += gettime() - start; ctx->count_res++; } static void boundaries_apply_fixval(double *dst, const ptrdiff_t dst_stride[2], const double *src, ptrdiff_t src_stride, size_t boundary_size) { for (int j = 0; j < FD_STENCIL_MAX; j++) { for (ptrdiff_t i = -j; i < (ptrdiff_t)boundary_size + j; i++) dst[i * dst_stride[0]] = src[i]; dst += dst_stride[1]; src += src_stride; } } static void boundaries_apply_reflect(double *dst, const ptrdiff_t dst_stride[2], const double *src, ptrdiff_t src_stride, size_t boundary_size) { if (dst_stride[0] == 1) { for (int j = 1; j <= FD_STENCIL_MAX; j++) memcpy(dst + j * dst_stride[1], dst - j * dst_stride[1], sizeof(*dst) * boundary_size); } else { for (size_t i = 0; i < boundary_size; i++) { for (int j = 1; j <= FD_STENCIL_MAX; j++) dst[dst_stride[1] * j] = dst[-dst_stride[1] * j]; dst += dst_stride[0]; } } } /* FIXME we currently use 1st-order forward differences in all cases, * since centered and/or higher-order FDs seem to be unstable * this may be suboptimal and should be investigated closer */ static const double falloff_bnd_fd_factors[][2] = { { 1.0 / 2.0, -1.0 / 2.0, }, { 1.0 / 2.0, -1.0 / 2.0, }, { 1.0 / 2.0, -1.0 / 2.0, }, { 1.0 / 2.0, -1.0 / 2.0, }, //{ -1.0, 8.0, 0.0, -8.0, 1.0 }, }; static void boundaries_apply_falloff(double *dst, const ptrdiff_t dst_stride[2], const double *src, ptrdiff_t src_stride, size_t boundary_size, EGSContext *ctx) { for (ptrdiff_t bnd_layer = 1; bnd_layer <= ctx->fd_stencil; bnd_layer++) for (ptrdiff_t bnd_idx = -(ptrdiff_t)ctx->fd_stencil; bnd_idx < (ptrdiff_t)(boundary_size + ctx->fd_stencil); bnd_idx++) { const double x = ctx->step[0] * (ctx->domain_size[0] - 1 + bnd_layer - ctx->fd_stencil); const double y = ctx->step[0] * bnd_idx; const double r = sqrt(SQR(x) + SQR(y)); double *row = dst + bnd_layer * dst_stride[1] + bnd_idx * dst_stride[0]; double val = row[-(ptrdiff_t)ctx->fd_stencil * dst_stride[1]] / r * (x / r); for (int i = 1; i < ARRAY_ELEMS(falloff_bnd_fd_factors); i++) val += dst[(bnd_layer - i) * dst_stride[1] + bnd_idx * dst_stride[0]] * falloff_bnd_fd_factors[ctx->fd_stencil - 1][i] / ctx->step[0]; *row = -val * (ctx->step[0] / falloff_bnd_fd_factors[ctx->fd_stencil - 1][0]); } } static void boundaries_apply(EGSContext *ctx, int init) { static const enum MG2DBCType bnd_type_order[] = { MG2D_BC_TYPE_FIXVAL, MG2D_BC_TYPE_REFLECT, MG2D_BC_TYPE_FALLOFF }; EGSInternal *priv = ctx->priv; const ptrdiff_t strides[2] = { 1, priv->stride }; int64_t start, start_bnd; start = gettime(); for (int order_idx = 0; order_idx < ARRAY_ELEMS(bnd_type_order); order_idx++) { for (int i = 0; i < ARRAY_ELEMS(ctx->boundaries); i++) { MG2DBoundary *bnd = ctx->boundaries[i]; const int ci = mg2d_bnd_coord_idx(i); const ptrdiff_t stride_boundary = strides[!ci]; const ptrdiff_t stride_offset = strides[ci]; const size_t size_boundary = ctx->domain_size[!ci]; const size_t size_offset = ctx->domain_size[ci]; double *dst = ctx->u->data + mg2d_bnd_is_upper(i) * ((size_offset - 1) * stride_offset); const ptrdiff_t dst_strides[] = { stride_boundary, mg2d_bnd_out_dir(i) * stride_offset }; if (bnd->type != bnd_type_order[order_idx]) continue; switch (bnd->type) { case MG2D_BC_TYPE_FIXVAL: if (init && !priv->bnd_zero[i]) { start_bnd = gettime(); boundaries_apply_fixval(dst, dst_strides, bnd->val, bnd->val_stride, size_boundary); ctx->time_bnd_fixval += gettime() - start_bnd; ctx->count_bnd_fixval++; } break; case MG2D_BC_TYPE_REFLECT: start_bnd = gettime(); boundaries_apply_reflect(dst, dst_strides, bnd->val, bnd->val_stride, size_boundary); ctx->time_bnd_reflect += gettime() - start_bnd; ctx->count_bnd_reflect++; break; case MG2D_BC_TYPE_FALLOFF: start_bnd = gettime(); boundaries_apply_falloff(dst, dst_strides, bnd->val, bnd->val_stride, size_boundary, ctx); ctx->time_bnd_falloff += gettime() - start_bnd; ctx->count_bnd_falloff++; break; } } } /* fill in the corner ghosts */ start_bnd = gettime(); for (int pos_y = 0; pos_y < 2; pos_y++) { enum MG2DBoundaryLoc loc_y = mg2d_bnd_id(1, pos_y); MG2DBoundary *bnd_y = ctx->boundaries[loc_y]; const int dir_y = mg2d_bnd_out_dir(loc_y); for (int pos_x = 0; pos_x < 2; pos_x++) { enum MG2DBoundaryLoc loc_x = mg2d_bnd_id(0, pos_x); MG2DBoundary *bnd_x = ctx->boundaries[loc_x]; const int dir_x = mg2d_bnd_out_dir(loc_x); int fact_x = dir_x, fact_y = dir_y; double *dst = ctx->u->data + mg2d_bnd_is_upper(loc_y) * ((ctx->domain_size[1] - 1) * priv->stride) + mg2d_bnd_is_upper(loc_x) * (ctx->domain_size[0] - 1); if (bnd_y->type == MG2D_BC_TYPE_REFLECT) fact_y *= -1; else if (bnd_x->type == MG2D_BC_TYPE_REFLECT) fact_x *= -1; else continue; for (int j = 1; j <= FD_STENCIL_MAX; j++) for (int i = 1; i <= FD_STENCIL_MAX; i++) { const ptrdiff_t idx_dst = dir_y * j * strides[1] + dir_x * i; const ptrdiff_t idx_src = fact_y * j * strides[1] + fact_x * i; dst[idx_dst] = dst[idx_src]; } } } ctx->time_bnd_corners += gettime() - start_bnd; ctx->count_bnd_corners++; ctx->time_boundaries += gettime() - start; ctx->count_boundaries++; } #if HAVE_EXTERNAL_ASM void mg2di_line_madd_fma3(ptrdiff_t linesize, double *dst, const double *src, double c); #endif static void line_madd_c(ptrdiff_t linesize, double *dst, const double *src, double c) { for (ptrdiff_t i = 0; i < linesize; i++) dst[i] += c * src[i]; } static int residual_add_task(void *arg, unsigned int job_idx, unsigned int thread_idx) { #if 1 EGSContext *ctx = arg; EGSInternal *priv = ctx->priv; priv->r.line_add(ctx->domain_size[0], ctx->u->data + ctx->u->stride[0] * job_idx, ctx->residual->data + ctx->residual->stride[0] * job_idx, priv->r.relax_factor); #else for (int idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) { ptrdiff_t idx = job_idx * ctx->u->stride[0] + idx0; ctx->u->data[idx] += priv->r.relax_factor * ctx->residual->data[idx]; } #endif return 0; } static int solve_relax_step(EGSContext *ctx) { EGSRelaxContext *r = ctx->solver_data; int64_t start; start = gettime(); tp_execute(ctx->tp, ctx->domain_size[1], residual_add_task, ctx); r->time_correct += gettime() - start; r->count_correct++; boundaries_apply(ctx, 0); residual_calc(ctx); return 0; } static void fill_mat_s1(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride, NDArray **diff_coeffs, double *fd_factors, ptrdiff_t idx_src) { mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src]; mat_row[ 1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[-1 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[ 1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[-1 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[ 1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[ 0 * mat_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[-1 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[ 1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[ 0 * row_stride * mat_stride] += -2.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[-1 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[( 1 + 1 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 - 1 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 + 1 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 - 1 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; } static void fill_mat_s2(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride, NDArray **diff_coeffs, double *fd_factors, ptrdiff_t idx_src) { mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src]; mat_row[ 2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[ 1 * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[-1 * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[-2 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[ 2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[ 1 * row_stride * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[-1 * row_stride * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[-2 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[ 2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[ 1 * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[ 0 * mat_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[-1 * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[-2 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[ 2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[ 1 * row_stride * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[ 0 * row_stride * mat_stride] += -30.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[-1 * row_stride * mat_stride] += 16.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[-2 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[( 2 + 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 + 1 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 - 1 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 - 2 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 + 2 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 + 1 * row_stride) * mat_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 - 1 * row_stride) * mat_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 - 2 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 + 2 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 + 1 * row_stride) * mat_stride] += -64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 - 1 * row_stride) * mat_stride] += 64.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 - 2 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 + 2 * row_stride) * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 + 1 * row_stride) * mat_stride] += 8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 - 1 * row_stride) * mat_stride] += -8.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 - 2 * row_stride) * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; } static void fill_mat_s3(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride, NDArray **diff_coeffs, double *fd_factors, ptrdiff_t idx_src) { mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src]; mat_row[ 3 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[ 2 * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[ 1 * mat_stride] += 45.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[-1 * mat_stride] += -45.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[-2 * mat_stride] += 9.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[-3 * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[ 3 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[ 2 * row_stride * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[ 1 * row_stride * mat_stride] += 45.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[-1 * row_stride * mat_stride] += -45.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[-2 * row_stride * mat_stride] += 9.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[-3 * row_stride * mat_stride] += -1.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[ 3 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[ 2 * mat_stride] += -13.5 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[ 1 * mat_stride] += 135.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[ 0 * mat_stride] += -245.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[-1 * mat_stride] += 135.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[-2 * mat_stride] += -13.5 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[-3 * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[ 3 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[ 2 * row_stride * mat_stride] += -13.5 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[ 1 * row_stride * mat_stride] += 135.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[ 0 * row_stride * mat_stride] += -245.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[-1 * row_stride * mat_stride] += 135.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[-2 * row_stride * mat_stride] += -13.5 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[-3 * row_stride * mat_stride] += 1.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[( 3 + 3 * row_stride) * mat_stride] += (1.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 3 + 2 * row_stride) * mat_stride] += (1.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 3 + 1 * row_stride) * mat_stride] += (1.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 3 - 1 * row_stride) * mat_stride] += (1.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 3 - 2 * row_stride) * mat_stride] += (1.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 3 - 3 * row_stride) * mat_stride] += (1.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 + 3 * row_stride) * mat_stride] += (-9.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 + 2 * row_stride) * mat_stride] += (-9.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 + 1 * row_stride) * mat_stride] += (-9.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 - 1 * row_stride) * mat_stride] += (-9.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 - 2 * row_stride) * mat_stride] += (-9.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 - 3 * row_stride) * mat_stride] += (-9.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 + 3 * row_stride) * mat_stride] += (45.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 + 2 * row_stride) * mat_stride] += (45.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 + 1 * row_stride) * mat_stride] += (45.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 - 1 * row_stride) * mat_stride] += (45.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 - 2 * row_stride) * mat_stride] += (45.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 - 3 * row_stride) * mat_stride] += (45.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 + 3 * row_stride) * mat_stride] += (-45.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 + 2 * row_stride) * mat_stride] += (-45.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 + 1 * row_stride) * mat_stride] += (-45.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 - 1 * row_stride) * mat_stride] += (-45.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 - 2 * row_stride) * mat_stride] += (-45.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 - 3 * row_stride) * mat_stride] += (-45.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 + 3 * row_stride) * mat_stride] += (9.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 + 2 * row_stride) * mat_stride] += (9.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 + 1 * row_stride) * mat_stride] += (9.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 - 1 * row_stride) * mat_stride] += (9.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 - 2 * row_stride) * mat_stride] += (9.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 - 3 * row_stride) * mat_stride] += (9.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-3 + 3 * row_stride) * mat_stride] += (-1.0) * ( 1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-3 + 2 * row_stride) * mat_stride] += (-1.0) * ( -9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-3 + 1 * row_stride) * mat_stride] += (-1.0) * ( 45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-3 - 1 * row_stride) * mat_stride] += (-1.0) * (-45.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-3 - 2 * row_stride) * mat_stride] += (-1.0) * ( 9.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-3 - 3 * row_stride) * mat_stride] += (-1.0) * ( -1.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; } static void fill_mat_s4(double *mat_row, ptrdiff_t mat_stride, ptrdiff_t row_stride, NDArray **diff_coeffs, double *fd_factors, ptrdiff_t idx_src) { mat_row[0] += diff_coeffs[MG2D_DIFF_COEFF_00]->data[idx_src]; mat_row[ 4 * mat_stride] += -3.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[ 3 * mat_stride] += 32.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[ 2 * mat_stride] += -168.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[ 1 * mat_stride] += 672.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[-1 * mat_stride] += -672.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[-2 * mat_stride] += 168.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[-3 * mat_stride] += -32.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[-4 * mat_stride] += 3.0 * diff_coeffs[MG2D_DIFF_COEFF_10]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_10]; mat_row[ 4 * row_stride * mat_stride] += -3.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[ 3 * row_stride * mat_stride] += 32.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[ 2 * row_stride * mat_stride] += -168.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[ 1 * row_stride * mat_stride] += 672.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[-1 * row_stride * mat_stride] += -672.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[-2 * row_stride * mat_stride] += 168.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[-3 * row_stride * mat_stride] += -32.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[-4 * row_stride * mat_stride] += 3.0 * diff_coeffs[MG2D_DIFF_COEFF_01]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_01]; mat_row[ 4 * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[ 3 * mat_stride] += 128.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[ 2 * mat_stride] += -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[ 1 * mat_stride] += 8064.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[ 0 * mat_stride] += -14350.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[-1 * mat_stride] += 8064.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[-2 * mat_stride] += -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[-3 * mat_stride] += 128.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[-4 * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_20]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_20]; mat_row[ 4 * row_stride * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[ 3 * row_stride * mat_stride] += 128.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[ 2 * row_stride * mat_stride] += -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[ 1 * row_stride * mat_stride] += 8064.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[ 0 * row_stride * mat_stride] += -14350.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[-1 * row_stride * mat_stride] += 8064.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[-2 * row_stride * mat_stride] += -1008.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[-3 * row_stride * mat_stride] += 128.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[-4 * row_stride * mat_stride] += -9.0 * diff_coeffs[MG2D_DIFF_COEFF_02]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_02]; mat_row[( 4 + 4 * row_stride) * mat_stride] += (-3.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 4 + 3 * row_stride) * mat_stride] += (-3.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 4 + 2 * row_stride) * mat_stride] += (-3.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 4 + 1 * row_stride) * mat_stride] += (-3.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 4 - 1 * row_stride) * mat_stride] += (-3.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 4 - 2 * row_stride) * mat_stride] += (-3.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 4 - 3 * row_stride) * mat_stride] += (-3.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 4 - 4 * row_stride) * mat_stride] += (-3.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 3 + 4 * row_stride) * mat_stride] += (32.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 3 + 3 * row_stride) * mat_stride] += (32.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 3 + 2 * row_stride) * mat_stride] += (32.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 3 + 1 * row_stride) * mat_stride] += (32.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 3 - 1 * row_stride) * mat_stride] += (32.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 3 - 2 * row_stride) * mat_stride] += (32.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 3 - 3 * row_stride) * mat_stride] += (32.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 3 - 4 * row_stride) * mat_stride] += (32.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 + 4 * row_stride) * mat_stride] += (-168.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 + 3 * row_stride) * mat_stride] += (-168.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 + 2 * row_stride) * mat_stride] += (-168.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 + 1 * row_stride) * mat_stride] += (-168.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 - 1 * row_stride) * mat_stride] += (-168.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 - 2 * row_stride) * mat_stride] += (-168.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 - 3 * row_stride) * mat_stride] += (-168.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 2 - 4 * row_stride) * mat_stride] += (-168.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 + 4 * row_stride) * mat_stride] += (672.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 + 3 * row_stride) * mat_stride] += (672.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 + 2 * row_stride) * mat_stride] += (672.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 + 1 * row_stride) * mat_stride] += (672.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 - 1 * row_stride) * mat_stride] += (672.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 - 2 * row_stride) * mat_stride] += (672.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 - 3 * row_stride) * mat_stride] += (672.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[( 1 - 4 * row_stride) * mat_stride] += (672.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 + 4 * row_stride) * mat_stride] += (-672.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 + 3 * row_stride) * mat_stride] += (-672.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 + 2 * row_stride) * mat_stride] += (-672.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 + 1 * row_stride) * mat_stride] += (-672.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 - 1 * row_stride) * mat_stride] += (-672.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 - 2 * row_stride) * mat_stride] += (-672.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 - 3 * row_stride) * mat_stride] += (-672.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-1 - 4 * row_stride) * mat_stride] += (-672.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 + 4 * row_stride) * mat_stride] += (168.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 + 3 * row_stride) * mat_stride] += (168.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 + 2 * row_stride) * mat_stride] += (168.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 + 1 * row_stride) * mat_stride] += (168.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 - 1 * row_stride) * mat_stride] += (168.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 - 2 * row_stride) * mat_stride] += (168.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 - 3 * row_stride) * mat_stride] += (168.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-2 - 4 * row_stride) * mat_stride] += (168.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-3 + 4 * row_stride) * mat_stride] += (-32.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-3 + 3 * row_stride) * mat_stride] += (-32.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-3 + 2 * row_stride) * mat_stride] += (-32.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-3 + 1 * row_stride) * mat_stride] += (-32.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-3 - 1 * row_stride) * mat_stride] += (-32.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-3 - 2 * row_stride) * mat_stride] += (-32.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-3 - 3 * row_stride) * mat_stride] += (-32.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-3 - 4 * row_stride) * mat_stride] += (-32.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-4 + 4 * row_stride) * mat_stride] += (3.0) * ( -3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-4 + 3 * row_stride) * mat_stride] += (3.0) * ( 32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-4 + 2 * row_stride) * mat_stride] += (3.0) * (-168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-4 + 1 * row_stride) * mat_stride] += (3.0) * ( 672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-4 - 1 * row_stride) * mat_stride] += (3.0) * (-672.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-4 - 2 * row_stride) * mat_stride] += (3.0) * ( 168.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-4 - 3 * row_stride) * mat_stride] += (3.0) * ( -32.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; mat_row[(-4 - 4 * row_stride) * mat_stride] += (3.0) * ( 3.0) * diff_coeffs[MG2D_DIFF_COEFF_11]->data[idx_src] * fd_factors[MG2D_DIFF_COEFF_11]; } static void mat_fill_row(EGSContext *ctx, double *scratch_line, ptrdiff_t idx1, ptrdiff_t idx0) { EGSInternal *priv = ctx->priv; EGSExactInternal *e = &priv->e; const ptrdiff_t idx_src = idx1 * priv->stride + idx0; const ptrdiff_t mat_row_idx = idx1 * ctx->domain_size[0] + idx0; int is_bnd[4], boundary; ptrdiff_t row_stride, mat_stride; ptrdiff_t row_offset; double *mat_row; is_bnd[MG2D_BOUNDARY_0L] = idx0 < ctx->fd_stencil; is_bnd[MG2D_BOUNDARY_0U] = idx0 >= ctx->domain_size[0] - ctx->fd_stencil; is_bnd[MG2D_BOUNDARY_1L] = idx1 < ctx->fd_stencil; is_bnd[MG2D_BOUNDARY_1U] = idx1 >= ctx->domain_size[1] - ctx->fd_stencil; boundary = is_bnd[0] || is_bnd[1] || is_bnd[2] || is_bnd[3]; if (boundary) { memset(scratch_line, 0, e->N_ghosts * sizeof(*scratch_line)); row_stride = ctx->domain_size[0] + 2 * ctx->fd_stencil; mat_row = scratch_line + row_stride * ctx->fd_stencil + ctx->fd_stencil; mat_stride = 1; } else { mat_row = e->mat + mat_row_idx; row_stride = ctx->domain_size[0]; mat_stride = e->N; } row_offset = idx1 * row_stride + idx0; e->fill_mat(mat_row + row_offset * mat_stride, mat_stride, row_stride, ctx->diff_coeffs, ctx->priv->fd_factors, idx_src); e->rhs_factor[mat_row_idx] = 1.0; if (boundary) { double *mat_dst = e->mat + mat_row_idx; ptrdiff_t mat_stride_dst = e->N; const MG2DBoundary *bnd_fixval = NULL; for (int bnd_loc = 0; bnd_loc < ARRAY_ELEMS(ctx->boundaries); bnd_loc++) { const int ci = mg2d_bnd_coord_idx(bnd_loc); const ptrdiff_t idx[] = { idx0, idx1 }; if (!(idx[ci] == (ctx->domain_size[ci] - 1) * mg2d_bnd_is_upper(bnd_loc) && ctx->boundaries[bnd_loc]->type == MG2D_BC_TYPE_FIXVAL)) continue; bnd_fixval = ctx->boundaries[bnd_loc]; mat_dst[mat_row_idx * mat_stride_dst] = 1.0; e->rhs_mat[mat_row_idx] = bnd_fixval->val[idx[!ci]]; e->rhs_factor[mat_row_idx] = 0.0; memset(scratch_line, 0, e->N_ghosts * sizeof(*scratch_line)); break; } if (!bnd_fixval) { /* apply the boundary conditions to eliminate the ghostpoint values */ // reflect for (int bnd_loc = 0; bnd_loc < ARRAY_ELEMS(ctx->boundaries); bnd_loc++) { MG2DBoundary *bnd = ctx->boundaries[bnd_loc]; const int ci = mg2d_bnd_coord_idx(bnd_loc); const int dir = mg2d_bnd_out_dir(bnd_loc); const ptrdiff_t dst_stride[2] = { 1, row_stride }; const ptrdiff_t bnd_stride[2] = { dst_stride[!ci], dst_stride[ci] }; const ptrdiff_t bnd_len[2] = { ctx->domain_size[!ci], ctx->domain_size[ci] }; double *dst = mat_row + mg2d_bnd_is_upper(bnd_loc) * ((bnd_len[1] - 1) * bnd_stride[1]); if (!is_bnd[bnd_loc] || bnd->type != MG2D_BC_TYPE_REFLECT) continue; for (ptrdiff_t bnd_layer = 1; bnd_layer <= ctx->fd_stencil; bnd_layer++) for (ptrdiff_t bnd_idx = -(ptrdiff_t)ctx->fd_stencil; bnd_idx < (ptrdiff_t)(bnd_len[0] + ctx->fd_stencil); bnd_idx++) { dst[-dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]] += dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]]; dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]] = 0.0; } } // fixval for (int bnd_loc = 0; bnd_loc < ARRAY_ELEMS(ctx->boundaries); bnd_loc++) { MG2DBoundary *bnd = ctx->boundaries[bnd_loc]; const int ci = mg2d_bnd_coord_idx(bnd_loc); const int dir = mg2d_bnd_out_dir(bnd_loc); const ptrdiff_t dst_stride[2] = { 1, row_stride }; const ptrdiff_t bnd_stride[2] = { dst_stride[!ci], dst_stride[ci] }; const ptrdiff_t bnd_len[2] = { ctx->domain_size[!ci], ctx->domain_size[ci] }; double *dst = mat_row + mg2d_bnd_is_upper(bnd_loc) * ((bnd_len[1] - 1) * bnd_stride[1]); if (!is_bnd[bnd_loc] || bnd->type != MG2D_BC_TYPE_FIXVAL) continue; for (ptrdiff_t bnd_layer = 1; bnd_layer < ctx->fd_stencil; bnd_layer++) for (ptrdiff_t bnd_idx = -((ptrdiff_t)ctx->fd_stencil - 1); bnd_idx < (ptrdiff_t)(bnd_len[0] + ctx->fd_stencil - 1); bnd_idx++) { e->rhs_mat[mat_row_idx] -= bnd->val[bnd_layer * bnd->val_stride + bnd_idx] * dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]]; dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]] = 0.0; } } // falloff for (int bnd_loc = 0; bnd_loc < ARRAY_ELEMS(ctx->boundaries); bnd_loc++) { MG2DBoundary *bnd = ctx->boundaries[bnd_loc]; const int ci = mg2d_bnd_coord_idx(bnd_loc); const int dir = mg2d_bnd_out_dir(bnd_loc); const ptrdiff_t dst_stride[2] = { 1, row_stride }; const ptrdiff_t bnd_stride[2] = { dst_stride[!ci], dst_stride[ci] }; const ptrdiff_t bnd_len[2] = { ctx->domain_size[!ci], ctx->domain_size[ci] }; double *dst = mat_row + mg2d_bnd_is_upper(bnd_loc) * ((bnd_len[1] - 1) * bnd_stride[1]); if (!is_bnd[bnd_loc] || bnd->type != MG2D_BC_TYPE_FALLOFF) continue; for (ptrdiff_t bnd_layer = ctx->fd_stencil; bnd_layer > 0; bnd_layer--) for (ptrdiff_t bnd_idx = -(ptrdiff_t)ctx->fd_stencil; bnd_idx < (ptrdiff_t)(bnd_len[0] + ctx->fd_stencil); bnd_idx++) { const double x = ctx->step[0] * (ctx->domain_size[0] - 1 + bnd_layer - ctx->fd_stencil); const double y = ctx->step[0] * bnd_idx; const double r = sqrt(SQR(x) + SQR(y)); double val = dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]]; for (int i = 1; i < ARRAY_ELEMS(falloff_bnd_fd_factors); i++) { dst[dir * (bnd_layer - i) * bnd_stride[1] + bnd_idx * bnd_stride[0]] -= val * falloff_bnd_fd_factors[ctx->fd_stencil - 1][i] / falloff_bnd_fd_factors[ctx->fd_stencil - 1][0]; } dst[dir * (bnd_layer - (ptrdiff_t)ctx->fd_stencil) * bnd_stride[1] + bnd_idx * bnd_stride[0]] -= val * (x / SQR(r)) * (ctx->step[0] / falloff_bnd_fd_factors[ctx->fd_stencil - 1][0]); dst[dir * bnd_layer * bnd_stride[1] + bnd_idx * bnd_stride[0]] = 0.0; } } /* copy the interior values */ for (ptrdiff_t idx1_col = 0; idx1_col < ctx->domain_size[1]; idx1_col++) for (ptrdiff_t idx0_col = 0; idx0_col < ctx->domain_size[0]; idx0_col++) { mat_dst[(idx1_col * ctx->domain_size[0] + idx0_col) * mat_stride_dst] = mat_row[idx1_col * row_stride + idx0_col]; mat_row[idx1_col * row_stride + idx0_col] = 0.0; } } /* make sure all the values from the scratch line have been accounted for */ for (ptrdiff_t idx_scratch = 0; idx_scratch < e->N_ghosts; idx_scratch++) if (scratch_line[idx_scratch] != 0.0) abort(); } } static int mat_fill_row_task(void *arg, unsigned int job_idx, unsigned int thread_idx) { EGSContext *ctx = arg; EGSExactInternal *e = &ctx->priv->e; const ptrdiff_t idx1 = job_idx / ctx->domain_size[0]; const ptrdiff_t idx0 = job_idx % ctx->domain_size[0]; mat_fill_row(ctx, e->scratch_lines + e->N_ghosts * thread_idx, idx1, idx0); return 0; } static int solve_exact(EGSContext *ctx) { EGSInternal *priv = ctx->priv; EGSExactContext *ec = ctx->solver_data; EGSExactInternal *e = &priv->e; int64_t start; int ret; start = gettime(); if (!e->mat_filled) { memset(e->mat, 0, SQR(e->N) * sizeof(*e->mat)); memset(e->rhs_mat, 0, e->N * sizeof(*e->rhs_mat)); tp_execute(ctx->tp, ctx->domain_size[1] * ctx->domain_size[0], mat_fill_row_task, ctx); e->mat_filled = 1; } for (ptrdiff_t idx1 = 0; idx1 < ctx->domain_size[1]; idx1++) for (ptrdiff_t idx0 = 0; idx0 < ctx->domain_size[0]; idx0++) { const ptrdiff_t idx_src = idx1 * priv->stride + idx0; const ptrdiff_t mat_row_idx = idx1 * ctx->domain_size[0] + idx0; e->rhs[mat_row_idx] = e->rhs_factor[mat_row_idx] * ctx->rhs->data[idx_src] + e->rhs_mat[mat_row_idx]; } ec->time_mat_construct += gettime() - start; ec->count_mat_construct++; start = gettime(); ret = mg2di_bicgstab_solve(e->bicgstab, e->mat, e->rhs, e->x); if (ret >= 0) ec->bicgstab_iterations += ret; ec->time_bicgstab_solve += gettime() - start; ec->count_bicgstab_solve++; if (ret < 0) { char equed = 'N'; double cond, ferr, berr, rpivot; start = gettime(); ret = LAPACKE_dgesvx(LAPACK_COL_MAJOR, 'N', 'N', e->N, 1, e->mat, e->N, e->mat_f, e->N, e->ipiv, &equed, NULL, NULL, e->rhs, e->N, e->x, e->N, &cond, &ferr, &berr, &rpivot); if (ret == 0) ret = LAPACKE_dgetri(LAPACK_COL_MAJOR, e->N, e->mat_f, e->N, e->ipiv); if (ret != 0) { mg2di_log(&ctx->logger, MG2D_LOG_ERROR, "Error solving the linear system: %d\n", ret); return -EDOM; } mg2di_log(&ctx->logger, MG2D_LOG_DEBUG, "LU factorization solution to a %zdx%zd matrix: " "condition number %16.16g; forward error %16.16g backward error %16.16g\n", e->N, e->N, cond, ferr, berr); ec->time_lu_solve += gettime() - start; ec->count_lu_solve++; ret = mg2di_bicgstab_init(e->bicgstab, e->mat_f, e->x); if (ret < 0) return ret; } start = gettime(); for (size_t idx1 = 0; idx1 < ctx->domain_size[1]; idx1++) memcpy(ctx->u->data + idx1 * ctx->u->stride[0], e->x + idx1 * ctx->domain_size[0], ctx->domain_size[0] * sizeof(*e->x)); ec->time_export += gettime() - start; ec->count_export++; boundaries_apply(ctx, 0); residual_calc(ctx); return 0; } int mg2di_egs_solve(EGSContext *ctx) { int64_t start; int ret; start = gettime(); switch (ctx->solver_type) { case EGS_SOLVER_RELAXATION: ret = solve_relax_step(ctx); break; case EGS_SOLVER_EXACT: ret = solve_exact(ctx); break; default: ret = -EINVAL; } ctx->time_total += gettime() - start; return ret; } int mg2di_egs_init(EGSContext *ctx, int flags) { EGSInternal *priv = ctx->priv; int64_t start, start_init; int ret; start = gettime(); start_init = gettime(); if (ctx->step[0] <= DBL_EPSILON || ctx->step[1] <= DBL_EPSILON) { mg2di_log(&ctx->logger, 0, "Spatial step size too small\n"); return -EINVAL; } if (ctx->solver_type == EGS_SOLVER_RELAXATION) { EGSRelaxContext *r = ctx->solver_data; if (r->relax_factor == 0.0) priv->r.relax_factor = relax_factors[ctx->fd_stencil - 1]; else priv->r.relax_factor = r->relax_factor; priv->r.relax_factor *= ctx->step[0] * ctx->step[0]; if (r->relax_multiplier > 0.0) priv->r.relax_factor *= r->relax_multiplier; priv->r.line_add = line_madd_c; #if HAVE_EXTERNAL_ASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) priv->r.line_add = mg2di_line_madd_fma3; #endif } priv->fd_factors[MG2D_DIFF_COEFF_00] = 1.0 / fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_00]; priv->fd_factors[MG2D_DIFF_COEFF_10] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_10] * ctx->step[0]); priv->fd_factors[MG2D_DIFF_COEFF_01] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_01] * ctx->step[1]); priv->fd_factors[MG2D_DIFF_COEFF_20] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_20] * SQR(ctx->step[0])); priv->fd_factors[MG2D_DIFF_COEFF_02] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_02] * SQR(ctx->step[1])); priv->fd_factors[MG2D_DIFF_COEFF_11] = 1.0 / (fd_denoms[ctx->fd_stencil - 1][MG2D_DIFF_COEFF_11] * ctx->step[0] * ctx->step[1]); if (!ctx->tp) { ret = tp_init(&priv->tp_internal, 1); if (ret < 0) return ret; ctx->tp = priv->tp_internal; } if (ctx->solver_type == EGS_SOLVER_EXACT) { EGSExactInternal *e = &priv->e; unsigned int nb_threads = tp_get_nb_threads(ctx->tp); if (!(flags & EGS_INIT_FLAG_SAME_DIFF_COEFFS)) e->mat_filled = 0; switch (ctx->fd_stencil) { case 1: e->fill_mat = fill_mat_s1; break; case 2: e->fill_mat = fill_mat_s2; break; case 3: e->fill_mat = fill_mat_s3; break; case 4: e->fill_mat = fill_mat_s4; break; default: mg2di_log(&ctx->logger, 0, "Invalid finite difference stencil: %zd\n", ctx->fd_stencil); return -EINVAL; } if (e->scratch_lines_allocated < nb_threads) { free(e->scratch_lines); e->scratch_lines = NULL; e->scratch_lines_allocated = 0; e->scratch_lines = calloc(e->N_ghosts * nb_threads, sizeof(*e->scratch_lines)); if (!e->scratch_lines) return -ENOMEM; e->scratch_lines_allocated = nb_threads; } } priv->residual_calc_offset = 0; priv->residual_calc_size[0] = ctx->domain_size[0]; priv->residual_calc_size[1] = ctx->domain_size[1]; if (ctx->boundaries[MG2D_BOUNDARY_1L]->type == MG2D_BC_TYPE_FIXVAL) priv->residual_calc_offset += ctx->residual->stride[0]; if (ctx->boundaries[MG2D_BOUNDARY_0L]->type == MG2D_BC_TYPE_FIXVAL) priv->residual_calc_offset++; for (int bnd_idx = 0; bnd_idx < ARRAY_ELEMS(ctx->boundaries); bnd_idx++) { MG2DBoundary *bnd = ctx->boundaries[bnd_idx]; const int ci = mg2d_bnd_coord_idx(bnd_idx); if (bnd->type == MG2D_BC_TYPE_FIXVAL) { double maxval = 0.0; for (int j = 0; j < ctx->fd_stencil; j++) { for (ptrdiff_t i = -j; i < (ptrdiff_t)ctx->domain_size[!ci] + j; i++) { double val = fabs(bnd->val[j * bnd->val_stride + i]); maxval = MAX(maxval, val); } } priv->bnd_zero[bnd_idx] = maxval == 0.0; priv->residual_calc_size[ci]--; } } priv->rescalc->tp = ctx->tp; priv->rescalc->fd_stencil = ctx->fd_stencil; priv->rescalc->cpuflags = ctx->cpuflags; ret = mg2di_residual_calc_init(priv->rescalc); if (ret < 0) return ret; ctx->time_init += gettime() - start_init; ctx->count_init++; boundaries_apply(ctx, 1); residual_calc(ctx); ctx->time_total += gettime() - start; return 0; } static int arrays_alloc(EGSContext *ctx, const size_t domain_size[2]) { EGSInternal *priv = ctx->priv; const size_t ghosts = FD_STENCIL_MAX; const size_t size_padded[2] = { domain_size[1] + 2 * ghosts, domain_size[0] + 2 * ghosts, }; const size_t stride = size_padded[0]; const Slice slice[2] = { SLICE(ghosts, -ghosts, 1), SLICE(ghosts, -ghosts, 1) }; int ret; ret = mg2di_ndarray_alloc(&priv->u_base, 2, size_padded, NDARRAY_ALLOC_ZERO); if (ret < 0) return ret; ret = mg2di_ndarray_slice(&ctx->u, priv->u_base, slice); if (ret < 0) return ret; ret = mg2di_ndarray_alloc(&priv->rhs_base, 2, size_padded, 0); if (ret < 0) return ret; ret = mg2di_ndarray_slice(&ctx->rhs, priv->rhs_base, slice); if (ret < 0) return ret; ret = mg2di_ndarray_alloc(&priv->residual_base, 2, size_padded, NDARRAY_ALLOC_ZERO); if (ret < 0) return ret; ret = mg2di_ndarray_slice(&ctx->residual, priv->residual_base, slice); if (ret < 0) return ret; for (int i = 0; i < ARRAY_ELEMS(ctx->diff_coeffs); i++) { ret = mg2di_ndarray_alloc(&priv->diff_coeffs_base[i], 2, size_padded, NDARRAY_ALLOC_ZERO); if (ret < 0) return ret; ret = mg2di_ndarray_slice(&ctx->diff_coeffs[i], priv->diff_coeffs_base[i], slice); if (ret < 0) return ret; } priv->stride = stride; for (int i = 0; i < ARRAY_ELEMS(ctx->boundaries); i++) { const int ci = mg2d_bnd_coord_idx(i); ctx->boundaries[i] = mg2di_bc_alloc(domain_size[!ci]); if (!ctx->boundaries[i]) return -ENOMEM; } if (ctx->solver_type == EGS_SOLVER_EXACT) { EGSExactInternal *e = &priv->e; e->N = domain_size[0] * domain_size[1]; e->N_ghosts = (domain_size[0] + 2 * FD_STENCIL_MAX) * (domain_size[1] + 2 * FD_STENCIL_MAX); e->mat = calloc(SQR(e->N), sizeof(*e->mat)); e->mat_f = calloc(SQR(e->N), sizeof(*e->mat_f)); e->rhs = calloc(e->N, sizeof(*e->rhs)); e->rhs_mat = calloc(e->N, sizeof(*e->rhs_mat)); e->rhs_factor = calloc(e->N, sizeof(*e->rhs_factor)); e->x = calloc(e->N, sizeof(*e->x)); e->ipiv = calloc(e->N, sizeof(*e->ipiv)); if (!e->mat || !e->mat_f || !e->rhs || !e->rhs_mat || !e->rhs_factor || !e->x || !e->ipiv) return -ENOMEM; ret = mg2di_bicgstab_context_alloc(&e->bicgstab, e->N, 24); if (ret < 0) return ret; for (int i = 0; i < e->N; i++) e->mat[i * e->N + i] = 1.0; ret = mg2di_bicgstab_init(e->bicgstab, e->mat, e->x); if (ret < 0) return ret; } return 0; } EGSContext *mg2di_egs_alloc(enum EGSType type, size_t domain_size[2]) { EGSContext *ctx; int ret; ctx = calloc(1, sizeof(*ctx)); if (!ctx) return NULL; ctx->priv = calloc(1, sizeof(*ctx->priv)); if (!ctx->priv) goto fail; switch (type) { case EGS_SOLVER_RELAXATION: ctx->solver_data = calloc(1, sizeof(EGSRelaxContext)); if (!ctx->solver_data) goto fail; break; case EGS_SOLVER_EXACT: ctx->solver_data = calloc(1, sizeof(EGSExactContext)); if (!ctx->solver_data) goto fail; break; default: goto fail; } ctx->solver_type = type; if (!domain_size[0] || !domain_size[1] || domain_size[0] > SIZE_MAX / domain_size[1]) goto fail; ret = arrays_alloc(ctx, domain_size); if (ret < 0) goto fail; ctx->domain_size[0] = domain_size[0]; ctx->domain_size[1] = domain_size[1]; ctx->priv->rescalc = mg2di_residual_calc_alloc(); if (!ctx->priv->rescalc) goto fail; return ctx; fail: mg2di_egs_free(&ctx); return NULL; } void mg2di_egs_free(EGSContext **pctx) { EGSContext *ctx = *pctx; if (!ctx) return; mg2di_residual_calc_free(&ctx->priv->rescalc); free(ctx->solver_data); if (ctx->solver_type == EGS_SOLVER_EXACT) { EGSExactInternal *e = &ctx->priv->e; free(e->scratch_lines); free(e->mat); free(e->mat_f); free(e->rhs); free(e->rhs_mat); free(e->rhs_factor); free(e->x); free(e->ipiv); mg2di_bicgstab_context_free(&e->bicgstab); } mg2di_ndarray_free(&ctx->u); mg2di_ndarray_free(&ctx->priv->u_base); mg2di_ndarray_free(&ctx->rhs); mg2di_ndarray_free(&ctx->priv->rhs_base); mg2di_ndarray_free(&ctx->residual); mg2di_ndarray_free(&ctx->priv->residual_base); for (int i = 0; i < ARRAY_ELEMS(ctx->priv->diff_coeffs_base); i++) { mg2di_ndarray_free(&ctx->diff_coeffs[i]); mg2di_ndarray_free(&ctx->priv->diff_coeffs_base[i]); } for (int i = 0; i < ARRAY_ELEMS(ctx->boundaries); i++) mg2di_bc_free(&ctx->boundaries[i]); tp_free(&ctx->priv->tp_internal); free(ctx->priv); free(ctx); *pctx = NULL; }