diff options
Diffstat (limited to 'src/qms.c')
-rw-r--r-- | src/qms.c | 75 |
1 files changed, 73 insertions, 2 deletions
@@ -8,6 +8,8 @@ #include <stdlib.h> #include <string.h> +#include <immintrin.h> + #include <mg2d.h> #include <mg2d_boundary.h> @@ -528,10 +530,61 @@ enum RHSVar { RHS_VAR_NB, }; +typedef void +(*FECLine)(const cGH *gh, const CoordPatch *cp, MG2DContext *solver, + const double * const vars[], + const int idx_x_start, const int idx_x_end, + const int idx_z, + const ptrdiff_t stride_z, const double dx_inv_scal, const double dz_inv_scal); + +// computing QMS equation coefficients is resource-intensive, so we generate +// vectorized versions when possible +// +// first, instantiate the plain scalar version + +#define ELEM_SCALAR #include "fill_eq_coeffs_template.c" +#undef ELEM_SCALAR + +#if defined(__INTEL_COMPILER) && defined(__OPTIMIZE__) +// ICC miscompiles the vectorized versions of fill_eq_coeffs_line() with higher +// optimization levels +# pragma GCC push_options +# pragma GCC optimize("O1") +#endif + +#ifdef __AVX512F__ + +#define ELEM_AVX512 +#include "fill_eq_coeffs_template.c" +#undef ELEM_AVX512 + +#define FILL_EQ_COEFFS_BLOCKSIZE 8 +#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_avx512 + +#elif defined(__AVX2__) + +#define ELEM_AVX2 +#include "fill_eq_coeffs_template.c" +#undef ELEM_AVX2 + +#define FILL_EQ_COEFFS_BLOCKSIZE 4 +#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_avx2 + +#else + +#define FILL_EQ_COEFFS_BLOCKSIZE 1 +#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_scalar + +#endif + +#if defined(__INTEL_COMPILER) && defined(__OPTIMIZE__) +# pragma GCC pop_options +#endif static void fill_eq_coeffs(QMSMGContext *ctx, CoordPatch *cp, MG2DContext *solver) { + FECLine fill_eq_coeffs_line; cGH *gh = ctx->gh; int ret; @@ -568,12 +621,30 @@ static void fill_eq_coeffs(QMSMGContext *ctx, CoordPatch *cp, MG2DContext *solve const double dx_inv = 1.0 / (vars[RHS_VAR_X][1] - vars[RHS_VAR_X][0]); const double dz_inv = 1.0 / (vars[RHS_VAR_Z][stride_z] - vars[RHS_VAR_Z][0]); + const int have_axis = fabs(vars[RHS_VAR_X][cp->offset_left[0]]) < 1e-13; + int idx_x_start, idx_x_end; + solver->diff_coeffs[MG2D_DIFF_COEFF_20]->boundaries[MG2D_BOUNDARY_0L].flags = MG2D_DC_FLAG_DISCONT; solver->diff_coeffs[MG2D_DIFF_COEFF_10]->boundaries[MG2D_BOUNDARY_0L].flags = MG2D_DC_FLAG_POLE; + // do not execute the vectorized version on the axis, + // as it does not support special handling of the values there + idx_x_start = FILL_EQ_COEFFS_BLOCKSIZE > 1 && !!have_axis; + idx_x_end = idx_x_start + (solver->local_size[0] - idx_x_start) / FILL_EQ_COEFFS_BLOCKSIZE * FILL_EQ_COEFFS_BLOCKSIZE; + #pragma omp parallel for - for (int idx_z = 0; idx_z < solver->local_size[1]; idx_z++) - fill_eq_coeffs_line(gh, cp, solver, vars, idx_z, stride_z, dx_inv, dz_inv); + for (int idx_z = 0; idx_z < solver->local_size[1]; idx_z++) { + if (idx_x_start) + fill_eq_coeffs_line_scalar(gh, cp, solver, vars, 0, idx_x_start, + idx_z, stride_z, dx_inv, dz_inv); + + FILL_EQ_COEFFS_VECTOR(gh, cp, solver, vars, idx_x_start, idx_x_end, + idx_z, stride_z, dx_inv, dz_inv); + + if (idx_x_end < solver->local_size[0]) + fill_eq_coeffs_line_scalar(gh, cp, solver, vars, idx_x_end, solver->local_size[0], + idx_z, stride_z, dx_inv, dz_inv); + } } static void mirror(CoordPatch *cp, double *dst) |