summaryrefslogtreecommitdiff
path: root/src/qms.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/qms.c')
-rw-r--r--src/qms.c75
1 files changed, 73 insertions, 2 deletions
diff --git a/src/qms.c b/src/qms.c
index 8fae7f0..4a72428 100644
--- a/src/qms.c
+++ b/src/qms.c
@@ -8,6 +8,8 @@
#include <stdlib.h>
#include <string.h>
+#include <immintrin.h>
+
#include <mg2d.h>
#include <mg2d_boundary.h>
@@ -528,10 +530,61 @@ enum RHSVar {
RHS_VAR_NB,
};
+typedef void
+(*FECLine)(const cGH *gh, const CoordPatch *cp, MG2DContext *solver,
+ const double * const vars[],
+ const int idx_x_start, const int idx_x_end,
+ const int idx_z,
+ const ptrdiff_t stride_z, const double dx_inv_scal, const double dz_inv_scal);
+
+// computing QMS equation coefficients is resource-intensive, so we generate
+// vectorized versions when possible
+//
+// first, instantiate the plain scalar version
+
+#define ELEM_SCALAR
#include "fill_eq_coeffs_template.c"
+#undef ELEM_SCALAR
+
+#if defined(__INTEL_COMPILER) && defined(__OPTIMIZE__)
+// ICC miscompiles the vectorized versions of fill_eq_coeffs_line() with higher
+// optimization levels
+# pragma GCC push_options
+# pragma GCC optimize("O1")
+#endif
+
+#ifdef __AVX512F__
+
+#define ELEM_AVX512
+#include "fill_eq_coeffs_template.c"
+#undef ELEM_AVX512
+
+#define FILL_EQ_COEFFS_BLOCKSIZE 8
+#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_avx512
+
+#elif defined(__AVX2__)
+
+#define ELEM_AVX2
+#include "fill_eq_coeffs_template.c"
+#undef ELEM_AVX2
+
+#define FILL_EQ_COEFFS_BLOCKSIZE 4
+#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_avx2
+
+#else
+
+#define FILL_EQ_COEFFS_BLOCKSIZE 1
+#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_scalar
+
+#endif
+
+#if defined(__INTEL_COMPILER) && defined(__OPTIMIZE__)
+# pragma GCC pop_options
+#endif
static void fill_eq_coeffs(QMSMGContext *ctx, CoordPatch *cp, MG2DContext *solver)
{
+ FECLine fill_eq_coeffs_line;
cGH *gh = ctx->gh;
int ret;
@@ -568,12 +621,30 @@ static void fill_eq_coeffs(QMSMGContext *ctx, CoordPatch *cp, MG2DContext *solve
const double dx_inv = 1.0 / (vars[RHS_VAR_X][1] - vars[RHS_VAR_X][0]);
const double dz_inv = 1.0 / (vars[RHS_VAR_Z][stride_z] - vars[RHS_VAR_Z][0]);
+ const int have_axis = fabs(vars[RHS_VAR_X][cp->offset_left[0]]) < 1e-13;
+ int idx_x_start, idx_x_end;
+
solver->diff_coeffs[MG2D_DIFF_COEFF_20]->boundaries[MG2D_BOUNDARY_0L].flags = MG2D_DC_FLAG_DISCONT;
solver->diff_coeffs[MG2D_DIFF_COEFF_10]->boundaries[MG2D_BOUNDARY_0L].flags = MG2D_DC_FLAG_POLE;
+ // do not execute the vectorized version on the axis,
+ // as it does not support special handling of the values there
+ idx_x_start = FILL_EQ_COEFFS_BLOCKSIZE > 1 && !!have_axis;
+ idx_x_end = idx_x_start + (solver->local_size[0] - idx_x_start) / FILL_EQ_COEFFS_BLOCKSIZE * FILL_EQ_COEFFS_BLOCKSIZE;
+
#pragma omp parallel for
- for (int idx_z = 0; idx_z < solver->local_size[1]; idx_z++)
- fill_eq_coeffs_line(gh, cp, solver, vars, idx_z, stride_z, dx_inv, dz_inv);
+ for (int idx_z = 0; idx_z < solver->local_size[1]; idx_z++) {
+ if (idx_x_start)
+ fill_eq_coeffs_line_scalar(gh, cp, solver, vars, 0, idx_x_start,
+ idx_z, stride_z, dx_inv, dz_inv);
+
+ FILL_EQ_COEFFS_VECTOR(gh, cp, solver, vars, idx_x_start, idx_x_end,
+ idx_z, stride_z, dx_inv, dz_inv);
+
+ if (idx_x_end < solver->local_size[0])
+ fill_eq_coeffs_line_scalar(gh, cp, solver, vars, idx_x_end, solver->local_size[0],
+ idx_z, stride_z, dx_inv, dz_inv);
+ }
}
static void mirror(CoordPatch *cp, double *dst)