1 files changed, 73 insertions, 2 deletions
diff --git a/src/qms.c b/src/qms.c
index 8fae7f0..4a72428 100644
--- a/src/qms.c
+++ b/src/qms.c
@@ -8,6 +8,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include <immintrin.h>
+
 #include <mg2d.h>
 #include <mg2d_boundary.h>
 
@@ -528,10 +530,61 @@ enum RHSVar {
     RHS_VAR_NB,
 };
 
+typedef void
+(*FECLine)(const cGH *gh, const CoordPatch *cp, MG2DContext *solver,
+           const double * const vars[],
+           const int idx_x_start, const int idx_x_end,
+           const int idx_z,
+           const ptrdiff_t stride_z, const double dx_inv_scal, const double dz_inv_scal);
+
+// computing QMS equation coefficients is resource-intensive, so we generate
+// vectorized versions when possible
+//
+// first, instantiate the plain scalar version
+
+#define ELEM_SCALAR
 #include "fill_eq_coeffs_template.c"
+#undef ELEM_SCALAR
+
+#if defined(__INTEL_COMPILER) && defined(__OPTIMIZE__)
+// ICC miscompiles the vectorized versions of fill_eq_coeffs_line() with higher
+// optimization levels
+# pragma GCC push_options
+# pragma GCC optimize("O1")
+#endif
+
+#ifdef __AVX512F__
+
+#define ELEM_AVX512
+#include "fill_eq_coeffs_template.c"
+#undef ELEM_AVX512
+
+#define FILL_EQ_COEFFS_BLOCKSIZE 8
+#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_avx512
+
+#elif defined(__AVX2__)
+
+#define ELEM_AVX2
+#include "fill_eq_coeffs_template.c"
+#undef ELEM_AVX2
+
+#define FILL_EQ_COEFFS_BLOCKSIZE 4
+#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_avx2
+
+#else
+
+#define FILL_EQ_COEFFS_BLOCKSIZE 1
+#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_scalar
+
+#endif
+
+#if defined(__INTEL_COMPILER) && defined(__OPTIMIZE__)
+# pragma GCC pop_options
+#endif
 
 static void fill_eq_coeffs(QMSMGContext *ctx, CoordPatch *cp, MG2DContext *solver)
 {
+    FECLine fill_eq_coeffs_line;
     cGH *gh = ctx->gh;
     int ret;
 
@@ -568,12 +621,30 @@ static void fill_eq_coeffs(QMSMGContext *ctx, CoordPatch *cp, MG2DContext *solve
     const double dx_inv = 1.0 / (vars[RHS_VAR_X][1]        - vars[RHS_VAR_X][0]);
     const double dz_inv = 1.0 / (vars[RHS_VAR_Z][stride_z] - vars[RHS_VAR_Z][0]);
 
+    const int have_axis = fabs(vars[RHS_VAR_X][cp->offset_left[0]]) < 1e-13;
+    int idx_x_start, idx_x_end;
+
     solver->diff_coeffs[MG2D_DIFF_COEFF_20]->boundaries[MG2D_BOUNDARY_0L].flags = MG2D_DC_FLAG_DISCONT;
     solver->diff_coeffs[MG2D_DIFF_COEFF_10]->boundaries[MG2D_BOUNDARY_0L].flags = MG2D_DC_FLAG_POLE;
 
+     // do not execute the vectorized version on the axis,
+     // as it does not support special handling of the values there
+     idx_x_start         = FILL_EQ_COEFFS_BLOCKSIZE > 1 && !!have_axis;
+     idx_x_end           = idx_x_start + (solver->local_size[0] - idx_x_start) / FILL_EQ_COEFFS_BLOCKSIZE * FILL_EQ_COEFFS_BLOCKSIZE;
+
 #pragma omp parallel for
-    for (int idx_z = 0; idx_z < solver->local_size[1]; idx_z++)
-        fill_eq_coeffs_line(gh, cp, solver, vars, idx_z, stride_z, dx_inv, dz_inv);
+    for (int idx_z = 0; idx_z < solver->local_size[1]; idx_z++) {
+        if (idx_x_start)
+            fill_eq_coeffs_line_scalar(gh, cp, solver, vars, 0, idx_x_start,
+                                       idx_z, stride_z, dx_inv, dz_inv);
+
+        FILL_EQ_COEFFS_VECTOR(gh, cp, solver, vars, idx_x_start, idx_x_end,
+                              idx_z, stride_z, dx_inv, dz_inv);
+
+        if (idx_x_end < solver->local_size[0])
+            fill_eq_coeffs_line_scalar(gh, cp, solver, vars, idx_x_end, solver->local_size[0],
+                                       idx_z, stride_z, dx_inv, dz_inv);
+    }
 }
 
 static void mirror(CoordPatch *cp, double *dst)