summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2022-09-06 12:22:46 +0200
committerAnton Khirnov <anton@khirnov.net>2022-09-06 12:25:46 +0200
commitc1532eebeab91159eb0ee18552da6a31ab3248ef (patch)
treed12933f3a1da9cae8b728da7bd462d1b05a8e67c
parentff2c324b778836e2b1ec44ec9cfeca3b73f5b942 (diff)
Add AVX2 and AVX512 versions of fill_eq_coeffs_line
-rw-r--r--src/fill_eq_coeffs_template.c139
-rw-r--r--src/qms.c75
2 files changed, 169 insertions, 45 deletions
diff --git a/src/fill_eq_coeffs_template.c b/src/fill_eq_coeffs_template.c
index c5fe884..6e06c01 100644
--- a/src/fill_eq_coeffs_template.c
+++ b/src/fill_eq_coeffs_template.c
@@ -1,11 +1,40 @@
-#define ELEM_TYPE double
-#define ELEM_SIZE sizeof(ELEM_TYPE) / sizeof(double)
+#ifdef ELEM_SCALAR
+
+# define TYPE scalar
+
+# define ELEM_TYPE double
+
+# define ELEM_SPLAT(x) (x)
+
+# define LOAD(arr, idx) (arr[idx])
+# define STORE(arr, idx, val) arr[idx] = val
+
+#elif defined ELEM_AVX512
+
+# define TYPE avx512
+
+# define ELEM_TYPE __m512d
-#define ELEM_SPLAT(x) (x)
+# define ELEM_SPLAT(x) _mm512_set1_pd(x)
+
+#define LOAD(arr, idx) _mm512_loadu_pd((arr) + (idx))
+#define STORE(arr, idx, val) _mm512_storeu_pd((arr) + (idx), val)
+#elif defined ELEM_AVX2
+
+# define TYPE avx2
+
+# define ELEM_TYPE __m256d
+
+# define ELEM_SPLAT(x) _mm256_set1_pd(x)
+
+#define LOAD(arr, idx) _mm256_loadu_pd((arr) + (idx))
+#define STORE(arr, idx, val) _mm256_storeu_pd((arr) + (idx), val)
+#else
+# error "Unknown element type"
+#endif
-#define LOAD(arr, idx) (arr[idx])
-#define STORE(arr, idx, val) arr[idx] = val
+#define ELEM_SIZE sizeof(ELEM_TYPE) / sizeof(double)
// while gcc supports multiplying vectors by scalars, icc does not,
// which forces us to do this
@@ -22,7 +51,11 @@
#define C1_12 ELEM_SPLAT(1. / 12.0)
#define C1_144 ELEM_SPLAT(1.0 / 144.0)
-static inline ELEM_TYPE fd1_4(const double *arr, const ptrdiff_t stride, const ELEM_TYPE h_inv)
+#define JOIN2(a, b) a ## _ ## b
+#define FUNC2(name, type) JOIN2(name, type)
+#define FUNC(name) FUNC2(name, TYPE)
+
+static inline ELEM_TYPE FUNC(fd1_4)(const double *arr, const ptrdiff_t stride, const ELEM_TYPE h_inv)
{
const ELEM_TYPE ap2 = LOAD(arr, 2 * stride);
const ELEM_TYPE ap1 = LOAD(arr, 1 * stride);
@@ -31,7 +64,7 @@ static inline ELEM_TYPE fd1_4(const double *arr, const ptrdiff_t stride, const E
return (-ap2 + EIGHT * ap1 - EIGHT * am1 + am2) * h_inv * C1_12;
}
-static inline ELEM_TYPE fd2_4(const double *arr, const ptrdiff_t stride, const ELEM_TYPE h_inv)
+static inline ELEM_TYPE FUNC(fd2_4)(const double *arr, const ptrdiff_t stride, const ELEM_TYPE h_inv)
{
const ELEM_TYPE ap2 = LOAD(arr, 2 * stride);
const ELEM_TYPE ap1 = LOAD(arr, 1 * stride);
@@ -41,8 +74,8 @@ static inline ELEM_TYPE fd2_4(const double *arr, const ptrdiff_t stride, const E
return (-ap2 + C16 * ap1 - C30 * a0 + C16 * am1 - am2) * SQR(h_inv) * C1_12;
}
-static inline ELEM_TYPE fd11_4(const double *arr, ptrdiff_t stride_x, ptrdiff_t stride_z,
- ELEM_TYPE dx_inv, ELEM_TYPE dz_inv)
+static inline ELEM_TYPE FUNC(fd11_4)(const double *arr, ptrdiff_t stride_x, ptrdiff_t stride_z,
+ ELEM_TYPE dx_inv, ELEM_TYPE dz_inv)
{
const ELEM_TYPE ap2p2 = LOAD(arr, 2 * stride_x + 2 * stride_z);
const ELEM_TYPE ap2p1 = LOAD(arr, 2 * stride_x + 1 * stride_z);
@@ -114,9 +147,9 @@ static double diff8_dxz(const double *arr, ptrdiff_t stride, double dx_inv, doub
#define diff2 fd2_8
#define diff11 fd11_8
#else
-#define diff1 fd1_4
-#define diff2 fd2_4
-#define diff11 fd11_4
+#define diff1 FUNC(fd1_4)
+#define diff2 FUNC(fd2_4)
+#define diff11 FUNC(fd11_4)
#endif
#define diff_dx(arr, stride, dx_inv, dz_inv) (diff1 (arr, 1, dx_inv))
@@ -125,23 +158,21 @@ static double diff8_dxz(const double *arr, ptrdiff_t stride, double dx_inv, doub
#define diff_dzz(arr, stride, dx_inv, dz_inv) (diff2 (arr, stride, dz_inv))
#define diff_dxz(arr, stride, dx_inv, dz_inv) (diff11(arr, 1, stride, dx_inv, dz_inv))
-static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext *solver,
- const double * const vars[], const int idx_z,
- const ptrdiff_t stride_z, const double dx_inv_scal, const double dz_inv_scal)
+static void
+FUNC(fill_eq_coeffs_line)(const cGH *gh, const CoordPatch *cp, MG2DContext *solver,
+ const double * const vars[],
+ const int idx_x_start, const int idx_x_end,
+ const int idx_z,
+ const ptrdiff_t stride_z, const double dx_inv_scal, const double dz_inv_scal)
{
const ELEM_TYPE dx_inv = ELEM_SPLAT(dx_inv_scal);
const ELEM_TYPE dz_inv = ELEM_SPLAT(dz_inv_scal);
- for (int idx_x = 0; idx_x < solver->local_size[0]; idx_x++) {
+ for (int idx_x = idx_x_start; idx_x < idx_x_end; idx_x += ELEM_SIZE) {
const int idx_src = CCTK_GFINDEX3D(gh, idx_x + cp->offset_left[0], cp->y_idx, idx_z + cp->offset_left[1]);
const int idx_dc = idx_z * solver->diff_coeffs[0]->stride + idx_x;
const int idx_rhs = idx_z * solver->rhs_stride + idx_x;
- const double x = vars[RHS_VAR_X][idx_src];
- const int on_axis = fabs(x) < 1e-13;
-
-#define AXIS_VAL(axis, normal) (on_axis ? (axis) : (normal))
-
ELEM_TYPE K[3][3], Km[3][3], Ku[3][3], dK[3][3][3];
ELEM_TYPE gtu[3][3], g[3][3], gu[3][3];
ELEM_TYPE dg[3][3][3], dgu[3][3][3], d2g[3][3][3][3], gudot[3][3];
@@ -162,6 +193,19 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext
#define DZZ(var) (diff2 (&vars[RHS_VAR_ ## var][idx_src], stride_z, dz_inv))
#define DXZ(var) (diff11(&vars[RHS_VAR_ ## var][idx_src], 1, stride_z, dx_inv, dz_inv))
+ const ELEM_TYPE x = LOAD_VAR(X);
+
+// special treatment for the axis is only implemented in the scalar version
+#ifdef ELEM_SCALAR
+ const int on_axis = fabs(x) < 1e-13;
+
+# define AXIS_VAL(axis, normal) (on_axis ? (axis) : (normal))
+#else
+ const int on_axis = 0;
+
+# define AXIS_VAL(axis, normal) (normal)
+#endif
+
const ELEM_TYPE gtxx = LOAD_VAR(GTXX);
const ELEM_TYPE gtyy = LOAD_VAR(GTYY);
const ELEM_TYPE gtzz = LOAD_VAR(GTZZ);
@@ -224,10 +268,10 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext
{ dxx_gt13, ZERO, dxx_gt33 },
},
{
- { ZERO, AXIS_VAL(0.5 * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO },
- { AXIS_VAL(0.5 * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO,
- AXIS_VAL(0.5 * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)) },
- { ZERO, AXIS_VAL(0.5 * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)), ZERO },
+ { ZERO, AXIS_VAL(HALF * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO },
+ { AXIS_VAL(HALF * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO,
+ AXIS_VAL(HALF * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)) },
+ { ZERO, AXIS_VAL(HALF * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)), ZERO },
},
{
{ dxz_gt11, ZERO, dxz_gt13 },
@@ -238,16 +282,16 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext
},
{
{
- { ZERO, AXIS_VAL(0.5 * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO },
- { AXIS_VAL(0.5 * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO,
- AXIS_VAL(0.5 * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)) },
- { ZERO, AXIS_VAL(0.5 * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)), ZERO },
+ { ZERO, AXIS_VAL(HALF * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO },
+ { AXIS_VAL(HALF * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO,
+ AXIS_VAL(HALF * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)) },
+ { ZERO, AXIS_VAL(HALF * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)), ZERO },
},
{
{ AXIS_VAL(dxx_gt22, dx_gt11 / x - 2 * (gtxx - gtyy) / SQR(x)), ZERO,
- AXIS_VAL(0.5 * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)) },
+ AXIS_VAL(HALF * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)) },
{ ZERO, AXIS_VAL(dxx_gt11, dx_gt22 / x + 2.0 * (gtxx - gtyy) / SQR(x)), ZERO },
- { AXIS_VAL(0.5 * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)), ZERO,
+ { AXIS_VAL(HALF * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)), ZERO,
AXIS_VAL(dxx_gt33, dx_gt33 / x) },
},
{
@@ -348,9 +392,9 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext
const ELEM_TYPE dzz_alpha = DZZ(ALPHA);
const ELEM_TYPE dxz_alpha = DXZ(ALPHA);
- const ELEM_TYPE d2alpha[3][3] = {{ dxx_alpha, 0, dxz_alpha },
- { 0, AXIS_VAL(dxx_alpha, dx_alpha / x), 0 },
- { dxz_alpha, 0, dzz_alpha }};
+ const ELEM_TYPE d2alpha[3][3] = {{ dxx_alpha, ZERO, dxz_alpha },
+ { ZERO, AXIS_VAL(dxx_alpha, dx_alpha / x), ZERO },
+ { dxz_alpha, ZERO, dzz_alpha }};
const ELEM_TYPE betax = LOAD_VAR(BETAX);
const ELEM_TYPE betaz = LOAD_VAR(BETAX);
@@ -433,9 +477,9 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext
for (int j = 0; j < 3; j++)
for (int k = 0; k < 3; k++)
for (int l = 0; l < 3; l++) {
- dg[j][k][l] = -2.0 * dphi[j] * gt[k][l] / (phi * SQR(phi)) + dgt[j][k][l] / SQR(phi);
- dK[j][k][l] = -2.0 * dphi[j] * At[k][l] / (phi * SQR(phi)) + dAt[j][k][l] / SQR(phi) +
- (1.0 / 3.0) * (dtrK[j] * g[k][l] + trK * dg[j][k][l]);
+ dg[j][k][l] = -TWO * dphi[j] * gt[k][l] / (phi * SQR(phi)) + dgt[j][k][l] / SQR(phi);
+ dK[j][k][l] = -TWO * dphi[j] * At[k][l] / (phi * SQR(phi)) + dAt[j][k][l] / SQR(phi) +
+ THIRD * (dtrK[j] * g[k][l] + trK * dg[j][k][l]);
}
// ∂_{jk} g_{lm}
@@ -443,10 +487,10 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext
for (int k = 0; k < 3; k++)
for (int l = 0; l < 3; l++)
for (int m = 0; m < 3; m++) {
- d2g[j][k][l][m] = 6.0 * gt [l][m] * dphi[j] * dphi[k] / SQR(SQR(phi)) -
- 2.0 * gt [l][m] * d2phi[j][k] / (phi * SQR(phi)) -
- 2.0 * dgt [j][l][m] * dphi[k] / (phi * SQR(phi)) -
- 2.0 * dgt [k][l][m] * dphi[j] / (phi * SQR(phi)) +
+ d2g[j][k][l][m] = SIZ * gt [l][m] * dphi[j] * dphi[k] / SQR(SQR(phi)) -
+ TWO * gt [l][m] * d2phi[j][k] / (phi * SQR(phi)) -
+ TWO * dgt [j][l][m] * dphi[k] / (phi * SQR(phi)) -
+ TWO * dgt [k][l][m] * dphi[j] / (phi * SQR(phi)) +
d2gt[j][k][l][m] / SQR(phi);
}
@@ -516,7 +560,7 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext
// K_{ij}
for (int j = 0; j < 3; j++)
for (int k = 0; k < 3; k++)
- K[j][k] = At[j][k] / SQR(phi) + (1.0 / 3.0) * g[j][k] * trK;
+ K[j][k] = At[j][k] / SQR(phi) + THIRD * g[j][k] * trK;
for (int j = 0; j < 3; j++)
for (int k = 0; k < 3; k++) {
@@ -673,7 +717,7 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext
STORE(solver->diff_coeffs[MG2D_DIFF_COEFF_20]->data, idx_dc, gu[0][0] + AXIS_VAL(gu[1][1], ZERO));
STORE(solver->diff_coeffs[MG2D_DIFF_COEFF_02]->data, idx_dc, gu[2][2]);
- STORE(solver->diff_coeffs[MG2D_DIFF_COEFF_11]->data, idx_dc, 2.0 * gu[0][2]);
+ STORE(solver->diff_coeffs[MG2D_DIFF_COEFF_11]->data, idx_dc, TWO * gu[0][2]);
STORE(solver->diff_coeffs[MG2D_DIFF_COEFF_10]->data, idx_dc, -X[0] + AXIS_VAL(ZERO, gu[1][1] / x));
STORE(solver->diff_coeffs[MG2D_DIFF_COEFF_01]->data, idx_dc, -X[2]);
STORE(solver->diff_coeffs[MG2D_DIFF_COEFF_00]->data, idx_dc, -k2);
@@ -681,13 +725,21 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext
STORE(solver->rhs, idx_rhs, -TWO * alpha * Kij_Dij_alpha + Gammadot_term +
TWO * alpha * (k_kdot + TWO * alpha * k3) + beta_term);
+#ifdef ELEM_SCALAR
if (on_axis) {
solver->diff_coeffs[MG2D_DIFF_COEFF_20]->boundaries[MG2D_BOUNDARY_0L].val[idx_z] = gu[1][1];
solver->diff_coeffs[MG2D_DIFF_COEFF_10]->boundaries[MG2D_BOUNDARY_0L].val[idx_z] = gu[1][1];
}
+#endif
}
}
+#undef AXIS_VAL
+
+#undef FUNC
+#undef FUNC2
+#undef JOIN2
+
#undef LOAD
#undef STORE
@@ -707,3 +759,4 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext
#undef ELEM_SPLAT
#undef ELEM_SIZE
#undef ELEM_TYPE
+#undef TYPE
diff --git a/src/qms.c b/src/qms.c
index 8fae7f0..4a72428 100644
--- a/src/qms.c
+++ b/src/qms.c
@@ -8,6 +8,8 @@
#include <stdlib.h>
#include <string.h>
+#include <immintrin.h>
+
#include <mg2d.h>
#include <mg2d_boundary.h>
@@ -528,10 +530,61 @@ enum RHSVar {
RHS_VAR_NB,
};
+typedef void
+(*FECLine)(const cGH *gh, const CoordPatch *cp, MG2DContext *solver,
+ const double * const vars[],
+ const int idx_x_start, const int idx_x_end,
+ const int idx_z,
+ const ptrdiff_t stride_z, const double dx_inv_scal, const double dz_inv_scal);
+
+// computing QMS equation coefficients is resource-intensive, so we generate
+// vectorized versions when possible
+//
+// first, instantiate the plain scalar version
+
+#define ELEM_SCALAR
#include "fill_eq_coeffs_template.c"
+#undef ELEM_SCALAR
+
+#if defined(__INTEL_COMPILER) && defined(__OPTIMIZE__)
+// ICC miscompiles the vectorized versions of fill_eq_coeffs_line() with higher
+// optimization levels
+# pragma GCC push_options
+# pragma GCC optimize("O1")
+#endif
+
+#ifdef __AVX512F__
+
+#define ELEM_AVX512
+#include "fill_eq_coeffs_template.c"
+#undef ELEM_AVX512
+
+#define FILL_EQ_COEFFS_BLOCKSIZE 8
+#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_avx512
+
+#elif defined(__AVX2__)
+
+#define ELEM_AVX2
+#include "fill_eq_coeffs_template.c"
+#undef ELEM_AVX2
+
+#define FILL_EQ_COEFFS_BLOCKSIZE 4
+#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_avx2
+
+#else
+
+#define FILL_EQ_COEFFS_BLOCKSIZE 1
+#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_scalar
+
+#endif
+
+#if defined(__INTEL_COMPILER) && defined(__OPTIMIZE__)
+# pragma GCC pop_options
+#endif
static void fill_eq_coeffs(QMSMGContext *ctx, CoordPatch *cp, MG2DContext *solver)
{
+ FECLine fill_eq_coeffs_line;
cGH *gh = ctx->gh;
int ret;
@@ -568,12 +621,30 @@ static void fill_eq_coeffs(QMSMGContext *ctx, CoordPatch *cp, MG2DContext *solve
const double dx_inv = 1.0 / (vars[RHS_VAR_X][1] - vars[RHS_VAR_X][0]);
const double dz_inv = 1.0 / (vars[RHS_VAR_Z][stride_z] - vars[RHS_VAR_Z][0]);
+ const int have_axis = fabs(vars[RHS_VAR_X][cp->offset_left[0]]) < 1e-13;
+ int idx_x_start, idx_x_end;
+
solver->diff_coeffs[MG2D_DIFF_COEFF_20]->boundaries[MG2D_BOUNDARY_0L].flags = MG2D_DC_FLAG_DISCONT;
solver->diff_coeffs[MG2D_DIFF_COEFF_10]->boundaries[MG2D_BOUNDARY_0L].flags = MG2D_DC_FLAG_POLE;
+ // do not execute the vectorized version on the axis,
+ // as it does not support special handling of the values there
+ idx_x_start = FILL_EQ_COEFFS_BLOCKSIZE > 1 && !!have_axis;
+ idx_x_end = idx_x_start + (solver->local_size[0] - idx_x_start) / FILL_EQ_COEFFS_BLOCKSIZE * FILL_EQ_COEFFS_BLOCKSIZE;
+
#pragma omp parallel for
- for (int idx_z = 0; idx_z < solver->local_size[1]; idx_z++)
- fill_eq_coeffs_line(gh, cp, solver, vars, idx_z, stride_z, dx_inv, dz_inv);
+ for (int idx_z = 0; idx_z < solver->local_size[1]; idx_z++) {
+ if (idx_x_start)
+ fill_eq_coeffs_line_scalar(gh, cp, solver, vars, 0, idx_x_start,
+ idx_z, stride_z, dx_inv, dz_inv);
+
+ FILL_EQ_COEFFS_VECTOR(gh, cp, solver, vars, idx_x_start, idx_x_end,
+ idx_z, stride_z, dx_inv, dz_inv);
+
+ if (idx_x_end < solver->local_size[0])
+ fill_eq_coeffs_line_scalar(gh, cp, solver, vars, idx_x_end, solver->local_size[0],
+ idx_z, stride_z, dx_inv, dz_inv);
+ }
}
static void mirror(CoordPatch *cp, double *dst)