From c1532eebeab91159eb0ee18552da6a31ab3248ef Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Tue, 6 Sep 2022 12:22:46 +0200 Subject: Add AVX2 and AVX512 versions of fill_eq_coeffs_line --- src/fill_eq_coeffs_template.c | 139 +++++++++++++++++++++++++++++------------- src/qms.c | 75 ++++++++++++++++++++++- 2 files changed, 169 insertions(+), 45 deletions(-) diff --git a/src/fill_eq_coeffs_template.c b/src/fill_eq_coeffs_template.c index c5fe884..6e06c01 100644 --- a/src/fill_eq_coeffs_template.c +++ b/src/fill_eq_coeffs_template.c @@ -1,11 +1,40 @@ -#define ELEM_TYPE double -#define ELEM_SIZE sizeof(ELEM_TYPE) / sizeof(double) +#ifdef ELEM_SCALAR + +# define TYPE scalar + +# define ELEM_TYPE double + +# define ELEM_SPLAT(x) (x) + +# define LOAD(arr, idx) (arr[idx]) +# define STORE(arr, idx, val) arr[idx] = val + +#elif defined ELEM_AVX512 + +# define TYPE avx512 + +# define ELEM_TYPE __m512d -#define ELEM_SPLAT(x) (x) +# define ELEM_SPLAT(x) _mm512_set1_pd(x) + +#define LOAD(arr, idx) _mm512_loadu_pd((arr) + (idx)) +#define STORE(arr, idx, val) _mm512_storeu_pd((arr) + (idx), val) +#elif defined ELEM_AVX2 + +# define TYPE avx2 + +# define ELEM_TYPE __m256d + +# define ELEM_SPLAT(x) _mm256_set1_pd(x) + +#define LOAD(arr, idx) _mm256_loadu_pd((arr) + (idx)) +#define STORE(arr, idx, val) _mm256_storeu_pd((arr) + (idx), val) +#else +# error "Unknown element type" +#endif -#define LOAD(arr, idx) (arr[idx]) -#define STORE(arr, idx, val) arr[idx] = val +#define ELEM_SIZE sizeof(ELEM_TYPE) / sizeof(double) // while gcc supports multiplying vectors by scalars, icc does not, // which forces us to do this @@ -22,7 +51,11 @@ #define C1_12 ELEM_SPLAT(1. / 12.0) #define C1_144 ELEM_SPLAT(1.0 / 144.0) -static inline ELEM_TYPE fd1_4(const double *arr, const ptrdiff_t stride, const ELEM_TYPE h_inv) +#define JOIN2(a, b) a ## _ ## b +#define FUNC2(name, type) JOIN2(name, type) +#define FUNC(name) FUNC2(name, TYPE) + +static inline ELEM_TYPE FUNC(fd1_4)(const double *arr, const ptrdiff_t stride, const ELEM_TYPE h_inv) { const ELEM_TYPE ap2 = LOAD(arr, 2 * stride); const ELEM_TYPE ap1 = LOAD(arr, 1 * stride); @@ -31,7 +64,7 @@ static inline ELEM_TYPE fd1_4(const double *arr, const ptrdiff_t stride, const E return (-ap2 + EIGHT * ap1 - EIGHT * am1 + am2) * h_inv * C1_12; } -static inline ELEM_TYPE fd2_4(const double *arr, const ptrdiff_t stride, const ELEM_TYPE h_inv) +static inline ELEM_TYPE FUNC(fd2_4)(const double *arr, const ptrdiff_t stride, const ELEM_TYPE h_inv) { const ELEM_TYPE ap2 = LOAD(arr, 2 * stride); const ELEM_TYPE ap1 = LOAD(arr, 1 * stride); @@ -41,8 +74,8 @@ static inline ELEM_TYPE fd2_4(const double *arr, const ptrdiff_t stride, const E return (-ap2 + C16 * ap1 - C30 * a0 + C16 * am1 - am2) * SQR(h_inv) * C1_12; } -static inline ELEM_TYPE fd11_4(const double *arr, ptrdiff_t stride_x, ptrdiff_t stride_z, - ELEM_TYPE dx_inv, ELEM_TYPE dz_inv) +static inline ELEM_TYPE FUNC(fd11_4)(const double *arr, ptrdiff_t stride_x, ptrdiff_t stride_z, + ELEM_TYPE dx_inv, ELEM_TYPE dz_inv) { const ELEM_TYPE ap2p2 = LOAD(arr, 2 * stride_x + 2 * stride_z); const ELEM_TYPE ap2p1 = LOAD(arr, 2 * stride_x + 1 * stride_z); @@ -114,9 +147,9 @@ static double diff8_dxz(const double *arr, ptrdiff_t stride, double dx_inv, doub #define diff2 fd2_8 #define diff11 fd11_8 #else -#define diff1 fd1_4 -#define diff2 fd2_4 -#define diff11 fd11_4 +#define diff1 FUNC(fd1_4) +#define diff2 FUNC(fd2_4) +#define diff11 FUNC(fd11_4) #endif #define diff_dx(arr, stride, dx_inv, dz_inv) (diff1 (arr, 1, dx_inv)) @@ -125,23 +158,21 @@ static double diff8_dxz(const double *arr, ptrdiff_t stride, double dx_inv, doub #define diff_dzz(arr, stride, dx_inv, dz_inv) (diff2 (arr, stride, dz_inv)) #define diff_dxz(arr, stride, dx_inv, dz_inv) (diff11(arr, 1, stride, dx_inv, dz_inv)) -static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext *solver, - const double * const vars[], const int idx_z, - const ptrdiff_t stride_z, const double dx_inv_scal, const double dz_inv_scal) +static void +FUNC(fill_eq_coeffs_line)(const cGH *gh, const CoordPatch *cp, MG2DContext *solver, + const double * const vars[], + const int idx_x_start, const int idx_x_end, + const int idx_z, + const ptrdiff_t stride_z, const double dx_inv_scal, const double dz_inv_scal) { const ELEM_TYPE dx_inv = ELEM_SPLAT(dx_inv_scal); const ELEM_TYPE dz_inv = ELEM_SPLAT(dz_inv_scal); - for (int idx_x = 0; idx_x < solver->local_size[0]; idx_x++) { + for (int idx_x = idx_x_start; idx_x < idx_x_end; idx_x += ELEM_SIZE) { const int idx_src = CCTK_GFINDEX3D(gh, idx_x + cp->offset_left[0], cp->y_idx, idx_z + cp->offset_left[1]); const int idx_dc = idx_z * solver->diff_coeffs[0]->stride + idx_x; const int idx_rhs = idx_z * solver->rhs_stride + idx_x; - const double x = vars[RHS_VAR_X][idx_src]; - const int on_axis = fabs(x) < 1e-13; - -#define AXIS_VAL(axis, normal) (on_axis ? (axis) : (normal)) - ELEM_TYPE K[3][3], Km[3][3], Ku[3][3], dK[3][3][3]; ELEM_TYPE gtu[3][3], g[3][3], gu[3][3]; ELEM_TYPE dg[3][3][3], dgu[3][3][3], d2g[3][3][3][3], gudot[3][3]; @@ -162,6 +193,19 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext #define DZZ(var) (diff2 (&vars[RHS_VAR_ ## var][idx_src], stride_z, dz_inv)) #define DXZ(var) (diff11(&vars[RHS_VAR_ ## var][idx_src], 1, stride_z, dx_inv, dz_inv)) + const ELEM_TYPE x = LOAD_VAR(X); + +// special treatment for the axis is only implemented in the scalar version +#ifdef ELEM_SCALAR + const int on_axis = fabs(x) < 1e-13; + +# define AXIS_VAL(axis, normal) (on_axis ? (axis) : (normal)) +#else + const int on_axis = 0; + +# define AXIS_VAL(axis, normal) (normal) +#endif + const ELEM_TYPE gtxx = LOAD_VAR(GTXX); const ELEM_TYPE gtyy = LOAD_VAR(GTYY); const ELEM_TYPE gtzz = LOAD_VAR(GTZZ); @@ -224,10 +268,10 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext { dxx_gt13, ZERO, dxx_gt33 }, }, { - { ZERO, AXIS_VAL(0.5 * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO }, - { AXIS_VAL(0.5 * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO, - AXIS_VAL(0.5 * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)) }, - { ZERO, AXIS_VAL(0.5 * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)), ZERO }, + { ZERO, AXIS_VAL(HALF * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO }, + { AXIS_VAL(HALF * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO, + AXIS_VAL(HALF * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)) }, + { ZERO, AXIS_VAL(HALF * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)), ZERO }, }, { { dxz_gt11, ZERO, dxz_gt13 }, @@ -238,16 +282,16 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext }, { { - { ZERO, AXIS_VAL(0.5 * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO }, - { AXIS_VAL(0.5 * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO, - AXIS_VAL(0.5 * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)) }, - { ZERO, AXIS_VAL(0.5 * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)), ZERO }, + { ZERO, AXIS_VAL(HALF * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO }, + { AXIS_VAL(HALF * (dxx_gt11 - dxx_gt22), (dx_gt11 - dx_gt22) / x - (gtxx - gtyy) / SQR(x)), ZERO, + AXIS_VAL(HALF * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)) }, + { ZERO, AXIS_VAL(HALF * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)), ZERO }, }, { { AXIS_VAL(dxx_gt22, dx_gt11 / x - 2 * (gtxx - gtyy) / SQR(x)), ZERO, - AXIS_VAL(0.5 * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)) }, + AXIS_VAL(HALF * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)) }, { ZERO, AXIS_VAL(dxx_gt11, dx_gt22 / x + 2.0 * (gtxx - gtyy) / SQR(x)), ZERO }, - { AXIS_VAL(0.5 * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)), ZERO, + { AXIS_VAL(HALF * dxx_gt13, dx_gt13 / x - gtxz / SQR(x)), ZERO, AXIS_VAL(dxx_gt33, dx_gt33 / x) }, }, { @@ -348,9 +392,9 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext const ELEM_TYPE dzz_alpha = DZZ(ALPHA); const ELEM_TYPE dxz_alpha = DXZ(ALPHA); - const ELEM_TYPE d2alpha[3][3] = {{ dxx_alpha, 0, dxz_alpha }, - { 0, AXIS_VAL(dxx_alpha, dx_alpha / x), 0 }, - { dxz_alpha, 0, dzz_alpha }}; + const ELEM_TYPE d2alpha[3][3] = {{ dxx_alpha, ZERO, dxz_alpha }, + { ZERO, AXIS_VAL(dxx_alpha, dx_alpha / x), ZERO }, + { dxz_alpha, ZERO, dzz_alpha }}; const ELEM_TYPE betax = LOAD_VAR(BETAX); const ELEM_TYPE betaz = LOAD_VAR(BETAX); @@ -433,9 +477,9 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext for (int j = 0; j < 3; j++) for (int k = 0; k < 3; k++) for (int l = 0; l < 3; l++) { - dg[j][k][l] = -2.0 * dphi[j] * gt[k][l] / (phi * SQR(phi)) + dgt[j][k][l] / SQR(phi); - dK[j][k][l] = -2.0 * dphi[j] * At[k][l] / (phi * SQR(phi)) + dAt[j][k][l] / SQR(phi) + - (1.0 / 3.0) * (dtrK[j] * g[k][l] + trK * dg[j][k][l]); + dg[j][k][l] = -TWO * dphi[j] * gt[k][l] / (phi * SQR(phi)) + dgt[j][k][l] / SQR(phi); + dK[j][k][l] = -TWO * dphi[j] * At[k][l] / (phi * SQR(phi)) + dAt[j][k][l] / SQR(phi) + + THIRD * (dtrK[j] * g[k][l] + trK * dg[j][k][l]); } // ∂_{jk} g_{lm} @@ -443,10 +487,10 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext for (int k = 0; k < 3; k++) for (int l = 0; l < 3; l++) for (int m = 0; m < 3; m++) { - d2g[j][k][l][m] = 6.0 * gt [l][m] * dphi[j] * dphi[k] / SQR(SQR(phi)) - - 2.0 * gt [l][m] * d2phi[j][k] / (phi * SQR(phi)) - - 2.0 * dgt [j][l][m] * dphi[k] / (phi * SQR(phi)) - - 2.0 * dgt [k][l][m] * dphi[j] / (phi * SQR(phi)) + + d2g[j][k][l][m] = SIZ * gt [l][m] * dphi[j] * dphi[k] / SQR(SQR(phi)) - + TWO * gt [l][m] * d2phi[j][k] / (phi * SQR(phi)) - + TWO * dgt [j][l][m] * dphi[k] / (phi * SQR(phi)) - + TWO * dgt [k][l][m] * dphi[j] / (phi * SQR(phi)) + d2gt[j][k][l][m] / SQR(phi); } @@ -516,7 +560,7 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext // K_{ij} for (int j = 0; j < 3; j++) for (int k = 0; k < 3; k++) - K[j][k] = At[j][k] / SQR(phi) + (1.0 / 3.0) * g[j][k] * trK; + K[j][k] = At[j][k] / SQR(phi) + THIRD * g[j][k] * trK; for (int j = 0; j < 3; j++) for (int k = 0; k < 3; k++) { @@ -673,7 +717,7 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext STORE(solver->diff_coeffs[MG2D_DIFF_COEFF_20]->data, idx_dc, gu[0][0] + AXIS_VAL(gu[1][1], ZERO)); STORE(solver->diff_coeffs[MG2D_DIFF_COEFF_02]->data, idx_dc, gu[2][2]); - STORE(solver->diff_coeffs[MG2D_DIFF_COEFF_11]->data, idx_dc, 2.0 * gu[0][2]); + STORE(solver->diff_coeffs[MG2D_DIFF_COEFF_11]->data, idx_dc, TWO * gu[0][2]); STORE(solver->diff_coeffs[MG2D_DIFF_COEFF_10]->data, idx_dc, -X[0] + AXIS_VAL(ZERO, gu[1][1] / x)); STORE(solver->diff_coeffs[MG2D_DIFF_COEFF_01]->data, idx_dc, -X[2]); STORE(solver->diff_coeffs[MG2D_DIFF_COEFF_00]->data, idx_dc, -k2); @@ -681,13 +725,21 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext STORE(solver->rhs, idx_rhs, -TWO * alpha * Kij_Dij_alpha + Gammadot_term + TWO * alpha * (k_kdot + TWO * alpha * k3) + beta_term); +#ifdef ELEM_SCALAR if (on_axis) { solver->diff_coeffs[MG2D_DIFF_COEFF_20]->boundaries[MG2D_BOUNDARY_0L].val[idx_z] = gu[1][1]; solver->diff_coeffs[MG2D_DIFF_COEFF_10]->boundaries[MG2D_BOUNDARY_0L].val[idx_z] = gu[1][1]; } +#endif } } +#undef AXIS_VAL + +#undef FUNC +#undef FUNC2 +#undef JOIN2 + #undef LOAD #undef STORE @@ -707,3 +759,4 @@ static void fill_eq_coeffs_line(const cGH *gh, const CoordPatch *cp, MG2DContext #undef ELEM_SPLAT #undef ELEM_SIZE #undef ELEM_TYPE +#undef TYPE diff --git a/src/qms.c b/src/qms.c index 8fae7f0..4a72428 100644 --- a/src/qms.c +++ b/src/qms.c @@ -8,6 +8,8 @@ #include #include +#include + #include #include @@ -528,10 +530,61 @@ enum RHSVar { RHS_VAR_NB, }; +typedef void +(*FECLine)(const cGH *gh, const CoordPatch *cp, MG2DContext *solver, + const double * const vars[], + const int idx_x_start, const int idx_x_end, + const int idx_z, + const ptrdiff_t stride_z, const double dx_inv_scal, const double dz_inv_scal); + +// computing QMS equation coefficients is resource-intensive, so we generate +// vectorized versions when possible +// +// first, instantiate the plain scalar version + +#define ELEM_SCALAR #include "fill_eq_coeffs_template.c" +#undef ELEM_SCALAR + +#if defined(__INTEL_COMPILER) && defined(__OPTIMIZE__) +// ICC miscompiles the vectorized versions of fill_eq_coeffs_line() with higher +// optimization levels +# pragma GCC push_options +# pragma GCC optimize("O1") +#endif + +#ifdef __AVX512F__ + +#define ELEM_AVX512 +#include "fill_eq_coeffs_template.c" +#undef ELEM_AVX512 + +#define FILL_EQ_COEFFS_BLOCKSIZE 8 +#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_avx512 + +#elif defined(__AVX2__) + +#define ELEM_AVX2 +#include "fill_eq_coeffs_template.c" +#undef ELEM_AVX2 + +#define FILL_EQ_COEFFS_BLOCKSIZE 4 +#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_avx2 + +#else + +#define FILL_EQ_COEFFS_BLOCKSIZE 1 +#define FILL_EQ_COEFFS_VECTOR fill_eq_coeffs_line_scalar + +#endif + +#if defined(__INTEL_COMPILER) && defined(__OPTIMIZE__) +# pragma GCC pop_options +#endif static void fill_eq_coeffs(QMSMGContext *ctx, CoordPatch *cp, MG2DContext *solver) { + FECLine fill_eq_coeffs_line; cGH *gh = ctx->gh; int ret; @@ -568,12 +621,30 @@ static void fill_eq_coeffs(QMSMGContext *ctx, CoordPatch *cp, MG2DContext *solve const double dx_inv = 1.0 / (vars[RHS_VAR_X][1] - vars[RHS_VAR_X][0]); const double dz_inv = 1.0 / (vars[RHS_VAR_Z][stride_z] - vars[RHS_VAR_Z][0]); + const int have_axis = fabs(vars[RHS_VAR_X][cp->offset_left[0]]) < 1e-13; + int idx_x_start, idx_x_end; + solver->diff_coeffs[MG2D_DIFF_COEFF_20]->boundaries[MG2D_BOUNDARY_0L].flags = MG2D_DC_FLAG_DISCONT; solver->diff_coeffs[MG2D_DIFF_COEFF_10]->boundaries[MG2D_BOUNDARY_0L].flags = MG2D_DC_FLAG_POLE; + // do not execute the vectorized version on the axis, + // as it does not support special handling of the values there + idx_x_start = FILL_EQ_COEFFS_BLOCKSIZE > 1 && !!have_axis; + idx_x_end = idx_x_start + (solver->local_size[0] - idx_x_start) / FILL_EQ_COEFFS_BLOCKSIZE * FILL_EQ_COEFFS_BLOCKSIZE; + #pragma omp parallel for - for (int idx_z = 0; idx_z < solver->local_size[1]; idx_z++) - fill_eq_coeffs_line(gh, cp, solver, vars, idx_z, stride_z, dx_inv, dz_inv); + for (int idx_z = 0; idx_z < solver->local_size[1]; idx_z++) { + if (idx_x_start) + fill_eq_coeffs_line_scalar(gh, cp, solver, vars, 0, idx_x_start, + idx_z, stride_z, dx_inv, dz_inv); + + FILL_EQ_COEFFS_VECTOR(gh, cp, solver, vars, idx_x_start, idx_x_end, + idx_z, stride_z, dx_inv, dz_inv); + + if (idx_x_end < solver->local_size[0]) + fill_eq_coeffs_line_scalar(gh, cp, solver, vars, idx_x_end, solver->local_size[0], + idx_z, stride_z, dx_inv, dz_inv); + } } static void mirror(CoordPatch *cp, double *dst) -- cgit v1.2.3