aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2024-04-15 21:44:14 +0200
committerAnton Khirnov <anton@khirnov.net>2024-04-15 21:44:14 +0200
commite30cfde7614be7062249954eab6c3f56eeabbb51 (patch)
tree1a27f188ed94b9ae4d566150ca951a8ac7f0fad1
parent982d71cb08f6ccf564c0558c659ae2756bb39ba1 (diff)
residual_calc: accept all diff coefficients in a single array
Plus an offset parameter that signals the distance between different coefficients. This allows to avoid passing so many pointers around, which reduces register pressure and simplifies writing SIMD. Seems also to be a little faster.
-rw-r--r--ell_grid_solve.c8
-rw-r--r--residual_calc.asm84
-rw-r--r--residual_calc.c53
-rw-r--r--residual_calc.h12
4 files changed, 80 insertions, 77 deletions
diff --git a/ell_grid_solve.c b/ell_grid_solve.c
index ecd9d8f..1cf2798 100644
--- a/ell_grid_solve.c
+++ b/ell_grid_solve.c
@@ -160,17 +160,15 @@ static void boundaries_sync(EGSContext *ctx, NDArray *a_dst)
static void residual_calc(EGSContext *ctx, int export_res)
{
EGSInternal *priv = ctx->priv;
- const double *diff_coeffs[MG2D_DIFF_COEFF_NB];
ptrdiff_t *offset = priv->residual_calc_offset[priv->steps_since_sync];
size_t *size = priv->residual_calc_size[priv->steps_since_sync];
+ const double *diff_coeffs0 = NDA_PTR2D(priv->diff_coeffs[0], offset[0], offset[1]);
+ const double *diff_coeffs1 = NDA_PTR2D(priv->diff_coeffs[1], offset[0], offset[1]);
NDArray *dst = export_res ? ctx->residual : priv->u_next;
int reflect_flags = 0;
mg2di_timer_start(&ctx->timer_res_calc);
- for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++)
- diff_coeffs[i] = NDA_PTR2D(priv->diff_coeffs[i], offset[0], offset[1]);
-
for (int bnd_idx = 0; bnd_idx < 4; bnd_idx++)
if (ctx->boundaries[bnd_idx]->type == MG2D_BC_TYPE_REFLECT &&
priv->dg->components[priv->local_component].bnd_is_outer[bnd_idx])
@@ -180,7 +178,7 @@ static void residual_calc(EGSContext *ctx, int export_res)
NDA_PTR2D(dst, offset[0], offset[1]), dst->stride[0],
NDA_PTR2D(ctx->u, offset[0], offset[1]), ctx->u->stride[0],
NDA_PTR2D(ctx->rhs, offset[0], offset[1]), ctx->rhs->stride[0],
- diff_coeffs, priv->diff_coeffs[0]->stride[0],
+ diff_coeffs0, priv->diff_coeffs[0]->stride[0], diff_coeffs1 - diff_coeffs0,
export_res ? 0.0 : 1.0, export_res ? 1.0 : priv->r.relax_factor,
reflect_flags, FD_STENCIL_MAX);
diff --git a/residual_calc.asm b/residual_calc.asm
index 9cd530d..42eb50b 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -22,14 +22,6 @@
; double precision
%define ELEM_SIZE 8
-; offsets to FD coefficients for given derivative
-%define OFF_DIFF_COEFF_00 0 * gprsize
-%define OFF_DIFF_COEFF_01 1 * gprsize
-%define OFF_DIFF_COEFF_10 2 * gprsize
-%define OFF_DIFF_COEFF_11 3 * gprsize
-%define OFF_DIFF_COEFF_02 4 * gprsize
-%define OFF_DIFF_COEFF_20 5 * gprsize
-
SECTION .rodata
const8: times 8 dq 8.0
@@ -65,15 +57,15 @@ SECTION .text
%define up2q uq + ELEM_SIZE * 2
%define um1q uq - ELEM_SIZE
%define um2q uq - ELEM_SIZE * 2
- %define coeffs1q diff_coeffs10q
- %define coeffs2q diff_coeffs20q
+ %define coeffs1q diff_coeffsq + diff_coeff_offset_10
+ %define coeffs2q diff_coeffsq + diff_coeff_offset_20
%else
%define up1q u_upq
%define up2q u_up2q
%define um1q u_downq
%define um2q u_down2q
- %define coeffs1q diff_coeffs01q
- %define coeffs2q diff_coeffs02q
+ %define coeffs1q diff_coeffsq + diff_coeff_offset_01
+ %define coeffs2q diff_coeffsq + diff_coeff_offset_02
%endif
; load the function values
@@ -91,7 +83,7 @@ SECTION .text
subpd m11, m8 ; m11 -= u[x+2]
addpd m11, m10 ; m11 += u[x-2]
%endif
- vfmadd231pd m0, m11, [coeffs1q + offsetq] ; res += d_x u * diff_coeffs10
+ vfmadd231pd m0, m11, [coeffs1q] ; res += d_x u * diff_coeffs10
; second derivative
addpd m11, m7, m9 ; m11 = u[x+1] + u[x-1]
@@ -102,7 +94,7 @@ SECTION .text
subpd m11, m10 ; m11 -= u[x-2]
%endif
subpd m11, m6 ; m11 -= fd0 u[x]
- vfmadd231pd m0, m11, [coeffs2q + offsetq] ; res += d_xx u * diff_coeffs20
+ vfmadd231pd m0, m11, [coeffs2q] ; res += d_xx u * diff_coeffs20
%endmacro
; calculate and add residual contributions from the second mixed derivative
@@ -138,13 +130,30 @@ SECTION .text
vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7
%endif
- vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11
+ vfmadd231pd m0, m6, [diff_coeffsq + diff_coeff_offset_11] ; res += d_xy u * diff_coeffs11
%endmacro
; %1: stencil
; %2: 0 - calc; 1 - add
%macro RESIDUAL_CALC 2
- %define stencil %1
+
+%define stencil %1
+
+%if %2
+%define opname add
+%else
+%define opname calc
+%endif
+
+; typedef void ResidualLineCalc/Add(
+; size_t linesize, double *dst, double *dst_max,
+; ptrdiff_t u_stride, const double *u, const double *rhs,
+; const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
+; double res_mult, [double u_mult (add only)])
+cglobal residual_line_ %+ opname %+ _s %+ stencil, \
+ 8, 13, 14 + stencil * 2, \
+ linesize, dst, res_max, u_stride, u, rhs, diff_coeffs, diff_coeffs_offset, \
+ u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5
%if %2
vpermq m2, m1, 0
@@ -156,27 +165,13 @@ SECTION .text
psrlq m13, 1
movu m12, [res_maxq]
- ; load pointers to the equation coefficients
- %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
- mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
- mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01]
- mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10]
- mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11]
- mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02]
- mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20]
-
; setup the data pointers and the loop counter
shl u_strideq, 3
+ shl diff_coeffs_offsetq, 3
shl linesizeq, 3
add dstq, linesizeq
add uq, linesizeq
add rhsq, linesizeq
- add diff_coeffs00q, linesizeq
- add diff_coeffs01q, linesizeq
- add diff_coeffs10q, linesizeq
- add diff_coeffs11q, linesizeq
- add diff_coeffs02q, linesizeq
- add diff_coeffs20q, linesizeq
neg linesizeq
; from now on, the register that held linesize is used as the offset into data arrays
%define offsetq linesizeq
@@ -195,13 +190,26 @@ SECTION .text
movu m14, [const8]
%endif
+ ; offsets to FD coefficients for given derivative
+ %define diff_coeff_offset_01 1 * diff_coeffs_offsetq
+ %define diff_coeff_offset_10 2 * diff_coeffs_offsetq
+
+ lea diff_coeffs_off3q, [diff_coeffs_offsetq * 2 + diff_coeffs_offsetq]
+ %define diff_coeff_offset_11 diff_coeffs_off3q
+
+ %define diff_coeff_offset_02 4 * diff_coeffs_offsetq
+
+ lea diff_coeffs_off5q, [diff_coeffs_offsetq * 4 + diff_coeffs_offsetq]
+ %define diff_coeff_offset_20 diff_coeffs_off5q
+
+
.loop:
xorpd m0, m0
subpd m0, [rhsq + offsetq] ; res = -rhs
; plain value
movu m6, [uq + offsetq] ; m6 = u[x]
- vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
+ vfmadd231pd m0, m6, [diff_coeffsq] ; res += u * diff_coeffs00
%if %2
mulpd m3, m6, m2
%endif
@@ -223,6 +231,7 @@ SECTION .text
%endif
; store the result
+ add diff_coeffsq, mmsize
add offsetq, mmsize
jg .store_partial
@@ -267,18 +276,7 @@ SECTION .text
%endmacro
INIT_YMM fma3
-cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
RESIDUAL_CALC 1, 0
-cglobal residual_add_line_s1, 7, 14, 14, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
RESIDUAL_CALC 1, 1
-
-INIT_YMM fma3
-cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2, 0
-
-cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2, 1
diff --git a/residual_calc.c b/residual_calc.c
index c06c966..948655e 100644
--- a/residual_calc.c
+++ b/residual_calc.c
@@ -33,11 +33,11 @@
typedef void ResidualLineCalc(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult);
typedef void ResidualLineAdd (size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult, double u_mult);
typedef struct ResidualCalcTask {
@@ -52,8 +52,9 @@ typedef struct ResidualCalcTask {
const double *rhs;
ptrdiff_t rhs_stride;
- const double * const *diff_coeffs;
+ const double *diff_coeffs;
ptrdiff_t diff_coeffs_stride;
+ ptrdiff_t diff_coeffs_offset;
double u_mult;
double res_mult;
@@ -74,10 +75,10 @@ struct ResidualCalcInternal {
};
#if HAVE_NASM
-ResidualLineCalc mg2di_residual_calc_line_s1_fma3;
-ResidualLineCalc mg2di_residual_calc_line_s2_fma3;
-ResidualLineAdd mg2di_residual_add_line_s1_fma3;
-ResidualLineAdd mg2di_residual_add_line_s2_fma3;
+ResidualLineCalc mg2di_residual_line_calc_s1_fma3;
+ResidualLineCalc mg2di_residual_line_calc_s2_fma3;
+ResidualLineAdd mg2di_residual_line_add_s1_fma3;
+ResidualLineAdd mg2di_residual_line_add_s2_fma3;
#endif
static void
@@ -142,7 +143,7 @@ derivatives_calc_s2(double *dst, const double *u, ptrdiff_t stride)
static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult)
{
double res_max = 0.0, res_abs;
@@ -154,7 +155,7 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
- res += u_vals[j] * diff_coeffs[j][i];
+ res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i];
dst[i] = res_mult * res;
res_abs = fabs(res);
@@ -166,7 +167,7 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma
static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult, double u_mult)
{
double res_max = 0.0, res_abs;
@@ -178,7 +179,7 @@ static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
- res += u_vals[j] * diff_coeffs[j][i];
+ res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i];
dst[i] = u_mult * u[i] + res_mult * res;
res_abs = fabs(res);
@@ -190,7 +191,7 @@ static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max
static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult)
{
double res_max = 0.0, res_abs;
@@ -202,7 +203,7 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
- res += u_vals[j] * diff_coeffs[j][i];
+ res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i];
dst[i] = res_mult * res;
res_abs = fabs(res);
@@ -214,7 +215,7 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma
static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult, double u_mult)
{
double res_max = 0.0, res_abs;
@@ -226,7 +227,7 @@ static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
- res += u_vals[j] * diff_coeffs[j][i];
+ res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i];
dst[i] = u_mult * u[i] + res_mult * res;
res_abs = fabs(res);
@@ -241,24 +242,21 @@ static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thre
ResidualCalcInternal *priv = arg;
ResidualCalcTask *task = &priv->task;
- const double *diff_coeffs[MG2D_DIFF_COEFF_NB];
+ const double *diff_coeffs = task->diff_coeffs + job_idx * task->diff_coeffs_stride;
double *dst = task->dst + job_idx * task->dst_stride;
- for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++)
- diff_coeffs[i] = task->diff_coeffs[i] + job_idx * task->diff_coeffs_stride;
-
if (task->u_mult == 0.0) {
priv->residual_line_calc(task->size[0], dst,
priv->residual_max + thread_idx * priv->calc_blocksize,
task->u_stride, task->u + job_idx * task->u_stride,
task->rhs + job_idx * task->rhs_stride,
- diff_coeffs, task->res_mult);
+ diff_coeffs, task->diff_coeffs_offset, task->res_mult);
} else {
priv->residual_line_add(task->size[0], dst,
priv->residual_max + thread_idx * priv->calc_blocksize,
task->u_stride, task->u + job_idx * task->u_stride,
task->rhs + job_idx * task->rhs_stride,
- diff_coeffs, task->res_mult, task->u_mult);
+ diff_coeffs, task->diff_coeffs_offset, task->res_mult, task->u_mult);
}
if (task->reflect & (1 << MG2D_BOUNDARY_0L)) {
@@ -286,8 +284,8 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2],
double *dst, ptrdiff_t dst_stride,
const double *u, ptrdiff_t u_stride,
const double *rhs, ptrdiff_t rhs_stride,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- ptrdiff_t diff_coeffs_stride,
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_stride,
+ ptrdiff_t diff_coeffs_offset,
double u_mult, double res_mult,
int reflect, size_t reflect_dist)
{
@@ -307,6 +305,7 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2],
task->rhs_stride = rhs_stride;
task->diff_coeffs = diff_coeffs;
task->diff_coeffs_stride = diff_coeffs_stride;
+ task->diff_coeffs_offset = diff_coeffs_offset;
task->u_mult = u_mult;
task->res_mult = res_mult;
task->reflect = reflect;
@@ -333,8 +332,8 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
priv->residual_line_add = residual_add_line_s1_c;
#if HAVE_NASM
if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
- priv->residual_line_calc = mg2di_residual_calc_line_s1_fma3;
- priv->residual_line_add = mg2di_residual_add_line_s1_fma3;
+ priv->residual_line_calc = mg2di_residual_line_calc_s1_fma3;
+ priv->residual_line_add = mg2di_residual_line_add_s1_fma3;
priv->calc_blocksize = 4;
}
#endif
@@ -344,8 +343,8 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
priv->residual_line_add = residual_add_line_s2_c;
#if HAVE_NASM
if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
- priv->residual_line_calc = mg2di_residual_calc_line_s2_fma3;
- priv->residual_line_add = mg2di_residual_add_line_s2_fma3;
+ priv->residual_line_calc = mg2di_residual_line_calc_s2_fma3;
+ priv->residual_line_add = mg2di_residual_line_add_s2_fma3;
priv->calc_blocksize = 4;
}
#endif
diff --git a/residual_calc.h b/residual_calc.h
index 1498bc9..01ad602 100644
--- a/residual_calc.h
+++ b/residual_calc.h
@@ -68,6 +68,14 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx);
/**
* Calculate the residual.
*
+ * @param diff_coeffs Array containing all PDE coefficients.
+ * @param diff_coeffs_stride Distance between adjacent lines for a given PDE
+ * coefficient.
+ * @param diff_coeffs_offset Distance between blocks containing different PDE
+ * coefficients. I.e. diff_coeffs[i * diff_coeffs_offset + j * diff_coeffs_stride + k]
+ * is the value of the i-th PDE coefficient (MG2D_DIFF_COEFF_*) at the
+ * (j, k)th gridpoint.
+ *
* @param reflect Flags indicating if dst should be extended by reflection.
* If the 'MG2D_BOUNDARY_x'th bit is set in reflect, then dst should be
* reflected reflect_dist points beyond its bounds in the corresponding
@@ -79,8 +87,8 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2],
double *dst, ptrdiff_t dst_stride,
const double *u, ptrdiff_t u_stride,
const double *rhs, ptrdiff_t rhs_stride,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- ptrdiff_t diff_coeffs_stride,
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_stride,
+ ptrdiff_t diff_coeffs_offset,
double u_mult, double res_mult,
int reflect, size_t reflect_dist);