aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ell_grid_solve.c8
-rw-r--r--residual_calc.asm84
-rw-r--r--residual_calc.c53
-rw-r--r--residual_calc.h12
4 files changed, 80 insertions, 77 deletions
diff --git a/ell_grid_solve.c b/ell_grid_solve.c
index ecd9d8f..1cf2798 100644
--- a/ell_grid_solve.c
+++ b/ell_grid_solve.c
@@ -160,17 +160,15 @@ static void boundaries_sync(EGSContext *ctx, NDArray *a_dst)
static void residual_calc(EGSContext *ctx, int export_res)
{
EGSInternal *priv = ctx->priv;
- const double *diff_coeffs[MG2D_DIFF_COEFF_NB];
ptrdiff_t *offset = priv->residual_calc_offset[priv->steps_since_sync];
size_t *size = priv->residual_calc_size[priv->steps_since_sync];
+ const double *diff_coeffs0 = NDA_PTR2D(priv->diff_coeffs[0], offset[0], offset[1]);
+ const double *diff_coeffs1 = NDA_PTR2D(priv->diff_coeffs[1], offset[0], offset[1]);
NDArray *dst = export_res ? ctx->residual : priv->u_next;
int reflect_flags = 0;
mg2di_timer_start(&ctx->timer_res_calc);
- for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++)
- diff_coeffs[i] = NDA_PTR2D(priv->diff_coeffs[i], offset[0], offset[1]);
-
for (int bnd_idx = 0; bnd_idx < 4; bnd_idx++)
if (ctx->boundaries[bnd_idx]->type == MG2D_BC_TYPE_REFLECT &&
priv->dg->components[priv->local_component].bnd_is_outer[bnd_idx])
@@ -180,7 +178,7 @@ static void residual_calc(EGSContext *ctx, int export_res)
NDA_PTR2D(dst, offset[0], offset[1]), dst->stride[0],
NDA_PTR2D(ctx->u, offset[0], offset[1]), ctx->u->stride[0],
NDA_PTR2D(ctx->rhs, offset[0], offset[1]), ctx->rhs->stride[0],
- diff_coeffs, priv->diff_coeffs[0]->stride[0],
+ diff_coeffs0, priv->diff_coeffs[0]->stride[0], diff_coeffs1 - diff_coeffs0,
export_res ? 0.0 : 1.0, export_res ? 1.0 : priv->r.relax_factor,
reflect_flags, FD_STENCIL_MAX);
diff --git a/residual_calc.asm b/residual_calc.asm
index 9cd530d..42eb50b 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -22,14 +22,6 @@
; double precision
%define ELEM_SIZE 8
-; offsets to FD coefficients for given derivative
-%define OFF_DIFF_COEFF_00 0 * gprsize
-%define OFF_DIFF_COEFF_01 1 * gprsize
-%define OFF_DIFF_COEFF_10 2 * gprsize
-%define OFF_DIFF_COEFF_11 3 * gprsize
-%define OFF_DIFF_COEFF_02 4 * gprsize
-%define OFF_DIFF_COEFF_20 5 * gprsize
-
SECTION .rodata
const8: times 8 dq 8.0
@@ -65,15 +57,15 @@ SECTION .text
%define up2q uq + ELEM_SIZE * 2
%define um1q uq - ELEM_SIZE
%define um2q uq - ELEM_SIZE * 2
- %define coeffs1q diff_coeffs10q
- %define coeffs2q diff_coeffs20q
+ %define coeffs1q diff_coeffsq + diff_coeff_offset_10
+ %define coeffs2q diff_coeffsq + diff_coeff_offset_20
%else
%define up1q u_upq
%define up2q u_up2q
%define um1q u_downq
%define um2q u_down2q
- %define coeffs1q diff_coeffs01q
- %define coeffs2q diff_coeffs02q
+ %define coeffs1q diff_coeffsq + diff_coeff_offset_01
+ %define coeffs2q diff_coeffsq + diff_coeff_offset_02
%endif
; load the function values
@@ -91,7 +83,7 @@ SECTION .text
subpd m11, m8 ; m11 -= u[x+2]
addpd m11, m10 ; m11 += u[x-2]
%endif
- vfmadd231pd m0, m11, [coeffs1q + offsetq] ; res += d_x u * diff_coeffs10
+ vfmadd231pd m0, m11, [coeffs1q] ; res += d_x u * diff_coeffs10
; second derivative
addpd m11, m7, m9 ; m11 = u[x+1] + u[x-1]
@@ -102,7 +94,7 @@ SECTION .text
subpd m11, m10 ; m11 -= u[x-2]
%endif
subpd m11, m6 ; m11 -= fd0 u[x]
- vfmadd231pd m0, m11, [coeffs2q + offsetq] ; res += d_xx u * diff_coeffs20
+ vfmadd231pd m0, m11, [coeffs2q] ; res += d_xx u * diff_coeffs20
%endmacro
; calculate and add residual contributions from the second mixed derivative
@@ -138,13 +130,30 @@ SECTION .text
vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7
%endif
- vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11
+ vfmadd231pd m0, m6, [diff_coeffsq + diff_coeff_offset_11] ; res += d_xy u * diff_coeffs11
%endmacro
; %1: stencil
; %2: 0 - calc; 1 - add
%macro RESIDUAL_CALC 2
- %define stencil %1
+
+%define stencil %1
+
+%if %2
+%define opname add
+%else
+%define opname calc
+%endif
+
+; typedef void ResidualLineCalc/Add(
+; size_t linesize, double *dst, double *dst_max,
+; ptrdiff_t u_stride, const double *u, const double *rhs,
+; const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
+; double res_mult, [double u_mult (add only)])
+cglobal residual_line_ %+ opname %+ _s %+ stencil, \
+ 8, 13, 14 + stencil * 2, \
+ linesize, dst, res_max, u_stride, u, rhs, diff_coeffs, diff_coeffs_offset, \
+ u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5
%if %2
vpermq m2, m1, 0
@@ -156,27 +165,13 @@ SECTION .text
psrlq m13, 1
movu m12, [res_maxq]
- ; load pointers to the equation coefficients
- %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
- mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
- mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01]
- mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10]
- mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11]
- mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02]
- mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20]
-
; setup the data pointers and the loop counter
shl u_strideq, 3
+ shl diff_coeffs_offsetq, 3
shl linesizeq, 3
add dstq, linesizeq
add uq, linesizeq
add rhsq, linesizeq
- add diff_coeffs00q, linesizeq
- add diff_coeffs01q, linesizeq
- add diff_coeffs10q, linesizeq
- add diff_coeffs11q, linesizeq
- add diff_coeffs02q, linesizeq
- add diff_coeffs20q, linesizeq
neg linesizeq
; from now on, the register that held linesize is used as the offset into data arrays
%define offsetq linesizeq
@@ -195,13 +190,26 @@ SECTION .text
movu m14, [const8]
%endif
+ ; offsets to FD coefficients for given derivative
+ %define diff_coeff_offset_01 1 * diff_coeffs_offsetq
+ %define diff_coeff_offset_10 2 * diff_coeffs_offsetq
+
+ lea diff_coeffs_off3q, [diff_coeffs_offsetq * 2 + diff_coeffs_offsetq]
+ %define diff_coeff_offset_11 diff_coeffs_off3q
+
+ %define diff_coeff_offset_02 4 * diff_coeffs_offsetq
+
+ lea diff_coeffs_off5q, [diff_coeffs_offsetq * 4 + diff_coeffs_offsetq]
+ %define diff_coeff_offset_20 diff_coeffs_off5q
+
+
.loop:
xorpd m0, m0
subpd m0, [rhsq + offsetq] ; res = -rhs
; plain value
movu m6, [uq + offsetq] ; m6 = u[x]
- vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
+ vfmadd231pd m0, m6, [diff_coeffsq] ; res += u * diff_coeffs00
%if %2
mulpd m3, m6, m2
%endif
@@ -223,6 +231,7 @@ SECTION .text
%endif
; store the result
+ add diff_coeffsq, mmsize
add offsetq, mmsize
jg .store_partial
@@ -267,18 +276,7 @@ SECTION .text
%endmacro
INIT_YMM fma3
-cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
RESIDUAL_CALC 1, 0
-cglobal residual_add_line_s1, 7, 14, 14, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
RESIDUAL_CALC 1, 1
-
-INIT_YMM fma3
-cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2, 0
-
-cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2, 1
diff --git a/residual_calc.c b/residual_calc.c
index c06c966..948655e 100644
--- a/residual_calc.c
+++ b/residual_calc.c
@@ -33,11 +33,11 @@
typedef void ResidualLineCalc(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult);
typedef void ResidualLineAdd (size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult, double u_mult);
typedef struct ResidualCalcTask {
@@ -52,8 +52,9 @@ typedef struct ResidualCalcTask {
const double *rhs;
ptrdiff_t rhs_stride;
- const double * const *diff_coeffs;
+ const double *diff_coeffs;
ptrdiff_t diff_coeffs_stride;
+ ptrdiff_t diff_coeffs_offset;
double u_mult;
double res_mult;
@@ -74,10 +75,10 @@ struct ResidualCalcInternal {
};
#if HAVE_NASM
-ResidualLineCalc mg2di_residual_calc_line_s1_fma3;
-ResidualLineCalc mg2di_residual_calc_line_s2_fma3;
-ResidualLineAdd mg2di_residual_add_line_s1_fma3;
-ResidualLineAdd mg2di_residual_add_line_s2_fma3;
+ResidualLineCalc mg2di_residual_line_calc_s1_fma3;
+ResidualLineCalc mg2di_residual_line_calc_s2_fma3;
+ResidualLineAdd mg2di_residual_line_add_s1_fma3;
+ResidualLineAdd mg2di_residual_line_add_s2_fma3;
#endif
static void
@@ -142,7 +143,7 @@ derivatives_calc_s2(double *dst, const double *u, ptrdiff_t stride)
static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult)
{
double res_max = 0.0, res_abs;
@@ -154,7 +155,7 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
- res += u_vals[j] * diff_coeffs[j][i];
+ res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i];
dst[i] = res_mult * res;
res_abs = fabs(res);
@@ -166,7 +167,7 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma
static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult, double u_mult)
{
double res_max = 0.0, res_abs;
@@ -178,7 +179,7 @@ static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
- res += u_vals[j] * diff_coeffs[j][i];
+ res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i];
dst[i] = u_mult * u[i] + res_mult * res;
res_abs = fabs(res);
@@ -190,7 +191,7 @@ static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max
static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult)
{
double res_max = 0.0, res_abs;
@@ -202,7 +203,7 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
- res += u_vals[j] * diff_coeffs[j][i];
+ res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i];
dst[i] = res_mult * res;
res_abs = fabs(res);
@@ -214,7 +215,7 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma
static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult, double u_mult)
{
double res_max = 0.0, res_abs;
@@ -226,7 +227,7 @@ static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
- res += u_vals[j] * diff_coeffs[j][i];
+ res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i];
dst[i] = u_mult * u[i] + res_mult * res;
res_abs = fabs(res);
@@ -241,24 +242,21 @@ static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thre
ResidualCalcInternal *priv = arg;
ResidualCalcTask *task = &priv->task;
- const double *diff_coeffs[MG2D_DIFF_COEFF_NB];
+ const double *diff_coeffs = task->diff_coeffs + job_idx * task->diff_coeffs_stride;
double *dst = task->dst + job_idx * task->dst_stride;
- for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++)
- diff_coeffs[i] = task->diff_coeffs[i] + job_idx * task->diff_coeffs_stride;
-
if (task->u_mult == 0.0) {
priv->residual_line_calc(task->size[0], dst,
priv->residual_max + thread_idx * priv->calc_blocksize,
task->u_stride, task->u + job_idx * task->u_stride,
task->rhs + job_idx * task->rhs_stride,
- diff_coeffs, task->res_mult);
+ diff_coeffs, task->diff_coeffs_offset, task->res_mult);
} else {
priv->residual_line_add(task->size[0], dst,
priv->residual_max + thread_idx * priv->calc_blocksize,
task->u_stride, task->u + job_idx * task->u_stride,
task->rhs + job_idx * task->rhs_stride,
- diff_coeffs, task->res_mult, task->u_mult);
+ diff_coeffs, task->diff_coeffs_offset, task->res_mult, task->u_mult);
}
if (task->reflect & (1 << MG2D_BOUNDARY_0L)) {
@@ -286,8 +284,8 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2],
double *dst, ptrdiff_t dst_stride,
const double *u, ptrdiff_t u_stride,
const double *rhs, ptrdiff_t rhs_stride,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- ptrdiff_t diff_coeffs_stride,
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_stride,
+ ptrdiff_t diff_coeffs_offset,
double u_mult, double res_mult,
int reflect, size_t reflect_dist)
{
@@ -307,6 +305,7 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2],
task->rhs_stride = rhs_stride;
task->diff_coeffs = diff_coeffs;
task->diff_coeffs_stride = diff_coeffs_stride;
+ task->diff_coeffs_offset = diff_coeffs_offset;
task->u_mult = u_mult;
task->res_mult = res_mult;
task->reflect = reflect;
@@ -333,8 +332,8 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
priv->residual_line_add = residual_add_line_s1_c;
#if HAVE_NASM
if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
- priv->residual_line_calc = mg2di_residual_calc_line_s1_fma3;
- priv->residual_line_add = mg2di_residual_add_line_s1_fma3;
+ priv->residual_line_calc = mg2di_residual_line_calc_s1_fma3;
+ priv->residual_line_add = mg2di_residual_line_add_s1_fma3;
priv->calc_blocksize = 4;
}
#endif
@@ -344,8 +343,8 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
priv->residual_line_add = residual_add_line_s2_c;
#if HAVE_NASM
if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
- priv->residual_line_calc = mg2di_residual_calc_line_s2_fma3;
- priv->residual_line_add = mg2di_residual_add_line_s2_fma3;
+ priv->residual_line_calc = mg2di_residual_line_calc_s2_fma3;
+ priv->residual_line_add = mg2di_residual_line_add_s2_fma3;
priv->calc_blocksize = 4;
}
#endif
diff --git a/residual_calc.h b/residual_calc.h
index 1498bc9..01ad602 100644
--- a/residual_calc.h
+++ b/residual_calc.h
@@ -68,6 +68,14 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx);
/**
* Calculate the residual.
*
+ * @param diff_coeffs Array containing all PDE coefficients.
+ * @param diff_coeffs_stride Distance between adjacent lines for a given PDE
+ * coefficient.
+ * @param diff_coeffs_offset Distance between blocks containing different PDE
+ * coefficients. I.e. diff_coeffs[i * diff_coeffs_offset + j * diff_coeffs_stride + k]
+ * is the value of the i-th PDE coefficient (MG2D_DIFF_COEFF_*) at the
+ * (j, k)th gridpoint.
+ *
* @param reflect Flags indicating if dst should be extended by reflection.
* If the 'MG2D_BOUNDARY_x'th bit is set in reflect, then dst should be
* reflected reflect_dist points beyond its bounds in the corresponding
@@ -79,8 +87,8 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2],
double *dst, ptrdiff_t dst_stride,
const double *u, ptrdiff_t u_stride,
const double *rhs, ptrdiff_t rhs_stride,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- ptrdiff_t diff_coeffs_stride,
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_stride,
+ ptrdiff_t diff_coeffs_offset,
double u_mult, double res_mult,
int reflect, size_t reflect_dist);