aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.c
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2024-04-15 21:44:14 +0200
committerAnton Khirnov <anton@khirnov.net>2024-04-15 21:44:14 +0200
commite30cfde7614be7062249954eab6c3f56eeabbb51 (patch)
tree1a27f188ed94b9ae4d566150ca951a8ac7f0fad1 /residual_calc.c
parent982d71cb08f6ccf564c0558c659ae2756bb39ba1 (diff)
residual_calc: accept all diff coefficients in a single array
Plus an offset parameter that signals the distance between different coefficients. This allows to avoid passing so many pointers around, which reduces register pressure and simplifies writing SIMD. Seems also to be a little faster.
Diffstat (limited to 'residual_calc.c')
-rw-r--r--residual_calc.c53
1 files changed, 26 insertions, 27 deletions
diff --git a/residual_calc.c b/residual_calc.c
index c06c966..948655e 100644
--- a/residual_calc.c
+++ b/residual_calc.c
@@ -33,11 +33,11 @@
typedef void ResidualLineCalc(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult);
typedef void ResidualLineAdd (size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult, double u_mult);
typedef struct ResidualCalcTask {
@@ -52,8 +52,9 @@ typedef struct ResidualCalcTask {
const double *rhs;
ptrdiff_t rhs_stride;
- const double * const *diff_coeffs;
+ const double *diff_coeffs;
ptrdiff_t diff_coeffs_stride;
+ ptrdiff_t diff_coeffs_offset;
double u_mult;
double res_mult;
@@ -74,10 +75,10 @@ struct ResidualCalcInternal {
};
#if HAVE_NASM
-ResidualLineCalc mg2di_residual_calc_line_s1_fma3;
-ResidualLineCalc mg2di_residual_calc_line_s2_fma3;
-ResidualLineAdd mg2di_residual_add_line_s1_fma3;
-ResidualLineAdd mg2di_residual_add_line_s2_fma3;
+ResidualLineCalc mg2di_residual_line_calc_s1_fma3;
+ResidualLineCalc mg2di_residual_line_calc_s2_fma3;
+ResidualLineAdd mg2di_residual_line_add_s1_fma3;
+ResidualLineAdd mg2di_residual_line_add_s2_fma3;
#endif
static void
@@ -142,7 +143,7 @@ derivatives_calc_s2(double *dst, const double *u, ptrdiff_t stride)
static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult)
{
double res_max = 0.0, res_abs;
@@ -154,7 +155,7 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
- res += u_vals[j] * diff_coeffs[j][i];
+ res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i];
dst[i] = res_mult * res;
res_abs = fabs(res);
@@ -166,7 +167,7 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma
static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult, double u_mult)
{
double res_max = 0.0, res_abs;
@@ -178,7 +179,7 @@ static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
- res += u_vals[j] * diff_coeffs[j][i];
+ res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i];
dst[i] = u_mult * u[i] + res_mult * res;
res_abs = fabs(res);
@@ -190,7 +191,7 @@ static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max
static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult)
{
double res_max = 0.0, res_abs;
@@ -202,7 +203,7 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
- res += u_vals[j] * diff_coeffs[j][i];
+ res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i];
dst[i] = res_mult * res;
res_abs = fabs(res);
@@ -214,7 +215,7 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma
static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t u_stride, const double *u, const double *rhs,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
double res_mult, double u_mult)
{
double res_max = 0.0, res_abs;
@@ -226,7 +227,7 @@ static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max
res = -rhs[i];
for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
- res += u_vals[j] * diff_coeffs[j][i];
+ res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i];
dst[i] = u_mult * u[i] + res_mult * res;
res_abs = fabs(res);
@@ -241,24 +242,21 @@ static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thre
ResidualCalcInternal *priv = arg;
ResidualCalcTask *task = &priv->task;
- const double *diff_coeffs[MG2D_DIFF_COEFF_NB];
+ const double *diff_coeffs = task->diff_coeffs + job_idx * task->diff_coeffs_stride;
double *dst = task->dst + job_idx * task->dst_stride;
- for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++)
- diff_coeffs[i] = task->diff_coeffs[i] + job_idx * task->diff_coeffs_stride;
-
if (task->u_mult == 0.0) {
priv->residual_line_calc(task->size[0], dst,
priv->residual_max + thread_idx * priv->calc_blocksize,
task->u_stride, task->u + job_idx * task->u_stride,
task->rhs + job_idx * task->rhs_stride,
- diff_coeffs, task->res_mult);
+ diff_coeffs, task->diff_coeffs_offset, task->res_mult);
} else {
priv->residual_line_add(task->size[0], dst,
priv->residual_max + thread_idx * priv->calc_blocksize,
task->u_stride, task->u + job_idx * task->u_stride,
task->rhs + job_idx * task->rhs_stride,
- diff_coeffs, task->res_mult, task->u_mult);
+ diff_coeffs, task->diff_coeffs_offset, task->res_mult, task->u_mult);
}
if (task->reflect & (1 << MG2D_BOUNDARY_0L)) {
@@ -286,8 +284,8 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2],
double *dst, ptrdiff_t dst_stride,
const double *u, ptrdiff_t u_stride,
const double *rhs, ptrdiff_t rhs_stride,
- const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
- ptrdiff_t diff_coeffs_stride,
+ const double *diff_coeffs, ptrdiff_t diff_coeffs_stride,
+ ptrdiff_t diff_coeffs_offset,
double u_mult, double res_mult,
int reflect, size_t reflect_dist)
{
@@ -307,6 +305,7 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2],
task->rhs_stride = rhs_stride;
task->diff_coeffs = diff_coeffs;
task->diff_coeffs_stride = diff_coeffs_stride;
+ task->diff_coeffs_offset = diff_coeffs_offset;
task->u_mult = u_mult;
task->res_mult = res_mult;
task->reflect = reflect;
@@ -333,8 +332,8 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
priv->residual_line_add = residual_add_line_s1_c;
#if HAVE_NASM
if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
- priv->residual_line_calc = mg2di_residual_calc_line_s1_fma3;
- priv->residual_line_add = mg2di_residual_add_line_s1_fma3;
+ priv->residual_line_calc = mg2di_residual_line_calc_s1_fma3;
+ priv->residual_line_add = mg2di_residual_line_add_s1_fma3;
priv->calc_blocksize = 4;
}
#endif
@@ -344,8 +343,8 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
priv->residual_line_add = residual_add_line_s2_c;
#if HAVE_NASM
if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
- priv->residual_line_calc = mg2di_residual_calc_line_s2_fma3;
- priv->residual_line_add = mg2di_residual_add_line_s2_fma3;
+ priv->residual_line_calc = mg2di_residual_line_calc_s2_fma3;
+ priv->residual_line_add = mg2di_residual_line_add_s2_fma3;
priv->calc_blocksize = 4;
}
#endif