From e30cfde7614be7062249954eab6c3f56eeabbb51 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Mon, 15 Apr 2024 21:44:14 +0200 Subject: residual_calc: accept all diff coefficients in a single array Plus an offset parameter that signals the distance between different coefficients. This allows to avoid passing so many pointers around, which reduces register pressure and simplifies writing SIMD. Seems also to be a little faster. --- residual_calc.c | 53 ++++++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) (limited to 'residual_calc.c') diff --git a/residual_calc.c b/residual_calc.c index c06c966..948655e 100644 --- a/residual_calc.c +++ b/residual_calc.c @@ -33,11 +33,11 @@ typedef void ResidualLineCalc(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult); typedef void ResidualLineAdd (size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult, double u_mult); typedef struct ResidualCalcTask { @@ -52,8 +52,9 @@ typedef struct ResidualCalcTask { const double *rhs; ptrdiff_t rhs_stride; - const double * const *diff_coeffs; + const double *diff_coeffs; ptrdiff_t diff_coeffs_stride; + ptrdiff_t diff_coeffs_offset; double u_mult; double res_mult; @@ -74,10 +75,10 @@ struct ResidualCalcInternal { }; #if HAVE_NASM -ResidualLineCalc mg2di_residual_calc_line_s1_fma3; -ResidualLineCalc mg2di_residual_calc_line_s2_fma3; -ResidualLineAdd mg2di_residual_add_line_s1_fma3; -ResidualLineAdd mg2di_residual_add_line_s2_fma3; +ResidualLineCalc mg2di_residual_line_calc_s1_fma3; +ResidualLineCalc mg2di_residual_line_calc_s2_fma3; +ResidualLineAdd mg2di_residual_line_add_s1_fma3; +ResidualLineAdd mg2di_residual_line_add_s2_fma3; #endif static void @@ -142,7 +143,7 @@ derivatives_calc_s2(double *dst, const double *u, ptrdiff_t stride) static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult) { double res_max = 0.0, res_abs; @@ -154,7 +155,7 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) - res += u_vals[j] * diff_coeffs[j][i]; + res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i]; dst[i] = res_mult * res; res_abs = fabs(res); @@ -166,7 +167,7 @@ static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_ma static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult, double u_mult) { double res_max = 0.0, res_abs; @@ -178,7 +179,7 @@ static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) - res += u_vals[j] * diff_coeffs[j][i]; + res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i]; dst[i] = u_mult * u[i] + res_mult * res; res_abs = fabs(res); @@ -190,7 +191,7 @@ static void residual_add_line_s1_c(size_t linesize, double *dst, double *dst_max static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult) { double res_max = 0.0, res_abs; @@ -202,7 +203,7 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) - res += u_vals[j] * diff_coeffs[j][i]; + res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i]; dst[i] = res_mult * res; res_abs = fabs(res); @@ -214,7 +215,7 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t u_stride, const double *u, const double *rhs, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, double res_mult, double u_mult) { double res_max = 0.0, res_abs; @@ -226,7 +227,7 @@ static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) - res += u_vals[j] * diff_coeffs[j][i]; + res += u_vals[j] * diff_coeffs[j * diff_coeffs_offset + i]; dst[i] = u_mult * u[i] + res_mult * res; res_abs = fabs(res); @@ -241,24 +242,21 @@ static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thre ResidualCalcInternal *priv = arg; ResidualCalcTask *task = &priv->task; - const double *diff_coeffs[MG2D_DIFF_COEFF_NB]; + const double *diff_coeffs = task->diff_coeffs + job_idx * task->diff_coeffs_stride; double *dst = task->dst + job_idx * task->dst_stride; - for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++) - diff_coeffs[i] = task->diff_coeffs[i] + job_idx * task->diff_coeffs_stride; - if (task->u_mult == 0.0) { priv->residual_line_calc(task->size[0], dst, priv->residual_max + thread_idx * priv->calc_blocksize, task->u_stride, task->u + job_idx * task->u_stride, task->rhs + job_idx * task->rhs_stride, - diff_coeffs, task->res_mult); + diff_coeffs, task->diff_coeffs_offset, task->res_mult); } else { priv->residual_line_add(task->size[0], dst, priv->residual_max + thread_idx * priv->calc_blocksize, task->u_stride, task->u + job_idx * task->u_stride, task->rhs + job_idx * task->rhs_stride, - diff_coeffs, task->res_mult, task->u_mult); + diff_coeffs, task->diff_coeffs_offset, task->res_mult, task->u_mult); } if (task->reflect & (1 << MG2D_BOUNDARY_0L)) { @@ -286,8 +284,8 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2], double *dst, ptrdiff_t dst_stride, const double *u, ptrdiff_t u_stride, const double *rhs, ptrdiff_t rhs_stride, - const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], - ptrdiff_t diff_coeffs_stride, + const double *diff_coeffs, ptrdiff_t diff_coeffs_stride, + ptrdiff_t diff_coeffs_offset, double u_mult, double res_mult, int reflect, size_t reflect_dist) { @@ -307,6 +305,7 @@ int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2], task->rhs_stride = rhs_stride; task->diff_coeffs = diff_coeffs; task->diff_coeffs_stride = diff_coeffs_stride; + task->diff_coeffs_offset = diff_coeffs_offset; task->u_mult = u_mult; task->res_mult = res_mult; task->reflect = reflect; @@ -333,8 +332,8 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) priv->residual_line_add = residual_add_line_s1_c; #if HAVE_NASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { - priv->residual_line_calc = mg2di_residual_calc_line_s1_fma3; - priv->residual_line_add = mg2di_residual_add_line_s1_fma3; + priv->residual_line_calc = mg2di_residual_line_calc_s1_fma3; + priv->residual_line_add = mg2di_residual_line_add_s1_fma3; priv->calc_blocksize = 4; } #endif @@ -344,8 +343,8 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) priv->residual_line_add = residual_add_line_s2_c; #if HAVE_NASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { - priv->residual_line_calc = mg2di_residual_calc_line_s2_fma3; - priv->residual_line_add = mg2di_residual_add_line_s2_fma3; + priv->residual_line_calc = mg2di_residual_line_calc_s2_fma3; + priv->residual_line_add = mg2di_residual_line_add_s2_fma3; priv->calc_blocksize = 4; } #endif -- cgit v1.2.3