diff options
Diffstat (limited to 'residual_calc.c')
-rw-r--r-- | residual_calc.c | 196 |
1 files changed, 196 insertions, 0 deletions
diff --git a/residual_calc.c b/residual_calc.c index 6d58c10..d92986b 100644 --- a/residual_calc.c +++ b/residual_calc.c @@ -87,8 +87,62 @@ void mg2di_residual_add_line_s2_fma3(size_t linesize, double *dst, double *dst_m ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], double res_mult, double u_mult); +void mg2di_residual_calc_line_s3_fma3(size_t linesize, double *dst, double *dst_max, + ptrdiff_t stride, const double *u, const double *rhs, + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult); +void mg2di_residual_add_line_s3_fma3(size_t linesize, double *dst, double *dst_max, + ptrdiff_t stride, const double *u, const double *rhs, + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult, double u_mult); #endif +static const double fd_coeffs[][2][FD_STENCIL_MAX * 2 + 1] = { + { + { -1.0, 0.0, 1.0 }, + { 1.0, -2.0, 1.0 }, + }, + { + { 1.0, -8.0, 0.0, 8.0, -1.0 }, + { -1.0, 16.0, -30.0, 16.0, -1.0 }, + }, + { + { -1.0, 9.0, -45.0, 0.0, 45.0, -9.0, 1.0 }, + { 1.0, -13.5, 135.0, -245.0, 135.0, -13.5, 1.0 }, + }, + { + { 3.0, -32.0, 168.0, -672.0, 0.0, 672.0, -168.0, 32.0, -3.0 }, + { -9.0, 128.0, -1008.0, 8064.0, -14350.0, 8064.0, -1008.0, 128.0, -9.0 }, + }, +}; + +#define STENCIL 1 +#define ADD 0 +#include "residual_calc_template.c" +#undef ADD +#undef STENCIL + +#define STENCIL 2 +#define ADD 0 +#include "residual_calc_template.c" +#undef ADD +#undef STENCIL + +#define STENCIL 3 +#define ADD 0 +#include "residual_calc_template.c" +#undef ADD +#undef STENCIL + +#define STENCIL 4 +#define ADD 0 +#include "residual_calc_template.c" +#undef ADD +#define ADD 1 +#include "residual_calc_template.c" +#undef ADD +#undef STENCIL + static void derivatives_calc_s1(double *dst, const double *u, ptrdiff_t stride) { @@ -149,6 +203,81 @@ derivatives_calc_s2(double *dst, const double *u, ptrdiff_t stride) -1.0 * valxm2yp2 + 8.0 * valxm2yp1 - 8.0 * valxm2ym1 + 1.0 * valxm2ym2); } +static void +derivatives_calc_s3(double *dst, const double *u, ptrdiff_t stride) +{ + const double val = u[0]; + + const double valxp1 = u[ 1]; + const double valxp2 = u[ 2]; + const double valxp3 = u[ 3]; + const double valxm1 = u[-1]; + const double valxm2 = u[-2]; + const double valxm3 = u[-3]; + const double valyp1 = u[ 1 * stride]; + const double valyp2 = u[ 2 * stride]; + const double valyp3 = u[ 3 * stride]; + const double valym1 = u[-1 * stride]; + const double valym2 = u[-2 * stride]; + const double valym3 = u[-3 * stride]; + + const double valxp3yp1 = u[ 3 + 1 * stride]; + const double valxp3yp2 = u[ 3 + 2 * stride]; + const double valxp3yp3 = u[ 3 + 3 * stride]; + const double valxp3ym1 = u[ 3 - 1 * stride]; + const double valxp3ym2 = u[ 3 - 2 * stride]; + const double valxp3ym3 = u[ 3 - 3 * stride]; + + const double valxp2yp1 = u[ 2 + 1 * stride]; + const double valxp2yp2 = u[ 2 + 2 * stride]; + const double valxp2yp3 = u[ 2 + 3 * stride]; + const double valxp2ym1 = u[ 2 - 1 * stride]; + const double valxp2ym2 = u[ 2 - 2 * stride]; + const double valxp2ym3 = u[ 2 - 3 * stride]; + + const double valxp1yp1 = u[ 1 + 1 * stride]; + const double valxp1yp2 = u[ 1 + 2 * stride]; + const double valxp1yp3 = u[ 1 + 3 * stride]; + const double valxp1ym1 = u[ 1 - 1 * stride]; + const double valxp1ym2 = u[ 1 - 2 * stride]; + const double valxp1ym3 = u[ 1 - 3 * stride]; + + const double valxm1yp1 = u[-1 + 1 * stride]; + const double valxm1yp2 = u[-1 + 2 * stride]; + const double valxm1yp3 = u[-1 + 3 * stride]; + const double valxm1ym1 = u[-1 - 1 * stride]; + const double valxm1ym2 = u[-1 - 2 * stride]; + const double valxm1ym3 = u[-1 - 3 * stride]; + + const double valxm2yp1 = u[-2 + 1 * stride]; + const double valxm2yp2 = u[-2 + 2 * stride]; + const double valxm2yp3 = u[-2 + 3 * stride]; + const double valxm2ym1 = u[-2 - 1 * stride]; + const double valxm2ym2 = u[-2 - 2 * stride]; + const double valxm2ym3 = u[-2 - 3 * stride]; + + const double valxm3yp1 = u[-3 + 1 * stride]; + const double valxm3yp2 = u[-3 + 2 * stride]; + const double valxm3yp3 = u[-3 + 3 * stride]; + const double valxm3ym1 = u[-3 - 1 * stride]; + const double valxm3ym2 = u[-3 - 2 * stride]; + const double valxm3ym3 = u[-3 - 3 * stride]; + + dst[MG2D_DIFF_COEFF_00] = val; + dst[MG2D_DIFF_COEFF_10] = (1.0 * valxp3 - 9.0 * valxp2 + 45.0 * valxp1 - 45.0 * valxm1 + 9.0 * valxm2 - 1.0 * valxm3); + dst[MG2D_DIFF_COEFF_01] = (1.0 * valyp3 - 9.0 * valyp2 + 45.0 * valyp1 - 45.0 * valym1 + 9.0 * valym2 - 1.0 * valym3); + + dst[MG2D_DIFF_COEFF_20] = (1.0 * valxp3 - 13.5 * valxp2 + 135.0 * valxp1 - 245.0 * val + 135.0 * valxm1 - 13.5 * valxm2 + 1.0 * valxm3); + dst[MG2D_DIFF_COEFF_02] = (1.0 * valyp3 - 13.5 * valyp2 + 135.0 * valyp1 - 245.0 * val + 135.0 * valym1 - 13.5 * valym2 + 1.0 * valym3); + + dst[MG2D_DIFF_COEFF_11] = ( 1.0 * (1.0 * valxp3yp3 - 9.0 * valxp3yp2 + 45.0 * valxp3yp1 - 45.0 * valxp3ym1 + 9.0 * valxp3ym2 - 1.0 * valxp3ym3) + - 9.0 * (1.0 * valxp2yp3 - 9.0 * valxp2yp2 + 45.0 * valxp2yp1 - 45.0 * valxp2ym1 + 9.0 * valxp2ym2 - 1.0 * valxp2ym3) + + 45.0 * (1.0 * valxp1yp3 - 9.0 * valxp1yp2 + 45.0 * valxp1yp1 - 45.0 * valxp1ym1 + 9.0 * valxp1ym2 - 1.0 * valxp1ym3) + - 45.0 * (1.0 * valxm1yp3 - 9.0 * valxm1yp2 + 45.0 * valxm1yp1 - 45.0 * valxm1ym1 + 9.0 * valxm1ym2 - 1.0 * valxm1ym3) + + 9.0 * (1.0 * valxm2yp3 - 9.0 * valxm2yp2 + 45.0 * valxm2yp1 - 45.0 * valxm2ym1 + 9.0 * valxm2ym2 - 1.0 * valxm2ym3) + - 1.0 * (1.0 * valxm3yp3 - 9.0 * valxm3yp2 + 45.0 * valxm3yp1 - 45.0 * valxm3ym1 + 9.0 * valxm3ym2 - 1.0 * valxm3ym3)); +} + static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], @@ -245,6 +374,54 @@ static void residual_add_line_s2_c(size_t linesize, double *dst, double *dst_max *dst_max = MAX(*dst_max, res_max); } +static void residual_calc_line_s3_c(size_t linesize, double *dst, double *dst_max, + ptrdiff_t stride, const double *u, const double *rhs, + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult) +{ + double res_max = 0.0, res_abs; + for (size_t i = 0; i < linesize; i++) { + double u_vals[MG2D_DIFF_COEFF_NB]; + double res; + + derivatives_calc_s3(u_vals, u + i, stride); + + res = -rhs[i]; + for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) + res += u_vals[j] * diff_coeffs[j][i]; + dst[i] = res_mult * res; + + res_abs = fabs(res); + res_max = MAX(res_max, res_abs); + } + + *dst_max = MAX(*dst_max, res_max); +} + +static void residual_add_line_s3_c(size_t linesize, double *dst, double *dst_max, + ptrdiff_t stride, const double *u, const double *rhs, + const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], + double res_mult, double u_mult) +{ + double res_max = 0.0, res_abs; + for (size_t i = 0; i < linesize; i++) { + double u_vals[MG2D_DIFF_COEFF_NB]; + double res; + + derivatives_calc_s3(u_vals, u + i, stride); + + res = -rhs[i]; + for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) + res += u_vals[j] * diff_coeffs[j][i]; + dst[i] = u_mult * u[i] + res_mult * res; + + res_abs = fabs(res); + res_max = MAX(res_max, res_abs); + } + + *dst_max = MAX(*dst_max, res_max); +} + static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thread_idx) { ResidualCalcInternal *priv = arg; @@ -340,6 +517,7 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) case 1: priv->residual_calc_line = residual_calc_line_s1_c; priv->residual_add_line = residual_add_line_s1_c; + //priv->residual_calc_line = residual_calc_line_1; #if HAVE_EXTERNAL_ASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { priv->residual_calc_line = mg2di_residual_calc_line_s1_fma3; @@ -351,6 +529,7 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) case 2: priv->residual_calc_line = residual_calc_line_s2_c; priv->residual_add_line = residual_add_line_s2_c; + //priv->residual_calc_line = residual_calc_line_2; #if HAVE_EXTERNAL_ASM if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { priv->residual_calc_line = mg2di_residual_calc_line_s2_fma3; @@ -359,6 +538,23 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx) } #endif break; + case 3: + priv->residual_calc_line = residual_calc_line_s3_c; + priv->residual_add_line = residual_add_line_s3_c; +#if HAVE_EXTERNAL_ASM + if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) { + priv->residual_calc_line = mg2di_residual_calc_line_s3_fma3; + priv->residual_add_line = mg2di_residual_add_line_s3_fma3; + priv->calc_blocksize = 4; + } +#endif + break; + case 4: + priv->residual_calc_line = residual_line_calc_4; + priv->residual_add_line = residual_line_add_4; + break; + default: + return -ENOSYS; } priv->residual_max_size = tp_get_nb_threads(ctx->tp) * priv->calc_blocksize; |