summaryrefslogtreecommitdiff
path: root/residual_calc.c
diff options
context:
space:
mode:
Diffstat (limited to 'residual_calc.c')
-rw-r--r--residual_calc.c145
1 files changed, 145 insertions, 0 deletions
diff --git a/residual_calc.c b/residual_calc.c
index 24ff088..3f1a91b 100644
--- a/residual_calc.c
+++ b/residual_calc.c
@@ -65,6 +65,41 @@ void mg2di_residual_calc_line_s2_avx2(size_t linesize, double *dst, double *dst_
const double *fd_factors);
#endif
+static const double fd_coeffs[][2][FD_STENCIL_MAX * 2 + 1] = {
+ {
+ { -1.0, 0.0, 1.0 },
+ { 1.0, -2.0, 1.0 },
+ },
+ {
+ { 1.0, -8.0, 0.0, 8.0, -1.0 },
+ { -1.0, 16.0, -30.0, 16.0, -1.0 },
+ },
+ {
+ { -1.0, 9.0, -45.0, 0.0, 45.0, -9.0, 1.0 },
+ { 1.0, -13.5, 135.0, -245.0, 135.0, -13.5, 1.0 },
+ },
+ {
+ { 3.0, -32.0, 168.0, -672.0, 0.0, 672.0, -168.0, 32.0, -3.0 },
+ { -9.0, 128.0, -1008.0, 8064.0, -14350.0, 8064.0, -1008.0, 128.0, -9.0 },
+ },
+};
+
+#define STENCIL 1
+#include "residual_calc_template.c"
+#undef STENCIL
+
+#define STENCIL 2
+#include "residual_calc_template.c"
+#undef STENCIL
+
+#define STENCIL 3
+#include "residual_calc_template.c"
+#undef STENCIL
+
+#define STENCIL 4
+#include "residual_calc_template.c"
+#undef STENCIL
+
static void
derivatives_calc_s1(double *dst, const double *u, const double *fd_factors, ptrdiff_t stride)
{
@@ -125,6 +160,82 @@ derivatives_calc_s2(double *dst, const double *u, const double *fd_factors, ptrd
-1.0 * valxm2yp2 + 8.0 * valxm2yp1 - 8.0 * valxm2ym1 + 1.0 * valxm2ym2) * fd_factors[MG2D_DIFF_COEFF_11];
}
+static void
+derivatives_calc_s3(double *dst, const double *u, const double *fd_factors, ptrdiff_t stride)
+{
+ const double val = u[0];
+
+ const double valxp1 = u[ 1];
+ const double valxp2 = u[ 2];
+ const double valxp3 = u[ 3];
+ const double valxm1 = u[-1];
+ const double valxm2 = u[-2];
+ const double valxm3 = u[-3];
+ const double valyp1 = u[ 1 * stride];
+ const double valyp2 = u[ 2 * stride];
+ const double valyp3 = u[ 3 * stride];
+ const double valym1 = u[-1 * stride];
+ const double valym2 = u[-2 * stride];
+ const double valym3 = u[-3 * stride];
+
+ const double valxp3yp1 = u[ 3 + 1 * stride];
+ const double valxp3yp2 = u[ 3 + 2 * stride];
+ const double valxp3yp3 = u[ 3 + 3 * stride];
+ const double valxp3ym1 = u[ 3 - 1 * stride];
+ const double valxp3ym2 = u[ 3 - 2 * stride];
+ const double valxp3ym3 = u[ 3 - 3 * stride];
+
+ const double valxp2yp1 = u[ 2 + 1 * stride];
+ const double valxp2yp2 = u[ 2 + 2 * stride];
+ const double valxp2yp3 = u[ 2 + 3 * stride];
+ const double valxp2ym1 = u[ 2 - 1 * stride];
+ const double valxp2ym2 = u[ 2 - 2 * stride];
+ const double valxp2ym3 = u[ 2 - 3 * stride];
+
+ const double valxp1yp1 = u[ 1 + 1 * stride];
+ const double valxp1yp2 = u[ 1 + 2 * stride];
+ const double valxp1yp3 = u[ 1 + 3 * stride];
+ const double valxp1ym1 = u[ 1 - 1 * stride];
+ const double valxp1ym2 = u[ 1 - 2 * stride];
+ const double valxp1ym3 = u[ 1 - 3 * stride];
+
+ const double valxm1yp1 = u[-1 + 1 * stride];
+ const double valxm1yp2 = u[-1 + 2 * stride];
+ const double valxm1yp3 = u[-1 + 3 * stride];
+ const double valxm1ym1 = u[-1 - 1 * stride];
+ const double valxm1ym2 = u[-1 - 2 * stride];
+ const double valxm1ym3 = u[-1 - 3 * stride];
+
+ const double valxm2yp1 = u[-2 + 1 * stride];
+ const double valxm2yp2 = u[-2 + 2 * stride];
+ const double valxm2yp3 = u[-2 + 3 * stride];
+ const double valxm2ym1 = u[-2 - 1 * stride];
+ const double valxm2ym2 = u[-2 - 2 * stride];
+ const double valxm2ym3 = u[-2 - 3 * stride];
+
+ const double valxm3yp1 = u[-3 + 1 * stride];
+ const double valxm3yp2 = u[-3 + 2 * stride];
+ const double valxm3yp3 = u[-3 + 3 * stride];
+ const double valxm3ym1 = u[-3 - 1 * stride];
+ const double valxm3ym2 = u[-3 - 2 * stride];
+ const double valxm3ym3 = u[-3 - 3 * stride];
+
+ dst[MG2D_DIFF_COEFF_00] = val;
+ dst[MG2D_DIFF_COEFF_10] = (1.0 * valxp3 - 9.0 * valxp2 + 45.0 * valxp1 - 45.0 * valxm1 + 9.0 * valxm2 - 1.0 * valxm3) * fd_factors[MG2D_DIFF_COEFF_10];
+ dst[MG2D_DIFF_COEFF_01] = (1.0 * valyp3 - 9.0 * valyp2 + 45.0 * valyp1 - 45.0 * valym1 + 9.0 * valym2 - 1.0 * valym3) * fd_factors[MG2D_DIFF_COEFF_01];
+
+ dst[MG2D_DIFF_COEFF_20] = (1.0 * valxp3 - 13.5 * valxp2 + 135.0 * valxp1 - 245.0 * val + 135.0 * valxm1 - 13.5 * valxm2 + 1.0 * valxm3) * fd_factors[MG2D_DIFF_COEFF_20];
+ dst[MG2D_DIFF_COEFF_02] = (1.0 * valyp3 - 13.5 * valyp2 + 135.0 * valyp1 - 245.0 * val + 135.0 * valym1 - 13.5 * valym2 + 1.0 * valym3) * fd_factors[MG2D_DIFF_COEFF_02];
+
+ dst[MG2D_DIFF_COEFF_11] = fd_factors[MG2D_DIFF_COEFF_11] * (
+ 1.0 * (1.0 * valxp3yp3 - 9.0 * valxp3yp2 + 45.0 * valxp3yp1 - 45.0 * valxp3ym1 + 9.0 * valxp3ym2 - 1.0 * valxp3ym3)
+ - 9.0 * (1.0 * valxp2yp3 - 9.0 * valxp2yp2 + 45.0 * valxp2yp1 - 45.0 * valxp2ym1 + 9.0 * valxp2ym2 - 1.0 * valxp2ym3)
+ + 45.0 * (1.0 * valxp1yp3 - 9.0 * valxp1yp2 + 45.0 * valxp1yp1 - 45.0 * valxp1ym1 + 9.0 * valxp1ym2 - 1.0 * valxp1ym3)
+ - 45.0 * (1.0 * valxm1yp3 - 9.0 * valxm1yp2 + 45.0 * valxm1yp1 - 45.0 * valxm1ym1 + 9.0 * valxm1ym2 - 1.0 * valxm1ym3)
+ + 9.0 * (1.0 * valxm2yp3 - 9.0 * valxm2yp2 + 45.0 * valxm2yp1 - 45.0 * valxm2ym1 + 9.0 * valxm2ym2 - 1.0 * valxm2ym3)
+ - 1.0 * (1.0 * valxm3yp3 - 9.0 * valxm3yp2 + 45.0 * valxm3yp1 - 45.0 * valxm3ym1 + 9.0 * valxm3ym2 - 1.0 * valxm3ym3));
+}
+
static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max,
ptrdiff_t stride, const double *u, const double *rhs,
const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
@@ -173,6 +284,30 @@ static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_ma
*dst_max = MAX(*dst_max, res_max);
}
+static void residual_calc_line_s3_c(size_t linesize, double *dst, double *dst_max,
+ ptrdiff_t stride, const double *u, const double *rhs,
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *fd_factors)
+{
+ double res_max = 0.0, res_abs;
+ for (size_t i = 0; i < linesize; i++) {
+ double u_vals[MG2D_DIFF_COEFF_NB];
+ double res;
+
+ derivatives_calc_s3(u_vals, u + i, fd_factors, stride);
+
+ res = -rhs[i];
+ for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
+ res += u_vals[j] * diff_coeffs[j][i];
+ dst[i] = res;
+
+ res_abs = fabs(res);
+ res_max = MAX(res_max, res_abs);
+ }
+
+ *dst_max = MAX(*dst_max, res_max);
+}
+
static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
{
ResidualCalcInternal *priv = arg;
@@ -230,6 +365,7 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
switch (ctx->fd_stencil) {
case 1:
priv->residual_calc_line = residual_calc_line_s1_c;
+ //priv->residual_calc_line = residual_calc_line_1;
#if HAVE_EXTERNAL_ASM
if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) {
priv->residual_calc_line = mg2di_residual_calc_line_s1_avx2;
@@ -239,6 +375,7 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
break;
case 2:
priv->residual_calc_line = residual_calc_line_s2_c;
+ priv->residual_calc_line = residual_calc_line_2;
#if HAVE_EXTERNAL_ASM
if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) {
priv->residual_calc_line = mg2di_residual_calc_line_s2_avx2;
@@ -246,6 +383,14 @@ int mg2di_residual_calc_init(ResidualCalcContext *ctx)
}
#endif
break;
+ case 3:
+ priv->residual_calc_line = residual_calc_line_s3_c;
+ break;
+ case 4:
+ priv->residual_calc_line = residual_calc_line_4;
+ break;
+ default:
+ return -ENOSYS;
}
priv->residual_max_size = tp_get_nb_threads(ctx->tp) * priv->calc_blocksize;