aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.c
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-01-30 11:36:34 +0100
committerAnton Khirnov <anton@khirnov.net>2019-01-30 11:36:34 +0100
commitb584bfe20168ac6208154b1eef395b3805b35e77 (patch)
tree1882c7708e474adfc864bc3c1a1bd90a4b83d7fd /residual_calc.c
parent783d260e0d47d6adb4388fea9ed8e35122d4f6c2 (diff)
ell_grid_solve: split residual computation into its own file
Diffstat (limited to 'residual_calc.c')
-rw-r--r--residual_calc.c292
1 files changed, 292 insertions, 0 deletions
diff --git a/residual_calc.c b/residual_calc.c
new file mode 100644
index 0000000..2fc1a66
--- /dev/null
+++ b/residual_calc.c
@@ -0,0 +1,292 @@
+/*
+ * Copyright 2019 Anton Khirnov <anton@khirnov.net>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#include <errno.h>
+#include <math.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <threadpool.h>
+
+#include "common.h"
+#include "cpu.h"
+#include "mg2d_constants.h"
+#include "residual_calc.h"
+
+typedef struct ResidualCalcTask {
+ size_t line_size;
+ ptrdiff_t stride;
+
+ double *dst;
+ const double *u;
+ const double *rhs;
+ const double * const *diff_coeffs;
+ const double *fd_factors;
+} ResidualCalcTask;
+
+struct ResidualCalcInternal {
+ double *residual_max;
+ size_t residual_max_size;
+
+ void (*residual_calc_line)(size_t linesize, double *dst, double *dst_max,
+ ptrdiff_t stride, const double *u, const double *rhs,
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *fd_factors);
+ size_t calc_blocksize;
+
+ ResidualCalcTask task;
+};
+
+#if HAVE_EXTERNAL_ASM
+void mg2di_residual_calc_line_s1_fma3(size_t linesize, double *dst, double *dst_max,
+ ptrdiff_t stride, const double *u, const double *rhs,
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *fd_factors);
+void mg2di_residual_calc_line_s2_fma3(size_t linesize, double *dst, double *dst_max,
+ ptrdiff_t stride, const double *u, const double *rhs,
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *fd_factors);
+#endif
+
+static void
+derivatives_calc_s1(double *dst, const double *u, const double *fd_factors, ptrdiff_t stride)
+{
+ dst[MG2D_DIFF_COEFF_00] = u[0];
+ dst[MG2D_DIFF_COEFF_10] = (u[1] - u[-1]) * fd_factors[MG2D_DIFF_COEFF_10];
+ dst[MG2D_DIFF_COEFF_01] = (u[stride] - u[-stride]) * fd_factors[MG2D_DIFF_COEFF_01];
+
+ dst[MG2D_DIFF_COEFF_20] = (u[1] - 2.0 * u[0] + u[-1]) * fd_factors[MG2D_DIFF_COEFF_20];
+ dst[MG2D_DIFF_COEFF_02] = (u[stride] - 2.0 * u[0] + u[-stride]) * fd_factors[MG2D_DIFF_COEFF_02];
+
+ dst[MG2D_DIFF_COEFF_11] = (u[1 + stride] - u[stride - 1] - u[-stride + 1] + u[-stride - 1]) * fd_factors[MG2D_DIFF_COEFF_11];
+}
+
+static void
+derivatives_calc_s2(double *dst, const double *u, const double *fd_factors, ptrdiff_t stride)
+{
+ const double val = u[0];
+
+ const double valxp1 = u[ 1];
+ const double valxp2 = u[ 2];
+ const double valxm1 = u[-1];
+ const double valxm2 = u[-2];
+ const double valyp1 = u[ 1 * stride];
+ const double valyp2 = u[ 2 * stride];
+ const double valym1 = u[-1 * stride];
+ const double valym2 = u[-2 * stride];
+
+ const double valxp1yp1 = u[ 1 + 1 * stride];
+ const double valxp1yp2 = u[ 1 + 2 * stride];
+ const double valxp1ym1 = u[ 1 - 1 * stride];
+ const double valxp1ym2 = u[ 1 - 2 * stride];
+
+ const double valxp2yp1 = u[ 2 + 1 * stride];
+ const double valxp2yp2 = u[ 2 + 2 * stride];
+ const double valxp2ym1 = u[ 2 - 1 * stride];
+ const double valxp2ym2 = u[ 2 - 2 * stride];
+
+ const double valxm1yp1 = u[-1 + 1 * stride];
+ const double valxm1yp2 = u[-1 + 2 * stride];
+ const double valxm1ym1 = u[-1 - 1 * stride];
+ const double valxm1ym2 = u[-1 - 2 * stride];
+
+ const double valxm2yp1 = u[-2 + 1 * stride];
+ const double valxm2yp2 = u[-2 + 2 * stride];
+ const double valxm2ym1 = u[-2 - 1 * stride];
+ const double valxm2ym2 = u[-2 - 2 * stride];
+
+ dst[MG2D_DIFF_COEFF_00] = val;
+ dst[MG2D_DIFF_COEFF_10] = (-1.0 * valxp2 + 8.0 * valxp1 - 8.0 * valxm1 + 1.0 * valxm2) * fd_factors[MG2D_DIFF_COEFF_10];
+ dst[MG2D_DIFF_COEFF_01] = (-1.0 * valyp2 + 8.0 * valyp1 - 8.0 * valym1 + 1.0 * valym2) * fd_factors[MG2D_DIFF_COEFF_01];
+
+ dst[MG2D_DIFF_COEFF_20] = (-1.0 * valxp2 + 16.0 * valxp1 - 30.0 * val + 16.0 * valxm1 - 1.0 * valxm2) * fd_factors[MG2D_DIFF_COEFF_20];
+ dst[MG2D_DIFF_COEFF_02] = (-1.0 * valyp2 + 16.0 * valyp1 - 30.0 * val + 16.0 * valym1 - 1.0 * valym2) * fd_factors[MG2D_DIFF_COEFF_02];
+
+ dst[MG2D_DIFF_COEFF_11] = ( 1.0 * valxp2yp2 - 8.0 * valxp2yp1 + 8.0 * valxp2ym1 - 1.0 * valxp2ym2
+ -8.0 * valxp1yp2 + 64.0 * valxp1yp1 - 64.0 * valxp1ym1 + 8.0 * valxp1ym2
+ +8.0 * valxm1yp2 - 64.0 * valxm1yp1 + 64.0 * valxm1ym1 - 8.0 * valxm1ym2
+ -1.0 * valxm2yp2 + 8.0 * valxm2yp1 - 8.0 * valxm2ym1 + 1.0 * valxm2ym2) * fd_factors[MG2D_DIFF_COEFF_11];
+}
+
+static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max,
+ ptrdiff_t stride, const double *u, const double *rhs,
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *fd_factors)
+{
+ double res_max = 0.0, res_abs;
+ for (size_t i = 0; i < linesize; i++) {
+ double u_vals[MG2D_DIFF_COEFF_NB];
+ double res;
+
+ derivatives_calc_s1(u_vals, u + i, fd_factors, stride);
+
+ res = -rhs[i];
+ for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
+ res += u_vals[j] * diff_coeffs[j][i];
+ dst[i] = res;
+
+ res_abs = fabs(res);
+ res_max = MAX(res_max, res_abs);
+ }
+
+ *dst_max = MAX(*dst_max, res_max);
+}
+
+static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max,
+ ptrdiff_t stride, const double *u, const double *rhs,
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *fd_factors)
+{
+ double res_max = 0.0, res_abs;
+ for (size_t i = 0; i < linesize; i++) {
+ double u_vals[MG2D_DIFF_COEFF_NB];
+ double res;
+
+ derivatives_calc_s2(u_vals, u + i, fd_factors, stride);
+
+ res = -rhs[i];
+ for (int j = 0; j < ARRAY_ELEMS(u_vals); j++)
+ res += u_vals[j] * diff_coeffs[j][i];
+ dst[i] = res;
+
+ res_abs = fabs(res);
+ res_max = MAX(res_max, res_abs);
+ }
+
+ *dst_max = MAX(*dst_max, res_max);
+}
+
+static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thread_idx)
+{
+ ResidualCalcInternal *priv = arg;
+ ResidualCalcTask *task = &priv->task;
+
+ const ptrdiff_t offset = job_idx * task->stride;
+ const double *diff_coeffs[MG2D_DIFF_COEFF_NB];
+
+ for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++)
+ diff_coeffs[i] = task->diff_coeffs[i] + offset;
+
+ priv->residual_calc_line(task->line_size, task->dst + offset,
+ priv->residual_max + thread_idx * priv->calc_blocksize,
+ task->stride, task->u + offset, task->rhs + offset,
+ diff_coeffs, task->fd_factors);
+
+ return 0;
+}
+
+int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2], ptrdiff_t stride,
+ double *residual_max,
+ double *dst, const double *u, const double *rhs,
+ const double * const diff_coeffs[MG2D_DIFF_COEFF_NB],
+ const double *fd_factors)
+{
+ ResidualCalcInternal *priv = ctx->priv;
+ ResidualCalcTask *task = &priv->task;
+ double res_max = 0.0;
+
+ memset(priv->residual_max, 0, sizeof(*priv->residual_max) * priv->residual_max_size);
+
+ task->line_size = size[0];
+ task->stride = stride;
+ task->dst = dst;
+ task->u = u;
+ task->rhs = rhs;
+ task->diff_coeffs = diff_coeffs;
+ task->fd_factors = fd_factors;
+
+ tp_execute(ctx->tp, size[1], residual_calc_task, priv);
+
+ for (size_t i = 0; i < priv->residual_max_size; i++)
+ res_max = MAX(res_max, priv->residual_max[i]);
+ *residual_max = res_max;
+
+ return 0;
+}
+
+int mg2di_residual_calc_init(ResidualCalcContext *ctx)
+{
+ ResidualCalcInternal *priv = ctx->priv;
+ double *tmp;
+
+ priv->calc_blocksize = 1;
+ switch (ctx->fd_stencil) {
+ case 1:
+ priv->residual_calc_line = residual_calc_line_s1_c;
+#if HAVE_EXTERNAL_ASM
+ if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
+ priv->residual_calc_line = mg2di_residual_calc_line_s1_fma3;
+ priv->calc_blocksize = 4;
+ }
+#endif
+ break;
+ case 2:
+ priv->residual_calc_line = residual_calc_line_s2_c;
+#if HAVE_EXTERNAL_ASM
+ if (ctx->cpuflags & MG2DI_CPU_FLAG_FMA3) {
+ priv->residual_calc_line = mg2di_residual_calc_line_s2_fma3;
+ priv->calc_blocksize = 4;
+ }
+#endif
+ break;
+ }
+
+ priv->residual_max_size = tp_get_nb_threads(ctx->tp) * priv->calc_blocksize;
+ tmp = realloc(priv->residual_max,
+ sizeof(*priv->residual_max) * priv->residual_max_size);
+ if (!tmp) {
+ priv->residual_max_size = 0;
+ return -ENOMEM;
+ }
+ priv->residual_max = tmp;
+
+ return 0;
+}
+
+ResidualCalcContext *mg2di_residual_calc_alloc(void)
+{
+ ResidualCalcContext *ctx;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx)
+ return NULL;
+
+ ctx->priv = calloc(1, sizeof(*ctx->priv));
+ if (!ctx->priv) {
+ free(ctx);
+ return NULL;
+ }
+
+ return ctx;
+}
+
+void mg2di_residual_calc_free(ResidualCalcContext **pctx)
+{
+ ResidualCalcContext *ctx = *pctx;
+
+ if (!ctx)
+ return;
+
+ free(ctx->priv->residual_max);
+ free(ctx->priv);
+
+ free(ctx);
+ *pctx = NULL;
+}