/* * Copyright 2019 Anton Khirnov * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "config.h" #include #include #include #include #include #include #include "common.h" #include "cpu.h" #include "mg2d_constants.h" #include "residual_calc.h" typedef struct ResidualCalcTask { size_t line_size; ptrdiff_t stride; double *dst; const double *u; const double *rhs; const double * const *diff_coeffs; const double *fd_factors; } ResidualCalcTask; struct ResidualCalcInternal { double *residual_max; size_t residual_max_size; void (*residual_calc_line)(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], const double *fd_factors); size_t calc_blocksize; ResidualCalcTask task; }; #if HAVE_EXTERNAL_ASM void mg2di_residual_calc_line_s1_avx2(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], const double *fd_factors); void mg2di_residual_calc_line_s2_avx2(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], const double *fd_factors); #endif static const double fd_coeffs[][2][FD_STENCIL_MAX * 2 + 1] = { { { -1.0, 0.0, 1.0 }, { 1.0, -2.0, 1.0 }, }, { { 1.0, -8.0, 0.0, 8.0, -1.0 }, { -1.0, 16.0, -30.0, 16.0, -1.0 }, }, { { -1.0, 9.0, -45.0, 0.0, 45.0, -9.0, 1.0 }, { 1.0, -13.5, 135.0, -245.0, 135.0, -13.5, 1.0 }, }, { { 3.0, -32.0, 168.0, -672.0, 0.0, 672.0, -168.0, 32.0, -3.0 }, { -9.0, 128.0, -1008.0, 8064.0, -14350.0, 8064.0, -1008.0, 128.0, -9.0 }, }, }; #define STENCIL 1 #include "residual_calc_template.c" #undef STENCIL #define STENCIL 2 #include "residual_calc_template.c" #undef STENCIL #define STENCIL 3 #include "residual_calc_template.c" #undef STENCIL #define STENCIL 4 #include "residual_calc_template.c" #undef STENCIL static void derivatives_calc_s1(double *dst, const double *u, const double *fd_factors, ptrdiff_t stride) { dst[MG2D_DIFF_COEFF_00] = u[0]; dst[MG2D_DIFF_COEFF_10] = (u[1] - u[-1]) * fd_factors[MG2D_DIFF_COEFF_10]; dst[MG2D_DIFF_COEFF_01] = (u[stride] - u[-stride]) * fd_factors[MG2D_DIFF_COEFF_01]; dst[MG2D_DIFF_COEFF_20] = (u[1] - 2.0 * u[0] + u[-1]) * fd_factors[MG2D_DIFF_COEFF_20]; dst[MG2D_DIFF_COEFF_02] = (u[stride] - 2.0 * u[0] + u[-stride]) * fd_factors[MG2D_DIFF_COEFF_02]; dst[MG2D_DIFF_COEFF_11] = (u[1 + stride] - u[stride - 1] - u[-stride + 1] + u[-stride - 1]) * fd_factors[MG2D_DIFF_COEFF_11]; } static void derivatives_calc_s2(double *dst, const double *u, const double *fd_factors, ptrdiff_t stride) { const double val = u[0]; const double valxp1 = u[ 1]; const double valxp2 = u[ 2]; const double valxm1 = u[-1]; const double valxm2 = u[-2]; const double valyp1 = u[ 1 * stride]; const double valyp2 = u[ 2 * stride]; const double valym1 = u[-1 * stride]; const double valym2 = u[-2 * stride]; const double valxp1yp1 = u[ 1 + 1 * stride]; const double valxp1yp2 = u[ 1 + 2 * stride]; const double valxp1ym1 = u[ 1 - 1 * stride]; const double valxp1ym2 = u[ 1 - 2 * stride]; const double valxp2yp1 = u[ 2 + 1 * stride]; const double valxp2yp2 = u[ 2 + 2 * stride]; const double valxp2ym1 = u[ 2 - 1 * stride]; const double valxp2ym2 = u[ 2 - 2 * stride]; const double valxm1yp1 = u[-1 + 1 * stride]; const double valxm1yp2 = u[-1 + 2 * stride]; const double valxm1ym1 = u[-1 - 1 * stride]; const double valxm1ym2 = u[-1 - 2 * stride]; const double valxm2yp1 = u[-2 + 1 * stride]; const double valxm2yp2 = u[-2 + 2 * stride]; const double valxm2ym1 = u[-2 - 1 * stride]; const double valxm2ym2 = u[-2 - 2 * stride]; dst[MG2D_DIFF_COEFF_00] = val; dst[MG2D_DIFF_COEFF_10] = (-1.0 * valxp2 + 8.0 * valxp1 - 8.0 * valxm1 + 1.0 * valxm2) * fd_factors[MG2D_DIFF_COEFF_10]; dst[MG2D_DIFF_COEFF_01] = (-1.0 * valyp2 + 8.0 * valyp1 - 8.0 * valym1 + 1.0 * valym2) * fd_factors[MG2D_DIFF_COEFF_01]; dst[MG2D_DIFF_COEFF_20] = (-1.0 * valxp2 + 16.0 * valxp1 - 30.0 * val + 16.0 * valxm1 - 1.0 * valxm2) * fd_factors[MG2D_DIFF_COEFF_20]; dst[MG2D_DIFF_COEFF_02] = (-1.0 * valyp2 + 16.0 * valyp1 - 30.0 * val + 16.0 * valym1 - 1.0 * valym2) * fd_factors[MG2D_DIFF_COEFF_02]; dst[MG2D_DIFF_COEFF_11] = ( 1.0 * valxp2yp2 - 8.0 * valxp2yp1 + 8.0 * valxp2ym1 - 1.0 * valxp2ym2 -8.0 * valxp1yp2 + 64.0 * valxp1yp1 - 64.0 * valxp1ym1 + 8.0 * valxp1ym2 +8.0 * valxm1yp2 - 64.0 * valxm1yp1 + 64.0 * valxm1ym1 - 8.0 * valxm1ym2 -1.0 * valxm2yp2 + 8.0 * valxm2yp1 - 8.0 * valxm2ym1 + 1.0 * valxm2ym2) * fd_factors[MG2D_DIFF_COEFF_11]; } static void derivatives_calc_s3(double *dst, const double *u, const double *fd_factors, ptrdiff_t stride) { const double val = u[0]; const double valxp1 = u[ 1]; const double valxp2 = u[ 2]; const double valxp3 = u[ 3]; const double valxm1 = u[-1]; const double valxm2 = u[-2]; const double valxm3 = u[-3]; const double valyp1 = u[ 1 * stride]; const double valyp2 = u[ 2 * stride]; const double valyp3 = u[ 3 * stride]; const double valym1 = u[-1 * stride]; const double valym2 = u[-2 * stride]; const double valym3 = u[-3 * stride]; const double valxp3yp1 = u[ 3 + 1 * stride]; const double valxp3yp2 = u[ 3 + 2 * stride]; const double valxp3yp3 = u[ 3 + 3 * stride]; const double valxp3ym1 = u[ 3 - 1 * stride]; const double valxp3ym2 = u[ 3 - 2 * stride]; const double valxp3ym3 = u[ 3 - 3 * stride]; const double valxp2yp1 = u[ 2 + 1 * stride]; const double valxp2yp2 = u[ 2 + 2 * stride]; const double valxp2yp3 = u[ 2 + 3 * stride]; const double valxp2ym1 = u[ 2 - 1 * stride]; const double valxp2ym2 = u[ 2 - 2 * stride]; const double valxp2ym3 = u[ 2 - 3 * stride]; const double valxp1yp1 = u[ 1 + 1 * stride]; const double valxp1yp2 = u[ 1 + 2 * stride]; const double valxp1yp3 = u[ 1 + 3 * stride]; const double valxp1ym1 = u[ 1 - 1 * stride]; const double valxp1ym2 = u[ 1 - 2 * stride]; const double valxp1ym3 = u[ 1 - 3 * stride]; const double valxm1yp1 = u[-1 + 1 * stride]; const double valxm1yp2 = u[-1 + 2 * stride]; const double valxm1yp3 = u[-1 + 3 * stride]; const double valxm1ym1 = u[-1 - 1 * stride]; const double valxm1ym2 = u[-1 - 2 * stride]; const double valxm1ym3 = u[-1 - 3 * stride]; const double valxm2yp1 = u[-2 + 1 * stride]; const double valxm2yp2 = u[-2 + 2 * stride]; const double valxm2yp3 = u[-2 + 3 * stride]; const double valxm2ym1 = u[-2 - 1 * stride]; const double valxm2ym2 = u[-2 - 2 * stride]; const double valxm2ym3 = u[-2 - 3 * stride]; const double valxm3yp1 = u[-3 + 1 * stride]; const double valxm3yp2 = u[-3 + 2 * stride]; const double valxm3yp3 = u[-3 + 3 * stride]; const double valxm3ym1 = u[-3 - 1 * stride]; const double valxm3ym2 = u[-3 - 2 * stride]; const double valxm3ym3 = u[-3 - 3 * stride]; dst[MG2D_DIFF_COEFF_00] = val; dst[MG2D_DIFF_COEFF_10] = (1.0 * valxp3 - 9.0 * valxp2 + 45.0 * valxp1 - 45.0 * valxm1 + 9.0 * valxm2 - 1.0 * valxm3) * fd_factors[MG2D_DIFF_COEFF_10]; dst[MG2D_DIFF_COEFF_01] = (1.0 * valyp3 - 9.0 * valyp2 + 45.0 * valyp1 - 45.0 * valym1 + 9.0 * valym2 - 1.0 * valym3) * fd_factors[MG2D_DIFF_COEFF_01]; dst[MG2D_DIFF_COEFF_20] = (1.0 * valxp3 - 13.5 * valxp2 + 135.0 * valxp1 - 245.0 * val + 135.0 * valxm1 - 13.5 * valxm2 + 1.0 * valxm3) * fd_factors[MG2D_DIFF_COEFF_20]; dst[MG2D_DIFF_COEFF_02] = (1.0 * valyp3 - 13.5 * valyp2 + 135.0 * valyp1 - 245.0 * val + 135.0 * valym1 - 13.5 * valym2 + 1.0 * valym3) * fd_factors[MG2D_DIFF_COEFF_02]; dst[MG2D_DIFF_COEFF_11] = fd_factors[MG2D_DIFF_COEFF_11] * ( 1.0 * (1.0 * valxp3yp3 - 9.0 * valxp3yp2 + 45.0 * valxp3yp1 - 45.0 * valxp3ym1 + 9.0 * valxp3ym2 - 1.0 * valxp3ym3) - 9.0 * (1.0 * valxp2yp3 - 9.0 * valxp2yp2 + 45.0 * valxp2yp1 - 45.0 * valxp2ym1 + 9.0 * valxp2ym2 - 1.0 * valxp2ym3) + 45.0 * (1.0 * valxp1yp3 - 9.0 * valxp1yp2 + 45.0 * valxp1yp1 - 45.0 * valxp1ym1 + 9.0 * valxp1ym2 - 1.0 * valxp1ym3) - 45.0 * (1.0 * valxm1yp3 - 9.0 * valxm1yp2 + 45.0 * valxm1yp1 - 45.0 * valxm1ym1 + 9.0 * valxm1ym2 - 1.0 * valxm1ym3) + 9.0 * (1.0 * valxm2yp3 - 9.0 * valxm2yp2 + 45.0 * valxm2yp1 - 45.0 * valxm2ym1 + 9.0 * valxm2ym2 - 1.0 * valxm2ym3) - 1.0 * (1.0 * valxm3yp3 - 9.0 * valxm3yp2 + 45.0 * valxm3yp1 - 45.0 * valxm3ym1 + 9.0 * valxm3ym2 - 1.0 * valxm3ym3)); } static void residual_calc_line_s1_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], const double *fd_factors) { double res_max = 0.0, res_abs; for (size_t i = 0; i < linesize; i++) { double u_vals[MG2D_DIFF_COEFF_NB]; double res; derivatives_calc_s1(u_vals, u + i, fd_factors, stride); res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) res += u_vals[j] * diff_coeffs[j][i]; dst[i] = res; res_abs = fabs(res); res_max = MAX(res_max, res_abs); } *dst_max = MAX(*dst_max, res_max); } static void residual_calc_line_s2_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], const double *fd_factors) { double res_max = 0.0, res_abs; for (size_t i = 0; i < linesize; i++) { double u_vals[MG2D_DIFF_COEFF_NB]; double res; derivatives_calc_s2(u_vals, u + i, fd_factors, stride); res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) res += u_vals[j] * diff_coeffs[j][i]; dst[i] = res; res_abs = fabs(res); res_max = MAX(res_max, res_abs); } *dst_max = MAX(*dst_max, res_max); } static void residual_calc_line_s3_c(size_t linesize, double *dst, double *dst_max, ptrdiff_t stride, const double *u, const double *rhs, const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], const double *fd_factors) { double res_max = 0.0, res_abs; for (size_t i = 0; i < linesize; i++) { double u_vals[MG2D_DIFF_COEFF_NB]; double res; derivatives_calc_s3(u_vals, u + i, fd_factors, stride); res = -rhs[i]; for (int j = 0; j < ARRAY_ELEMS(u_vals); j++) res += u_vals[j] * diff_coeffs[j][i]; dst[i] = res; res_abs = fabs(res); res_max = MAX(res_max, res_abs); } *dst_max = MAX(*dst_max, res_max); } static int residual_calc_task(void *arg, unsigned int job_idx, unsigned int thread_idx) { ResidualCalcInternal *priv = arg; ResidualCalcTask *task = &priv->task; const ptrdiff_t offset = job_idx * task->stride; const double *diff_coeffs[MG2D_DIFF_COEFF_NB]; for (int i = 0; i < ARRAY_ELEMS(diff_coeffs); i++) diff_coeffs[i] = task->diff_coeffs[i] + offset; priv->residual_calc_line(task->line_size, task->dst + offset, priv->residual_max + thread_idx * priv->calc_blocksize, task->stride, task->u + offset, task->rhs + offset, diff_coeffs, task->fd_factors); return 0; } int mg2di_residual_calc(ResidualCalcContext *ctx, size_t size[2], ptrdiff_t stride, double *residual_max, double *dst, const double *u, const double *rhs, const double * const diff_coeffs[MG2D_DIFF_COEFF_NB], const double *fd_factors) { ResidualCalcInternal *priv = ctx->priv; ResidualCalcTask *task = &priv->task; double res_max = 0.0; memset(priv->residual_max, 0, sizeof(*priv->residual_max) * priv->residual_max_size); task->line_size = size[0]; task->stride = stride; task->dst = dst; task->u = u; task->rhs = rhs; task->diff_coeffs = diff_coeffs; task->fd_factors = fd_factors; tp_execute(ctx->tp, size[1], residual_calc_task, priv); for (size_t i = 0; i < priv->residual_max_size; i++) res_max = MAX(res_max, priv->residual_max[i]); *residual_max = res_max; return 0; } int mg2di_residual_calc_init(ResidualCalcContext *ctx) { ResidualCalcInternal *priv = ctx->priv; double *tmp; priv->calc_blocksize = 1; switch (ctx->fd_stencil) { case 1: priv->residual_calc_line = residual_calc_line_s1_c; //priv->residual_calc_line = residual_calc_line_1; #if HAVE_EXTERNAL_ASM if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) { priv->residual_calc_line = mg2di_residual_calc_line_s1_avx2; priv->calc_blocksize = 4; } #endif break; case 2: priv->residual_calc_line = residual_calc_line_s2_c; priv->residual_calc_line = residual_calc_line_2; #if HAVE_EXTERNAL_ASM if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) { priv->residual_calc_line = mg2di_residual_calc_line_s2_avx2; priv->calc_blocksize = 4; } #endif break; case 3: priv->residual_calc_line = residual_calc_line_s3_c; break; case 4: priv->residual_calc_line = residual_calc_line_4; break; default: return -ENOSYS; } priv->residual_max_size = tp_get_nb_threads(ctx->tp) * priv->calc_blocksize; tmp = realloc(priv->residual_max, sizeof(*priv->residual_max) * priv->residual_max_size); if (!tmp) { priv->residual_max_size = 0; return -ENOMEM; } priv->residual_max = tmp; return 0; } ResidualCalcContext *mg2di_residual_calc_alloc(void) { ResidualCalcContext *ctx; ctx = calloc(1, sizeof(*ctx)); if (!ctx) return NULL; ctx->priv = calloc(1, sizeof(*ctx->priv)); if (!ctx->priv) { free(ctx); return NULL; } return ctx; } void mg2di_residual_calc_free(ResidualCalcContext **pctx) { ResidualCalcContext *ctx = *pctx; if (!ctx) return; free(ctx->priv->residual_max); free(ctx->priv); free(ctx); *pctx = NULL; }