diff options
Diffstat (limited to 'transfer.c')
-rw-r--r-- | transfer.c | 45 |
1 files changed, 32 insertions, 13 deletions
@@ -43,10 +43,14 @@ typedef struct GridTransferLagrange { void (*transfer_cont) (double *dst, ptrdiff_t dst_len, const double *src, ptrdiff_t src_stride, - const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y); + const ptrdiff_t *idx_x, + const double *fact_x, ptrdiff_t fact_x_stride, + const double *fact_y, ptrdiff_t fact_y_stride); void (*transfer_generic)(double *dst, ptrdiff_t dst_len, const double *src, ptrdiff_t src_stride, - const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y, + const ptrdiff_t *idx_x, + const double *fact_x, ptrdiff_t fact_x_stride, + const double *fact_y, ptrdiff_t fact_y_stride, ptrdiff_t dst_stride0, ptrdiff_t src_stride0); const NDArray *src; @@ -54,13 +58,15 @@ typedef struct GridTransferLagrange { ptrdiff_t *idx[2]; double *fact[2]; + ptrdiff_t fact_stride[2]; } GridTransferLagrange; #if HAVE_EXTERNAL_ASM void mg2di_transfer_interp_line_cont_4_avx2(double *dst, ptrdiff_t dst_len, const double *src, ptrdiff_t src_stride, const ptrdiff_t *idx_x, - const double *fact_x, const double *fact_y); + const double *fact_x, ptrdiff_t fact_x_stride, + const double *fact_y, ptrdiff_t fact_y_stride); #endif #define STENCIL 2 @@ -84,7 +90,7 @@ void mg2di_transfer_interp_line_cont_4_avx2(double *dst, ptrdiff_t dst_len, #undef STENCIL // generate the interpolation source indices and weights -static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim) +static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim, unsigned int pad) { GridTransferLagrange *priv = ctx->priv; const double step_dst = ctx->dst.step[dim]; @@ -101,24 +107,31 @@ static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim) const ptrdiff_t idx_src = (ptrdiff_t)(scale * idx_dst) - (priv->stencil / 2 - 1); const double coord_dst = idx_dst * step_dst; - double *fact = priv->fact[dim] + priv->stencil * idx_dst; + double *fact = priv->fact[dim] + idx_dst; for (int i = 0; i < priv->stencil; i++) coord_src[i] = (idx_src + i) * step_src; for (int i = 0; i < priv->stencil; i++) { - fact[i] = 1.0; + double val = 1.0; for (int j = 0; j < priv->stencil; j++) { if (i == j) continue; - fact[i] *= (coord_dst - coord_src[j]) / (coord_src[i] - coord_src[j]); + val *= (coord_dst - coord_src[j]) / (coord_src[i] - coord_src[j]); } + fact[i * priv->fact_stride[dim]] = val; } priv->idx[dim][idx_dst] = idx_src; } + for (int i = 0; i < pad; i++) { + for (int j = 0; j < priv->stencil; j++) + priv->fact[dim][priv->fact_stride[dim] * j + ctx->dst.size[dim] + i] = 0.0; + priv->idx[dim][ctx->dst.size[dim] + i] = priv->idx[dim][ctx->dst.size[dim] - 1]; + } + free(coord_src); return 0; @@ -127,6 +140,7 @@ static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim) static int transfer_lagrange_init(GridTransferContext *ctx) { GridTransferLagrange *priv = ctx->priv; + unsigned int pad = 0; int ret; switch (ctx->op) { @@ -143,6 +157,7 @@ static int transfer_lagrange_init(GridTransferContext *ctx) #if HAVE_EXTERNAL_ASM if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) { priv->transfer_cont = mg2di_transfer_interp_line_cont_4_avx2; + pad = 3; } #endif break; @@ -151,15 +166,16 @@ static int transfer_lagrange_init(GridTransferContext *ctx) } for (int dim = 0; dim < 2; dim++) { - priv->idx[dim] = calloc(ctx->dst.size[dim], sizeof(*priv->idx[dim])); + priv->idx[dim] = calloc(ctx->dst.size[dim] + pad, sizeof(*priv->idx[dim])); if (!priv->idx[dim]) return -ENOMEM; - priv->fact[dim] = calloc(ctx->dst.size[dim], sizeof(*priv->fact[dim]) * priv->stencil); + priv->fact[dim] = calloc(ctx->dst.size[dim] + pad, sizeof(*priv->fact[dim]) * priv->stencil); if (!priv->fact[dim]) return -ENOMEM; + priv->fact_stride[dim] = ctx->dst.size[dim] + pad; - ret = transfer_factors_lagrange(ctx, dim); + ret = transfer_factors_lagrange(ctx, dim, pad); if (ret < 0) return ret; } @@ -184,16 +200,19 @@ static int transfer_interp_task(void *arg, unsigned int job_idx, unsigned int th NDArray *dst = priv->dst; const ptrdiff_t idx_y = priv->idx[0][job_idx]; - const double *fact_y = priv->fact[0] + priv->stencil * job_idx; + const double *fact_y = priv->fact[0] + job_idx; if (dst->stride[1] == 1 && src->stride[1] == 1) { priv->transfer_cont(dst->data + job_idx * dst->stride[0], dst->shape[1], src->data + idx_y * src->stride[0], src->stride[0], - priv->idx[1], priv->fact[1], fact_y); + priv->idx[1], priv->fact[1], priv->fact_stride[1], + fact_y, priv->fact_stride[0]); } else { priv->transfer_generic(dst->data + job_idx * dst->stride[0], dst->shape[1], src->data + idx_y * src->stride[0], src->stride[0], - priv->idx[1], priv->fact[1], fact_y, dst->stride[1], src->stride[1]); + priv->idx[1], priv->fact[1], priv->fact_stride[1], + fact_y, priv->fact_stride[0], + dst->stride[1], src->stride[1]); } return 0; |