From df2eddaf8d1f999021cd097a0716e7a8b23751ee Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Tue, 9 Apr 2019 19:20:34 +0200 Subject: tmp --- transfer.c | 45 ++++++++--- transfer_interp.asm | 194 ++++++++++++++++++++++++++++++++++++++------- transfer_interp_template.c | 8 +- 3 files changed, 203 insertions(+), 44 deletions(-) diff --git a/transfer.c b/transfer.c index 98051de..9ea511e 100644 --- a/transfer.c +++ b/transfer.c @@ -43,10 +43,14 @@ typedef struct GridTransferLagrange { void (*transfer_cont) (double *dst, ptrdiff_t dst_len, const double *src, ptrdiff_t src_stride, - const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y); + const ptrdiff_t *idx_x, + const double *fact_x, ptrdiff_t fact_x_stride, + const double *fact_y, ptrdiff_t fact_y_stride); void (*transfer_generic)(double *dst, ptrdiff_t dst_len, const double *src, ptrdiff_t src_stride, - const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y, + const ptrdiff_t *idx_x, + const double *fact_x, ptrdiff_t fact_x_stride, + const double *fact_y, ptrdiff_t fact_y_stride, ptrdiff_t dst_stride0, ptrdiff_t src_stride0); const NDArray *src; @@ -54,13 +58,15 @@ typedef struct GridTransferLagrange { ptrdiff_t *idx[2]; double *fact[2]; + ptrdiff_t fact_stride[2]; } GridTransferLagrange; #if HAVE_EXTERNAL_ASM void mg2di_transfer_interp_line_cont_4_avx2(double *dst, ptrdiff_t dst_len, const double *src, ptrdiff_t src_stride, const ptrdiff_t *idx_x, - const double *fact_x, const double *fact_y); + const double *fact_x, ptrdiff_t fact_x_stride, + const double *fact_y, ptrdiff_t fact_y_stride); #endif #define STENCIL 2 @@ -84,7 +90,7 @@ void mg2di_transfer_interp_line_cont_4_avx2(double *dst, ptrdiff_t dst_len, #undef STENCIL // generate the interpolation source indices and weights -static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim) +static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim, unsigned int pad) { GridTransferLagrange *priv = ctx->priv; const double step_dst = ctx->dst.step[dim]; @@ -101,24 +107,31 @@ static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim) const ptrdiff_t idx_src = (ptrdiff_t)(scale * idx_dst) - (priv->stencil / 2 - 1); const double coord_dst = idx_dst * step_dst; - double *fact = priv->fact[dim] + priv->stencil * idx_dst; + double *fact = priv->fact[dim] + idx_dst; for (int i = 0; i < priv->stencil; i++) coord_src[i] = (idx_src + i) * step_src; for (int i = 0; i < priv->stencil; i++) { - fact[i] = 1.0; + double val = 1.0; for (int j = 0; j < priv->stencil; j++) { if (i == j) continue; - fact[i] *= (coord_dst - coord_src[j]) / (coord_src[i] - coord_src[j]); + val *= (coord_dst - coord_src[j]) / (coord_src[i] - coord_src[j]); } + fact[i * priv->fact_stride[dim]] = val; } priv->idx[dim][idx_dst] = idx_src; } + for (int i = 0; i < pad; i++) { + for (int j = 0; j < priv->stencil; j++) + priv->fact[dim][priv->fact_stride[dim] * j + ctx->dst.size[dim] + i] = 0.0; + priv->idx[dim][ctx->dst.size[dim] + i] = priv->idx[dim][ctx->dst.size[dim] - 1]; + } + free(coord_src); return 0; @@ -127,6 +140,7 @@ static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim) static int transfer_lagrange_init(GridTransferContext *ctx) { GridTransferLagrange *priv = ctx->priv; + unsigned int pad = 0; int ret; switch (ctx->op) { @@ -143,6 +157,7 @@ static int transfer_lagrange_init(GridTransferContext *ctx) #if HAVE_EXTERNAL_ASM if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) { priv->transfer_cont = mg2di_transfer_interp_line_cont_4_avx2; + pad = 3; } #endif break; @@ -151,15 +166,16 @@ static int transfer_lagrange_init(GridTransferContext *ctx) } for (int dim = 0; dim < 2; dim++) { - priv->idx[dim] = calloc(ctx->dst.size[dim], sizeof(*priv->idx[dim])); + priv->idx[dim] = calloc(ctx->dst.size[dim] + pad, sizeof(*priv->idx[dim])); if (!priv->idx[dim]) return -ENOMEM; - priv->fact[dim] = calloc(ctx->dst.size[dim], sizeof(*priv->fact[dim]) * priv->stencil); + priv->fact[dim] = calloc(ctx->dst.size[dim] + pad, sizeof(*priv->fact[dim]) * priv->stencil); if (!priv->fact[dim]) return -ENOMEM; + priv->fact_stride[dim] = ctx->dst.size[dim] + pad; - ret = transfer_factors_lagrange(ctx, dim); + ret = transfer_factors_lagrange(ctx, dim, pad); if (ret < 0) return ret; } @@ -184,16 +200,19 @@ static int transfer_interp_task(void *arg, unsigned int job_idx, unsigned int th NDArray *dst = priv->dst; const ptrdiff_t idx_y = priv->idx[0][job_idx]; - const double *fact_y = priv->fact[0] + priv->stencil * job_idx; + const double *fact_y = priv->fact[0] + job_idx; if (dst->stride[1] == 1 && src->stride[1] == 1) { priv->transfer_cont(dst->data + job_idx * dst->stride[0], dst->shape[1], src->data + idx_y * src->stride[0], src->stride[0], - priv->idx[1], priv->fact[1], fact_y); + priv->idx[1], priv->fact[1], priv->fact_stride[1], + fact_y, priv->fact_stride[0]); } else { priv->transfer_generic(dst->data + job_idx * dst->stride[0], dst->shape[1], src->data + idx_y * src->stride[0], src->stride[0], - priv->idx[1], priv->fact[1], fact_y, dst->stride[1], src->stride[1]); + priv->idx[1], priv->fact[1], priv->fact_stride[1], + fact_y, priv->fact_stride[0], + dst->stride[1], src->stride[1]); } return 0; diff --git a/transfer_interp.asm b/transfer_interp.asm index b7c9af5..110203e 100644 --- a/transfer_interp.asm +++ b/transfer_interp.asm @@ -22,54 +22,192 @@ SECTION .text +; double +%define ELEM_SIZE 8 + +; transpose a 4x4 matrix of doubles, supplied in %1-%4 +; %5, %6: temporary registers (clobbered) +%macro TRANSPOSE4x4 6 + unpcklpd %5, %1, %2 ; %5 = 00 10 02 12 + unpckhpd %6, %1, %2 ; %6 = 01 11 03 13 + unpcklpd %1, %3, %4 ; %1 = 20 30 22 32 + unpckhpd %4, %3, %4 ; %4 = 21 31 23 33 + + vperm2f128 %3, %5, %1, 00110001b ; %3 = 02 12 22 32 + vinsertf128 %1, %5, xmm%1, 1 ; %1 = 00 10 20 30 + vinsertf128 %2, %6, xmm%4, 1 ; %2 = 01 11 21 31 + vperm2f128 %4, %6, %4, 00110001b ; %4 = 03 13 23 33 +%endmacro + INIT_YMM avx2 -cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\ - idx_x_val +cglobal transfer_interp_line_cont_4, 9, 11, 16, dst, dst_len, src, src_stride, idx_x, fact_x, fact_x_stride, fact_y, fact_y_stride,\ + idx_x_val, fact_x3 + shl fact_y_strideq, 3 + shl fact_x_strideq, 3 shl src_strideq, 3 shl dst_lenq, 3 add dstq, dst_lenq add idx_xq, dst_lenq - lea fact_xq, [fact_xq + 4 * dst_lenq] + add fact_xq, dst_lenq neg dst_lenq ; from now on, the register that held the line size is used as the offset into data arrays %define offsetq dst_lenq - movu m0, [fact_yq] - SPLATPD m1, m0, 1 ; fact y + 1 -> m1 - SPLATPD m2, m0, 2 ; fact y + 2 -> m2 - SPLATPD m3, m0, 3 ; fact y + 3 -> m3 - SPLATPD m0, m0, 0 ; fact y + 0 -> m0 + ; load the y interpolation factors + SPLATPD m0, [fact_yq + fact_y_strideq * 0], 0 ; fact y + 0 -> m0 + SPLATPD m1, [fact_yq + fact_y_strideq * 1], 0 ; fact y + 1 -> m1 + SPLATPD m2, [fact_yq + fact_y_strideq * 2], 0 ; fact y + 2 -> m2 + add fact_yq, fact_y_strideq + SPLATPD m3, [fact_yq + fact_y_strideq * 2], 0 ; fact y + 3 -> m3 + + ; reuse the now unneded fact_y[_stride] registers + %define fact_x1q fact_yq + %define fact_x2q fact_y_strideq + lea fact_x1q, [fact_xq + fact_x_strideq] + lea fact_x2q, [fact_xq + fact_x_strideq * 2] + lea fact_x3q, [fact_x1q + fact_x_strideq * 2] .loop: - mov idx_x_valq, [idx_xq + offsetq] - shl idx_x_valq, 3 + ; load the x interpolation factors + movu m4, [fact_xq + offsetq] ; fact x + 0 -> m4 + movu m5, [fact_x1q + offsetq] ; fact x + 1 -> m5 + movu m6, [fact_x2q + offsetq] ; fact x + 2 -> m6 + movu m7, [fact_x3q + offsetq] ; fact x + 3 -> m7 + + ; load the x indices of the source values + movu m15, [idx_xq + offsetq] + psllq m15, m15, 3 + + movq idx_x_valq, xm15 + movu m8, [srcq + idx_x_valq] + + vpermq m13, m15, 1 + movq idx_x_valq, xm13 + movu m9, [srcq + idx_x_valq] + + vpermq m13, m15, 2 + movq idx_x_valq, xm13 + movu m10, [srcq + idx_x_valq] + + vpermq m13, m15, 3 + movq idx_x_valq, xm13 + movu m11, [srcq + idx_x_valq] + + TRANSPOSE4x4 m8, m9, m10, m11, m12, m13 + + mulpd m14, m8, m4 + vfmadd231pd m14, m9, m5 + vfmadd231pd m14, m10, m6 + vfmadd231pd m14, m11, m7 + mulpd m14, m0 + + movq xm13, src_strideq + vbroadcastsd m13, xm13 + paddq m15, m13 + + movq idx_x_valq, xm15 + movu m8, [srcq + idx_x_valq] + + vpermq m13, m15, 1 + movq idx_x_valq, xm13 + movu m9, [srcq + idx_x_valq] - xorpd m4, m4 + vpermq m13, m15, 2 + movq idx_x_valq, xm13 + movu m10, [srcq + idx_x_valq] - movu m5, [fact_xq + 4 * offsetq] + vpermq m13, m15, 3 + movq idx_x_valq, xm13 + movu m11, [srcq + idx_x_valq] - mulpd m6, m5, [srcq + idx_x_valq] - mulpd m6, m0 + TRANSPOSE4x4 m8, m9, m10, m11, m12, m13 - add idx_x_valq, src_strideq - mulpd m7, m5, [srcq + idx_x_valq] - vfmadd231pd m6, m7, m1 + mulpd m13, m8, m4 + vfmadd231pd m13, m9, m5 + vfmadd231pd m13, m10, m6 + vfmadd231pd m13, m11, m7 + vfmadd231pd m14, m13, m1 - add idx_x_valq, src_strideq - mulpd m7, m5, [srcq + idx_x_valq] - vfmadd231pd m6, m7, m2 + movq xm13, src_strideq + vbroadcastsd m13, xm13 + paddq m15, m13 - add idx_x_valq, src_strideq - mulpd m7, m5, [srcq + idx_x_valq] - vfmadd231pd m6, m7, m3 + movq idx_x_valq, xm15 + movu m8, [srcq + idx_x_valq] - haddpd m6, m6 - vpermq m6, m6, 00001000b - haddpd m6, m6 + vpermq m13, m15, 1 + movq idx_x_valq, xm13 + movu m9, [srcq + idx_x_valq] - movq [dstq + offsetq], xm6 - add offsetq, 8 + vpermq m13, m15, 2 + movq idx_x_valq, xm13 + movu m10, [srcq + idx_x_valq] + + vpermq m13, m15, 3 + movq idx_x_valq, xm13 + movu m11, [srcq + idx_x_valq] + + TRANSPOSE4x4 m8, m9, m10, m11, m12, m13 + + mulpd m13, m8, m4 + vfmadd231pd m13, m9, m5 + vfmadd231pd m13, m10, m6 + vfmadd231pd m13, m11, m7 + vfmadd231pd m14, m13, m2 + + movq xm13, src_strideq + vbroadcastsd m13, xm13 + paddq m15, m13 + + movq idx_x_valq, xm15 + movu m8, [srcq + idx_x_valq] + + vpermq m13, m15, 1 + movq idx_x_valq, xm13 + movu m9, [srcq + idx_x_valq] + + vpermq m13, m15, 2 + movq idx_x_valq, xm13 + movu m10, [srcq + idx_x_valq] + + vpermq m13, m15, 3 + movq idx_x_valq, xm13 + movu m11, [srcq + idx_x_valq] + + TRANSPOSE4x4 m8, m9, m10, m11, m12, m13 + + mulpd m13, m8, m4 + vfmadd231pd m13, m9, m5 + vfmadd231pd m13, m10, m6 + vfmadd231pd m13, m11, m7 + vfmadd231pd m14, m13, m3 + + add offsetq, mmsize + jg .store_partial + movu [dstq + offsetq - mmsize], m14 js .loop + jmp .finish + +.store_partial: + sub offsetq, ELEM_SIZE + jz .store3 + sub offsetq, ELEM_SIZE + jz .store2 + +.store1: + ; offsetq is now mmsize-2 after the write position + movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm14 + jmp .finish +.store2: + ; offsetq is now mmsize-2 after the write position + movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm14 + jmp .finish +.store3: + ; offsetq is now mmsize-1 after the write position + movu [dstq + offsetq - mmsize + 1 * ELEM_SIZE], xm14 + vextractf128 xm14, m14, 1 + movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm14 +.finish: RET diff --git a/transfer_interp_template.c b/transfer_interp_template.c index 65ae98b..ddd270f 100644 --- a/transfer_interp_template.c +++ b/transfer_interp_template.c @@ -29,7 +29,9 @@ static void FUNC(interp_transfer_line)(double *dst, ptrdiff_t dst_len, const double *src, ptrdiff_t src_stride, - const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y + const ptrdiff_t *idx_x, + const double *fact_x, ptrdiff_t fact_x_stride, + const double *fact_y, ptrdiff_t fact_y_stride #if !CONTIGUOUS , ptrdiff_t dst_stride0, ptrdiff_t src_stride0 # define SSTRIDE1 src_stride0 @@ -48,8 +50,8 @@ FUNC(interp_transfer_line)(double *dst, ptrdiff_t dst_len, for (ptrdiff_t idx0 = 0; idx0 < STENCIL; idx0++) { double tmp = 0.0; for (ptrdiff_t idx1 = 0; idx1 < STENCIL; idx1++) - tmp += src_data[idx0 * src_stride + idx1 ] * fact_x[STENCIL * x + idx1]; - val += tmp * fact_y[idx0]; + tmp += src_data[idx0 * src_stride + idx1 ] * fact_x[idx1 * fact_x_stride + x]; + val += tmp * fact_y[idx0 * fact_y_stride]; } dst[x * DSTRIDE1] = val; -- cgit v1.2.3