summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-04-09 19:20:34 +0200
committerAnton Khirnov <anton@khirnov.net>2019-04-09 19:20:34 +0200
commitdf2eddaf8d1f999021cd097a0716e7a8b23751ee (patch)
tree325028205605f15f750b89520d2f34d501ade8c8
parent264d8ce5e39676582f2e6a65cf517924846070b9 (diff)
-rw-r--r--transfer.c45
-rw-r--r--transfer_interp.asm194
-rw-r--r--transfer_interp_template.c8
3 files changed, 203 insertions, 44 deletions
diff --git a/transfer.c b/transfer.c
index 98051de..9ea511e 100644
--- a/transfer.c
+++ b/transfer.c
@@ -43,10 +43,14 @@ typedef struct GridTransferLagrange {
void (*transfer_cont) (double *dst, ptrdiff_t dst_len,
const double *src, ptrdiff_t src_stride,
- const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y);
+ const ptrdiff_t *idx_x,
+ const double *fact_x, ptrdiff_t fact_x_stride,
+ const double *fact_y, ptrdiff_t fact_y_stride);
void (*transfer_generic)(double *dst, ptrdiff_t dst_len,
const double *src, ptrdiff_t src_stride,
- const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y,
+ const ptrdiff_t *idx_x,
+ const double *fact_x, ptrdiff_t fact_x_stride,
+ const double *fact_y, ptrdiff_t fact_y_stride,
ptrdiff_t dst_stride0, ptrdiff_t src_stride0);
const NDArray *src;
@@ -54,13 +58,15 @@ typedef struct GridTransferLagrange {
ptrdiff_t *idx[2];
double *fact[2];
+ ptrdiff_t fact_stride[2];
} GridTransferLagrange;
#if HAVE_EXTERNAL_ASM
void mg2di_transfer_interp_line_cont_4_avx2(double *dst, ptrdiff_t dst_len,
const double *src, ptrdiff_t src_stride,
const ptrdiff_t *idx_x,
- const double *fact_x, const double *fact_y);
+ const double *fact_x, ptrdiff_t fact_x_stride,
+ const double *fact_y, ptrdiff_t fact_y_stride);
#endif
#define STENCIL 2
@@ -84,7 +90,7 @@ void mg2di_transfer_interp_line_cont_4_avx2(double *dst, ptrdiff_t dst_len,
#undef STENCIL
// generate the interpolation source indices and weights
-static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim)
+static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim, unsigned int pad)
{
GridTransferLagrange *priv = ctx->priv;
const double step_dst = ctx->dst.step[dim];
@@ -101,24 +107,31 @@ static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim)
const ptrdiff_t idx_src = (ptrdiff_t)(scale * idx_dst) - (priv->stencil / 2 - 1);
const double coord_dst = idx_dst * step_dst;
- double *fact = priv->fact[dim] + priv->stencil * idx_dst;
+ double *fact = priv->fact[dim] + idx_dst;
for (int i = 0; i < priv->stencil; i++)
coord_src[i] = (idx_src + i) * step_src;
for (int i = 0; i < priv->stencil; i++) {
- fact[i] = 1.0;
+ double val = 1.0;
for (int j = 0; j < priv->stencil; j++) {
if (i == j)
continue;
- fact[i] *= (coord_dst - coord_src[j]) / (coord_src[i] - coord_src[j]);
+ val *= (coord_dst - coord_src[j]) / (coord_src[i] - coord_src[j]);
}
+ fact[i * priv->fact_stride[dim]] = val;
}
priv->idx[dim][idx_dst] = idx_src;
}
+ for (int i = 0; i < pad; i++) {
+ for (int j = 0; j < priv->stencil; j++)
+ priv->fact[dim][priv->fact_stride[dim] * j + ctx->dst.size[dim] + i] = 0.0;
+ priv->idx[dim][ctx->dst.size[dim] + i] = priv->idx[dim][ctx->dst.size[dim] - 1];
+ }
+
free(coord_src);
return 0;
@@ -127,6 +140,7 @@ static int transfer_factors_lagrange(GridTransferContext *ctx, unsigned int dim)
static int transfer_lagrange_init(GridTransferContext *ctx)
{
GridTransferLagrange *priv = ctx->priv;
+ unsigned int pad = 0;
int ret;
switch (ctx->op) {
@@ -143,6 +157,7 @@ static int transfer_lagrange_init(GridTransferContext *ctx)
#if HAVE_EXTERNAL_ASM
if (ctx->cpuflags & MG2DI_CPU_FLAG_AVX2) {
priv->transfer_cont = mg2di_transfer_interp_line_cont_4_avx2;
+ pad = 3;
}
#endif
break;
@@ -151,15 +166,16 @@ static int transfer_lagrange_init(GridTransferContext *ctx)
}
for (int dim = 0; dim < 2; dim++) {
- priv->idx[dim] = calloc(ctx->dst.size[dim], sizeof(*priv->idx[dim]));
+ priv->idx[dim] = calloc(ctx->dst.size[dim] + pad, sizeof(*priv->idx[dim]));
if (!priv->idx[dim])
return -ENOMEM;
- priv->fact[dim] = calloc(ctx->dst.size[dim], sizeof(*priv->fact[dim]) * priv->stencil);
+ priv->fact[dim] = calloc(ctx->dst.size[dim] + pad, sizeof(*priv->fact[dim]) * priv->stencil);
if (!priv->fact[dim])
return -ENOMEM;
+ priv->fact_stride[dim] = ctx->dst.size[dim] + pad;
- ret = transfer_factors_lagrange(ctx, dim);
+ ret = transfer_factors_lagrange(ctx, dim, pad);
if (ret < 0)
return ret;
}
@@ -184,16 +200,19 @@ static int transfer_interp_task(void *arg, unsigned int job_idx, unsigned int th
NDArray *dst = priv->dst;
const ptrdiff_t idx_y = priv->idx[0][job_idx];
- const double *fact_y = priv->fact[0] + priv->stencil * job_idx;
+ const double *fact_y = priv->fact[0] + job_idx;
if (dst->stride[1] == 1 && src->stride[1] == 1) {
priv->transfer_cont(dst->data + job_idx * dst->stride[0], dst->shape[1],
src->data + idx_y * src->stride[0], src->stride[0],
- priv->idx[1], priv->fact[1], fact_y);
+ priv->idx[1], priv->fact[1], priv->fact_stride[1],
+ fact_y, priv->fact_stride[0]);
} else {
priv->transfer_generic(dst->data + job_idx * dst->stride[0], dst->shape[1],
src->data + idx_y * src->stride[0], src->stride[0],
- priv->idx[1], priv->fact[1], fact_y, dst->stride[1], src->stride[1]);
+ priv->idx[1], priv->fact[1], priv->fact_stride[1],
+ fact_y, priv->fact_stride[0],
+ dst->stride[1], src->stride[1]);
}
return 0;
diff --git a/transfer_interp.asm b/transfer_interp.asm
index b7c9af5..110203e 100644
--- a/transfer_interp.asm
+++ b/transfer_interp.asm
@@ -22,54 +22,192 @@
SECTION .text
+; double
+%define ELEM_SIZE 8
+
+; transpose a 4x4 matrix of doubles, supplied in %1-%4
+; %5, %6: temporary registers (clobbered)
+%macro TRANSPOSE4x4 6
+ unpcklpd %5, %1, %2 ; %5 = 00 10 02 12
+ unpckhpd %6, %1, %2 ; %6 = 01 11 03 13
+ unpcklpd %1, %3, %4 ; %1 = 20 30 22 32
+ unpckhpd %4, %3, %4 ; %4 = 21 31 23 33
+
+ vperm2f128 %3, %5, %1, 00110001b ; %3 = 02 12 22 32
+ vinsertf128 %1, %5, xmm%1, 1 ; %1 = 00 10 20 30
+ vinsertf128 %2, %6, xmm%4, 1 ; %2 = 01 11 21 31
+ vperm2f128 %4, %6, %4, 00110001b ; %4 = 03 13 23 33
+%endmacro
+
INIT_YMM avx2
-cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\
- idx_x_val
+cglobal transfer_interp_line_cont_4, 9, 11, 16, dst, dst_len, src, src_stride, idx_x, fact_x, fact_x_stride, fact_y, fact_y_stride,\
+ idx_x_val, fact_x3
+ shl fact_y_strideq, 3
+ shl fact_x_strideq, 3
shl src_strideq, 3
shl dst_lenq, 3
add dstq, dst_lenq
add idx_xq, dst_lenq
- lea fact_xq, [fact_xq + 4 * dst_lenq]
+ add fact_xq, dst_lenq
neg dst_lenq
; from now on, the register that held the line size is used as the offset into data arrays
%define offsetq dst_lenq
- movu m0, [fact_yq]
- SPLATPD m1, m0, 1 ; fact y + 1 -> m1
- SPLATPD m2, m0, 2 ; fact y + 2 -> m2
- SPLATPD m3, m0, 3 ; fact y + 3 -> m3
- SPLATPD m0, m0, 0 ; fact y + 0 -> m0
+ ; load the y interpolation factors
+ SPLATPD m0, [fact_yq + fact_y_strideq * 0], 0 ; fact y + 0 -> m0
+ SPLATPD m1, [fact_yq + fact_y_strideq * 1], 0 ; fact y + 1 -> m1
+ SPLATPD m2, [fact_yq + fact_y_strideq * 2], 0 ; fact y + 2 -> m2
+ add fact_yq, fact_y_strideq
+ SPLATPD m3, [fact_yq + fact_y_strideq * 2], 0 ; fact y + 3 -> m3
+
+ ; reuse the now unneded fact_y[_stride] registers
+ %define fact_x1q fact_yq
+ %define fact_x2q fact_y_strideq
+ lea fact_x1q, [fact_xq + fact_x_strideq]
+ lea fact_x2q, [fact_xq + fact_x_strideq * 2]
+ lea fact_x3q, [fact_x1q + fact_x_strideq * 2]
.loop:
- mov idx_x_valq, [idx_xq + offsetq]
- shl idx_x_valq, 3
+ ; load the x interpolation factors
+ movu m4, [fact_xq + offsetq] ; fact x + 0 -> m4
+ movu m5, [fact_x1q + offsetq] ; fact x + 1 -> m5
+ movu m6, [fact_x2q + offsetq] ; fact x + 2 -> m6
+ movu m7, [fact_x3q + offsetq] ; fact x + 3 -> m7
+
+ ; load the x indices of the source values
+ movu m15, [idx_xq + offsetq]
+ psllq m15, m15, 3
+
+ movq idx_x_valq, xm15
+ movu m8, [srcq + idx_x_valq]
+
+ vpermq m13, m15, 1
+ movq idx_x_valq, xm13
+ movu m9, [srcq + idx_x_valq]
+
+ vpermq m13, m15, 2
+ movq idx_x_valq, xm13
+ movu m10, [srcq + idx_x_valq]
+
+ vpermq m13, m15, 3
+ movq idx_x_valq, xm13
+ movu m11, [srcq + idx_x_valq]
+
+ TRANSPOSE4x4 m8, m9, m10, m11, m12, m13
+
+ mulpd m14, m8, m4
+ vfmadd231pd m14, m9, m5
+ vfmadd231pd m14, m10, m6
+ vfmadd231pd m14, m11, m7
+ mulpd m14, m0
+
+ movq xm13, src_strideq
+ vbroadcastsd m13, xm13
+ paddq m15, m13
+
+ movq idx_x_valq, xm15
+ movu m8, [srcq + idx_x_valq]
+
+ vpermq m13, m15, 1
+ movq idx_x_valq, xm13
+ movu m9, [srcq + idx_x_valq]
- xorpd m4, m4
+ vpermq m13, m15, 2
+ movq idx_x_valq, xm13
+ movu m10, [srcq + idx_x_valq]
- movu m5, [fact_xq + 4 * offsetq]
+ vpermq m13, m15, 3
+ movq idx_x_valq, xm13
+ movu m11, [srcq + idx_x_valq]
- mulpd m6, m5, [srcq + idx_x_valq]
- mulpd m6, m0
+ TRANSPOSE4x4 m8, m9, m10, m11, m12, m13
- add idx_x_valq, src_strideq
- mulpd m7, m5, [srcq + idx_x_valq]
- vfmadd231pd m6, m7, m1
+ mulpd m13, m8, m4
+ vfmadd231pd m13, m9, m5
+ vfmadd231pd m13, m10, m6
+ vfmadd231pd m13, m11, m7
+ vfmadd231pd m14, m13, m1
- add idx_x_valq, src_strideq
- mulpd m7, m5, [srcq + idx_x_valq]
- vfmadd231pd m6, m7, m2
+ movq xm13, src_strideq
+ vbroadcastsd m13, xm13
+ paddq m15, m13
- add idx_x_valq, src_strideq
- mulpd m7, m5, [srcq + idx_x_valq]
- vfmadd231pd m6, m7, m3
+ movq idx_x_valq, xm15
+ movu m8, [srcq + idx_x_valq]
- haddpd m6, m6
- vpermq m6, m6, 00001000b
- haddpd m6, m6
+ vpermq m13, m15, 1
+ movq idx_x_valq, xm13
+ movu m9, [srcq + idx_x_valq]
- movq [dstq + offsetq], xm6
- add offsetq, 8
+ vpermq m13, m15, 2
+ movq idx_x_valq, xm13
+ movu m10, [srcq + idx_x_valq]
+
+ vpermq m13, m15, 3
+ movq idx_x_valq, xm13
+ movu m11, [srcq + idx_x_valq]
+
+ TRANSPOSE4x4 m8, m9, m10, m11, m12, m13
+
+ mulpd m13, m8, m4
+ vfmadd231pd m13, m9, m5
+ vfmadd231pd m13, m10, m6
+ vfmadd231pd m13, m11, m7
+ vfmadd231pd m14, m13, m2
+
+ movq xm13, src_strideq
+ vbroadcastsd m13, xm13
+ paddq m15, m13
+
+ movq idx_x_valq, xm15
+ movu m8, [srcq + idx_x_valq]
+
+ vpermq m13, m15, 1
+ movq idx_x_valq, xm13
+ movu m9, [srcq + idx_x_valq]
+
+ vpermq m13, m15, 2
+ movq idx_x_valq, xm13
+ movu m10, [srcq + idx_x_valq]
+
+ vpermq m13, m15, 3
+ movq idx_x_valq, xm13
+ movu m11, [srcq + idx_x_valq]
+
+ TRANSPOSE4x4 m8, m9, m10, m11, m12, m13
+
+ mulpd m13, m8, m4
+ vfmadd231pd m13, m9, m5
+ vfmadd231pd m13, m10, m6
+ vfmadd231pd m13, m11, m7
+ vfmadd231pd m14, m13, m3
+
+ add offsetq, mmsize
+ jg .store_partial
+ movu [dstq + offsetq - mmsize], m14
js .loop
+ jmp .finish
+
+.store_partial:
+ sub offsetq, ELEM_SIZE
+ jz .store3
+ sub offsetq, ELEM_SIZE
+ jz .store2
+
+.store1:
+ ; offsetq is now mmsize-2 after the write position
+ movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm14
+ jmp .finish
+.store2:
+ ; offsetq is now mmsize-2 after the write position
+ movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm14
+ jmp .finish
+.store3:
+ ; offsetq is now mmsize-1 after the write position
+ movu [dstq + offsetq - mmsize + 1 * ELEM_SIZE], xm14
+ vextractf128 xm14, m14, 1
+ movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm14
+.finish:
RET
diff --git a/transfer_interp_template.c b/transfer_interp_template.c
index 65ae98b..ddd270f 100644
--- a/transfer_interp_template.c
+++ b/transfer_interp_template.c
@@ -29,7 +29,9 @@
static void
FUNC(interp_transfer_line)(double *dst, ptrdiff_t dst_len,
const double *src, ptrdiff_t src_stride,
- const ptrdiff_t *idx_x, const double *fact_x, const double *fact_y
+ const ptrdiff_t *idx_x,
+ const double *fact_x, ptrdiff_t fact_x_stride,
+ const double *fact_y, ptrdiff_t fact_y_stride
#if !CONTIGUOUS
, ptrdiff_t dst_stride0, ptrdiff_t src_stride0
# define SSTRIDE1 src_stride0
@@ -48,8 +50,8 @@ FUNC(interp_transfer_line)(double *dst, ptrdiff_t dst_len,
for (ptrdiff_t idx0 = 0; idx0 < STENCIL; idx0++) {
double tmp = 0.0;
for (ptrdiff_t idx1 = 0; idx1 < STENCIL; idx1++)
- tmp += src_data[idx0 * src_stride + idx1 ] * fact_x[STENCIL * x + idx1];
- val += tmp * fact_y[idx0];
+ tmp += src_data[idx0 * src_stride + idx1 ] * fact_x[idx1 * fact_x_stride + x];
+ val += tmp * fact_y[idx0 * fact_y_stride];
}
dst[x * DSTRIDE1] = val;