From 7b19e76aeb0ace57b99aaef156bbfe592e43e65e Mon Sep 17 00:00:00 2001 From: Paul B Mahol Date: Tue, 15 Aug 2017 20:12:32 +0200 Subject: avfilter/vf_transpose: rewrite for x86 SIMD Transpose first in chunks of 8x8 blocks. 15% faster overall. --- libavfilter/vf_transpose.c | 182 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 141 insertions(+), 41 deletions(-) (limited to 'libavfilter/vf_transpose.c') diff --git a/libavfilter/vf_transpose.c b/libavfilter/vf_transpose.c index 75b4dda41f..982fb0c8ca 100644 --- a/libavfilter/vf_transpose.c +++ b/libavfilter/vf_transpose.c @@ -58,6 +58,12 @@ typedef struct TransContext { int passthrough; ///< PassthroughType, landscape passthrough mode enabled int dir; ///< TransposeDir + + void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize); + void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize, + int w, int h); } TransContext; static int query_formats(AVFilterContext *ctx) @@ -79,6 +85,109 @@ static int query_formats(AVFilterContext *ctx) return ff_set_common_formats(ctx, pix_fmts); } +static inline void transpose_block_8_c(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize, + int w, int h) +{ + int x, y; + for (y = 0; y < h; y++, dst += dst_linesize, src++) + for (x = 0; x < w; x++) + dst[x] = src[x*src_linesize]; +} + +static void transpose_8x8_8_c(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize) +{ + transpose_block_8_c(src, src_linesize, dst, dst_linesize, 8, 8); +} + +static inline void transpose_block_16_c(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize, + int w, int h) +{ + int x, y; + for (y = 0; y < h; y++, dst += dst_linesize, src += 2) + for (x = 0; x < w; x++) + *((uint16_t *)(dst + 2*x)) = *((uint16_t *)(src + x*src_linesize)); +} + +static void transpose_8x8_16_c(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize) +{ + transpose_block_16_c(src, src_linesize, dst, dst_linesize, 8, 8); +} + +static inline void transpose_block_24_c(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize, + int w, int h) +{ + int x, y; + for (y = 0; y < h; y++, dst += dst_linesize) { + for (x = 0; x < w; x++) { + int32_t v = AV_RB24(src + x*src_linesize + y*3); + AV_WB24(dst + 3*x, v); + } + } +} + +static void transpose_8x8_24_c(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize) +{ + transpose_block_24_c(src, src_linesize, dst, dst_linesize, 8, 8); +} + +static inline void transpose_block_32_c(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize, + int w, int h) +{ + int x, y; + for (y = 0; y < h; y++, dst += dst_linesize, src += 4) { + for (x = 0; x < w; x++) + *((uint32_t *)(dst + 4*x)) = *((uint32_t *)(src + x*src_linesize)); + } +} + +static void transpose_8x8_32_c(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize) +{ + transpose_block_32_c(src, src_linesize, dst, dst_linesize, 8, 8); +} + +static inline void transpose_block_48_c(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize, + int w, int h) +{ + int x, y; + for (y = 0; y < h; y++, dst += dst_linesize, src += 6) { + for (x = 0; x < w; x++) { + int64_t v = AV_RB48(src + x*src_linesize); + AV_WB48(dst + 6*x, v); + } + } +} + +static void transpose_8x8_48_c(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize) +{ + transpose_block_48_c(src, src_linesize, dst, dst_linesize, 8, 8); +} + +static inline void transpose_block_64_c(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize, + int w, int h) +{ + int x, y; + for (y = 0; y < h; y++, dst += dst_linesize, src += 8) + for (x = 0; x < w; x++) + *((uint64_t *)(dst + 8*x)) = *((uint64_t *)(src + x*src_linesize)); +} + +static void transpose_8x8_64_c(uint8_t *src, ptrdiff_t src_linesize, + uint8_t *dst, ptrdiff_t dst_linesize) +{ + transpose_block_64_c(src, src_linesize, dst, dst_linesize, 8, 8); +} + static int config_props_output(AVFilterLink *outlink) { AVFilterContext *ctx = outlink->src; @@ -118,6 +227,21 @@ static int config_props_output(AVFilterLink *outlink) else outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; + switch (s->pixsteps[0]) { + case 1: s->transpose_block = transpose_block_8_c; + s->transpose_8x8 = transpose_8x8_8_c; break; + case 2: s->transpose_block = transpose_block_16_c; + s->transpose_8x8 = transpose_8x8_16_c; break; + case 3: s->transpose_block = transpose_block_24_c; + s->transpose_8x8 = transpose_8x8_24_c; break; + case 4: s->transpose_block = transpose_block_32_c; + s->transpose_8x8 = transpose_8x8_32_c; break; + case 6: s->transpose_block = transpose_block_48_c; + s->transpose_8x8 = transpose_8x8_48_c; break; + case 8: s->transpose_block = transpose_block_64_c; + s->transpose_8x8 = transpose_8x8_64_c; break; + } + av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d dir:%d -> w:%d h:%d rotation:%s vflip:%d\n", inlink->w, inlink->h, s->dir, outlink->w, outlink->h, @@ -176,49 +300,25 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, dstlinesize *= -1; } - switch (pixstep) { - case 1: - for (y = start; y < end; y++, dst += dstlinesize) - for (x = 0; x < outw; x++) - dst[x] = src[x * srclinesize + y]; - break; - case 2: - for (y = start; y < end; y++, dst += dstlinesize) { - for (x = 0; x < outw; x++) - *((uint16_t *)(dst + 2 * x)) = - *((uint16_t *)(src + x * srclinesize + y * 2)); - } - break; - case 3: - for (y = start; y < end; y++, dst += dstlinesize) { - for (x = 0; x < outw; x++) { - int32_t v = AV_RB24(src + x * srclinesize + y * 3); - AV_WB24(dst + 3 * x, v); - } - } - break; - case 4: - for (y = start; y < end; y++, dst += dstlinesize) { - for (x = 0; x < outw; x++) - *((uint32_t *)(dst + 4 * x)) = - *((uint32_t *)(src + x * srclinesize + y * 4)); + for (y = start; y < end - 7; y += 8) { + for (x = 0; x < outw - 7; x += 8) { + s->transpose_8x8(src + x * srclinesize + y * pixstep, + srclinesize, + dst + (y - start) * dstlinesize + x * pixstep, + dstlinesize); } - break; - case 6: - for (y = start; y < end; y++, dst += dstlinesize) { - for (x = 0; x < outw; x++) { - int64_t v = AV_RB48(src + x * srclinesize + y*6); - AV_WB48(dst + 6*x, v); - } - } - break; - case 8: - for (y = start; y < end; y++, dst += dstlinesize) { - for (x = 0; x < outw; x++) - *((uint64_t *)(dst + 8*x)) = *((uint64_t *)(src + x * srclinesize + y*8)); - } - break; + if (outw - x > 0 && end - y > 0) + s->transpose_block(src + x * srclinesize + y * pixstep, + srclinesize, + dst + (y - start) * dstlinesize + x * pixstep, + dstlinesize, outw - x, end - y); } + + if (end - y > 0) + s->transpose_block(src + 0 * srclinesize + y * pixstep, + srclinesize, + dst + (y - start) * dstlinesize + 0 * pixstep, + dstlinesize, outw, end - y); } return 0; -- cgit v1.2.3