summaryrefslogtreecommitdiff
path: root/libswscale
diff options
context:
space:
mode:
authorLauri Kasanen <cand@gmx.com>2019-03-24 13:45:55 +0200
committerLauri Kasanen <cand@gmx.com>2019-03-31 12:41:32 +0300
commita6a31ca3d9af907f6d10211af60d0762ee85284e (patch)
treedc038a442385ca7c5d7dd1db09f0eab20fbc3014 /libswscale
parent4e8cbbf70e7a4ca3bb157f31c2f28e2365322b45 (diff)
swscale/ppc: VSX-optimize yuv2422_1
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \ -s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \ -cpuflags 0 -v error - 15.3x speedup: yuyv422 14513 UNITS in yuv2packed1, 32768 runs, 0 skips 949 UNITS in yuv2packed1, 32767 runs, 1 skips yvyu422 14516 UNITS in yuv2packed1, 32767 runs, 1 skips 943 UNITS in yuv2packed1, 32767 runs, 1 skips uyvy422 14530 UNITS in yuv2packed1, 32767 runs, 1 skips 941 UNITS in yuv2packed1, 32766 runs, 2 skips
Diffstat (limited to 'libswscale')
-rw-r--r--libswscale/ppc/swscale_vsx.c149
1 files changed, 149 insertions, 0 deletions
diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 062ab0dc70..0bb82ac742 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -664,6 +664,143 @@ YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
+static av_always_inline void
+write422(const vector int16_t vy1, const vector int16_t vy2,
+ const vector int16_t vu, const vector int16_t vv,
+ uint8_t *dest, const enum AVPixelFormat target)
+{
+ vector uint8_t vd1, vd2, tmp;
+ const vector uint8_t yuyv1 = (vector uint8_t) {
+ 0x0, 0x10, 0x1, 0x18,
+ 0x2, 0x11, 0x3, 0x19,
+ 0x4, 0x12, 0x5, 0x1a,
+ 0x6, 0x13, 0x7, 0x1b };
+ const vector uint8_t yuyv2 = (vector uint8_t) {
+ 0x8, 0x14, 0x9, 0x1c,
+ 0xa, 0x15, 0xb, 0x1d,
+ 0xc, 0x16, 0xd, 0x1e,
+ 0xe, 0x17, 0xf, 0x1f };
+ const vector uint8_t yvyu1 = (vector uint8_t) {
+ 0x0, 0x18, 0x1, 0x10,
+ 0x2, 0x19, 0x3, 0x11,
+ 0x4, 0x1a, 0x5, 0x12,
+ 0x6, 0x1b, 0x7, 0x13 };
+ const vector uint8_t yvyu2 = (vector uint8_t) {
+ 0x8, 0x1c, 0x9, 0x14,
+ 0xa, 0x1d, 0xb, 0x15,
+ 0xc, 0x1e, 0xd, 0x16,
+ 0xe, 0x1f, 0xf, 0x17 };
+ const vector uint8_t uyvy1 = (vector uint8_t) {
+ 0x10, 0x0, 0x18, 0x1,
+ 0x11, 0x2, 0x19, 0x3,
+ 0x12, 0x4, 0x1a, 0x5,
+ 0x13, 0x6, 0x1b, 0x7 };
+ const vector uint8_t uyvy2 = (vector uint8_t) {
+ 0x14, 0x8, 0x1c, 0x9,
+ 0x15, 0xa, 0x1d, 0xb,
+ 0x16, 0xc, 0x1e, 0xd,
+ 0x17, 0xe, 0x1f, 0xf };
+
+ vd1 = vec_packsu(vy1, vy2);
+ vd2 = vec_packsu(vu, vv);
+
+ switch (target) {
+ case AV_PIX_FMT_YUYV422:
+ tmp = vec_perm(vd1, vd2, yuyv1);
+ vec_st(tmp, 0, dest);
+ tmp = vec_perm(vd1, vd2, yuyv2);
+ vec_st(tmp, 16, dest);
+ break;
+ case AV_PIX_FMT_YVYU422:
+ tmp = vec_perm(vd1, vd2, yvyu1);
+ vec_st(tmp, 0, dest);
+ tmp = vec_perm(vd1, vd2, yvyu2);
+ vec_st(tmp, 16, dest);
+ break;
+ case AV_PIX_FMT_UYVY422:
+ tmp = vec_perm(vd1, vd2, uyvy1);
+ vec_st(tmp, 0, dest);
+ tmp = vec_perm(vd1, vd2, uyvy2);
+ vec_st(tmp, 16, dest);
+ break;
+ }
+}
+
+static av_always_inline void
+yuv2422_1_vsx_template(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest, int dstW,
+ int uvalpha, int y, enum AVPixelFormat target)
+{
+ const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+ vector int16_t vy1, vy2, vu, vv, tmp;
+ const vector int16_t add64 = vec_splats((int16_t) 64);
+ const vector int16_t add128 = vec_splats((int16_t) 128);
+ const vector uint16_t shift7 = vec_splat_u16(7);
+ const vector uint16_t shift8 = vec_splat_u16(8);
+ int i;
+
+ if (uvalpha < 2048) {
+ for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
+ vy1 = vec_ld(0, &buf0[i * 2]);
+ vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
+ vu = vec_ld(0, &ubuf0[i]);
+ vv = vec_ld(0, &vbuf0[i]);
+
+ vy1 = vec_add(vy1, add64);
+ vy2 = vec_add(vy2, add64);
+ vu = vec_add(vu, add64);
+ vv = vec_add(vv, add64);
+
+ vy1 = vec_sra(vy1, shift7);
+ vy2 = vec_sra(vy2, shift7);
+ vu = vec_sra(vu, shift7);
+ vv = vec_sra(vv, shift7);
+
+ write422(vy1, vy2, vu, vv, &dest[i * 4], target);
+ }
+ } else {
+ const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+ for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
+ vy1 = vec_ld(0, &buf0[i * 2]);
+ vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
+ vu = vec_ld(0, &ubuf0[i]);
+ tmp = vec_ld(0, &ubuf1[i]);
+ vu = vec_adds(vu, tmp);
+ vv = vec_ld(0, &vbuf0[i]);
+ tmp = vec_ld(0, &vbuf1[i]);
+ vv = vec_adds(vv, tmp);
+
+ vy1 = vec_add(vy1, add64);
+ vy2 = vec_add(vy2, add64);
+ vu = vec_adds(vu, add128);
+ vv = vec_adds(vv, add128);
+
+ vy1 = vec_sra(vy1, shift7);
+ vy2 = vec_sra(vy2, shift7);
+ vu = vec_sra(vu, shift8);
+ vv = vec_sra(vv, shift8);
+
+ write422(vy1, vy2, vu, vv, &dest[i * 4], target);
+ }
+ }
+}
+
+#define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
+static void name ## ext ## _1_vsx(SwsContext *c, const int16_t *buf0, \
+ const int16_t *ubuf[2], const int16_t *vbuf[2], \
+ const int16_t *abuf0, uint8_t *dest, int dstW, \
+ int uvalpha, int y) \
+{ \
+ name ## base ## _1_vsx_template(c, buf0, ubuf, vbuf, \
+ abuf0, dest, dstW, uvalpha, \
+ y, fmt); \
+}
+
+YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422)
+YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422)
+YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422)
+
#endif /* !HAVE_BIGENDIAN */
#endif /* HAVE_VSX */
@@ -768,6 +905,18 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
}
break;
}
+ } else { /* !SWS_FULL_CHR_H_INT */
+ switch (dstFormat) {
+ case AV_PIX_FMT_YUYV422:
+ c->yuv2packed1 = yuv2yuyv422_1_vsx;
+ break;
+ case AV_PIX_FMT_YVYU422:
+ c->yuv2packed1 = yuv2yvyu422_1_vsx;
+ break;
+ case AV_PIX_FMT_UYVY422:
+ c->yuv2packed1 = yuv2uyvy422_1_vsx;
+ break;
+ }
}
#endif /* !HAVE_BIGENDIAN */