From d0e4d0429ed2841283e31d5f2cb6b1c56210382c Mon Sep 17 00:00:00 2001 From: Lauri Kasanen Date: Tue, 23 Apr 2019 14:54:46 +0300 Subject: swscale/ppc: VSX-optimize hscale_fast ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags fast_bilinear \ -s 2400x720 -f rawvideo -vframes 5 -pix_fmt abgr -nostats test.raw 4.27 speedup for hyscale_fast: 24796 UNITS in hyscale_fast, 4096 runs, 0 skips 5797 UNITS in hyscale_fast, 4096 runs, 0 skips 4.48 speedup for hcscale_fast: 19911 UNITS in hcscale_fast, 4095 runs, 1 skips 4437 UNITS in hcscale_fast, 4096 runs, 0 skips --- libswscale/ppc/swscale_vsx.c | 196 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index ba00791dcb..2e20ab388a 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -1661,6 +1661,198 @@ YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422) YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422) YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422) +static void hyscale_fast_vsx(SwsContext *c, int16_t *dst, int dstWidth, + const uint8_t *src, int srcW, int xInc) +{ + int i; + unsigned int xpos = 0, xx; + vector uint8_t vin, vin2, vperm; + vector int8_t vmul, valpha; + vector int16_t vtmp, vtmp2, vtmp3, vtmp4; + vector uint16_t vd_l, vd_r, vcoord16[2]; + vector uint32_t vcoord[4]; + const vector uint32_t vadd = (vector uint32_t) { + 0, + xInc * 1, + xInc * 2, + xInc * 3, + }; + const vector uint16_t vadd16 = (vector uint16_t) { // Modulo math + 0, + xInc * 1, + xInc * 2, + xInc * 3, + xInc * 4, + xInc * 5, + xInc * 6, + xInc * 7, + }; + const vector uint32_t vshift16 = vec_splats((uint32_t) 16); + const vector uint16_t vshift9 = vec_splat_u16(9); + const vector uint8_t vzero = vec_splat_u8(0); + const vector uint16_t vshift = vec_splat_u16(7); + + for (i = 0; i < dstWidth; i += 16) { + vcoord16[0] = vec_splats((uint16_t) xpos); + vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8)); + + vcoord16[0] = vec_add(vcoord16[0], vadd16); + vcoord16[1] = vec_add(vcoord16[1], vadd16); + + vcoord16[0] = vec_sr(vcoord16[0], vshift9); + vcoord16[1] = vec_sr(vcoord16[1], vshift9); + valpha = (vector int8_t) vec_pack(vcoord16[0], vcoord16[1]); + + xx = xpos >> 16; + vin = vec_vsx_ld(0, &src[xx]); + + vcoord[0] = vec_splats(xpos & 0xffff); + vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4); + vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8); + vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12); + + vcoord[0] = vec_add(vcoord[0], vadd); + vcoord[1] = vec_add(vcoord[1], vadd); + vcoord[2] = vec_add(vcoord[2], vadd); + vcoord[3] = vec_add(vcoord[3], vadd); + + vcoord[0] = vec_sr(vcoord[0], vshift16); + vcoord[1] = vec_sr(vcoord[1], vshift16); + vcoord[2] = vec_sr(vcoord[2], vshift16); + vcoord[3] = vec_sr(vcoord[3], vshift16); + + vcoord16[0] = vec_pack(vcoord[0], vcoord[1]); + vcoord16[1] = vec_pack(vcoord[2], vcoord[3]); + vperm = vec_pack(vcoord16[0], vcoord16[1]); + + vin = vec_perm(vin, vin, vperm); + + vin2 = vec_vsx_ld(1, &src[xx]); + vin2 = vec_perm(vin2, vin2, vperm); + + vmul = (vector int8_t) vec_sub(vin2, vin); + vtmp = vec_mule(vmul, valpha); + vtmp2 = vec_mulo(vmul, valpha); + vtmp3 = vec_mergeh(vtmp, vtmp2); + vtmp4 = vec_mergel(vtmp, vtmp2); + + vd_l = (vector uint16_t) vec_mergeh(vin, vzero); + vd_r = (vector uint16_t) vec_mergel(vin, vzero); + vd_l = vec_sl(vd_l, vshift); + vd_r = vec_sl(vd_r, vshift); + + vd_l = vec_add(vd_l, (vector uint16_t) vtmp3); + vd_r = vec_add(vd_r, (vector uint16_t) vtmp4); + + vec_st((vector int16_t) vd_l, 0, &dst[i]); + vec_st((vector int16_t) vd_r, 0, &dst[i + 8]); + + xpos += xInc * 16; + } + for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) + dst[i] = src[srcW-1]*128; +} + +#define HCSCALE(in, out) \ + vin = vec_vsx_ld(0, &in[xx]); \ + vin = vec_perm(vin, vin, vperm); \ +\ + vin2 = vec_vsx_ld(1, &in[xx]); \ + vin2 = vec_perm(vin2, vin2, vperm); \ +\ + vtmp = vec_mule(vin, valphaxor); \ + vtmp2 = vec_mulo(vin, valphaxor); \ + vtmp3 = vec_mergeh(vtmp, vtmp2); \ + vtmp4 = vec_mergel(vtmp, vtmp2); \ +\ + vtmp = vec_mule(vin2, valpha); \ + vtmp2 = vec_mulo(vin2, valpha); \ + vd_l = vec_mergeh(vtmp, vtmp2); \ + vd_r = vec_mergel(vtmp, vtmp2); \ +\ + vd_l = vec_add(vd_l, vtmp3); \ + vd_r = vec_add(vd_r, vtmp4); \ +\ + vec_st((vector int16_t) vd_l, 0, &out[i]); \ + vec_st((vector int16_t) vd_r, 0, &out[i + 8]) + +static void hcscale_fast_vsx(SwsContext *c, int16_t *dst1, int16_t *dst2, + int dstWidth, const uint8_t *src1, + const uint8_t *src2, int srcW, int xInc) +{ + int i; + unsigned int xpos = 0, xx; + vector uint8_t vin, vin2, vperm; + vector uint8_t valpha, valphaxor; + vector uint16_t vtmp, vtmp2, vtmp3, vtmp4; + vector uint16_t vd_l, vd_r, vcoord16[2]; + vector uint32_t vcoord[4]; + const vector uint8_t vxor = vec_splats((uint8_t) 127); + const vector uint32_t vadd = (vector uint32_t) { + 0, + xInc * 1, + xInc * 2, + xInc * 3, + }; + const vector uint16_t vadd16 = (vector uint16_t) { // Modulo math + 0, + xInc * 1, + xInc * 2, + xInc * 3, + xInc * 4, + xInc * 5, + xInc * 6, + xInc * 7, + }; + const vector uint32_t vshift16 = vec_splats((uint32_t) 16); + const vector uint16_t vshift9 = vec_splat_u16(9); + + for (i = 0; i < dstWidth; i += 16) { + vcoord16[0] = vec_splats((uint16_t) xpos); + vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8)); + + vcoord16[0] = vec_add(vcoord16[0], vadd16); + vcoord16[1] = vec_add(vcoord16[1], vadd16); + + vcoord16[0] = vec_sr(vcoord16[0], vshift9); + vcoord16[1] = vec_sr(vcoord16[1], vshift9); + valpha = vec_pack(vcoord16[0], vcoord16[1]); + valphaxor = vec_xor(valpha, vxor); + + xx = xpos >> 16; + + vcoord[0] = vec_splats(xpos & 0xffff); + vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4); + vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8); + vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12); + + vcoord[0] = vec_add(vcoord[0], vadd); + vcoord[1] = vec_add(vcoord[1], vadd); + vcoord[2] = vec_add(vcoord[2], vadd); + vcoord[3] = vec_add(vcoord[3], vadd); + + vcoord[0] = vec_sr(vcoord[0], vshift16); + vcoord[1] = vec_sr(vcoord[1], vshift16); + vcoord[2] = vec_sr(vcoord[2], vshift16); + vcoord[3] = vec_sr(vcoord[3], vshift16); + + vcoord16[0] = vec_pack(vcoord[0], vcoord[1]); + vcoord16[1] = vec_pack(vcoord[2], vcoord[3]); + vperm = vec_pack(vcoord16[0], vcoord16[1]); + + HCSCALE(src1, dst1); + HCSCALE(src2, dst2); + + xpos += xInc * 16; + } + for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { + dst1[i] = src1[srcW-1]*128; + dst2[i] = src2[srcW-1]*128; + } +} + +#undef HCSCALE + #endif /* !HAVE_BIGENDIAN */ #endif /* HAVE_VSX */ @@ -1677,6 +1869,10 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) #if !HAVE_BIGENDIAN if (c->srcBpc == 8 && c->dstBpc <= 14) { c->hyScale = c->hcScale = hScale_real_vsx; + if (c->flags & SWS_FAST_BILINEAR && c->dstW >= c->srcW && c->chrDstW >= c->chrSrcW) { + c->hyscale_fast = hyscale_fast_vsx; + c->hcscale_fast = hcscale_fast_vsx; + } } if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 && -- cgit v1.2.3