summaryrefslogtreecommitdiff
path: root/libswscale/x86/swscale.c
diff options
context:
space:
mode:
authorMark Reid <mindmark@gmail.com>2021-11-24 13:15:20 -0800
committerJames Almer <jamrial@gmail.com>2022-01-11 16:33:17 -0300
commit9e445a5be2dca30a1f1103c73440648ccf5af9b1 (patch)
tree400817a50debefa499ca5b7445c4596dc0724444 /libswscale/x86/swscale.c
parent4b053b8db18eb0610b62a2c40d0327ae53f2387c (diff)
swscale/x86/output.asm: add x86-optimized planer gbr yuv2anyX functions
changes since v2: * fixed label changes since v1: * remove vex intruction on sse4 path * some load/pack marcos use less intructions * fixed some typos yuv2gbrp_full_X_4_512_c: 12757.6 yuv2gbrp_full_X_4_512_sse2: 8946.6 yuv2gbrp_full_X_4_512_sse4: 5138.6 yuv2gbrp_full_X_4_512_avx2: 3889.6 yuv2gbrap_full_X_4_512_c: 15368.6 yuv2gbrap_full_X_4_512_sse2: 11916.1 yuv2gbrap_full_X_4_512_sse4: 6294.6 yuv2gbrap_full_X_4_512_avx2: 3477.1 yuv2gbrp9be_full_X_4_512_c: 14381.6 yuv2gbrp9be_full_X_4_512_sse2: 9139.1 yuv2gbrp9be_full_X_4_512_sse4: 5150.1 yuv2gbrp9be_full_X_4_512_avx2: 2834.6 yuv2gbrp9le_full_X_4_512_c: 12990.1 yuv2gbrp9le_full_X_4_512_sse2: 9118.1 yuv2gbrp9le_full_X_4_512_sse4: 5132.1 yuv2gbrp9le_full_X_4_512_avx2: 2833.1 yuv2gbrp10be_full_X_4_512_c: 14401.6 yuv2gbrp10be_full_X_4_512_sse2: 9133.1 yuv2gbrp10be_full_X_4_512_sse4: 5126.1 yuv2gbrp10be_full_X_4_512_avx2: 2837.6 yuv2gbrp10le_full_X_4_512_c: 12718.1 yuv2gbrp10le_full_X_4_512_sse2: 9106.1 yuv2gbrp10le_full_X_4_512_sse4: 5120.1 yuv2gbrp10le_full_X_4_512_avx2: 2826.1 yuv2gbrap10be_full_X_4_512_c: 18535.6 yuv2gbrap10be_full_X_4_512_sse2: 33617.6 yuv2gbrap10be_full_X_4_512_sse4: 6264.1 yuv2gbrap10be_full_X_4_512_avx2: 3422.1 yuv2gbrap10le_full_X_4_512_c: 16724.1 yuv2gbrap10le_full_X_4_512_sse2: 11787.1 yuv2gbrap10le_full_X_4_512_sse4: 6282.1 yuv2gbrap10le_full_X_4_512_avx2: 3441.6 yuv2gbrp12be_full_X_4_512_c: 13723.6 yuv2gbrp12be_full_X_4_512_sse2: 9128.1 yuv2gbrp12be_full_X_4_512_sse4: 7997.6 yuv2gbrp12be_full_X_4_512_avx2: 2844.1 yuv2gbrp12le_full_X_4_512_c: 12257.1 yuv2gbrp12le_full_X_4_512_sse2: 9107.6 yuv2gbrp12le_full_X_4_512_sse4: 5142.6 yuv2gbrp12le_full_X_4_512_avx2: 2837.6 yuv2gbrap12be_full_X_4_512_c: 18511.1 yuv2gbrap12be_full_X_4_512_sse2: 12156.6 yuv2gbrap12be_full_X_4_512_sse4: 6251.1 yuv2gbrap12be_full_X_4_512_avx2: 3444.6 yuv2gbrap12le_full_X_4_512_c: 16687.1 yuv2gbrap12le_full_X_4_512_sse2: 11785.1 yuv2gbrap12le_full_X_4_512_sse4: 6243.6 yuv2gbrap12le_full_X_4_512_avx2: 3446.1 yuv2gbrp14be_full_X_4_512_c: 13690.6 yuv2gbrp14be_full_X_4_512_sse2: 9120.6 yuv2gbrp14be_full_X_4_512_sse4: 5138.1 yuv2gbrp14be_full_X_4_512_avx2: 2843.1 yuv2gbrp14le_full_X_4_512_c: 14995.6 yuv2gbrp14le_full_X_4_512_sse2: 9119.1 yuv2gbrp14le_full_X_4_512_sse4: 5126.1 yuv2gbrp14le_full_X_4_512_avx2: 2843.1 yuv2gbrp16be_full_X_4_512_c: 12367.1 yuv2gbrp16be_full_X_4_512_sse2: 8233.6 yuv2gbrp16be_full_X_4_512_sse4: 4820.1 yuv2gbrp16be_full_X_4_512_avx2: 2666.6 yuv2gbrp16le_full_X_4_512_c: 10904.1 yuv2gbrp16le_full_X_4_512_sse2: 8214.1 yuv2gbrp16le_full_X_4_512_sse4: 4824.1 yuv2gbrp16le_full_X_4_512_avx2: 2629.1 yuv2gbrap16be_full_X_4_512_c: 26569.6 yuv2gbrap16be_full_X_4_512_sse2: 10884.1 yuv2gbrap16be_full_X_4_512_sse4: 5488.1 yuv2gbrap16be_full_X_4_512_avx2: 3272.1 yuv2gbrap16le_full_X_4_512_c: 14010.1 yuv2gbrap16le_full_X_4_512_sse2: 10562.1 yuv2gbrap16le_full_X_4_512_sse4: 5463.6 yuv2gbrap16le_full_X_4_512_avx2: 3255.1 yuv2gbrpf32be_full_X_4_512_c: 14524.1 yuv2gbrpf32be_full_X_4_512_sse2: 8552.6 yuv2gbrpf32be_full_X_4_512_sse4: 4636.1 yuv2gbrpf32be_full_X_4_512_avx2: 2474.6 yuv2gbrpf32le_full_X_4_512_c: 13060.6 yuv2gbrpf32le_full_X_4_512_sse2: 9682.6 yuv2gbrpf32le_full_X_4_512_sse4: 4298.1 yuv2gbrpf32le_full_X_4_512_avx2: 2453.1 yuv2gbrapf32be_full_X_4_512_c: 18629.6 yuv2gbrapf32be_full_X_4_512_sse2: 11363.1 yuv2gbrapf32be_full_X_4_512_sse4: 15201.6 yuv2gbrapf32be_full_X_4_512_avx2: 3727.1 yuv2gbrapf32le_full_X_4_512_c: 16677.6 yuv2gbrapf32le_full_X_4_512_sse2: 10221.6 yuv2gbrapf32le_full_X_4_512_sse4: 5693.6 yuv2gbrapf32le_full_X_4_512_avx2: 3656.6 Reviewed-by: Paul B Mahol <onemda@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libswscale/x86/swscale.c')
-rw-r--r--libswscale/x86/swscale.c98
1 files changed, 98 insertions, 0 deletions
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index fdc93866a6..d5a467da0e 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -353,6 +353,43 @@ void ff_yuv2 ## fmt ## cX_ ## opt(enum AVPixelFormat format, const uint8_t *dith
YUV2NV_DECL(nv12, avx2);
YUV2NV_DECL(nv21, avx2);
+
+#define YUV2GBRP_FN_DECL(fmt, opt) \
+void ff_yuv2##fmt##_full_X_ ##opt(SwsContext *c, const int16_t *lumFilter, \
+ const int16_t **lumSrcx, int lumFilterSize, \
+ const int16_t *chrFilter, const int16_t **chrUSrcx, \
+ const int16_t **chrVSrcx, int chrFilterSize, \
+ const int16_t **alpSrcx, uint8_t **dest, \
+ int dstW, int y)
+
+#define YUV2GBRP_DECL(opt) \
+YUV2GBRP_FN_DECL(gbrp, opt); \
+YUV2GBRP_FN_DECL(gbrap, opt); \
+YUV2GBRP_FN_DECL(gbrp9le, opt); \
+YUV2GBRP_FN_DECL(gbrp10le, opt); \
+YUV2GBRP_FN_DECL(gbrap10le, opt); \
+YUV2GBRP_FN_DECL(gbrp12le, opt); \
+YUV2GBRP_FN_DECL(gbrap12le, opt); \
+YUV2GBRP_FN_DECL(gbrp14le, opt); \
+YUV2GBRP_FN_DECL(gbrp16le, opt); \
+YUV2GBRP_FN_DECL(gbrap16le, opt); \
+YUV2GBRP_FN_DECL(gbrpf32le, opt); \
+YUV2GBRP_FN_DECL(gbrapf32le, opt); \
+YUV2GBRP_FN_DECL(gbrp9be, opt); \
+YUV2GBRP_FN_DECL(gbrp10be, opt); \
+YUV2GBRP_FN_DECL(gbrap10be, opt); \
+YUV2GBRP_FN_DECL(gbrp12be, opt); \
+YUV2GBRP_FN_DECL(gbrap12be, opt); \
+YUV2GBRP_FN_DECL(gbrp14be, opt); \
+YUV2GBRP_FN_DECL(gbrp16be, opt); \
+YUV2GBRP_FN_DECL(gbrap16be, opt); \
+YUV2GBRP_FN_DECL(gbrpf32be, opt); \
+YUV2GBRP_FN_DECL(gbrapf32be, opt);
+
+YUV2GBRP_DECL(sse2);
+YUV2GBRP_DECL(sse4);
+YUV2GBRP_DECL(avx2);
+
#endif
av_cold void ff_sws_init_swscale_x86(SwsContext *c)
@@ -601,5 +638,66 @@ switch(c->dstBpc){ \
break;
}
}
+
+ if(c->flags & SWS_FULL_CHR_H_INT) {
+
+ /* yuv2gbrp uses the SwsContext for yuv coefficients
+ if struct offsets change the asm needs to be updated too */
+ av_assert0(offsetof(SwsContext, yuv2rgb_y_offset) == 40292);
+
+#define YUV2ANYX_FUNC_CASE(fmt, name, opt) \
+ case fmt: \
+ c->yuv2anyX = ff_yuv2##name##_full_X_##opt; \
+ break;
+
+#define YUV2ANYX_GBRAP_CASES(opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRP, gbrp, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRAP, gbrap, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRP9LE, gbrp9le, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRP10LE, gbrp10le, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRAP10LE, gbrap10le, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRP12LE, gbrp12le, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRAP12LE, gbrap12le, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRP14LE, gbrp14le, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRP16LE, gbrp16le, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRAP16LE, gbrap16le, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRPF32LE, gbrpf32le, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRAPF32LE, gbrapf32le, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRP9BE, gbrp9be, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRP10BE, gbrp10be, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRAP10BE, gbrap10be, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRP12BE, gbrp12be, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRAP12BE, gbrap12be, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRP14BE, gbrp14be, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRP16BE, gbrp16be, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRAP16BE, gbrap16be, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRPF32BE, gbrpf32be, opt) \
+ YUV2ANYX_FUNC_CASE(AV_PIX_FMT_GBRAPF32BE, gbrapf32be, opt)
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ switch (c->dstFormat) {
+ YUV2ANYX_GBRAP_CASES(sse2)
+ default:
+ break;
+ }
+ }
+
+ if (EXTERNAL_SSE4(cpu_flags)) {
+ switch (c->dstFormat) {
+ YUV2ANYX_GBRAP_CASES(sse4)
+ default:
+ break;
+ }
+ }
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ switch (c->dstFormat) {
+ YUV2ANYX_GBRAP_CASES(avx2)
+ default:
+ break;
+ }
+ }
+ }
+
#endif
}