From 2170a0e6add6bb0a6fbdf689b82361c21d9b72be Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sun, 8 Jan 2012 16:37:43 -0800 Subject: swscale: convert yuy2/uyvy/nv12/nv21ToY/UV from inline asm to yasm. Also implement SSE2/AVX variants. --- libswscale/x86/input.asm | 242 ++++++++++++++++++++++++++++++++++++++ libswscale/x86/swscale_mmx.c | 83 +++++++++++++ libswscale/x86/swscale_template.c | 163 ------------------------- 3 files changed, 325 insertions(+), 163 deletions(-) create mode 100644 libswscale/x86/input.asm (limited to 'libswscale/x86') diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm new file mode 100644 index 0000000000..4bdb575765 --- /dev/null +++ b/libswscale/x86/input.asm @@ -0,0 +1,242 @@ +;****************************************************************************** +;* x86-optimized input routines; does shuffling of packed +;* YUV formats into individual planes, and converts RGB +;* into YUV planes also. +;* Copyright (c) 2012 Ronald S. Bultje +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +SECTION .text + +;----------------------------------------------------------------------------- +; YUYV/UYVY/NV12/NV21 packed pixel shuffling. +; +; void ToY_(uint8_t *dst, const uint8_t *src, int w); +; and +; void toUV_(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, +; const uint8_t *unused, int w); +;----------------------------------------------------------------------------- + +; %1 = a (aligned) or u (unaligned) +; %2 = yuyv or uyvy +%macro LOOP_YUYV_TO_Y 2 +.loop_%1: + mov%1 m0, [srcq+wq*2] ; (byte) { Y0, U0, Y1, V0, ... } + mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } +%ifidn %2, yuyv + pand m0, m2 ; (word) { Y0, Y1, ..., Y7 } + pand m1, m2 ; (word) { Y8, Y9, ..., Y15 } +%else ; uyvy + psrlw m0, 8 ; (word) { Y0, Y1, ..., Y7 } + psrlw m1, 8 ; (word) { Y8, Y9, ..., Y15 } +%endif ; yuyv/uyvy + packuswb m0, m1 ; (byte) { Y0, ..., Y15 } + mova [dstq+wq], m0 + add wq, mmsize + jl .loop_%1 + REP_RET +%endmacro + +; %1 = nr. of XMM registers +; %2 = yuyv or uyvy +; %3 = if specified, it means that unaligned and aligned code in loop +; will be the same (i.e. YUYV+AVX), and thus we don't need to +; split the loop in an aligned and unaligned case +%macro YUYV_TO_Y_FN 2-3 +cglobal %2ToY, 3, 3, %1, dst, src, w +%ifdef ARCH_X86_64 + movsxd wq, wd +%endif + add dstq, wq +%if mmsize == 16 + test srcq, 15 +%endif + lea srcq, [srcq+wq*2] +%ifidn %2, yuyv + pcmpeqb m2, m2 ; (byte) { 0xff } x 16 + psrlw m2, 8 ; (word) { 0x00ff } x 8 +%endif ; yuyv +%if mmsize == 16 + jnz .loop_u_start + neg wq + LOOP_YUYV_TO_Y a, %2 +.loop_u_start: + neg wq + LOOP_YUYV_TO_Y u, %2 +%else ; mmsize == 8 + neg wq + LOOP_YUYV_TO_Y a, %2 +%endif ; mmsize == 8/16 +%endmacro + +; %1 = a (aligned) or u (unaligned) +; %2 = yuyv or uyvy +%macro LOOP_YUYV_TO_UV 2 +.loop_%1: +%ifidn %2, yuyv + mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... } + mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } + psrlw m0, 8 ; (word) { U0, V0, ..., U3, V3 } + psrlw m1, 8 ; (word) { U4, V4, ..., U7, V7 } +%else ; uyvy +%if cpuflag(avx) + vpand m0, m2, [srcq+wq*4] ; (word) { U0, V0, ..., U3, V3 } + vpand m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 } +%else + mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... } + mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... } + pand m0, m2 ; (word) { U0, V0, ..., U3, V3 } + pand m1, m2 ; (word) { U4, V4, ..., U7, V7 } +%endif +%endif ; yuyv/uyvy + packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 } + pand m1, m0, m2 ; (word) { U0, U1, ..., U7 } + psrlw m0, 8 ; (word) { V0, V1, ..., V7 } +%if mmsize == 16 + packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 } + movh [dstUq+wq], m1 + movhps [dstVq+wq], m1 +%else ; mmsize == 8 + packuswb m1, m1 ; (byte) { U0, ... U3 } + packuswb m0, m0 ; (byte) { V0, ... V3 } + movh [dstUq+wq], m1 + movh [dstVq+wq], m0 +%endif ; mmsize == 8/16 + add wq, mmsize / 2 + jl .loop_%1 + REP_RET +%endmacro + +; %1 = nr. of XMM registers +; %2 = yuyv or uyvy +; %3 = if specified, it means that unaligned and aligned code in loop +; will be the same (i.e. UYVY+AVX), and thus we don't need to +; split the loop in an aligned and unaligned case +%macro YUYV_TO_UV_FN 2-3 +cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w +%ifdef ARCH_X86_64 + movsxd wq, r4m +%else ; x86-32 + mov wq, r4m +%endif + add dstUq, wq + add dstVq, wq +%if mmsize == 16 && %0 == 2 + test srcq, 15 +%endif + lea srcq, [srcq+wq*4] + pcmpeqb m2, m2 ; (byte) { 0xff } x 16 + psrlw m2, 8 ; (word) { 0x00ff } x 8 + ; NOTE: if uyvy+avx, u/a are identical +%if mmsize == 16 && %0 == 2 + jnz .loop_u_start + neg wq + LOOP_YUYV_TO_UV a, %2 +.loop_u_start: + neg wq + LOOP_YUYV_TO_UV u, %2 +%else ; mmsize == 8 + neg wq + LOOP_YUYV_TO_UV a, %2 +%endif ; mmsize == 8/16 +%endmacro + +; %1 = a (aligned) or u (unaligned) +; %2 = nv12 or nv21 +%macro LOOP_NVXX_TO_UV 2 +.loop_%1: + mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... } + mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... } + pand m2, m0, m4 ; (word) { U0, U1, ..., U7 } + pand m3, m1, m4 ; (word) { U8, U9, ..., U15 } + psrlw m0, 8 ; (word) { V0, V1, ..., V7 } + psrlw m1, 8 ; (word) { V8, V9, ..., V15 } + packuswb m2, m3 ; (byte) { U0, ..., U15 } + packuswb m0, m1 ; (byte) { V0, ..., V15 } +%ifidn %2, nv12 + mova [dstUq+wq], m2 + mova [dstVq+wq], m0 +%else ; nv21 + mova [dstVq+wq], m2 + mova [dstUq+wq], m0 +%endif ; nv12/21 + add wq, mmsize + jl .loop_%1 + REP_RET +%endmacro + +; %1 = nr. of XMM registers +; %2 = nv12 or nv21 +%macro NVXX_TO_UV_FN 2 +cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w +%ifdef ARCH_X86_64 + movsxd wq, r4m +%else ; x86-32 + mov wq, r4m +%endif + add dstUq, wq + add dstVq, wq +%if mmsize == 16 + test srcq, 15 +%endif + lea srcq, [srcq+wq*2] + pcmpeqb m4, m4 ; (byte) { 0xff } x 16 + psrlw m4, 8 ; (word) { 0x00ff } x 8 +%if mmsize == 16 + jnz .loop_u_start + neg wq + LOOP_NVXX_TO_UV a, %2 +.loop_u_start: + neg wq + LOOP_NVXX_TO_UV u, %2 +%else ; mmsize == 8 + neg wq + LOOP_NVXX_TO_UV a, %2 +%endif ; mmsize == 8/16 +%endmacro + +%ifdef ARCH_X86_32 +INIT_MMX mmx +YUYV_TO_Y_FN 0, yuyv +YUYV_TO_Y_FN 0, uyvy +YUYV_TO_UV_FN 0, yuyv +YUYV_TO_UV_FN 0, uyvy +NVXX_TO_UV_FN 0, nv12 +NVXX_TO_UV_FN 0, nv21 +%endif + +INIT_XMM sse2 +YUYV_TO_Y_FN 3, yuyv +YUYV_TO_Y_FN 2, uyvy +YUYV_TO_UV_FN 3, yuyv +YUYV_TO_UV_FN 3, uyvy +NVXX_TO_UV_FN 5, nv12 +NVXX_TO_UV_FN 5, nv21 + +INIT_XMM avx +; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but +; that's not faster in practice +YUYV_TO_UV_FN 3, yuyv +YUYV_TO_UV_FN 3, uyvy, 1 +NVXX_TO_UV_FN 5, nv12 +NVXX_TO_UV_FN 5, nv21 diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c index 4305cef41d..867a9f1244 100644 --- a/libswscale/x86/swscale_mmx.c +++ b/libswscale/x86/swscale_mmx.c @@ -244,6 +244,26 @@ VSCALE_FUNCS(sse2, sse2); VSCALE_FUNC(16, sse4); VSCALE_FUNCS(avx, avx); +#define INPUT_UV_FUNC(fmt, opt) \ +extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \ + const uint8_t *src, const uint8_t *unused1, \ + int w, uint32_t *unused2) +#define INPUT_FUNC(fmt, opt) \ +extern void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \ + int w, uint32_t *unused); \ + INPUT_UV_FUNC(fmt, opt) +#define INPUT_FUNCS(opt) \ + INPUT_FUNC(uyvy, opt); \ + INPUT_FUNC(yuyv, opt); \ + INPUT_UV_FUNC(nv12, opt); \ + INPUT_UV_FUNC(nv21, opt) + +#if ARCH_X86_32 +INPUT_FUNCS(mmx); +#endif +INPUT_FUNCS(sse2); +INPUT_FUNCS(avx); + void ff_sws_init_swScale_mmx(SwsContext *c) { int cpu_flags = av_get_cpu_flags(); @@ -296,6 +316,30 @@ switch(c->dstBpc){ \ ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx); ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx); ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmx2, cpu_flags & AV_CPU_FLAG_MMX2); + + switch (c->srcFormat) { + case PIX_FMT_Y400A: + c->lumToYV12 = ff_yuyvToY_mmx; + if (c->alpPixBuf) + c->alpToYV12 = ff_uyvyToY_mmx; + break; + case PIX_FMT_YUYV422: + c->lumToYV12 = ff_yuyvToY_mmx; + c->chrToYV12 = ff_yuyvToUV_mmx; + break; + case PIX_FMT_UYVY422: + c->lumToYV12 = ff_uyvyToY_mmx; + c->chrToYV12 = ff_uyvyToUV_mmx; + break; + case PIX_FMT_NV12: + c->chrToYV12 = ff_nv12ToUV_mmx; + break; + case PIX_FMT_NV21: + c->chrToYV12 = ff_nv21ToUV_mmx; + break; + default: + break; + } } if (cpu_flags & AV_CPU_FLAG_MMX2) { ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmx2,); @@ -314,6 +358,28 @@ switch(c->dstBpc){ \ ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2); ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2,); ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1); + + switch (c->srcFormat) { + case PIX_FMT_Y400A: + c->lumToYV12 = ff_yuyvToY_sse2; + if (c->alpPixBuf) + c->alpToYV12 = ff_uyvyToY_sse2; + break; + case PIX_FMT_YUYV422: + c->lumToYV12 = ff_yuyvToY_sse2; + c->chrToYV12 = ff_yuyvToUV_sse2; + break; + case PIX_FMT_UYVY422: + c->lumToYV12 = ff_uyvyToY_sse2; + c->chrToYV12 = ff_uyvyToUV_sse2; + break; + case PIX_FMT_NV12: + c->chrToYV12 = ff_nv12ToUV_sse2; + break; + case PIX_FMT_NV21: + c->chrToYV12 = ff_nv21ToUV_sse2; + break; + } } if (cpu_flags & AV_CPU_FLAG_SSSE3) { ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3); @@ -332,6 +398,23 @@ switch(c->dstBpc){ \ if (cpu_flags & AV_CPU_FLAG_AVX) { ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx,); ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1); + + switch (c->srcFormat) { + case PIX_FMT_YUYV422: + c->chrToYV12 = ff_yuyvToUV_avx; + break; + case PIX_FMT_UYVY422: + c->chrToYV12 = ff_uyvyToUV_avx; + break; + case PIX_FMT_NV12: + c->chrToYV12 = ff_nv12ToUV_avx; + break; + case PIX_FMT_NV21: + c->chrToYV12 = ff_nv21ToUV_avx; + break; + default: + break; + } } #endif } diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index 5e7df5c4a0..b3d7336588 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -1361,147 +1361,6 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, } } -#if !COMPILE_TEMPLATE_MMX2 -//FIXME yuy2* can read up to 7 samples too much - -static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, - int width, uint32_t *unused) -{ - __asm__ volatile( - "movq "MANGLE(bm01010101)", %%mm2 \n\t" - "mov %0, %%"REG_a" \n\t" - "1: \n\t" - "movq (%1, %%"REG_a",2), %%mm0 \n\t" - "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" - "pand %%mm2, %%mm0 \n\t" - "pand %%mm2, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "movq %%mm0, (%2, %%"REG_a") \n\t" - "add $8, %%"REG_a" \n\t" - " js 1b \n\t" - : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) - : "%"REG_a - ); -} - -static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src1, const uint8_t *src2, - int width, uint32_t *unused) -{ - __asm__ volatile( - "movq "MANGLE(bm01010101)", %%mm4 \n\t" - "mov %0, %%"REG_a" \n\t" - "1: \n\t" - "movq (%1, %%"REG_a",4), %%mm0 \n\t" - "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" - "psrlw $8, %%mm0 \n\t" - "psrlw $8, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "movq %%mm0, %%mm1 \n\t" - "psrlw $8, %%mm0 \n\t" - "pand %%mm4, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - "movd %%mm0, (%3, %%"REG_a") \n\t" - "movd %%mm1, (%2, %%"REG_a") \n\t" - "add $4, %%"REG_a" \n\t" - " js 1b \n\t" - : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) - : "%"REG_a - ); - assert(src1 == src2); -} - -/* This is almost identical to the previous, end exists only because - * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */ -static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, - int width, uint32_t *unused) -{ - __asm__ volatile( - "mov %0, %%"REG_a" \n\t" - "1: \n\t" - "movq (%1, %%"REG_a",2), %%mm0 \n\t" - "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" - "psrlw $8, %%mm0 \n\t" - "psrlw $8, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "movq %%mm0, (%2, %%"REG_a") \n\t" - "add $8, %%"REG_a" \n\t" - " js 1b \n\t" - : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) - : "%"REG_a - ); -} - -static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src1, const uint8_t *src2, - int width, uint32_t *unused) -{ - __asm__ volatile( - "movq "MANGLE(bm01010101)", %%mm4 \n\t" - "mov %0, %%"REG_a" \n\t" - "1: \n\t" - "movq (%1, %%"REG_a",4), %%mm0 \n\t" - "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" - "pand %%mm4, %%mm0 \n\t" - "pand %%mm4, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "movq %%mm0, %%mm1 \n\t" - "psrlw $8, %%mm0 \n\t" - "pand %%mm4, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - "movd %%mm0, (%3, %%"REG_a") \n\t" - "movd %%mm1, (%2, %%"REG_a") \n\t" - "add $4, %%"REG_a" \n\t" - " js 1b \n\t" - : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) - : "%"REG_a - ); - assert(src1 == src2); -} - -static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2, - const uint8_t *src, int width) -{ - __asm__ volatile( - "movq "MANGLE(bm01010101)", %%mm4 \n\t" - "mov %0, %%"REG_a" \n\t" - "1: \n\t" - "movq (%1, %%"REG_a",2), %%mm0 \n\t" - "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "movq %%mm1, %%mm3 \n\t" - "pand %%mm4, %%mm0 \n\t" - "pand %%mm4, %%mm1 \n\t" - "psrlw $8, %%mm2 \n\t" - "psrlw $8, %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "movq %%mm0, (%2, %%"REG_a") \n\t" - "movq %%mm2, (%3, %%"REG_a") \n\t" - "add $8, %%"REG_a" \n\t" - " js 1b \n\t" - : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width) - : "%"REG_a - ); -} - -static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src1, const uint8_t *src2, - int width, uint32_t *unused) -{ - RENAME(nvXXtoUV)(dstU, dstV, src1, width); -} - -static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src1, const uint8_t *src2, - int width, uint32_t *unused) -{ - RENAME(nvXXtoUV)(dstV, dstU, src1, width); -} -#endif /* !COMPILE_TEMPLATE_MMX2 */ - static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, int width, enum PixelFormat srcFormat) { @@ -1856,15 +1715,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) #endif /* COMPILE_TEMPLATE_MMX2 */ } -#if !COMPILE_TEMPLATE_MMX2 - switch(srcFormat) { - case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break; - case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break; - case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break; - case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break; - default: break; - } -#endif /* !COMPILE_TEMPLATE_MMX2 */ if (!c->chrSrcHSubSample) { switch(srcFormat) { case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break; @@ -1874,21 +1724,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) } switch (srcFormat) { -#if !COMPILE_TEMPLATE_MMX2 - case PIX_FMT_YUYV422 : - case PIX_FMT_Y400A : c->lumToYV12 = RENAME(yuy2ToY); break; - case PIX_FMT_UYVY422 : c->lumToYV12 = RENAME(uyvyToY); break; -#endif /* !COMPILE_TEMPLATE_MMX2 */ case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break; case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break; default: break; } -#if !COMPILE_TEMPLATE_MMX2 - if (c->alpPixBuf) { - switch (srcFormat) { - case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break; - default: break; - } - } -#endif /* !COMPILE_TEMPLATE_MMX2 */ } -- cgit v1.2.3