diff options
Diffstat (limited to 'libswscale/x86')
-rw-r--r-- | libswscale/x86/Makefile | 2 | ||||
-rw-r--r-- | libswscale/x86/input.asm | 206 | ||||
-rw-r--r-- | libswscale/x86/output.asm | 12 | ||||
-rw-r--r-- | libswscale/x86/rgb2rgb.c | 13 | ||||
-rw-r--r-- | libswscale/x86/rgb2rgb_template.c | 162 | ||||
-rw-r--r-- | libswscale/x86/scale.asm | 12 | ||||
-rw-r--r-- | libswscale/x86/swscale.c | 137 | ||||
-rw-r--r-- | libswscale/x86/swscale_template.c | 130 | ||||
-rw-r--r-- | libswscale/x86/w64xmmtest.c | 8 | ||||
-rw-r--r-- | libswscale/x86/yuv2rgb.c | 13 | ||||
-rw-r--r-- | libswscale/x86/yuv2rgb_template.c | 35 |
11 files changed, 465 insertions, 265 deletions
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index b94b14abbb..e767a5c420 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -1,3 +1,5 @@ +$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS) + OBJS += x86/rgb2rgb.o \ x86/swscale.o \ x86/yuv2rgb.o \ diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm index 6f5677e1fd..0c4f30e6e0 100644 --- a/libswscale/x86/input.asm +++ b/libswscale/x86/input.asm @@ -4,20 +4,20 @@ ;* into YUV planes also. ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -35,33 +35,59 @@ SECTION_RODATA %define GV 0xD0E3 %define BV 0xF6E4 -rgb_Yrnd: times 4 dd 0x84000 ; 16.5 << 15 -rgb_UVrnd: times 4 dd 0x404000 ; 128.5 << 15 -bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY -bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY -rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY -rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY -bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU -bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU -rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU -rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU -bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV -bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV -rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV -rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV - -rgba_Ycoeff_rb: times 4 dw RY, BY -rgba_Ycoeff_br: times 4 dw BY, RY -rgba_Ycoeff_ga: times 4 dw GY, 0 -rgba_Ycoeff_ag: times 4 dw 0, GY -rgba_Ucoeff_rb: times 4 dw RU, BU -rgba_Ucoeff_br: times 4 dw BU, RU -rgba_Ucoeff_ga: times 4 dw GU, 0 -rgba_Ucoeff_ag: times 4 dw 0, GU -rgba_Vcoeff_rb: times 4 dw RV, BV -rgba_Vcoeff_br: times 4 dw BV, RV -rgba_Vcoeff_ga: times 4 dw GV, 0 -rgba_Vcoeff_ag: times 4 dw 0, GV +rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15 +rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15 +%define bgr_Ycoeff_12x4 16*4 + 16* 0 + tableq +%define bgr_Ycoeff_3x56 16*4 + 16* 1 + tableq +%define rgb_Ycoeff_12x4 16*4 + 16* 2 + tableq +%define rgb_Ycoeff_3x56 16*4 + 16* 3 + tableq +%define bgr_Ucoeff_12x4 16*4 + 16* 4 + tableq +%define bgr_Ucoeff_3x56 16*4 + 16* 5 + tableq +%define rgb_Ucoeff_12x4 16*4 + 16* 6 + tableq +%define rgb_Ucoeff_3x56 16*4 + 16* 7 + tableq +%define bgr_Vcoeff_12x4 16*4 + 16* 8 + tableq +%define bgr_Vcoeff_3x56 16*4 + 16* 9 + tableq +%define rgb_Vcoeff_12x4 16*4 + 16*10 + tableq +%define rgb_Vcoeff_3x56 16*4 + 16*11 + tableq + +%define rgba_Ycoeff_rb 16*4 + 16*12 + tableq +%define rgba_Ycoeff_br 16*4 + 16*13 + tableq +%define rgba_Ycoeff_ga 16*4 + 16*14 + tableq +%define rgba_Ycoeff_ag 16*4 + 16*15 + tableq +%define rgba_Ucoeff_rb 16*4 + 16*16 + tableq +%define rgba_Ucoeff_br 16*4 + 16*17 + tableq +%define rgba_Ucoeff_ga 16*4 + 16*18 + tableq +%define rgba_Ucoeff_ag 16*4 + 16*19 + tableq +%define rgba_Vcoeff_rb 16*4 + 16*20 + tableq +%define rgba_Vcoeff_br 16*4 + 16*21 + tableq +%define rgba_Vcoeff_ga 16*4 + 16*22 + tableq +%define rgba_Vcoeff_ag 16*4 + 16*23 + tableq + +; bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY +; bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY +; rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY +; rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY +; bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU +; bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU +; rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU +; rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU +; bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV +; bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV +; rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV +; rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV + +; rgba_Ycoeff_rb: times 4 dw RY, BY +; rgba_Ycoeff_br: times 4 dw BY, RY +; rgba_Ycoeff_ga: times 4 dw GY, 0 +; rgba_Ycoeff_ag: times 4 dw 0, GY +; rgba_Ucoeff_rb: times 4 dw RU, BU +; rgba_Ucoeff_br: times 4 dw BU, RU +; rgba_Ucoeff_ga: times 4 dw GU, 0 +; rgba_Ucoeff_ag: times 4 dw 0, GU +; rgba_Vcoeff_rb: times 4 dw RV, BV +; rgba_Vcoeff_br: times 4 dw BV, RV +; rgba_Vcoeff_ga: times 4 dw GV, 0 +; rgba_Vcoeff_ag: times 4 dw 0, GV shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \ 6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80 @@ -82,7 +108,7 @@ SECTION .text ; %1 = nr. of XMM registers ; %2 = rgb or bgr %macro RGB24_TO_Y_FN 2-3 -cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w +cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table %if mmsize == 8 mova m5, [%2_Ycoeff_12x4] mova m6, [%2_Ycoeff_3x56] @@ -114,6 +140,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w %if ARCH_X86_64 movsxd wq, wd %endif + add wq, wq add dstq, wq neg wq %if notcpuflag(ssse3) @@ -157,12 +184,11 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7] paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] } paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] } - psrad m0, 15 - psrad m2, 15 + psrad m0, 9 + psrad m2, 9 packssdw m0, m2 ; (word) { Y[0-7] } - packuswb m0, m0 ; (byte) { Y[0-7] } - movh [dstq+wq], m0 - add wq, mmsize / 2 + mova [dstq+wq], m0 + add wq, mmsize jl .loop REP_RET %endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8 @@ -171,7 +197,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w ; %1 = nr. of XMM registers ; %2 = rgb or bgr %macro RGB24_TO_UV_FN 2-3 -cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table %if ARCH_X86_64 mova m8, [%2_Ucoeff_12x4] mova m9, [%2_Ucoeff_3x56] @@ -202,10 +228,11 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w %endif ; x86-32/64 %endif ; cpuflag(ssse3) %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif + add wq, wq add dstUq, wq add dstVq, wq neg wq @@ -263,23 +290,20 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] } paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] } paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] } - psrad m0, 15 - psrad m2, 15 - psrad m1, 15 - psrad m4, 15 + psrad m0, 9 + psrad m2, 9 + psrad m1, 9 + psrad m4, 9 packssdw m0, m1 ; (word) { U[0-7] } packssdw m2, m4 ; (word) { V[0-7] } %if mmsize == 8 - packuswb m0, m0 ; (byte) { U[0-3] } - packuswb m2, m2 ; (byte) { V[0-3] } - movh [dstUq+wq], m0 - movh [dstVq+wq], m2 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %else ; mmsize == 16 - packuswb m0, m2 ; (byte) { U[0-7], V[0-7] } - movh [dstUq+wq], m0 - movhps [dstVq+wq], m0 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %endif ; mmsize == 8/16 - add wq, mmsize / 2 + add wq, mmsize jl .loop REP_RET %endif ; ARCH_X86_64 && %0 == 3 @@ -305,13 +329,15 @@ RGB24_FUNCS 10, 12 INIT_XMM ssse3 RGB24_FUNCS 11, 13 +%if HAVE_AVX_EXTERNAL INIT_XMM avx RGB24_FUNCS 11, 13 +%endif ; %1 = nr. of XMM registers ; %2-5 = rgba, bgra, argb or abgr (in individual characters) %macro RGB32_TO_Y_FN 5-6 -cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w +cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table mova m5, [rgba_Ycoeff_%2%4] mova m6, [rgba_Ycoeff_%3%5] %if %0 == 6 @@ -322,6 +348,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w movsxd wq, wd %endif lea srcq, [srcq+wq*4] + add wq, wq add dstq, wq neg wq mova m4, [rgb_Yrnd] @@ -329,8 +356,8 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w psrlw m7, 8 ; (word) { 0x00ff } x4 .loop: ; FIXME check alignment and use mova - movu m0, [srcq+wq*4+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] - movu m2, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] + movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7] pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3] pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3] @@ -340,12 +367,11 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w paddd m2, m4 ; += rgb_Yrnd paddd m0, m1 ; (dword) { Y[0-3] } paddd m2, m3 ; (dword) { Y[4-7] } - psrad m0, 15 - psrad m2, 15 + psrad m0, 9 + psrad m2, 9 packssdw m0, m2 ; (word) { Y[0-7] } - packuswb m0, m0 ; (byte) { Y[0-7] } - movh [dstq+wq], m0 - add wq, mmsize / 2 + mova [dstq+wq], m0 + add wq, mmsize jl .loop REP_RET %endif ; %0 == 3 @@ -354,7 +380,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w ; %1 = nr. of XMM registers ; %2-5 = rgba, bgra, argb or abgr (in individual characters) %macro RGB32_TO_UV_FN 5-6 -cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table %if ARCH_X86_64 mova m8, [rgba_Ucoeff_%2%4] mova m9, [rgba_Ucoeff_%3%5] @@ -375,21 +401,22 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w %else ; ARCH_X86_64 && %0 == 6 .body: %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif + add wq, wq add dstUq, wq add dstVq, wq - lea srcq, [srcq+wq*4] + lea srcq, [srcq+wq*2] neg wq pcmpeqb m7, m7 psrlw m7, 8 ; (word) { 0x00ff } x4 mova m6, [rgb_UVrnd] .loop: ; FIXME check alignment and use mova - movu m0, [srcq+wq*4+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] - movu m4, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] + movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7] pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3] pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3] @@ -405,25 +432,22 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7] paddd m3, m6 ; += rgb_UVrnd paddd m5, m6 ; += rgb_UVrnd - psrad m0, 15 + psrad m0, 9 paddd m1, m3 ; (dword) { V[4-7] } paddd m4, m5 ; (dword) { U[4-7] } - psrad m2, 15 - psrad m4, 15 - psrad m1, 15 + psrad m2, 9 + psrad m4, 9 + psrad m1, 9 packssdw m0, m4 ; (word) { U[0-7] } packssdw m2, m1 ; (word) { V[0-7] } %if mmsize == 8 - packuswb m0, m0 ; (byte) { U[0-7] } - packuswb m2, m2 ; (byte) { V[0-7] } - movh [dstUq+wq], m0 - movh [dstVq+wq], m2 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %else ; mmsize == 16 - packuswb m0, m2 ; (byte) { U[0-7], V[0-7] } - movh [dstUq+wq], m0 - movhps [dstVq+wq], m0 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %endif ; mmsize == 8/16 - add wq, mmsize / 2 + add wq, mmsize jl .loop REP_RET %endif ; ARCH_X86_64 && %0 == 3 @@ -451,8 +475,10 @@ RGB32_FUNCS 0, 0 INIT_XMM sse2 RGB32_FUNCS 8, 12 +%if HAVE_AVX_EXTERNAL INIT_XMM avx RGB32_FUNCS 8, 12 +%endif ;----------------------------------------------------------------------------- ; YUYV/UYVY/NV12/NV21 packed pixel shuffling. @@ -489,7 +515,7 @@ RGB32_FUNCS 8, 12 ; will be the same (i.e. YUYV+AVX), and thus we don't need to ; split the loop in an aligned and unaligned case %macro YUYV_TO_Y_FN 2-3 -cglobal %2ToY, 3, 3, %1, dst, src, w +cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w %if ARCH_X86_64 movsxd wq, wd %endif @@ -559,11 +585,11 @@ cglobal %2ToY, 3, 3, %1, dst, src, w ; will be the same (i.e. UYVY+AVX), and thus we don't need to ; split the loop in an aligned and unaligned case %macro YUYV_TO_UV_FN 2-3 -cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif add dstUq, wq add dstVq, wq @@ -593,8 +619,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w .loop_%1: mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... } mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... } - pand m2, m0, m4 ; (word) { U0, U1, ..., U7 } - pand m3, m1, m4 ; (word) { U8, U9, ..., U15 } + pand m2, m0, m5 ; (word) { U0, U1, ..., U7 } + pand m3, m1, m5 ; (word) { U8, U9, ..., U15 } psrlw m0, 8 ; (word) { V0, V1, ..., V7 } psrlw m1, 8 ; (word) { V8, V9, ..., V15 } packuswb m2, m3 ; (byte) { U0, ..., U15 } @@ -614,11 +640,11 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w ; %1 = nr. of XMM registers ; %2 = nv12 or nv21 %macro NVXX_TO_UV_FN 2 -cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif add dstUq, wq add dstVq, wq @@ -626,8 +652,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w test srcq, 15 %endif lea srcq, [srcq+wq*2] - pcmpeqb m4, m4 ; (byte) { 0xff } x 16 - psrlw m4, 8 ; (word) { 0x00ff } x 8 + pcmpeqb m5, m5 ; (byte) { 0xff } x 16 + psrlw m5, 8 ; (word) { 0x00ff } x 8 %if mmsize == 16 jnz .loop_u_start neg wq @@ -659,6 +685,7 @@ YUYV_TO_UV_FN 3, uyvy NVXX_TO_UV_FN 5, nv12 NVXX_TO_UV_FN 5, nv21 +%if HAVE_AVX_EXTERNAL INIT_XMM avx ; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but ; that's not faster in practice @@ -666,3 +693,4 @@ YUYV_TO_UV_FN 3, yuyv YUYV_TO_UV_FN 3, uyvy, 1 NVXX_TO_UV_FN 5, nv12 NVXX_TO_UV_FN 5, nv21 +%endif diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm index e1ceded756..9ea4af9535 100644 --- a/libswscale/x86/output.asm +++ b/libswscale/x86/output.asm @@ -3,20 +3,20 @@ ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> ;* Kieran Kunhya <kieran@kunhya.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -264,10 +264,12 @@ yuv2planeX_fn 9, 7, 5 yuv2planeX_fn 10, 7, 5 yuv2planeX_fn 16, 8, 5 +%if HAVE_AVX_EXTERNAL INIT_XMM avx yuv2planeX_fn 8, 10, 7 yuv2planeX_fn 9, 7, 5 yuv2planeX_fn 10, 7, 5 +%endif ; %1=outout-bpc, %2=alignment (u/a) %macro yuv2plane1_mainloop 2 @@ -402,8 +404,10 @@ yuv2plane1_fn 16, 6, 3 INIT_XMM sse4 yuv2plane1_fn 16, 5, 3 +%if HAVE_AVX_EXTERNAL INIT_XMM avx yuv2plane1_fn 8, 5, 5 yuv2plane1_fn 9, 5, 3 yuv2plane1_fn 10, 5, 3 yuv2plane1_fn 16, 5, 3 +%endif diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index 9cfe831e3c..8cc99c6c58 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -6,20 +6,20 @@ * Written by Nick Kurshev. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -72,8 +72,10 @@ DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL; DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL; DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL; DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL; +DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL; +DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL; +DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL; -#define RGB2YUV_SHIFT 8 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5)) #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5)) #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) @@ -125,6 +127,7 @@ DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL; #undef COMPILE_TEMPLATE_AMD3DNOW #define COMPILE_TEMPLATE_MMXEXT 0 #define COMPILE_TEMPLATE_SSE2 0 +#define COMPILE_TEMPLATE_AVX 0 #define COMPILE_TEMPLATE_AMD3DNOW 1 #define RENAME(a) a ## _3dnow #include "rgb2rgb_template.c" diff --git a/libswscale/x86/rgb2rgb_template.c b/libswscale/x86/rgb2rgb_template.c index dc3c694a9e..455e7c25a8 100644 --- a/libswscale/x86/rgb2rgb_template.c +++ b/libswscale/x86/rgb2rgb_template.c @@ -7,20 +7,20 @@ * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * lot of big-endian byte order fixes by Alex Beregszaszi * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -129,14 +129,11 @@ static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int sr "movq %%mm4, %%mm3 \n\t" \ "psllq $48, %%mm2 \n\t" \ "psllq $32, %%mm3 \n\t" \ - "pand "MANGLE(mask24hh)", %%mm2\n\t" \ - "pand "MANGLE(mask24hhh)", %%mm3\n\t" \ "por %%mm2, %%mm0 \n\t" \ "psrlq $16, %%mm1 \n\t" \ "psrlq $32, %%mm4 \n\t" \ "psllq $16, %%mm5 \n\t" \ "por %%mm3, %%mm1 \n\t" \ - "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \ "por %%mm5, %%mm4 \n\t" \ \ MOVNTQ" %%mm0, (%0) \n\t" \ @@ -715,27 +712,6 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_s } } -/* - I use less accurate approximation here by simply left-shifting the input - value and filling the low order bits with zeroes. This method improves PNG - compression but this scheme cannot reproduce white exactly, since it does - not generate an all-ones maximum value; the net effect is to darken the - image slightly. - - The better method should be "left bit replication": - - 4 3 2 1 0 - --------- - 1 1 0 1 1 - - 7 6 5 4 3 2 1 0 - ---------------- - 1 1 0 1 1 1 1 0 - |=======| |===| - | leftmost bits repeated to fill open bits - | - original bits -*/ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) { const uint16_t *end; @@ -754,9 +730,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $2, %%mm1 \n\t" - "psrlq $7, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" @@ -784,9 +761,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $2, %%mm1 \n\t" - "psrlq $7, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" @@ -832,9 +810,9 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr while (s < end) { register uint16_t bgr; bgr = *s++; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x3E0)>>2; - *d++ = (bgr&0x7C00)>>7; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); + *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); } } @@ -856,9 +834,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $3, %%mm1 \n\t" - "psrlq $8, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "psrlq $1, %%mm2 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" @@ -886,9 +866,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $3, %%mm1 \n\t" - "psrlq $8, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "psrlq $1, %%mm2 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" @@ -933,9 +915,9 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr while (s < end) { register uint16_t bgr; bgr = *s++; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x7E0)>>3; - *d++ = (bgr&0xF800)>>8; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); + *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); } } @@ -978,11 +960,12 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $2, %%mm1 \n\t" - "psrlq $7, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "pmulhw %5, %%mm0 \n\t" + "pmulhw %5, %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" PACK_RGB32 - ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) + ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid) :"memory"); d += 16; s += 4; @@ -992,9 +975,9 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s while (s < end) { register uint16_t bgr; bgr = *s++; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x3E0)>>2; - *d++ = (bgr&0x7C00)>>7; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); + *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); *d++ = 255; } } @@ -1019,11 +1002,13 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $3, %%mm1 \n\t" - "psrlq $8, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "psrlq $1, %%mm2 \n\t" + "pmulhw %5, %%mm0 \n\t" + "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" PACK_RGB32 - ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r) + ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid) :"memory"); d += 16; s += 4; @@ -1033,9 +1018,9 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s while (s < end) { register uint16_t bgr; bgr = *s++; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x7E0)>>3; - *d++ = (bgr&0xF800)>>8; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); + *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); *d++ = 255; } } @@ -1627,10 +1612,15 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t * others are ignored in the C version. * FIXME: Write HQ version. */ +#if HAVE_7REGS static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, - int lumStride, int chromStride, int srcStride) + int lumStride, int chromStride, int srcStride, + int32_t *rgb2yuv) { +#define BGR2Y_IDX "16*4+16*32" +#define BGR2U_IDX "16*4+16*33" +#define BGR2V_IDX "16*4+16*34" int y; const x86_reg chromWidth= width>>1; for (y=0; y<height-2; y+=2) { @@ -1638,7 +1628,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ for (i=0; i<2; i++) { __asm__ volatile( "mov %2, %%"REG_a" \n\t" - "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t" + "movq "BGR2Y_IDX"(%3), %%mm6 \n\t" "movq "MANGLE(ff_w1111)", %%mm5 \n\t" "pxor %%mm7, %%mm7 \n\t" "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" @@ -1657,12 +1647,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "pmaddwd %%mm6, %%mm1 \n\t" "pmaddwd %%mm6, %%mm2 \n\t" "pmaddwd %%mm6, %%mm3 \n\t" -#ifndef FAST_BGR2YV12 "psrad $8, %%mm0 \n\t" "psrad $8, %%mm1 \n\t" "psrad $8, %%mm2 \n\t" "psrad $8, %%mm3 \n\t" -#endif "packssdw %%mm1, %%mm0 \n\t" "packssdw %%mm3, %%mm2 \n\t" "pmaddwd %%mm5, %%mm0 \n\t" @@ -1682,12 +1670,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "pmaddwd %%mm6, %%mm1 \n\t" "pmaddwd %%mm6, %%mm2 \n\t" "pmaddwd %%mm6, %%mm3 \n\t" -#ifndef FAST_BGR2YV12 "psrad $8, %%mm4 \n\t" "psrad $8, %%mm1 \n\t" "psrad $8, %%mm2 \n\t" "psrad $8, %%mm3 \n\t" -#endif "packssdw %%mm1, %%mm4 \n\t" "packssdw %%mm3, %%mm2 \n\t" "pmaddwd %%mm5, %%mm4 \n\t" @@ -1702,7 +1688,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" "add $8, %%"REG_a" \n\t" " js 1b \n\t" - : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width) + : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv) : "%"REG_a, "%"REG_d ); ydst += lumStride; @@ -1712,7 +1698,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ __asm__ volatile( "mov %4, %%"REG_a" \n\t" "movq "MANGLE(ff_w1111)", %%mm5 \n\t" - "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t" + "movq "BGR2U_IDX"(%5), %%mm6 \n\t" "pxor %%mm7, %%mm7 \n\t" "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" "add %%"REG_d", %%"REG_d" \n\t" @@ -1761,19 +1747,17 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "psrlw $2, %%mm0 \n\t" "psrlw $2, %%mm2 \n\t" #endif - "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" - "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" + "movq "BGR2V_IDX"(%5), %%mm1 \n\t" + "movq "BGR2V_IDX"(%5), %%mm3 \n\t" "pmaddwd %%mm0, %%mm1 \n\t" "pmaddwd %%mm2, %%mm3 \n\t" "pmaddwd %%mm6, %%mm0 \n\t" "pmaddwd %%mm6, %%mm2 \n\t" -#ifndef FAST_BGR2YV12 "psrad $8, %%mm0 \n\t" "psrad $8, %%mm1 \n\t" "psrad $8, %%mm2 \n\t" "psrad $8, %%mm3 \n\t" -#endif "packssdw %%mm2, %%mm0 \n\t" "packssdw %%mm3, %%mm1 \n\t" "pmaddwd %%mm5, %%mm0 \n\t" @@ -1823,19 +1807,17 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "psrlw $2, %%mm4 \n\t" "psrlw $2, %%mm2 \n\t" #endif - "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" - "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" + "movq "BGR2V_IDX"(%5), %%mm1 \n\t" + "movq "BGR2V_IDX"(%5), %%mm3 \n\t" "pmaddwd %%mm4, %%mm1 \n\t" "pmaddwd %%mm2, %%mm3 \n\t" "pmaddwd %%mm6, %%mm4 \n\t" "pmaddwd %%mm6, %%mm2 \n\t" -#ifndef FAST_BGR2YV12 "psrad $8, %%mm4 \n\t" "psrad $8, %%mm1 \n\t" "psrad $8, %%mm2 \n\t" "psrad $8, %%mm3 \n\t" -#endif "packssdw %%mm2, %%mm4 \n\t" "packssdw %%mm3, %%mm1 \n\t" "pmaddwd %%mm5, %%mm4 \n\t" @@ -1854,7 +1836,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "movd %%mm0, (%3, %%"REG_a") \n\t" "add $4, %%"REG_a" \n\t" " js 1b \n\t" - : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) + : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv) : "%"REG_a, "%"REG_d ); @@ -1867,8 +1849,9 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ SFENCE" \n\t" :::"memory"); - rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride); + ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv); } +#endif /* HAVE_7REGS */ #endif /* !COMPILE_TEMPLATE_SSE2 */ #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX @@ -1943,9 +1926,13 @@ static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, ui } #endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */ +#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src, const uint8_t *unused, int w, + const uint8_t *unused, + const uint8_t *src1, + const uint8_t *src2, + int w, uint32_t *unused2); static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride, @@ -1954,7 +1941,7 @@ static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t int h; for (h = 0; h < height; h++) { - RENAME(ff_nv12ToUV)(dst1, dst2, src, NULL, width, NULL); + RENAME(ff_nv12ToUV)(dst1, dst2, NULL, src, NULL, width, NULL); src += srcStride; dst1 += dst1Stride; dst2 += dst2Stride; @@ -1966,6 +1953,7 @@ static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t ); } #endif /* !COMPILE_TEMPLATE_AMD3DNOW */ +#endif /* !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL */ #if !COMPILE_TEMPLATE_SSE2 #if !COMPILE_TEMPLATE_AMD3DNOW @@ -2395,7 +2383,7 @@ static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth= -((-width)>>1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y=0; y<height; y++) { RENAME(extract_even)(src, ydst, width); @@ -2421,7 +2409,7 @@ static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth= -((-width)>>1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y=0; y<height; y++) { RENAME(extract_even)(src, ydst, width); @@ -2445,7 +2433,7 @@ static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth= -((-width)>>1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y=0; y<height; y++) { RENAME(extract_even)(src+1, ydst, width); @@ -2471,7 +2459,7 @@ static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth= -((-width)>>1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y=0; y<height; y++) { RENAME(extract_even)(src+1, ydst, width); @@ -2527,7 +2515,9 @@ static av_cold void RENAME(rgb2rgb_init)(void) #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW planar2x = RENAME(planar2x); #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */ - rgb24toyv12 = RENAME(rgb24toyv12); +#if HAVE_7REGS + ff_rgb24toyv12 = RENAME(rgb24toyv12); +#endif /* HAVE_7REGS */ yuyvtoyuv420 = RENAME(yuyvtoyuv420); uyvytoyuv420 = RENAME(uyvytoyuv420); @@ -2536,7 +2526,9 @@ static av_cold void RENAME(rgb2rgb_init)(void) #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX interleaveBytes = RENAME(interleaveBytes); #endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */ +#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM deinterleaveBytes = RENAME(deinterleaveBytes); #endif +#endif } diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm index 440a27b0ba..940f35744e 100644 --- a/libswscale/x86/scale.asm +++ b/libswscale/x86/scale.asm @@ -2,20 +2,20 @@ ;* x86-optimized horizontal line scaling functions ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -407,11 +407,15 @@ SCALE_FUNC %1, %2, X, X8, 7, %3 SCALE_FUNCS 8, 15, %1 SCALE_FUNCS 9, 15, %2 SCALE_FUNCS 10, 15, %2 +SCALE_FUNCS 12, 15, %2 +SCALE_FUNCS 14, 15, %2 SCALE_FUNCS 16, 15, %3 %endif ; !sse4 SCALE_FUNCS 8, 19, %1 SCALE_FUNCS 9, 19, %2 SCALE_FUNCS 10, 19, %2 +SCALE_FUNCS 12, 19, %2 +SCALE_FUNCS 14, 19, %2 SCALE_FUNCS 16, 19, %3 %endmacro diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index a2bce48339..d9294ce783 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -1,20 +1,20 @@ /* - * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -23,6 +23,7 @@ #include "libswscale/swscale.h" #include "libswscale/swscale_internal.h" #include "libavutil/attributes.h" +#include "libavutil/avassert.h" #include "libavutil/intreadwrite.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" @@ -57,19 +58,15 @@ DECLARE_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL; DECLARE_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL; DECLARE_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL; -#ifdef FAST_BGR2YV12 -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000000210041000DULL; -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000FFEEFFDC0038ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00000038FFD2FFF8ULL; -#else DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000020E540830C8BULL; DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000ED0FDAC23831ULL; DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00003831D0E6F6EAULL; -#endif /* FAST_BGR2YV12 */ + DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL; DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL; DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL; + //MMX versions #if HAVE_MMX_INLINE #undef RENAME @@ -117,9 +114,9 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI c->greenDither= ff_dither4[dstY&1]; c->redDither= ff_dither8[(dstY+1)&1]; if (dstY < dstH - 2) { - const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; - const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; - const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; + const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; + const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; + const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; int i; if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) { @@ -186,7 +183,7 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI *(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i]; lumMmxFilter[4*i+2]= lumMmxFilter[4*i+3]= - ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; + ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U; if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i]; alpMmxFilter[4*i+2]= @@ -197,12 +194,83 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i]; chrMmxFilter[4*i+2]= chrMmxFilter[4*i+3]= - ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; + ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U; } } } } +#if HAVE_MMXEXT +static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset) +{ + if(((int)dest) & 15){ + return yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); + } + if (offset) { + __asm__ volatile("movq (%0), %%xmm3\n\t" + "movdqa %%xmm3, %%xmm4\n\t" + "psrlq $24, %%xmm3\n\t" + "psllq $40, %%xmm4\n\t" + "por %%xmm4, %%xmm3\n\t" + :: "r"(dither) + ); + } else { + __asm__ volatile("movq (%0), %%xmm3\n\t" + :: "r"(dither) + ); + } + filterSize--; + __asm__ volatile( + "pxor %%xmm0, %%xmm0\n\t" + "punpcklbw %%xmm0, %%xmm3\n\t" + "movd %0, %%xmm1\n\t" + "punpcklwd %%xmm1, %%xmm1\n\t" + "punpckldq %%xmm1, %%xmm1\n\t" + "punpcklqdq %%xmm1, %%xmm1\n\t" + "psllw $3, %%xmm1\n\t" + "paddw %%xmm1, %%xmm3\n\t" + "psraw $4, %%xmm3\n\t" + ::"m"(filterSize) + ); + __asm__ volatile( + "movdqa %%xmm3, %%xmm4\n\t" + "movdqa %%xmm3, %%xmm7\n\t" + "movl %3, %%ecx\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + ".p2align 4 \n\t" /* FIXME Unroll? */\ + "1: \n\t"\ + "movddup 8(%%"REG_d"), %%xmm0 \n\t" /* filterCoeff */\ + "movdqa (%%"REG_S", %%"REG_c", 2), %%xmm2 \n\t" /* srcData */\ + "movdqa 16(%%"REG_S", %%"REG_c", 2), %%xmm5 \n\t" /* srcData */\ + "add $16, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "pmulhw %%xmm0, %%xmm2 \n\t"\ + "pmulhw %%xmm0, %%xmm5 \n\t"\ + "paddw %%xmm2, %%xmm3 \n\t"\ + "paddw %%xmm5, %%xmm4 \n\t"\ + " jnz 1b \n\t"\ + "psraw $3, %%xmm3 \n\t"\ + "psraw $3, %%xmm4 \n\t"\ + "packuswb %%xmm4, %%xmm3 \n\t" + "movntdq %%xmm3, (%1, %%"REG_c")\n\t" + "add $16, %%"REG_c" \n\t"\ + "cmp %2, %%"REG_c" \n\t"\ + "movdqa %%xmm7, %%xmm3\n\t" + "movdqa %%xmm7, %%xmm4\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "jb 1b \n\t"\ + :: "g" (filter), + "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) + : "%"REG_d, "%"REG_S, "%"REG_c + ); +} +#endif + #endif /* HAVE_INLINE_ASM */ #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \ @@ -216,10 +284,14 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \ SCALE_FUNC(filter_n, 8, 15, opt); \ SCALE_FUNC(filter_n, 9, 15, opt); \ SCALE_FUNC(filter_n, 10, 15, opt); \ + SCALE_FUNC(filter_n, 12, 15, opt); \ + SCALE_FUNC(filter_n, 14, 15, opt); \ SCALE_FUNC(filter_n, 16, 15, opt); \ SCALE_FUNC(filter_n, 8, 19, opt); \ SCALE_FUNC(filter_n, 9, 19, opt); \ SCALE_FUNC(filter_n, 10, 19, opt); \ + SCALE_FUNC(filter_n, 12, 19, opt); \ + SCALE_FUNC(filter_n, 14, 19, opt); \ SCALE_FUNC(filter_n, 16, 19, opt) #define SCALE_FUNCS_MMX(opt) \ @@ -275,11 +347,14 @@ VSCALE_FUNCS(avx, avx); #define INPUT_Y_FUNC(fmt, opt) \ void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \ + const uint8_t *unused1, const uint8_t *unused2, \ int w, uint32_t *unused) #define INPUT_UV_FUNC(fmt, opt) \ void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \ - const uint8_t *src, const uint8_t *unused1, \ - int w, uint32_t *unused2) + const uint8_t *unused0, \ + const uint8_t *src1, \ + const uint8_t *src2, \ + int w, uint32_t *unused) #define INPUT_FUNC(fmt, opt) \ INPUT_Y_FUNC(fmt, opt); \ INPUT_UV_FUNC(fmt, opt) @@ -313,20 +388,31 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c) #if HAVE_MMXEXT_INLINE if (INLINE_MMXEXT(cpu_flags)) sws_init_swscale_mmxext(c); + if (cpu_flags & AV_CPU_FLAG_SSE3){ + if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) + c->yuv2planeX = yuv2yuvX_sse3; + } #endif #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \ if (c->srcBpc == 8) { \ - hscalefn = c->dstBpc <= 10 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \ + hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \ ff_hscale8to19_ ## filtersize ## _ ## opt1; \ } else if (c->srcBpc == 9) { \ - hscalefn = c->dstBpc <= 10 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \ + hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \ ff_hscale9to19_ ## filtersize ## _ ## opt1; \ } else if (c->srcBpc == 10) { \ - hscalefn = c->dstBpc <= 10 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \ + hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \ ff_hscale10to19_ ## filtersize ## _ ## opt1; \ - } else /* c->srcBpc == 16 */ { \ - hscalefn = c->dstBpc <= 10 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \ + } else if (c->srcBpc == 12) { \ + hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \ + ff_hscale12to19_ ## filtersize ## _ ## opt1; \ + } else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \ + hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \ + ff_hscale14to19_ ## filtersize ## _ ## opt1; \ + } else { /* c->srcBpc == 16 */ \ + av_assert0(c->srcBpc == 16);\ + hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \ ff_hscale16to19_ ## filtersize ## _ ## opt1; \ } \ } while (0) @@ -341,14 +427,15 @@ switch(c->dstBpc){ \ case 16: do_16_case; break; \ case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \ case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \ - default: if (condition_8bit) vscalefn = ff_yuv2planeX_8_ ## opt; break; \ + default: if (condition_8bit) /*vscalefn = ff_yuv2planeX_8_ ## opt;*/ break; \ } #define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \ switch(c->dstBpc){ \ case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \ case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \ case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \ - default: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \ + case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \ + default: av_assert0(c->dstBpc>8); \ } #define case_rgb(x, X, opt) \ case AV_PIX_FMT_ ## X: \ diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index 80a3ad9a3d..c7a1bb46d9 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -1,25 +1,26 @@ /* - * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #undef REAL_MOVNTQ #undef MOVNTQ +#undef MOVNTQ2 #undef PREFETCH #if COMPILE_TEMPLATE_MMXEXT @@ -30,11 +31,95 @@ #if COMPILE_TEMPLATE_MMXEXT #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" +#define MOVNTQ2 "movntq " #else #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" +#define MOVNTQ2 "movq " #endif #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) +#if !COMPILE_TEMPLATE_MMXEXT +static av_always_inline void +dither_8to16(const uint8_t *srcDither, int rot) +{ + if (rot) { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "psrlq $24, %%mm3\n\t" + "psllq $40, %%mm4\n\t" + "por %%mm4, %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "punpcklbw %%mm0, %%mm3\n\t" + "punpckhbw %%mm0, %%mm4\n\t" + :: "r"(srcDither) + ); + } else { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "punpcklbw %%mm0, %%mm3\n\t" + "punpckhbw %%mm0, %%mm4\n\t" + :: "r"(srcDither) + ); + } +} +#endif + +static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset) +{ + dither_8to16(dither, offset); + filterSize--; + __asm__ volatile( + "movd %0, %%mm1\n\t" + "punpcklwd %%mm1, %%mm1\n\t" + "punpckldq %%mm1, %%mm1\n\t" + "psllw $3, %%mm1\n\t" + "paddw %%mm1, %%mm3\n\t" + "paddw %%mm1, %%mm4\n\t" + "psraw $4, %%mm3\n\t" + "psraw $4, %%mm4\n\t" + ::"m"(filterSize) + ); + + __asm__ volatile(\ + "movq %%mm3, %%mm6\n\t" + "movq %%mm4, %%mm7\n\t" + "movl %3, %%ecx\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + ".p2align 4 \n\t" /* FIXME Unroll? */\ + "1: \n\t"\ + "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ + "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\ + "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\ + "add $16, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "pmulhw %%mm0, %%mm2 \n\t"\ + "pmulhw %%mm0, %%mm5 \n\t"\ + "paddw %%mm2, %%mm3 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + " jnz 1b \n\t"\ + "psraw $3, %%mm3 \n\t"\ + "psraw $3, %%mm4 \n\t"\ + "packuswb %%mm4, %%mm3 \n\t" + MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t" + "add $8, %%"REG_c" \n\t"\ + "cmp %2, %%"REG_c" \n\t"\ + "movq %%mm6, %%mm3\n\t" + "movq %%mm7, %%mm4\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "jb 1b \n\t"\ + :: "g" (filter), + "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) + : "%"REG_d, "%"REG_S, "%"REG_c + ); +} + #define YSCALEYUV2PACKEDX_UV \ __asm__ volatile(\ "xor %%"REG_a", %%"REG_a" \n\t"\ @@ -260,7 +345,7 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX_ACCURATE @@ -293,7 +378,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX @@ -350,7 +435,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -374,7 +459,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -427,7 +512,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -451,7 +536,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -584,7 +669,7 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -608,7 +693,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -649,7 +734,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ @@ -670,7 +755,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ @@ -786,8 +871,8 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], : "%r8" ); #else - *(const uint16_t **)(&c->u_temp)=abuf0; - *(const uint16_t **)(&c->v_temp)=abuf1; + c->u_temp=(intptr_t)abuf0; + c->v_temp=(intptr_t)abuf1; __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" @@ -1559,9 +1644,9 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) { enum AVPixelFormat dstFormat = c->dstFormat; - if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && - dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21) { - if (!(c->flags & SWS_BITEXACT)) { + c->use_mmx_vfilter= 0; + if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12 + && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) { if (c->flags & SWS_ACCURATE_RND) { if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { @@ -1574,6 +1659,8 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) } } } else { + c->use_mmx_vfilter= 1; + c->yuv2planeX = RENAME(yuv2yuvX ); if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break; @@ -1585,7 +1672,6 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) } } } - } if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case AV_PIX_FMT_RGB32: @@ -1614,7 +1700,7 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) } } - if (c->srcBpc == 8 && c->dstBpc <= 10) { + if (c->srcBpc == 8 && c->dstBpc <= 14) { // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one). #if COMPILE_TEMPLATE_MMXEXT if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) { diff --git a/libswscale/x86/w64xmmtest.c b/libswscale/x86/w64xmmtest.c index dd9a2a4378..88143d9687 100644 --- a/libswscale/x86/w64xmmtest.c +++ b/libswscale/x86/w64xmmtest.c @@ -2,20 +2,20 @@ * check XMM registers for clobbers on Win64 * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c index bacc87f6c7..a3370eec61 100644 --- a/libswscale/x86/yuv2rgb.c +++ b/libswscale/x86/yuv2rgb.c @@ -7,27 +7,26 @@ * 1,4,8bpp support and context / deglobalize stuff * by Michael Niedermayer (michaelni@gmx.at) * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include <stdio.h> #include <stdlib.h> #include <inttypes.h> -#include <assert.h> #include "config.h" #include "libswscale/rgb2rgb.h" @@ -75,10 +74,6 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c) #if HAVE_MMX_INLINE int cpu_flags = av_get_cpu_flags(); - if (c->srcFormat != AV_PIX_FMT_YUV420P && - c->srcFormat != AV_PIX_FMT_YUVA420P) - return NULL; - #if HAVE_MMXEXT_INLINE if (INLINE_MMXEXT(cpu_flags)) { switch (c->dstFormat) { diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c index b028e93152..c879102cc4 100644 --- a/libswscale/x86/yuv2rgb_template.c +++ b/libswscale/x86/yuv2rgb_template.c @@ -4,20 +4,20 @@ * Copyright (C) 2001-2007 Michael Niedermayer * (c) 2010 Konstantin Shishkov * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -43,17 +43,14 @@ if (h_size * depth > FFABS(dstStride[0])) \ h_size -= 8; \ \ - if (c->srcFormat == AV_PIX_FMT_YUV422P) { \ - srcStride[1] *= 2; \ - srcStride[2] *= 2; \ - } \ + vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \ \ __asm__ volatile ("pxor %mm4, %mm4\n\t"); \ for (y = 0; y < srcSliceH; y++) { \ uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \ const uint8_t *py = src[0] + y * srcStride[0]; \ - const uint8_t *pu = src[1] + (y >> 1) * srcStride[1]; \ - const uint8_t *pv = src[2] + (y >> 1) * srcStride[2]; \ + const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \ + const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \ x86_reg index = -h_size / 2; \ #define YUV2RGB_INITIAL_LOAD \ @@ -141,6 +138,7 @@ : "+r" (index), "+r" (image) \ : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \ "r" (py - 2*index) \ + : "memory" \ ); \ } \ @@ -148,6 +146,7 @@ : "+r" (index), "+r" (image) \ : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \ "r" (py - 2*index), "r" (pa - 2*index) \ + : "memory" \ ); \ } \ @@ -188,7 +187,7 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(2) @@ -216,7 +215,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(2) @@ -306,7 +305,7 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(3) @@ -324,7 +323,7 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(3) @@ -368,7 +367,7 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(4) @@ -389,7 +388,7 @@ static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(4) @@ -411,7 +410,7 @@ static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(4) @@ -432,7 +431,7 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(4) |