summaryrefslogtreecommitdiff
path: root/libswscale/x86
diff options
context:
space:
mode:
Diffstat (limited to 'libswscale/x86')
-rw-r--r--libswscale/x86/Makefile2
-rw-r--r--libswscale/x86/input.asm206
-rw-r--r--libswscale/x86/output.asm12
-rw-r--r--libswscale/x86/rgb2rgb.c13
-rw-r--r--libswscale/x86/rgb2rgb_template.c162
-rw-r--r--libswscale/x86/scale.asm12
-rw-r--r--libswscale/x86/swscale.c137
-rw-r--r--libswscale/x86/swscale_template.c130
-rw-r--r--libswscale/x86/w64xmmtest.c8
-rw-r--r--libswscale/x86/yuv2rgb.c13
-rw-r--r--libswscale/x86/yuv2rgb_template.c35
11 files changed, 465 insertions, 265 deletions
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index b94b14abbb..e767a5c420 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -1,3 +1,5 @@
+$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
+
OBJS += x86/rgb2rgb.o \
x86/swscale.o \
x86/yuv2rgb.o \
diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
index 6f5677e1fd..0c4f30e6e0 100644
--- a/libswscale/x86/input.asm
+++ b/libswscale/x86/input.asm
@@ -4,20 +4,20 @@
;* into YUV planes also.
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -35,33 +35,59 @@ SECTION_RODATA
%define GV 0xD0E3
%define BV 0xF6E4
-rgb_Yrnd: times 4 dd 0x84000 ; 16.5 << 15
-rgb_UVrnd: times 4 dd 0x404000 ; 128.5 << 15
-bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
-bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
-rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
-rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY
-bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU
-bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU
-rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU
-rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU
-bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV
-bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
-rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
-rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
-
-rgba_Ycoeff_rb: times 4 dw RY, BY
-rgba_Ycoeff_br: times 4 dw BY, RY
-rgba_Ycoeff_ga: times 4 dw GY, 0
-rgba_Ycoeff_ag: times 4 dw 0, GY
-rgba_Ucoeff_rb: times 4 dw RU, BU
-rgba_Ucoeff_br: times 4 dw BU, RU
-rgba_Ucoeff_ga: times 4 dw GU, 0
-rgba_Ucoeff_ag: times 4 dw 0, GU
-rgba_Vcoeff_rb: times 4 dw RV, BV
-rgba_Vcoeff_br: times 4 dw BV, RV
-rgba_Vcoeff_ga: times 4 dw GV, 0
-rgba_Vcoeff_ag: times 4 dw 0, GV
+rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15
+rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15
+%define bgr_Ycoeff_12x4 16*4 + 16* 0 + tableq
+%define bgr_Ycoeff_3x56 16*4 + 16* 1 + tableq
+%define rgb_Ycoeff_12x4 16*4 + 16* 2 + tableq
+%define rgb_Ycoeff_3x56 16*4 + 16* 3 + tableq
+%define bgr_Ucoeff_12x4 16*4 + 16* 4 + tableq
+%define bgr_Ucoeff_3x56 16*4 + 16* 5 + tableq
+%define rgb_Ucoeff_12x4 16*4 + 16* 6 + tableq
+%define rgb_Ucoeff_3x56 16*4 + 16* 7 + tableq
+%define bgr_Vcoeff_12x4 16*4 + 16* 8 + tableq
+%define bgr_Vcoeff_3x56 16*4 + 16* 9 + tableq
+%define rgb_Vcoeff_12x4 16*4 + 16*10 + tableq
+%define rgb_Vcoeff_3x56 16*4 + 16*11 + tableq
+
+%define rgba_Ycoeff_rb 16*4 + 16*12 + tableq
+%define rgba_Ycoeff_br 16*4 + 16*13 + tableq
+%define rgba_Ycoeff_ga 16*4 + 16*14 + tableq
+%define rgba_Ycoeff_ag 16*4 + 16*15 + tableq
+%define rgba_Ucoeff_rb 16*4 + 16*16 + tableq
+%define rgba_Ucoeff_br 16*4 + 16*17 + tableq
+%define rgba_Ucoeff_ga 16*4 + 16*18 + tableq
+%define rgba_Ucoeff_ag 16*4 + 16*19 + tableq
+%define rgba_Vcoeff_rb 16*4 + 16*20 + tableq
+%define rgba_Vcoeff_br 16*4 + 16*21 + tableq
+%define rgba_Vcoeff_ga 16*4 + 16*22 + tableq
+%define rgba_Vcoeff_ag 16*4 + 16*23 + tableq
+
+; bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
+; bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
+; rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
+; rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY
+; bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU
+; bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU
+; rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU
+; rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU
+; bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV
+; bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
+; rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
+; rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
+
+; rgba_Ycoeff_rb: times 4 dw RY, BY
+; rgba_Ycoeff_br: times 4 dw BY, RY
+; rgba_Ycoeff_ga: times 4 dw GY, 0
+; rgba_Ycoeff_ag: times 4 dw 0, GY
+; rgba_Ucoeff_rb: times 4 dw RU, BU
+; rgba_Ucoeff_br: times 4 dw BU, RU
+; rgba_Ucoeff_ga: times 4 dw GU, 0
+; rgba_Ucoeff_ag: times 4 dw 0, GU
+; rgba_Vcoeff_rb: times 4 dw RV, BV
+; rgba_Vcoeff_br: times 4 dw BV, RV
+; rgba_Vcoeff_ga: times 4 dw GV, 0
+; rgba_Vcoeff_ag: times 4 dw 0, GV
shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \
6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80
@@ -82,7 +108,7 @@ SECTION .text
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro RGB24_TO_Y_FN 2-3
-cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
+cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
%if mmsize == 8
mova m5, [%2_Ycoeff_12x4]
mova m6, [%2_Ycoeff_3x56]
@@ -114,6 +140,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
%if ARCH_X86_64
movsxd wq, wd
%endif
+ add wq, wq
add dstq, wq
neg wq
%if notcpuflag(ssse3)
@@ -157,12 +184,11 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
- psrad m0, 15
- psrad m2, 15
+ psrad m0, 9
+ psrad m2, 9
packssdw m0, m2 ; (word) { Y[0-7] }
- packuswb m0, m0 ; (byte) { Y[0-7] }
- movh [dstq+wq], m0
- add wq, mmsize / 2
+ mova [dstq+wq], m0
+ add wq, mmsize
jl .loop
REP_RET
%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
@@ -171,7 +197,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro RGB24_TO_UV_FN 2-3
-cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
%if ARCH_X86_64
mova m8, [%2_Ucoeff_12x4]
mova m9, [%2_Ucoeff_3x56]
@@ -202,10 +228,11 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
%endif ; x86-32/64
%endif ; cpuflag(ssse3)
%if ARCH_X86_64
- movsxd wq, dword r4m
+ movsxd wq, dword r5m
%else ; x86-32
- mov wq, r4m
+ mov wq, r5m
%endif
+ add wq, wq
add dstUq, wq
add dstVq, wq
neg wq
@@ -263,23 +290,20 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
- psrad m0, 15
- psrad m2, 15
- psrad m1, 15
- psrad m4, 15
+ psrad m0, 9
+ psrad m2, 9
+ psrad m1, 9
+ psrad m4, 9
packssdw m0, m1 ; (word) { U[0-7] }
packssdw m2, m4 ; (word) { V[0-7] }
%if mmsize == 8
- packuswb m0, m0 ; (byte) { U[0-3] }
- packuswb m2, m2 ; (byte) { V[0-3] }
- movh [dstUq+wq], m0
- movh [dstVq+wq], m2
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
%else ; mmsize == 16
- packuswb m0, m2 ; (byte) { U[0-7], V[0-7] }
- movh [dstUq+wq], m0
- movhps [dstVq+wq], m0
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
%endif ; mmsize == 8/16
- add wq, mmsize / 2
+ add wq, mmsize
jl .loop
REP_RET
%endif ; ARCH_X86_64 && %0 == 3
@@ -305,13 +329,15 @@ RGB24_FUNCS 10, 12
INIT_XMM ssse3
RGB24_FUNCS 11, 13
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
RGB24_FUNCS 11, 13
+%endif
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro RGB32_TO_Y_FN 5-6
-cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
+cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
mova m5, [rgba_Ycoeff_%2%4]
mova m6, [rgba_Ycoeff_%3%5]
%if %0 == 6
@@ -322,6 +348,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
movsxd wq, wd
%endif
lea srcq, [srcq+wq*4]
+ add wq, wq
add dstq, wq
neg wq
mova m4, [rgb_Yrnd]
@@ -329,8 +356,8 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
psrlw m7, 8 ; (word) { 0x00ff } x4
.loop:
; FIXME check alignment and use mova
- movu m0, [srcq+wq*4+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
- movu m2, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+ movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
+ movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
@@ -340,12 +367,11 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
paddd m2, m4 ; += rgb_Yrnd
paddd m0, m1 ; (dword) { Y[0-3] }
paddd m2, m3 ; (dword) { Y[4-7] }
- psrad m0, 15
- psrad m2, 15
+ psrad m0, 9
+ psrad m2, 9
packssdw m0, m2 ; (word) { Y[0-7] }
- packuswb m0, m0 ; (byte) { Y[0-7] }
- movh [dstq+wq], m0
- add wq, mmsize / 2
+ mova [dstq+wq], m0
+ add wq, mmsize
jl .loop
REP_RET
%endif ; %0 == 3
@@ -354,7 +380,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro RGB32_TO_UV_FN 5-6
-cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
%if ARCH_X86_64
mova m8, [rgba_Ucoeff_%2%4]
mova m9, [rgba_Ucoeff_%3%5]
@@ -375,21 +401,22 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
%else ; ARCH_X86_64 && %0 == 6
.body:
%if ARCH_X86_64
- movsxd wq, dword r4m
+ movsxd wq, dword r5m
%else ; x86-32
- mov wq, r4m
+ mov wq, r5m
%endif
+ add wq, wq
add dstUq, wq
add dstVq, wq
- lea srcq, [srcq+wq*4]
+ lea srcq, [srcq+wq*2]
neg wq
pcmpeqb m7, m7
psrlw m7, 8 ; (word) { 0x00ff } x4
mova m6, [rgb_UVrnd]
.loop:
; FIXME check alignment and use mova
- movu m0, [srcq+wq*4+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
- movu m4, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+ movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
+ movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
@@ -405,25 +432,22 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7]
paddd m3, m6 ; += rgb_UVrnd
paddd m5, m6 ; += rgb_UVrnd
- psrad m0, 15
+ psrad m0, 9
paddd m1, m3 ; (dword) { V[4-7] }
paddd m4, m5 ; (dword) { U[4-7] }
- psrad m2, 15
- psrad m4, 15
- psrad m1, 15
+ psrad m2, 9
+ psrad m4, 9
+ psrad m1, 9
packssdw m0, m4 ; (word) { U[0-7] }
packssdw m2, m1 ; (word) { V[0-7] }
%if mmsize == 8
- packuswb m0, m0 ; (byte) { U[0-7] }
- packuswb m2, m2 ; (byte) { V[0-7] }
- movh [dstUq+wq], m0
- movh [dstVq+wq], m2
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
%else ; mmsize == 16
- packuswb m0, m2 ; (byte) { U[0-7], V[0-7] }
- movh [dstUq+wq], m0
- movhps [dstVq+wq], m0
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
%endif ; mmsize == 8/16
- add wq, mmsize / 2
+ add wq, mmsize
jl .loop
REP_RET
%endif ; ARCH_X86_64 && %0 == 3
@@ -451,8 +475,10 @@ RGB32_FUNCS 0, 0
INIT_XMM sse2
RGB32_FUNCS 8, 12
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
RGB32_FUNCS 8, 12
+%endif
;-----------------------------------------------------------------------------
; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
@@ -489,7 +515,7 @@ RGB32_FUNCS 8, 12
; will be the same (i.e. YUYV+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro YUYV_TO_Y_FN 2-3
-cglobal %2ToY, 3, 3, %1, dst, src, w
+cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
%if ARCH_X86_64
movsxd wq, wd
%endif
@@ -559,11 +585,11 @@ cglobal %2ToY, 3, 3, %1, dst, src, w
; will be the same (i.e. UYVY+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro YUYV_TO_UV_FN 2-3
-cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%if ARCH_X86_64
- movsxd wq, dword r4m
+ movsxd wq, dword r5m
%else ; x86-32
- mov wq, r4m
+ mov wq, r5m
%endif
add dstUq, wq
add dstVq, wq
@@ -593,8 +619,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
.loop_%1:
mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... }
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
- pand m2, m0, m4 ; (word) { U0, U1, ..., U7 }
- pand m3, m1, m4 ; (word) { U8, U9, ..., U15 }
+ pand m2, m0, m5 ; (word) { U0, U1, ..., U7 }
+ pand m3, m1, m5 ; (word) { U8, U9, ..., U15 }
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
psrlw m1, 8 ; (word) { V8, V9, ..., V15 }
packuswb m2, m3 ; (byte) { U0, ..., U15 }
@@ -614,11 +640,11 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
; %1 = nr. of XMM registers
; %2 = nv12 or nv21
%macro NVXX_TO_UV_FN 2
-cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%if ARCH_X86_64
- movsxd wq, dword r4m
+ movsxd wq, dword r5m
%else ; x86-32
- mov wq, r4m
+ mov wq, r5m
%endif
add dstUq, wq
add dstVq, wq
@@ -626,8 +652,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
test srcq, 15
%endif
lea srcq, [srcq+wq*2]
- pcmpeqb m4, m4 ; (byte) { 0xff } x 16
- psrlw m4, 8 ; (word) { 0x00ff } x 8
+ pcmpeqb m5, m5 ; (byte) { 0xff } x 16
+ psrlw m5, 8 ; (word) { 0x00ff } x 8
%if mmsize == 16
jnz .loop_u_start
neg wq
@@ -659,6 +685,7 @@ YUYV_TO_UV_FN 3, uyvy
NVXX_TO_UV_FN 5, nv12
NVXX_TO_UV_FN 5, nv21
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
; that's not faster in practice
@@ -666,3 +693,4 @@ YUYV_TO_UV_FN 3, yuyv
YUYV_TO_UV_FN 3, uyvy, 1
NVXX_TO_UV_FN 5, nv12
NVXX_TO_UV_FN 5, nv21
+%endif
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index e1ceded756..9ea4af9535 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -3,20 +3,20 @@
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;* Kieran Kunhya <kieran@kunhya.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -264,10 +264,12 @@ yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
yuv2planeX_fn 16, 8, 5
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
+%endif
; %1=outout-bpc, %2=alignment (u/a)
%macro yuv2plane1_mainloop 2
@@ -402,8 +404,10 @@ yuv2plane1_fn 16, 6, 3
INIT_XMM sse4
yuv2plane1_fn 16, 5, 3
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
yuv2plane1_fn 10, 5, 3
yuv2plane1_fn 16, 5, 3
+%endif
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 9cfe831e3c..8cc99c6c58 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -6,20 +6,20 @@
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -72,8 +72,10 @@ DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
-#define RGB2YUV_SHIFT 8
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
@@ -125,6 +127,7 @@ DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
#undef COMPILE_TEMPLATE_AMD3DNOW
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 0
+#define COMPILE_TEMPLATE_AVX 0
#define COMPILE_TEMPLATE_AMD3DNOW 1
#define RENAME(a) a ## _3dnow
#include "rgb2rgb_template.c"
diff --git a/libswscale/x86/rgb2rgb_template.c b/libswscale/x86/rgb2rgb_template.c
index dc3c694a9e..455e7c25a8 100644
--- a/libswscale/x86/rgb2rgb_template.c
+++ b/libswscale/x86/rgb2rgb_template.c
@@ -7,20 +7,20 @@
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
* lot of big-endian byte order fixes by Alex Beregszaszi
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -129,14 +129,11 @@ static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int sr
"movq %%mm4, %%mm3 \n\t" \
"psllq $48, %%mm2 \n\t" \
"psllq $32, %%mm3 \n\t" \
- "pand "MANGLE(mask24hh)", %%mm2\n\t" \
- "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
"por %%mm2, %%mm0 \n\t" \
"psrlq $16, %%mm1 \n\t" \
"psrlq $32, %%mm4 \n\t" \
"psllq $16, %%mm5 \n\t" \
"por %%mm3, %%mm1 \n\t" \
- "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
"por %%mm5, %%mm4 \n\t" \
\
MOVNTQ" %%mm0, (%0) \n\t" \
@@ -715,27 +712,6 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_s
}
}
-/*
- I use less accurate approximation here by simply left-shifting the input
- value and filling the low order bits with zeroes. This method improves PNG
- compression but this scheme cannot reproduce white exactly, since it does
- not generate an all-ones maximum value; the net effect is to darken the
- image slightly.
-
- The better method should be "left bit replication":
-
- 4 3 2 1 0
- ---------
- 1 1 0 1 1
-
- 7 6 5 4 3 2 1 0
- ----------------
- 1 1 0 1 1 1 1 0
- |=======| |===|
- | leftmost bits repeated to fill open bits
- |
- original bits
-*/
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint16_t *end;
@@ -754,9 +730,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $2, %%mm1 \n\t"
- "psrlq $7, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
@@ -784,9 +761,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $2, %%mm1 \n\t"
- "psrlq $7, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
@@ -832,9 +810,9 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
while (s < end) {
register uint16_t bgr;
bgr = *s++;
- *d++ = (bgr&0x1F)<<3;
- *d++ = (bgr&0x3E0)>>2;
- *d++ = (bgr&0x7C00)>>7;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+ *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
}
}
@@ -856,9 +834,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $3, %%mm1 \n\t"
- "psrlq $8, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "psrlq $1, %%mm2 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
@@ -886,9 +866,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $3, %%mm1 \n\t"
- "psrlq $8, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "psrlq $1, %%mm2 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
@@ -933,9 +915,9 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
while (s < end) {
register uint16_t bgr;
bgr = *s++;
- *d++ = (bgr&0x1F)<<3;
- *d++ = (bgr&0x7E0)>>3;
- *d++ = (bgr&0xF800)>>8;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+ *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
}
}
@@ -978,11 +960,12 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $2, %%mm1 \n\t"
- "psrlq $7, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "pmulhw %5, %%mm0 \n\t"
+ "pmulhw %5, %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
PACK_RGB32
- ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
+ ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
:"memory");
d += 16;
s += 4;
@@ -992,9 +975,9 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s
while (s < end) {
register uint16_t bgr;
bgr = *s++;
- *d++ = (bgr&0x1F)<<3;
- *d++ = (bgr&0x3E0)>>2;
- *d++ = (bgr&0x7C00)>>7;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+ *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = 255;
}
}
@@ -1019,11 +1002,13 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $3, %%mm1 \n\t"
- "psrlq $8, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "psrlq $1, %%mm2 \n\t"
+ "pmulhw %5, %%mm0 \n\t"
+ "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
PACK_RGB32
- ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
+ ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
:"memory");
d += 16;
s += 4;
@@ -1033,9 +1018,9 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
while (s < end) {
register uint16_t bgr;
bgr = *s++;
- *d++ = (bgr&0x1F)<<3;
- *d++ = (bgr&0x7E0)>>3;
- *d++ = (bgr&0xF800)>>8;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+ *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = 255;
}
}
@@ -1627,10 +1612,15 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
* others are ignored in the C version.
* FIXME: Write HQ version.
*/
+#if HAVE_7REGS
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
- int lumStride, int chromStride, int srcStride)
+ int lumStride, int chromStride, int srcStride,
+ int32_t *rgb2yuv)
{
+#define BGR2Y_IDX "16*4+16*32"
+#define BGR2U_IDX "16*4+16*33"
+#define BGR2V_IDX "16*4+16*34"
int y;
const x86_reg chromWidth= width>>1;
for (y=0; y<height-2; y+=2) {
@@ -1638,7 +1628,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
for (i=0; i<2; i++) {
__asm__ volatile(
"mov %2, %%"REG_a" \n\t"
- "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
+ "movq "BGR2Y_IDX"(%3), %%mm6 \n\t"
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
@@ -1657,12 +1647,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
"pmaddwd %%mm6, %%mm1 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
-#ifndef FAST_BGR2YV12
"psrad $8, %%mm0 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
-#endif
"packssdw %%mm1, %%mm0 \n\t"
"packssdw %%mm3, %%mm2 \n\t"
"pmaddwd %%mm5, %%mm0 \n\t"
@@ -1682,12 +1670,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
"pmaddwd %%mm6, %%mm1 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
-#ifndef FAST_BGR2YV12
"psrad $8, %%mm4 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
-#endif
"packssdw %%mm1, %%mm4 \n\t"
"packssdw %%mm3, %%mm2 \n\t"
"pmaddwd %%mm5, %%mm4 \n\t"
@@ -1702,7 +1688,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
"add $8, %%"REG_a" \n\t"
" js 1b \n\t"
- : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
+ : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
: "%"REG_a, "%"REG_d
);
ydst += lumStride;
@@ -1712,7 +1698,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
__asm__ volatile(
"mov %4, %%"REG_a" \n\t"
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
- "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
+ "movq "BGR2U_IDX"(%5), %%mm6 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
"add %%"REG_d", %%"REG_d" \n\t"
@@ -1761,19 +1747,17 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm2 \n\t"
#endif
- "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
- "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
+ "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
+ "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
"pmaddwd %%mm0, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm0 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
-#ifndef FAST_BGR2YV12
"psrad $8, %%mm0 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
-#endif
"packssdw %%mm2, %%mm0 \n\t"
"packssdw %%mm3, %%mm1 \n\t"
"pmaddwd %%mm5, %%mm0 \n\t"
@@ -1823,19 +1807,17 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
"psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm2 \n\t"
#endif
- "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
- "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
+ "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
+ "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
"pmaddwd %%mm4, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm4 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
-#ifndef FAST_BGR2YV12
"psrad $8, %%mm4 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
-#endif
"packssdw %%mm2, %%mm4 \n\t"
"packssdw %%mm3, %%mm1 \n\t"
"pmaddwd %%mm5, %%mm4 \n\t"
@@ -1854,7 +1836,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
"movd %%mm0, (%3, %%"REG_a") \n\t"
"add $4, %%"REG_a" \n\t"
" js 1b \n\t"
- : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
+ : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
: "%"REG_a, "%"REG_d
);
@@ -1867,8 +1849,9 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
SFENCE" \n\t"
:::"memory");
- rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
+ ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
}
+#endif /* HAVE_7REGS */
#endif /* !COMPILE_TEMPLATE_SSE2 */
#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
@@ -1943,9 +1926,13 @@ static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, ui
}
#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
+#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
- const uint8_t *src, const uint8_t *unused, int w,
+ const uint8_t *unused,
+ const uint8_t *src1,
+ const uint8_t *src2,
+ int w,
uint32_t *unused2);
static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
int width, int height, int srcStride,
@@ -1954,7 +1941,7 @@ static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t
int h;
for (h = 0; h < height; h++) {
- RENAME(ff_nv12ToUV)(dst1, dst2, src, NULL, width, NULL);
+ RENAME(ff_nv12ToUV)(dst1, dst2, NULL, src, NULL, width, NULL);
src += srcStride;
dst1 += dst1Stride;
dst2 += dst2Stride;
@@ -1966,6 +1953,7 @@ static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t
);
}
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+#endif /* !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL */
#if !COMPILE_TEMPLATE_SSE2
#if !COMPILE_TEMPLATE_AMD3DNOW
@@ -2395,7 +2383,7 @@ static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co
int lumStride, int chromStride, int srcStride)
{
int y;
- const int chromWidth= -((-width)>>1);
+ const int chromWidth = FF_CEIL_RSHIFT(width, 1);
for (y=0; y<height; y++) {
RENAME(extract_even)(src, ydst, width);
@@ -2421,7 +2409,7 @@ static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co
int lumStride, int chromStride, int srcStride)
{
int y;
- const int chromWidth= -((-width)>>1);
+ const int chromWidth = FF_CEIL_RSHIFT(width, 1);
for (y=0; y<height; y++) {
RENAME(extract_even)(src, ydst, width);
@@ -2445,7 +2433,7 @@ static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co
int lumStride, int chromStride, int srcStride)
{
int y;
- const int chromWidth= -((-width)>>1);
+ const int chromWidth = FF_CEIL_RSHIFT(width, 1);
for (y=0; y<height; y++) {
RENAME(extract_even)(src+1, ydst, width);
@@ -2471,7 +2459,7 @@ static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co
int lumStride, int chromStride, int srcStride)
{
int y;
- const int chromWidth= -((-width)>>1);
+ const int chromWidth = FF_CEIL_RSHIFT(width, 1);
for (y=0; y<height; y++) {
RENAME(extract_even)(src+1, ydst, width);
@@ -2527,7 +2515,9 @@ static av_cold void RENAME(rgb2rgb_init)(void)
#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
planar2x = RENAME(planar2x);
#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
- rgb24toyv12 = RENAME(rgb24toyv12);
+#if HAVE_7REGS
+ ff_rgb24toyv12 = RENAME(rgb24toyv12);
+#endif /* HAVE_7REGS */
yuyvtoyuv420 = RENAME(yuyvtoyuv420);
uyvytoyuv420 = RENAME(uyvytoyuv420);
@@ -2536,7 +2526,9 @@ static av_cold void RENAME(rgb2rgb_init)(void)
#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
interleaveBytes = RENAME(interleaveBytes);
#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
+#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
deinterleaveBytes = RENAME(deinterleaveBytes);
#endif
+#endif
}
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index 440a27b0ba..940f35744e 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -2,20 +2,20 @@
;* x86-optimized horizontal line scaling functions
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -407,11 +407,15 @@ SCALE_FUNC %1, %2, X, X8, 7, %3
SCALE_FUNCS 8, 15, %1
SCALE_FUNCS 9, 15, %2
SCALE_FUNCS 10, 15, %2
+SCALE_FUNCS 12, 15, %2
+SCALE_FUNCS 14, 15, %2
SCALE_FUNCS 16, 15, %3
%endif ; !sse4
SCALE_FUNCS 8, 19, %1
SCALE_FUNCS 9, 19, %2
SCALE_FUNCS 10, 19, %2
+SCALE_FUNCS 12, 19, %2
+SCALE_FUNCS 14, 19, %2
SCALE_FUNCS 16, 19, %3
%endmacro
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index a2bce48339..d9294ce783 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -1,20 +1,20 @@
/*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -23,6 +23,7 @@
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
@@ -57,19 +58,15 @@ DECLARE_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL;
DECLARE_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL;
DECLARE_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL;
-#ifdef FAST_BGR2YV12
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000000210041000DULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000FFEEFFDC0038ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00000038FFD2FFF8ULL;
-#else
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000020E540830C8BULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000ED0FDAC23831ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00003831D0E6F6EAULL;
-#endif /* FAST_BGR2YV12 */
+
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
+
//MMX versions
#if HAVE_MMX_INLINE
#undef RENAME
@@ -117,9 +114,9 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
c->greenDither= ff_dither4[dstY&1];
c->redDither= ff_dither8[(dstY+1)&1];
if (dstY < dstH - 2) {
- const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
- const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
- const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+ const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+ const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+ const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
int i;
if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
@@ -186,7 +183,7 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
*(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
lumMmxFilter[4*i+2]=
lumMmxFilter[4*i+3]=
- ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
+ ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
*(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
alpMmxFilter[4*i+2]=
@@ -197,12 +194,83 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
*(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
chrMmxFilter[4*i+2]=
chrMmxFilter[4*i+3]=
- ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
+ ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U;
}
}
}
}
+#if HAVE_MMXEXT
+static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ if(((int)dest) & 15){
+ return yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
+ }
+ if (offset) {
+ __asm__ volatile("movq (%0), %%xmm3\n\t"
+ "movdqa %%xmm3, %%xmm4\n\t"
+ "psrlq $24, %%xmm3\n\t"
+ "psllq $40, %%xmm4\n\t"
+ "por %%xmm4, %%xmm3\n\t"
+ :: "r"(dither)
+ );
+ } else {
+ __asm__ volatile("movq (%0), %%xmm3\n\t"
+ :: "r"(dither)
+ );
+ }
+ filterSize--;
+ __asm__ volatile(
+ "pxor %%xmm0, %%xmm0\n\t"
+ "punpcklbw %%xmm0, %%xmm3\n\t"
+ "movd %0, %%xmm1\n\t"
+ "punpcklwd %%xmm1, %%xmm1\n\t"
+ "punpckldq %%xmm1, %%xmm1\n\t"
+ "punpcklqdq %%xmm1, %%xmm1\n\t"
+ "psllw $3, %%xmm1\n\t"
+ "paddw %%xmm1, %%xmm3\n\t"
+ "psraw $4, %%xmm3\n\t"
+ ::"m"(filterSize)
+ );
+ __asm__ volatile(
+ "movdqa %%xmm3, %%xmm4\n\t"
+ "movdqa %%xmm3, %%xmm7\n\t"
+ "movl %3, %%ecx\n\t"
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ ".p2align 4 \n\t" /* FIXME Unroll? */\
+ "1: \n\t"\
+ "movddup 8(%%"REG_d"), %%xmm0 \n\t" /* filterCoeff */\
+ "movdqa (%%"REG_S", %%"REG_c", 2), %%xmm2 \n\t" /* srcData */\
+ "movdqa 16(%%"REG_S", %%"REG_c", 2), %%xmm5 \n\t" /* srcData */\
+ "add $16, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "test %%"REG_S", %%"REG_S" \n\t"\
+ "pmulhw %%xmm0, %%xmm2 \n\t"\
+ "pmulhw %%xmm0, %%xmm5 \n\t"\
+ "paddw %%xmm2, %%xmm3 \n\t"\
+ "paddw %%xmm5, %%xmm4 \n\t"\
+ " jnz 1b \n\t"\
+ "psraw $3, %%xmm3 \n\t"\
+ "psraw $3, %%xmm4 \n\t"\
+ "packuswb %%xmm4, %%xmm3 \n\t"
+ "movntdq %%xmm3, (%1, %%"REG_c")\n\t"
+ "add $16, %%"REG_c" \n\t"\
+ "cmp %2, %%"REG_c" \n\t"\
+ "movdqa %%xmm7, %%xmm3\n\t"
+ "movdqa %%xmm7, %%xmm4\n\t"
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "jb 1b \n\t"\
+ :: "g" (filter),
+ "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+ : "%"REG_d, "%"REG_S, "%"REG_c
+ );
+}
+#endif
+
#endif /* HAVE_INLINE_ASM */
#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
@@ -216,10 +284,14 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
SCALE_FUNC(filter_n, 8, 15, opt); \
SCALE_FUNC(filter_n, 9, 15, opt); \
SCALE_FUNC(filter_n, 10, 15, opt); \
+ SCALE_FUNC(filter_n, 12, 15, opt); \
+ SCALE_FUNC(filter_n, 14, 15, opt); \
SCALE_FUNC(filter_n, 16, 15, opt); \
SCALE_FUNC(filter_n, 8, 19, opt); \
SCALE_FUNC(filter_n, 9, 19, opt); \
SCALE_FUNC(filter_n, 10, 19, opt); \
+ SCALE_FUNC(filter_n, 12, 19, opt); \
+ SCALE_FUNC(filter_n, 14, 19, opt); \
SCALE_FUNC(filter_n, 16, 19, opt)
#define SCALE_FUNCS_MMX(opt) \
@@ -275,11 +347,14 @@ VSCALE_FUNCS(avx, avx);
#define INPUT_Y_FUNC(fmt, opt) \
void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
+ const uint8_t *unused1, const uint8_t *unused2, \
int w, uint32_t *unused)
#define INPUT_UV_FUNC(fmt, opt) \
void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
- const uint8_t *src, const uint8_t *unused1, \
- int w, uint32_t *unused2)
+ const uint8_t *unused0, \
+ const uint8_t *src1, \
+ const uint8_t *src2, \
+ int w, uint32_t *unused)
#define INPUT_FUNC(fmt, opt) \
INPUT_Y_FUNC(fmt, opt); \
INPUT_UV_FUNC(fmt, opt)
@@ -313,20 +388,31 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags))
sws_init_swscale_mmxext(c);
+ if (cpu_flags & AV_CPU_FLAG_SSE3){
+ if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
+ c->yuv2planeX = yuv2yuvX_sse3;
+ }
#endif
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
if (c->srcBpc == 8) { \
- hscalefn = c->dstBpc <= 10 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
ff_hscale8to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 9) { \
- hscalefn = c->dstBpc <= 10 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
ff_hscale9to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 10) { \
- hscalefn = c->dstBpc <= 10 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
ff_hscale10to19_ ## filtersize ## _ ## opt1; \
- } else /* c->srcBpc == 16 */ { \
- hscalefn = c->dstBpc <= 10 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
+ } else if (c->srcBpc == 12) { \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale12to19_ ## filtersize ## _ ## opt1; \
+ } else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale14to19_ ## filtersize ## _ ## opt1; \
+ } else { /* c->srcBpc == 16 */ \
+ av_assert0(c->srcBpc == 16);\
+ hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
ff_hscale16to19_ ## filtersize ## _ ## opt1; \
} \
} while (0)
@@ -341,14 +427,15 @@ switch(c->dstBpc){ \
case 16: do_16_case; break; \
case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \
- default: if (condition_8bit) vscalefn = ff_yuv2planeX_8_ ## opt; break; \
+ default: if (condition_8bit) /*vscalefn = ff_yuv2planeX_8_ ## opt;*/ break; \
}
#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
switch(c->dstBpc){ \
case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \
- default: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
+ case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
+ default: av_assert0(c->dstBpc>8); \
}
#define case_rgb(x, X, opt) \
case AV_PIX_FMT_ ## X: \
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index 80a3ad9a3d..c7a1bb46d9 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -1,25 +1,26 @@
/*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#undef REAL_MOVNTQ
#undef MOVNTQ
+#undef MOVNTQ2
#undef PREFETCH
#if COMPILE_TEMPLATE_MMXEXT
@@ -30,11 +31,95 @@
#if COMPILE_TEMPLATE_MMXEXT
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movntq "
#else
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movq "
#endif
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
+#if !COMPILE_TEMPLATE_MMXEXT
+static av_always_inline void
+dither_8to16(const uint8_t *srcDither, int rot)
+{
+ if (rot) {
+ __asm__ volatile("pxor %%mm0, %%mm0\n\t"
+ "movq (%0), %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "psrlq $24, %%mm3\n\t"
+ "psllq $40, %%mm4\n\t"
+ "por %%mm4, %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "punpcklbw %%mm0, %%mm3\n\t"
+ "punpckhbw %%mm0, %%mm4\n\t"
+ :: "r"(srcDither)
+ );
+ } else {
+ __asm__ volatile("pxor %%mm0, %%mm0\n\t"
+ "movq (%0), %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "punpcklbw %%mm0, %%mm3\n\t"
+ "punpckhbw %%mm0, %%mm4\n\t"
+ :: "r"(srcDither)
+ );
+ }
+}
+#endif
+
+static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ dither_8to16(dither, offset);
+ filterSize--;
+ __asm__ volatile(
+ "movd %0, %%mm1\n\t"
+ "punpcklwd %%mm1, %%mm1\n\t"
+ "punpckldq %%mm1, %%mm1\n\t"
+ "psllw $3, %%mm1\n\t"
+ "paddw %%mm1, %%mm3\n\t"
+ "paddw %%mm1, %%mm4\n\t"
+ "psraw $4, %%mm3\n\t"
+ "psraw $4, %%mm4\n\t"
+ ::"m"(filterSize)
+ );
+
+ __asm__ volatile(\
+ "movq %%mm3, %%mm6\n\t"
+ "movq %%mm4, %%mm7\n\t"
+ "movl %3, %%ecx\n\t"
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ ".p2align 4 \n\t" /* FIXME Unroll? */\
+ "1: \n\t"\
+ "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
+ "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\
+ "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\
+ "add $16, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "test %%"REG_S", %%"REG_S" \n\t"\
+ "pmulhw %%mm0, %%mm2 \n\t"\
+ "pmulhw %%mm0, %%mm5 \n\t"\
+ "paddw %%mm2, %%mm3 \n\t"\
+ "paddw %%mm5, %%mm4 \n\t"\
+ " jnz 1b \n\t"\
+ "psraw $3, %%mm3 \n\t"\
+ "psraw $3, %%mm4 \n\t"\
+ "packuswb %%mm4, %%mm3 \n\t"
+ MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t"
+ "add $8, %%"REG_c" \n\t"\
+ "cmp %2, %%"REG_c" \n\t"\
+ "movq %%mm6, %%mm3\n\t"
+ "movq %%mm7, %%mm4\n\t"
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "jb 1b \n\t"\
+ :: "g" (filter),
+ "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+ : "%"REG_d, "%"REG_S, "%"REG_c
+ );
+}
+
#define YSCALEYUV2PACKEDX_UV \
__asm__ volatile(\
"xor %%"REG_a", %%"REG_a" \n\t"\
@@ -260,7 +345,7 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX_ACCURATE
@@ -293,7 +378,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX
@@ -350,7 +435,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
@@ -374,7 +459,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
@@ -427,7 +512,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
@@ -451,7 +536,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
@@ -584,7 +669,7 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
@@ -608,7 +693,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
@@ -649,7 +734,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -670,7 +755,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -786,8 +871,8 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
: "%r8"
);
#else
- *(const uint16_t **)(&c->u_temp)=abuf0;
- *(const uint16_t **)(&c->v_temp)=abuf1;
+ c->u_temp=(intptr_t)abuf0;
+ c->v_temp=(intptr_t)abuf1;
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
@@ -1559,9 +1644,9 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
{
enum AVPixelFormat dstFormat = c->dstFormat;
- if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
- dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21) {
- if (!(c->flags & SWS_BITEXACT)) {
+ c->use_mmx_vfilter= 0;
+ if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
+ && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
if (c->flags & SWS_ACCURATE_RND) {
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
@@ -1574,6 +1659,8 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
}
}
} else {
+ c->use_mmx_vfilter= 1;
+ c->yuv2planeX = RENAME(yuv2yuvX );
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
@@ -1585,7 +1672,6 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
}
}
}
- }
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32:
@@ -1614,7 +1700,7 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
}
}
- if (c->srcBpc == 8 && c->dstBpc <= 10) {
+ if (c->srcBpc == 8 && c->dstBpc <= 14) {
// Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
#if COMPILE_TEMPLATE_MMXEXT
if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
diff --git a/libswscale/x86/w64xmmtest.c b/libswscale/x86/w64xmmtest.c
index dd9a2a4378..88143d9687 100644
--- a/libswscale/x86/w64xmmtest.c
+++ b/libswscale/x86/w64xmmtest.c
@@ -2,20 +2,20 @@
* check XMM registers for clobbers on Win64
* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c
index bacc87f6c7..a3370eec61 100644
--- a/libswscale/x86/yuv2rgb.c
+++ b/libswscale/x86/yuv2rgb.c
@@ -7,27 +7,26 @@
* 1,4,8bpp support and context / deglobalize stuff
* by Michael Niedermayer (michaelni@gmx.at)
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
-#include <assert.h>
#include "config.h"
#include "libswscale/rgb2rgb.h"
@@ -75,10 +74,6 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
- if (c->srcFormat != AV_PIX_FMT_YUV420P &&
- c->srcFormat != AV_PIX_FMT_YUVA420P)
- return NULL;
-
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags)) {
switch (c->dstFormat) {
diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
index b028e93152..c879102cc4 100644
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -4,20 +4,20 @@
* Copyright (C) 2001-2007 Michael Niedermayer
* (c) 2010 Konstantin Shishkov
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -43,17 +43,14 @@
if (h_size * depth > FFABS(dstStride[0])) \
h_size -= 8; \
\
- if (c->srcFormat == AV_PIX_FMT_YUV422P) { \
- srcStride[1] *= 2; \
- srcStride[2] *= 2; \
- } \
+ vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
\
__asm__ volatile ("pxor %mm4, %mm4\n\t"); \
for (y = 0; y < srcSliceH; y++) { \
uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \
const uint8_t *py = src[0] + y * srcStride[0]; \
- const uint8_t *pu = src[1] + (y >> 1) * srcStride[1]; \
- const uint8_t *pv = src[2] + (y >> 1) * srcStride[2]; \
+ const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
+ const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
x86_reg index = -h_size / 2; \
#define YUV2RGB_INITIAL_LOAD \
@@ -141,6 +138,7 @@
: "+r" (index), "+r" (image) \
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
"r" (py - 2*index) \
+ : "memory" \
); \
} \
@@ -148,6 +146,7 @@
: "+r" (index), "+r" (image) \
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
"r" (py - 2*index), "r" (pa - 2*index) \
+ : "memory" \
); \
} \
@@ -188,7 +187,7 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(2)
@@ -216,7 +215,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(2)
@@ -306,7 +305,7 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(3)
@@ -324,7 +323,7 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(3)
@@ -368,7 +367,7 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(4)
@@ -389,7 +388,7 @@ static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(4)
@@ -411,7 +410,7 @@ static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(4)
@@ -432,7 +431,7 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(4)