summaryrefslogtreecommitdiff
path: root/libswscale/x86
diff options
context:
space:
mode:
Diffstat (limited to 'libswscale/x86')
-rw-r--r--libswscale/x86/Makefile2
-rw-r--r--libswscale/x86/input.asm122
-rw-r--r--libswscale/x86/output.asm4
-rw-r--r--libswscale/x86/rgb2rgb.c11
-rw-r--r--libswscale/x86/rgb2rgb_template.c105
-rw-r--r--libswscale/x86/scale.asm4
-rw-r--r--libswscale/x86/swscale.c120
-rw-r--r--libswscale/x86/swscale_template.c119
-rw-r--r--libswscale/x86/yuv2rgb.c12
-rw-r--r--libswscale/x86/yuv2rgb_template.c35
10 files changed, 343 insertions, 191 deletions
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 5416d48a4c..7d219b458b 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -1,3 +1,5 @@
+$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
+
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
MMX-OBJS += x86/rgb2rgb.o \
diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
index a8d5a5a187..327b9f0bc5 100644
--- a/libswscale/x86/input.asm
+++ b/libswscale/x86/input.asm
@@ -35,8 +35,8 @@ SECTION_RODATA
%define GV 0xD0E3
%define BV 0xF6E4
-rgb_Yrnd: times 4 dd 0x84000 ; 16.5 << 15
-rgb_UVrnd: times 4 dd 0x404000 ; 128.5 << 15
+rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15
+rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15
bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
@@ -82,7 +82,7 @@ SECTION .text
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro RGB24_TO_Y_FN 2-3
-cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
+cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, u3
%if mmsize == 8
mova m5, [%2_Ycoeff_12x4]
mova m6, [%2_Ycoeff_3x56]
@@ -114,6 +114,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
%if ARCH_X86_64
movsxd wq, wd
%endif
+ add wq, wq
add dstq, wq
neg wq
%if notcpuflag(ssse3)
@@ -157,12 +158,11 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
- psrad m0, 15
- psrad m2, 15
+ psrad m0, 9
+ psrad m2, 9
packssdw m0, m2 ; (word) { Y[0-7] }
- packuswb m0, m0 ; (byte) { Y[0-7] }
- movh [dstq+wq], m0
- add wq, mmsize / 2
+ mova [dstq+wq], m0
+ add wq, mmsize
jl .loop
REP_RET
%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
@@ -171,7 +171,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro RGB24_TO_UV_FN 2-3
-cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
%if ARCH_X86_64
mova m8, [%2_Ucoeff_12x4]
mova m9, [%2_Ucoeff_3x56]
@@ -202,10 +202,11 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
%endif ; x86-32/64
%endif ; cpuflag(ssse3)
%if ARCH_X86_64
- movsxd wq, dword r4m
+ movsxd wq, dword r5m
%else ; x86-32
- mov wq, r4m
+ mov wq, r5m
%endif
+ add wq, wq
add dstUq, wq
add dstVq, wq
neg wq
@@ -263,23 +264,20 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
- psrad m0, 15
- psrad m2, 15
- psrad m1, 15
- psrad m4, 15
+ psrad m0, 9
+ psrad m2, 9
+ psrad m1, 9
+ psrad m4, 9
packssdw m0, m1 ; (word) { U[0-7] }
packssdw m2, m4 ; (word) { V[0-7] }
%if mmsize == 8
- packuswb m0, m0 ; (byte) { U[0-3] }
- packuswb m2, m2 ; (byte) { V[0-3] }
- movh [dstUq+wq], m0
- movh [dstVq+wq], m2
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
%else ; mmsize == 16
- packuswb m0, m2 ; (byte) { U[0-7], V[0-7] }
- movh [dstUq+wq], m0
- movhps [dstVq+wq], m0
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
%endif ; mmsize == 8/16
- add wq, mmsize / 2
+ add wq, mmsize
jl .loop
REP_RET
%endif ; ARCH_X86_64 && %0 == 3
@@ -305,13 +303,15 @@ RGB24_FUNCS 10, 12
INIT_XMM ssse3
RGB24_FUNCS 11, 13
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
RGB24_FUNCS 11, 13
+%endif
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro RGB32_TO_Y_FN 5-6
-cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
+cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, u3
mova m5, [rgba_Ycoeff_%2%4]
mova m6, [rgba_Ycoeff_%3%5]
%if %0 == 6
@@ -322,6 +322,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
movsxd wq, wd
%endif
lea srcq, [srcq+wq*4]
+ add wq, wq
add dstq, wq
neg wq
mova m4, [rgb_Yrnd]
@@ -329,8 +330,8 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
psrlw m7, 8 ; (word) { 0x00ff } x4
.loop:
; FIXME check alignment and use mova
- movu m0, [srcq+wq*4+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
- movu m2, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+ movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
+ movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
@@ -340,12 +341,11 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
paddd m2, m4 ; += rgb_Yrnd
paddd m0, m1 ; (dword) { Y[0-3] }
paddd m2, m3 ; (dword) { Y[4-7] }
- psrad m0, 15
- psrad m2, 15
+ psrad m0, 9
+ psrad m2, 9
packssdw m0, m2 ; (word) { Y[0-7] }
- packuswb m0, m0 ; (byte) { Y[0-7] }
- movh [dstq+wq], m0
- add wq, mmsize / 2
+ mova [dstq+wq], m0
+ add wq, mmsize
jl .loop
REP_RET
%endif ; %0 == 3
@@ -354,7 +354,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro RGB32_TO_UV_FN 5-6
-cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
%if ARCH_X86_64
mova m8, [rgba_Ucoeff_%2%4]
mova m9, [rgba_Ucoeff_%3%5]
@@ -375,21 +375,22 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
%else ; ARCH_X86_64 && %0 == 6
.body:
%if ARCH_X86_64
- movsxd wq, dword r4m
+ movsxd wq, dword r5m
%else ; x86-32
- mov wq, r4m
+ mov wq, r5m
%endif
+ add wq, wq
add dstUq, wq
add dstVq, wq
- lea srcq, [srcq+wq*4]
+ lea srcq, [srcq+wq*2]
neg wq
pcmpeqb m7, m7
psrlw m7, 8 ; (word) { 0x00ff } x4
mova m6, [rgb_UVrnd]
.loop:
; FIXME check alignment and use mova
- movu m0, [srcq+wq*4+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
- movu m4, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+ movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
+ movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
@@ -405,25 +406,22 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7]
paddd m3, m6 ; += rgb_UVrnd
paddd m5, m6 ; += rgb_UVrnd
- psrad m0, 15
+ psrad m0, 9
paddd m1, m3 ; (dword) { V[4-7] }
paddd m4, m5 ; (dword) { U[4-7] }
- psrad m2, 15
- psrad m4, 15
- psrad m1, 15
+ psrad m2, 9
+ psrad m4, 9
+ psrad m1, 9
packssdw m0, m4 ; (word) { U[0-7] }
packssdw m2, m1 ; (word) { V[0-7] }
%if mmsize == 8
- packuswb m0, m0 ; (byte) { U[0-7] }
- packuswb m2, m2 ; (byte) { V[0-7] }
- movh [dstUq+wq], m0
- movh [dstVq+wq], m2
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
%else ; mmsize == 16
- packuswb m0, m2 ; (byte) { U[0-7], V[0-7] }
- movh [dstUq+wq], m0
- movhps [dstVq+wq], m0
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
%endif ; mmsize == 8/16
- add wq, mmsize / 2
+ add wq, mmsize
jl .loop
REP_RET
%endif ; ARCH_X86_64 && %0 == 3
@@ -451,8 +449,10 @@ RGB32_FUNCS 0, 0
INIT_XMM sse2
RGB32_FUNCS 8, 12
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
RGB32_FUNCS 8, 12
+%endif
;-----------------------------------------------------------------------------
; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
@@ -489,7 +489,7 @@ RGB32_FUNCS 8, 12
; will be the same (i.e. YUYV+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro YUYV_TO_Y_FN 2-3
-cglobal %2ToY, 3, 3, %1, dst, src, w
+cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
%if ARCH_X86_64
movsxd wq, wd
%endif
@@ -559,11 +559,11 @@ cglobal %2ToY, 3, 3, %1, dst, src, w
; will be the same (i.e. UYVY+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro YUYV_TO_UV_FN 2-3
-cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%if ARCH_X86_64
- movsxd wq, dword r4m
+ movsxd wq, dword r5m
%else ; x86-32
- mov wq, r4m
+ mov wq, r5m
%endif
add dstUq, wq
add dstVq, wq
@@ -593,8 +593,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
.loop_%1:
mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... }
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
- pand m2, m0, m4 ; (word) { U0, U1, ..., U7 }
- pand m3, m1, m4 ; (word) { U8, U9, ..., U15 }
+ pand m2, m0, m5 ; (word) { U0, U1, ..., U7 }
+ pand m3, m1, m5 ; (word) { U8, U9, ..., U15 }
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
psrlw m1, 8 ; (word) { V8, V9, ..., V15 }
packuswb m2, m3 ; (byte) { U0, ..., U15 }
@@ -614,11 +614,11 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
; %1 = nr. of XMM registers
; %2 = nv12 or nv21
%macro NVXX_TO_UV_FN 2
-cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%if ARCH_X86_64
- movsxd wq, dword r4m
+ movsxd wq, dword r5m
%else ; x86-32
- mov wq, r4m
+ mov wq, r5m
%endif
add dstUq, wq
add dstVq, wq
@@ -626,8 +626,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
test srcq, 15
%endif
lea srcq, [srcq+wq*2]
- pcmpeqb m4, m4 ; (byte) { 0xff } x 16
- psrlw m4, 8 ; (word) { 0x00ff } x 8
+ pcmpeqb m5, m5 ; (byte) { 0xff } x 16
+ psrlw m5, 8 ; (word) { 0x00ff } x 8
%if mmsize == 16
jnz .loop_u_start
neg wq
@@ -659,6 +659,7 @@ YUYV_TO_UV_FN 3, uyvy
NVXX_TO_UV_FN 5, nv12
NVXX_TO_UV_FN 5, nv21
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
; that's not faster in practice
@@ -666,3 +667,4 @@ YUYV_TO_UV_FN 3, yuyv
YUYV_TO_UV_FN 3, uyvy, 1
NVXX_TO_UV_FN 5, nv12
NVXX_TO_UV_FN 5, nv21
+%endif
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index cf0dec3843..ec253ba5eb 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -266,10 +266,12 @@ yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
yuv2planeX_fn 16, 8, 5
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
+%endif
; %1=outout-bpc, %2=alignment (u/a)
%macro yuv2plane1_mainloop 2
@@ -404,8 +406,10 @@ yuv2plane1_fn 16, 6, 3
INIT_XMM sse4
yuv2plane1_fn 16, 5, 3
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
yuv2plane1_fn 10, 5, 3
yuv2plane1_fn 16, 5, 3
+%endif
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 486f436702..f7288d3e44 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -6,20 +6,20 @@
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -72,6 +72,9 @@ DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
#define RGB2YUV_SHIFT 8
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
diff --git a/libswscale/x86/rgb2rgb_template.c b/libswscale/x86/rgb2rgb_template.c
index 205b749244..d802ab4227 100644
--- a/libswscale/x86/rgb2rgb_template.c
+++ b/libswscale/x86/rgb2rgb_template.c
@@ -7,20 +7,20 @@
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
* lot of big-endian byte order fixes by Alex Beregszaszi
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -127,14 +127,11 @@ static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int sr
"movq %%mm4, %%mm3 \n\t" \
"psllq $48, %%mm2 \n\t" \
"psllq $32, %%mm3 \n\t" \
- "pand "MANGLE(mask24hh)", %%mm2\n\t" \
- "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
"por %%mm2, %%mm0 \n\t" \
"psrlq $16, %%mm1 \n\t" \
"psrlq $32, %%mm4 \n\t" \
"psllq $16, %%mm5 \n\t" \
"por %%mm3, %%mm1 \n\t" \
- "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
"por %%mm5, %%mm4 \n\t" \
\
MOVNTQ" %%mm0, (%0) \n\t" \
@@ -713,27 +710,6 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_s
}
}
-/*
- I use less accurate approximation here by simply left-shifting the input
- value and filling the low order bits with zeroes. This method improves PNG
- compression but this scheme cannot reproduce white exactly, since it does
- not generate an all-ones maximum value; the net effect is to darken the
- image slightly.
-
- The better method should be "left bit replication":
-
- 4 3 2 1 0
- ---------
- 1 1 0 1 1
-
- 7 6 5 4 3 2 1 0
- ----------------
- 1 1 0 1 1 1 1 0
- |=======| |===|
- | leftmost bits repeated to fill open bits
- |
- original bits
-*/
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint16_t *end;
@@ -752,9 +728,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $2, %%mm1 \n\t"
- "psrlq $7, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
@@ -782,9 +759,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $2, %%mm1 \n\t"
- "psrlq $7, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
@@ -830,9 +808,9 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
while (s < end) {
register uint16_t bgr;
bgr = *s++;
- *d++ = (bgr&0x1F)<<3;
- *d++ = (bgr&0x3E0)>>2;
- *d++ = (bgr&0x7C00)>>7;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+ *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
}
}
@@ -854,9 +832,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $3, %%mm1 \n\t"
- "psrlq $8, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "psrlq $1, %%mm2 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
@@ -884,9 +864,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $3, %%mm1 \n\t"
- "psrlq $8, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "psrlq $1, %%mm2 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
@@ -931,9 +913,9 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
while (s < end) {
register uint16_t bgr;
bgr = *s++;
- *d++ = (bgr&0x1F)<<3;
- *d++ = (bgr&0x7E0)>>3;
- *d++ = (bgr&0xF800)>>8;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+ *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
}
}
@@ -976,11 +958,12 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $2, %%mm1 \n\t"
- "psrlq $7, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "pmulhw %5, %%mm0 \n\t"
+ "pmulhw %5, %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
PACK_RGB32
- ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
+ ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
:"memory");
d += 16;
s += 4;
@@ -990,9 +973,9 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s
while (s < end) {
register uint16_t bgr;
bgr = *s++;
- *d++ = (bgr&0x1F)<<3;
- *d++ = (bgr&0x3E0)>>2;
- *d++ = (bgr&0x7C00)>>7;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+ *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = 255;
}
}
@@ -1017,11 +1000,13 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $3, %%mm1 \n\t"
- "psrlq $8, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "psrlq $1, %%mm2 \n\t"
+ "pmulhw %5, %%mm0 \n\t"
+ "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
PACK_RGB32
- ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
+ ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
:"memory");
d += 16;
s += 4;
@@ -1031,9 +1016,9 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
while (s < end) {
register uint16_t bgr;
bgr = *s++;
- *d++ = (bgr&0x1F)<<3;
- *d++ = (bgr&0x7E0)>>3;
- *d++ = (bgr&0xF800)>>8;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+ *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = 255;
}
}
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index 440a27b0ba..c6dafdebc3 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -407,11 +407,15 @@ SCALE_FUNC %1, %2, X, X8, 7, %3
SCALE_FUNCS 8, 15, %1
SCALE_FUNCS 9, 15, %2
SCALE_FUNCS 10, 15, %2
+SCALE_FUNCS 12, 15, %2
+SCALE_FUNCS 14, 15, %2
SCALE_FUNCS 16, 15, %3
%endif ; !sse4
SCALE_FUNCS 8, 19, %1
SCALE_FUNCS 9, 19, %2
SCALE_FUNCS 10, 19, %2
+SCALE_FUNCS 12, 19, %2
+SCALE_FUNCS 14, 19, %2
SCALE_FUNCS 16, 19, %3
%endmacro
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index fc74d97201..e82666bec9 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -1,20 +1,20 @@
/*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -23,6 +23,7 @@
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
@@ -70,6 +71,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
+
//MMX versions
#if HAVE_MMX_INLINE
#undef RENAME
@@ -117,9 +119,9 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
c->greenDither= ff_dither4[dstY&1];
c->redDither= ff_dither8[(dstY+1)&1];
if (dstY < dstH - 2) {
- const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
- const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
- const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+ const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+ const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+ const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
int i;
if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
@@ -186,7 +188,7 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
*(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
lumMmxFilter[4*i+2]=
lumMmxFilter[4*i+3]=
- ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
+ ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
*(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
alpMmxFilter[4*i+2]=
@@ -197,12 +199,73 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
*(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
chrMmxFilter[4*i+2]=
chrMmxFilter[4*i+3]=
- ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
+ ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U;
}
}
}
}
+#if HAVE_MMXEXT
+static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ if(((int)dest) & 15){
+ return yuv2yuvX_MMX2(filter, filterSize, src, dest, dstW, dither, offset);
+ }
+ if (offset) {
+ __asm__ volatile("movq (%0), %%xmm3\n\t"
+ "movdqa %%xmm3, %%xmm4\n\t"
+ "psrlq $24, %%xmm3\n\t"
+ "psllq $40, %%xmm4\n\t"
+ "por %%xmm4, %%xmm3\n\t"
+ :: "r"(dither)
+ );
+ } else {
+ __asm__ volatile("movq (%0), %%xmm3\n\t"
+ :: "r"(dither)
+ );
+ }
+ __asm__ volatile(
+ "pxor %%xmm0, %%xmm0\n\t"
+ "punpcklbw %%xmm0, %%xmm3\n\t"
+ "psraw $4, %%xmm3\n\t"
+ "movdqa %%xmm3, %%xmm4\n\t"
+ "movdqa %%xmm3, %%xmm7\n\t"
+ "movl %3, %%ecx\n\t"
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ ".p2align 4 \n\t" /* FIXME Unroll? */\
+ "1: \n\t"\
+ "movddup 8(%%"REG_d"), %%xmm0 \n\t" /* filterCoeff */\
+ "movdqa (%%"REG_S", %%"REG_c", 2), %%xmm2 \n\t" /* srcData */\
+ "movdqa 16(%%"REG_S", %%"REG_c", 2), %%xmm5 \n\t" /* srcData */\
+ "add $16, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "test %%"REG_S", %%"REG_S" \n\t"\
+ "pmulhw %%xmm0, %%xmm2 \n\t"\
+ "pmulhw %%xmm0, %%xmm5 \n\t"\
+ "paddw %%xmm2, %%xmm3 \n\t"\
+ "paddw %%xmm5, %%xmm4 \n\t"\
+ " jnz 1b \n\t"\
+ "psraw $3, %%xmm3 \n\t"\
+ "psraw $3, %%xmm4 \n\t"\
+ "packuswb %%xmm4, %%xmm3 \n\t"
+ "movntdq %%xmm3, (%1, %%"REG_c")\n\t"
+ "add $16, %%"REG_c" \n\t"\
+ "cmp %2, %%"REG_c" \n\t"\
+ "movdqa %%xmm7, %%xmm3\n\t"
+ "movdqa %%xmm7, %%xmm4\n\t"
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "jb 1b \n\t"\
+ :: "g" (filter),
+ "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+ : "%"REG_d, "%"REG_S, "%"REG_c
+ );
+}
+#endif
+
#endif /* HAVE_INLINE_ASM */
#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
@@ -216,10 +279,14 @@ extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt(
SCALE_FUNC(filter_n, 8, 15, opt); \
SCALE_FUNC(filter_n, 9, 15, opt); \
SCALE_FUNC(filter_n, 10, 15, opt); \
+ SCALE_FUNC(filter_n, 12, 15, opt); \
+ SCALE_FUNC(filter_n, 14, 15, opt); \
SCALE_FUNC(filter_n, 16, 15, opt); \
SCALE_FUNC(filter_n, 8, 19, opt); \
SCALE_FUNC(filter_n, 9, 19, opt); \
SCALE_FUNC(filter_n, 10, 19, opt); \
+ SCALE_FUNC(filter_n, 12, 19, opt); \
+ SCALE_FUNC(filter_n, 14, 19, opt); \
SCALE_FUNC(filter_n, 16, 19, opt)
#define SCALE_FUNCS_MMX(opt) \
@@ -275,11 +342,14 @@ VSCALE_FUNCS(avx, avx);
#define INPUT_Y_FUNC(fmt, opt) \
extern void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
+ const uint8_t *unused1, const uint8_t *unused2, \
int w, uint32_t *unused)
#define INPUT_UV_FUNC(fmt, opt) \
extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
- const uint8_t *src, const uint8_t *unused1, \
- int w, uint32_t *unused2)
+ const uint8_t *unused0, \
+ const uint8_t *src1, \
+ const uint8_t *src2, \
+ int w, uint32_t *unused)
#define INPUT_FUNC(fmt, opt) \
INPUT_Y_FUNC(fmt, opt); \
INPUT_UV_FUNC(fmt, opt)
@@ -312,21 +382,32 @@ av_cold void ff_sws_init_swScale_mmx(SwsContext *c)
#if HAVE_MMXEXT_INLINE
if (cpu_flags & AV_CPU_FLAG_MMXEXT)
sws_init_swScale_MMX2(c);
+ if (cpu_flags & AV_CPU_FLAG_SSE3){
+ if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
+ c->yuv2planeX = yuv2yuvX_sse3;
+ }
#endif
#endif /* HAVE_INLINE_ASM */
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
if (c->srcBpc == 8) { \
- hscalefn = c->dstBpc <= 10 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
ff_hscale8to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 9) { \
- hscalefn = c->dstBpc <= 10 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
ff_hscale9to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 10) { \
- hscalefn = c->dstBpc <= 10 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
ff_hscale10to19_ ## filtersize ## _ ## opt1; \
- } else /* c->srcBpc == 16 */ { \
- hscalefn = c->dstBpc <= 10 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
+ } else if (c->srcBpc == 12) { \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale12to19_ ## filtersize ## _ ## opt1; \
+ } else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale14to19_ ## filtersize ## _ ## opt1; \
+ } else { /* c->srcBpc == 16 */ \
+ av_assert0(c->srcBpc == 16);\
+ hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
ff_hscale16to19_ ## filtersize ## _ ## opt1; \
} \
} while (0)
@@ -341,14 +422,15 @@ switch(c->dstBpc){ \
case 16: do_16_case; break; \
case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \
- default: if (condition_8bit) vscalefn = ff_yuv2planeX_8_ ## opt; break; \
+ default: if (condition_8bit) /*vscalefn = ff_yuv2planeX_8_ ## opt;*/ break; \
}
#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
switch(c->dstBpc){ \
case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \
- default: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
+ case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
+ default: av_assert0(c->dstBpc>8); \
}
#define case_rgb(x, X, opt) \
case AV_PIX_FMT_ ## X: \
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index d89a26f6a5..62265db30f 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -1,25 +1,26 @@
/*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#undef REAL_MOVNTQ
#undef MOVNTQ
+#undef MOVNTQ2
#undef PREFETCH
#if COMPILE_TEMPLATE_MMXEXT
@@ -30,11 +31,84 @@
#if COMPILE_TEMPLATE_MMXEXT
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movntq "
#else
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movq "
#endif
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
+#if !COMPILE_TEMPLATE_MMXEXT
+static av_always_inline void
+dither_8to16(const uint8_t *srcDither, int rot)
+{
+ if (rot) {
+ __asm__ volatile("pxor %%mm0, %%mm0\n\t"
+ "movq (%0), %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "psrlq $24, %%mm3\n\t"
+ "psllq $40, %%mm4\n\t"
+ "por %%mm4, %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "punpcklbw %%mm0, %%mm3\n\t"
+ "punpckhbw %%mm0, %%mm4\n\t"
+ :: "r"(srcDither)
+ );
+ } else {
+ __asm__ volatile("pxor %%mm0, %%mm0\n\t"
+ "movq (%0), %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "punpcklbw %%mm0, %%mm3\n\t"
+ "punpckhbw %%mm0, %%mm4\n\t"
+ :: "r"(srcDither)
+ );
+ }
+}
+#endif
+
+static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ dither_8to16(dither, offset);
+ __asm__ volatile(\
+ "psraw $4, %%mm3\n\t"
+ "psraw $4, %%mm4\n\t"
+ "movq %%mm3, %%mm6\n\t"
+ "movq %%mm4, %%mm7\n\t"
+ "movl %3, %%ecx\n\t"
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ ".p2align 4 \n\t" /* FIXME Unroll? */\
+ "1: \n\t"\
+ "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
+ "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\
+ "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\
+ "add $16, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "test %%"REG_S", %%"REG_S" \n\t"\
+ "pmulhw %%mm0, %%mm2 \n\t"\
+ "pmulhw %%mm0, %%mm5 \n\t"\
+ "paddw %%mm2, %%mm3 \n\t"\
+ "paddw %%mm5, %%mm4 \n\t"\
+ " jnz 1b \n\t"\
+ "psraw $3, %%mm3 \n\t"\
+ "psraw $3, %%mm4 \n\t"\
+ "packuswb %%mm4, %%mm3 \n\t"
+ MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t"
+ "add $8, %%"REG_c" \n\t"\
+ "cmp %2, %%"REG_c" \n\t"\
+ "movq %%mm6, %%mm3\n\t"
+ "movq %%mm7, %%mm4\n\t"
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "jb 1b \n\t"\
+ :: "g" (filter),
+ "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+ : "%"REG_d, "%"REG_S, "%"REG_c
+ );
+}
+
#define YSCALEYUV2PACKEDX_UV \
__asm__ volatile(\
"xor %%"REG_a", %%"REG_a" \n\t"\
@@ -260,7 +334,7 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX_ACCURATE
@@ -293,7 +367,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX
@@ -350,7 +424,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
@@ -374,7 +448,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
@@ -427,7 +501,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
@@ -451,7 +525,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
@@ -584,7 +658,7 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
@@ -608,7 +682,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
@@ -649,7 +723,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -670,7 +744,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -786,8 +860,8 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
: "%r8"
);
#else
- *(const uint16_t **)(&c->u_temp)=abuf0;
- *(const uint16_t **)(&c->v_temp)=abuf1;
+ c->u_temp=(intptr_t)abuf0;
+ c->v_temp=(intptr_t)abuf1;
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
@@ -1559,9 +1633,9 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
{
enum AVPixelFormat dstFormat = c->dstFormat;
- if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
- dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21) {
- if (!(c->flags & SWS_BITEXACT)) {
+ c->use_mmx_vfilter= 0;
+ if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
+ && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
if (c->flags & SWS_ACCURATE_RND) {
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
@@ -1574,6 +1648,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
}
}
} else {
+ c->use_mmx_vfilter= 1;
+ c->yuv2planeX = RENAME(yuv2yuvX );
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
@@ -1585,7 +1661,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
}
}
}
- }
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32:
@@ -1614,7 +1689,7 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
}
}
- if (c->srcBpc == 8 && c->dstBpc <= 10) {
+ if (c->srcBpc == 8 && c->dstBpc <= 14) {
// Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
#if COMPILE_TEMPLATE_MMXEXT
if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c
index 17ac3e2ffe..ea52e3c71b 100644
--- a/libswscale/x86/yuv2rgb.c
+++ b/libswscale/x86/yuv2rgb.c
@@ -7,20 +7,20 @@
* 1,4,8bpp support and context / deglobalize stuff
* by Michael Niedermayer (michaelni@gmx.at)
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -74,10 +74,6 @@ av_cold SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
#if HAVE_INLINE_ASM
int cpu_flags = av_get_cpu_flags();
- if (c->srcFormat != AV_PIX_FMT_YUV420P &&
- c->srcFormat != AV_PIX_FMT_YUVA420P)
- return NULL;
-
#if HAVE_MMXEXT_INLINE
if (cpu_flags & AV_CPU_FLAG_MMXEXT) {
switch (c->dstFormat) {
diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
index b028e93152..c879102cc4 100644
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -4,20 +4,20 @@
* Copyright (C) 2001-2007 Michael Niedermayer
* (c) 2010 Konstantin Shishkov
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -43,17 +43,14 @@
if (h_size * depth > FFABS(dstStride[0])) \
h_size -= 8; \
\
- if (c->srcFormat == AV_PIX_FMT_YUV422P) { \
- srcStride[1] *= 2; \
- srcStride[2] *= 2; \
- } \
+ vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
\
__asm__ volatile ("pxor %mm4, %mm4\n\t"); \
for (y = 0; y < srcSliceH; y++) { \
uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \
const uint8_t *py = src[0] + y * srcStride[0]; \
- const uint8_t *pu = src[1] + (y >> 1) * srcStride[1]; \
- const uint8_t *pv = src[2] + (y >> 1) * srcStride[2]; \
+ const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
+ const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
x86_reg index = -h_size / 2; \
#define YUV2RGB_INITIAL_LOAD \
@@ -141,6 +138,7 @@
: "+r" (index), "+r" (image) \
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
"r" (py - 2*index) \
+ : "memory" \
); \
} \
@@ -148,6 +146,7 @@
: "+r" (index), "+r" (image) \
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
"r" (py - 2*index), "r" (pa - 2*index) \
+ : "memory" \
); \
} \
@@ -188,7 +187,7 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(2)
@@ -216,7 +215,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(2)
@@ -306,7 +305,7 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(3)
@@ -324,7 +323,7 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(3)
@@ -368,7 +367,7 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(4)
@@ -389,7 +388,7 @@ static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(4)
@@ -411,7 +410,7 @@ static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(4)
@@ -432,7 +431,7 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(4)