summaryrefslogtreecommitdiff
path: root/libswscale
diff options
context:
space:
mode:
Diffstat (limited to 'libswscale')
-rw-r--r--libswscale/x86/input.asm82
-rw-r--r--libswscale/x86/output.asm30
-rw-r--r--libswscale/x86/scale.asm45
-rw-r--r--libswscale/x86/swscale.c83
-rw-r--r--libswscale/x86/swscale_template.c30
5 files changed, 38 insertions, 232 deletions
diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
index fcdfe2fcd8..6de6733faa 100644
--- a/libswscale/x86/input.asm
+++ b/libswscale/x86/input.asm
@@ -133,23 +133,18 @@ SECTION .text
; %2 = rgb or bgr
%macro RGB24_TO_Y_FN 2-3
cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
-%if mmsize == 8
- mova m5, [%2_Ycoeff_12x4]
- mova m6, [%2_Ycoeff_3x56]
-%define coeff1 m5
-%define coeff2 m6
-%elif ARCH_X86_64
+%if ARCH_X86_64
mova m8, [%2_Ycoeff_12x4]
mova m9, [%2_Ycoeff_3x56]
%define coeff1 m8
%define coeff2 m9
-%else ; x86-32 && mmsize == 16
+%else ; x86-32
%define coeff1 [%2_Ycoeff_12x4]
%define coeff2 [%2_Ycoeff_3x56]
-%endif ; x86-32/64 && mmsize == 8/16
-%if (ARCH_X86_64 || mmsize == 8) && %0 == 3
+%endif ; x86-32/64
+%if ARCH_X86_64 && %0 == 3
jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body
-%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
+%else ; ARCH_X86_64 && %0 == 3
.body:
%if cpuflag(ssse3)
mova m7, [shuf_rgb_12x4]
@@ -184,7 +179,6 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 }
movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 }
-%if mmsize == 16 ; i.e. sse2
punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 }
@@ -193,7 +187,6 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 }
punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
-%endif ; mmsize == 16
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
@@ -215,7 +208,7 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
add wq, mmsize
jl .loop
REP_RET
-%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
+%endif ; ARCH_X86_64 && %0 == 3
%endmacro
; %1 = nr. of XMM registers
@@ -275,12 +268,10 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 }
movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 }
-%if mmsize == 16
punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 }
movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 }
-%endif ; mmsize == 16
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
%endif ; cpuflag(ssse3)
@@ -294,12 +285,10 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
%else ; !cpuflag(ssse3)
-%if mmsize == 16
movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 }
movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 }
punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
-%endif ; mmsize == 16 && !cpuflag(ssse3)
punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; cpuflag(ssse3)
@@ -320,13 +309,8 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
psrad m4, 9
packssdw m0, m1 ; (word) { U[0-7] }
packssdw m2, m4 ; (word) { V[0-7] }
-%if mmsize == 8
mova [dstUq+wq], m0
mova [dstVq+wq], m2
-%else ; mmsize == 16
- mova [dstUq+wq], m0
- mova [dstVq+wq], m2
-%endif ; mmsize == 8/16
add wq, mmsize
jl .loop
REP_RET
@@ -342,11 +326,6 @@ RGB24_TO_UV_FN %2, rgb
RGB24_TO_UV_FN %2, bgr, rgb
%endmacro
-%if ARCH_X86_32
-INIT_MMX mmx
-RGB24_FUNCS 0, 0
-%endif
-
INIT_XMM sse2
RGB24_FUNCS 10, 12
@@ -483,13 +462,8 @@ cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
psrad m1, 9
packssdw m0, m4 ; (word) { U[0-7] }
packssdw m2, m1 ; (word) { V[0-7] }
-%if mmsize == 8
mova [dstUq+wq], m0
mova [dstVq+wq], m2
-%else ; mmsize == 16
- mova [dstUq+wq], m0
- mova [dstVq+wq], m2
-%endif ; mmsize == 8/16
add wq, mmsize
jl .loop
sub wq, mmsize - 1
@@ -535,11 +509,6 @@ RGB32_TO_UV_FN %2, a, r, g, b, rgba
RGB32_TO_UV_FN %2, a, b, g, r, rgba
%endmacro
-%if ARCH_X86_32
-INIT_MMX mmx
-RGB32_FUNCS 0, 0
-%endif
-
INIT_XMM sse2
RGB32_FUNCS 8, 12
@@ -588,25 +557,18 @@ cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
movsxd wq, wd
%endif
add dstq, wq
-%if mmsize == 16
test srcq, 15
-%endif
lea srcq, [srcq+wq*2]
%ifidn %2, yuyv
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
psrlw m2, 8 ; (word) { 0x00ff } x 8
%endif ; yuyv
-%if mmsize == 16
jnz .loop_u_start
neg wq
LOOP_YUYV_TO_Y a, %2
.loop_u_start:
neg wq
LOOP_YUYV_TO_Y u, %2
-%else ; mmsize == 8
- neg wq
- LOOP_YUYV_TO_Y a, %2
-%endif ; mmsize == 8/16
%endmacro
; %1 = a (aligned) or u (unaligned)
@@ -632,16 +594,9 @@ cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 }
pand m1, m0, m2 ; (word) { U0, U1, ..., U7 }
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
-%if mmsize == 16
packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 }
movh [dstUq+wq], m1
movhps [dstVq+wq], m1
-%else ; mmsize == 8
- packuswb m1, m1 ; (byte) { U0, ... U3 }
- packuswb m0, m0 ; (byte) { V0, ... V3 }
- movh [dstUq+wq], m1
- movh [dstVq+wq], m0
-%endif ; mmsize == 8/16
add wq, mmsize / 2
jl .loop_%1
REP_RET
@@ -661,24 +616,24 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%endif
add dstUq, wq
add dstVq, wq
-%if mmsize == 16 && %0 == 2
+%if %0 == 2
test srcq, 15
%endif
lea srcq, [srcq+wq*4]
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
psrlw m2, 8 ; (word) { 0x00ff } x 8
; NOTE: if uyvy+avx, u/a are identical
-%if mmsize == 16 && %0 == 2
+%if %0 == 2
jnz .loop_u_start
neg wq
LOOP_YUYV_TO_UV a, %2
.loop_u_start:
neg wq
LOOP_YUYV_TO_UV u, %2
-%else ; mmsize == 8
+%else
neg wq
LOOP_YUYV_TO_UV a, %2
-%endif ; mmsize == 8/16
+%endif
%endmacro
; %1 = a (aligned) or u (unaligned)
@@ -716,35 +671,18 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%endif
add dstUq, wq
add dstVq, wq
-%if mmsize == 16
test srcq, 15
-%endif
lea srcq, [srcq+wq*2]
pcmpeqb m5, m5 ; (byte) { 0xff } x 16
psrlw m5, 8 ; (word) { 0x00ff } x 8
-%if mmsize == 16
jnz .loop_u_start
neg wq
LOOP_NVXX_TO_UV a, %2
.loop_u_start:
neg wq
LOOP_NVXX_TO_UV u, %2
-%else ; mmsize == 8
- neg wq
- LOOP_NVXX_TO_UV a, %2
-%endif ; mmsize == 8/16
%endmacro
-%if ARCH_X86_32
-INIT_MMX mmx
-YUYV_TO_Y_FN 0, yuyv
-YUYV_TO_Y_FN 0, uyvy
-YUYV_TO_UV_FN 0, yuyv
-YUYV_TO_UV_FN 0, uyvy
-NVXX_TO_UV_FN 0, nv12
-NVXX_TO_UV_FN 0, nv21
-%endif
-
INIT_XMM sse2
YUYV_TO_Y_FN 3, yuyv
YUYV_TO_Y_FN 2, uyvy
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index 1e498fddf6..84e94baaf6 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -312,11 +312,9 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%endif ; %1 == 8/9/10/16
%endmacro
-%if ARCH_X86_32
+%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
INIT_MMX mmxext
yuv2planeX_fn 8, 0, 7
-yuv2planeX_fn 9, 0, 5
-yuv2planeX_fn 10, 0, 5
%endif
INIT_XMM sse2
@@ -407,19 +405,11 @@ cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
movq m3, [ditherq] ; dither
test offsetd, offsetd
jz .no_rot
-%if mmsize == 16
punpcklqdq m3, m3
-%endif ; mmsize == 16
PALIGNR m3, m3, 3, m2
.no_rot:
-%if mmsize == 8
- mova m2, m3
- punpckhbw m3, m4 ; byte->word
- punpcklbw m2, m4 ; byte->word
-%else
punpcklbw m3, m4
mova m2, m3
-%endif
%elif %1 == 9
pxor m4, m4
mova m3, [pw_512]
@@ -431,36 +421,22 @@ cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
%else ; %1 == 16
%if cpuflag(sse4) ; sse4/avx
mova m4, [pd_4]
-%else ; mmx/sse2
+%else ; sse2
mova m4, [pd_4min0x40000]
mova m5, [minshort]
-%endif ; mmx/sse2/sse4/avx
+%endif ; sse2/sse4/avx
%endif ; %1 == ..
; actual pixel scaling
-%if mmsize == 8
- yuv2plane1_mainloop %1, a
-%else ; mmsize == 16
test dstq, 15
jnz .unaligned
yuv2plane1_mainloop %1, a
REP_RET
.unaligned:
yuv2plane1_mainloop %1, u
-%endif ; mmsize == 8/16
REP_RET
%endmacro
-%if ARCH_X86_32
-INIT_MMX mmx
-yuv2plane1_fn 8, 0, 5
-yuv2plane1_fn 16, 0, 3
-
-INIT_MMX mmxext
-yuv2plane1_fn 9, 0, 3
-yuv2plane1_fn 10, 0, 3
-%endif
-
INIT_XMM sse2
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index 83cabff722..c62ae3dcc2 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -61,13 +61,11 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
%define mov32 mov
%endif ; x86-64
%if %2 == 19
-%if mmsize == 8 ; mmx
- mova m2, [max_19bit_int]
-%elif cpuflag(sse4)
+%if cpuflag(sse4)
mova m2, [max_19bit_int]
%else ; ssse3/sse2
mova m2, [max_19bit_flt]
-%endif ; mmx/sse2/ssse3/sse4
+%endif ; sse2/ssse3/sse4
%endif ; %2 == 19
%if %1 == 16
mova m6, [minshort]
@@ -144,12 +142,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
-%if mmsize == 8 ; mmx
- movq m4, m0
- punpckldq m0, m1
- punpckhdq m4, m1
- paddd m0, m4
-%elif notcpuflag(ssse3) ; sse2
+%if notcpuflag(ssse3) ; sse2
mova m4, m0
shufps m0, m1, 10001000b
shufps m4, m1, 11011101b
@@ -159,7 +152,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
-%endif ; mmx/sse2/ssse3/sse4
+%endif ; sse2/ssse3/sse4
%else ; %3 == 8, i.e. filterSize == 8 scaling
; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
@@ -197,14 +190,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
-%if mmsize == 8
- paddd m0, m1
- paddd m4, m5
- movq m1, m0
- punpckldq m0, m4
- punpckhdq m1, m4
- paddd m0, m1
-%elif notcpuflag(ssse3) ; sse2
+%if notcpuflag(ssse3) ; sse2
%if %1 == 8
%define mex m6
%else
@@ -233,7 +219,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
-%endif ; mmx/sse2/ssse3/sse4
+%endif ; sse2/ssse3/sse4
%endif ; %3 == 4/8
%else ; %3 == X, i.e. any filterSize scaling
@@ -274,7 +260,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
mov srcq, srcmemmp
.innerloop:
- ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
+ ; load 2x8 (sse) source pixels into m0/m1 -> m4/m5
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
%if %1 == 8
@@ -319,12 +305,6 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
lea filterq, [filterq+(fltsizeq+dlt)*2]
-%if mmsize == 8 ; mmx
- movq m0, m4
- punpckldq m4, m5
- punpckhdq m0, m5
- paddd m0, m4
-%else ; mmsize == 16
%if notcpuflag(ssse3) ; sse2
mova m1, m4
punpcklqdq m4, m5
@@ -344,7 +324,6 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
phaddd m4, m4
SWAP 0, 4
%endif ; sse2/ssse3/sse4
-%endif ; mmsize == 8/16
%endif ; %3 ==/!= X
%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
@@ -372,7 +351,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
%endif ; %3 ==/!= X
%endif ; %2 == 15/19
%ifnidn %3, X
- add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
+ add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels
; per iteration. see "shl wq,1" above as for why we do this
%else ; %3 == X
add wq, 2
@@ -385,12 +364,8 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
%macro SCALE_FUNCS 3
SCALE_FUNC %1, %2, 4, 4, 6, %3
SCALE_FUNC %1, %2, 8, 8, 6, %3
-%if mmsize == 8
-SCALE_FUNC %1, %2, X, X, 7, %3
-%else
SCALE_FUNC %1, %2, X, X4, 7, %3
SCALE_FUNC %1, %2, X, X8, 7, %3
-%endif
%endmacro
; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
@@ -411,10 +386,6 @@ SCALE_FUNCS 14, 19, %2
SCALE_FUNCS 16, 19, %3
%endmacro
-%if ARCH_X86_32
-INIT_MMX mmx
-SCALE_FUNCS2 0, 0, 0
-%endif
INIT_XMM sse2
SCALE_FUNCS2 7, 6, 8
INIT_XMM ssse3
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 73869355b8..97bbc4f2d0 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -54,14 +54,6 @@ DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
-//MMX versions
-#if HAVE_MMX_INLINE
-#undef RENAME
-#define COMPILE_TEMPLATE_MMXEXT 0
-#define RENAME(a) a ## _mmx
-#include "swscale_template.c"
-#endif
-
// MMXEXT versions
#if HAVE_MMXEXT_INLINE
#undef RENAME
@@ -269,9 +261,6 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
SCALE_FUNCS(X4, opt); \
SCALE_FUNCS(X8, opt)
-#if ARCH_X86_32
-SCALE_FUNCS_MMX(mmx);
-#endif
SCALE_FUNCS_SSE(sse2);
SCALE_FUNCS_SSE(ssse3);
SCALE_FUNCS_SSE(sse4);
@@ -288,9 +277,7 @@ void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
VSCALEX_FUNC(9, opt); \
VSCALEX_FUNC(10, opt)
-#if ARCH_X86_32
-VSCALEX_FUNCS(mmxext);
-#endif
+VSCALEX_FUNC(8, mmxext);
VSCALEX_FUNCS(sse2);
VSCALEX_FUNCS(sse4);
VSCALEX_FUNC(16, sse4);
@@ -305,9 +292,6 @@ void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int ds
VSCALE_FUNC(10, opt2); \
VSCALE_FUNC(16, opt1)
-#if ARCH_X86_32
-VSCALE_FUNCS(mmx, mmxext);
-#endif
VSCALE_FUNCS(sse2, sse2);
VSCALE_FUNC(16, sse4);
VSCALE_FUNCS(avx, avx);
@@ -337,9 +321,6 @@ void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
INPUT_FUNC(rgb24, opt); \
INPUT_FUNC(bgr24, opt)
-#if ARCH_X86_32
-INPUT_FUNCS(mmx);
-#endif
INPUT_FUNCS(sse2);
INPUT_FUNCS(ssse3);
INPUT_FUNCS(avx);
@@ -470,19 +451,11 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
-#if HAVE_MMX_INLINE
- if (INLINE_MMX(cpu_flags))
- sws_init_swscale_mmx(c);
-#endif
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags))
sws_init_swscale_mmxext(c);
#endif
if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) {
-#if HAVE_MMX_EXTERNAL
- if (EXTERNAL_MMX(cpu_flags))
- c->yuv2planeX = yuv2yuvX_mmx;
-#endif
#if HAVE_MMXEXT_EXTERNAL
if (EXTERNAL_MMXEXT(cpu_flags))
c->yuv2planeX = yuv2yuvX_mmxext;
@@ -496,6 +469,14 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
c->yuv2planeX = yuv2yuvX_avx2;
#endif
}
+#if ARCH_X86_32 && !HAVE_ALIGNED_STACK
+ // The better yuv2planeX_8 functions need aligned stack on x86-32,
+ // so we use MMXEXT in this case if they are not available.
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ if (c->dstBpc == 8 && !c->use_mmx_vfilter)
+ c->yuv2planeX = ff_yuv2planeX_8_mmxext;
+ }
+#endif /* ARCH_X86_32 && !HAVE_ALIGNED_STACK */
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
if (c->srcBpc == 8) { \
@@ -519,12 +500,6 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
ff_hscale16to19_ ## filtersize ## _ ## opt1; \
} \
} while (0)
-#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
- switch (filtersize) { \
- case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
- case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
- default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
- }
#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
switch(c->dstBpc){ \
case 16: do_16_case; break; \
@@ -546,46 +521,6 @@ switch(c->dstBpc){ \
if (!c->chrSrcHSubSample) \
c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
break
-#if ARCH_X86_32
- if (EXTERNAL_MMX(cpu_flags)) {
- ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
- ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
- ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);
-
- switch (c->srcFormat) {
- case AV_PIX_FMT_YA8:
- c->lumToYV12 = ff_yuyvToY_mmx;
- if (c->needAlpha)
- c->alpToYV12 = ff_uyvyToY_mmx;
- break;
- case AV_PIX_FMT_YUYV422:
- c->lumToYV12 = ff_yuyvToY_mmx;
- c->chrToYV12 = ff_yuyvToUV_mmx;
- break;
- case AV_PIX_FMT_UYVY422:
- c->lumToYV12 = ff_uyvyToY_mmx;
- c->chrToYV12 = ff_uyvyToUV_mmx;
- break;
- case AV_PIX_FMT_NV12:
- c->chrToYV12 = ff_nv12ToUV_mmx;
- break;
- case AV_PIX_FMT_NV21:
- c->chrToYV12 = ff_nv21ToUV_mmx;
- break;
- case_rgb(rgb24, RGB24, mmx);
- case_rgb(bgr24, BGR24, mmx);
- case_rgb(bgra, BGRA, mmx);
- case_rgb(rgba, RGBA, mmx);
- case_rgb(abgr, ABGR, mmx);
- case_rgb(argb, ARGB, mmx);
- default:
- break;
- }
- }
- if (EXTERNAL_MMXEXT(cpu_flags)) {
- ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1);
- }
-#endif /* ARCH_X86_32 */
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
switch (filtersize) { \
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index 97d8cae613..6190fcb4fe 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -29,13 +29,8 @@
#undef PREFETCH
-#if COMPILE_TEMPLATE_MMXEXT
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
#define MOVNTQ2 "movntq "
-#else
-#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
-#define MOVNTQ2 "movq "
-#endif
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
#define YSCALEYUV2PACKEDX_UV \
@@ -600,13 +595,8 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
"cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
-#if COMPILE_TEMPLATE_MMXEXT
#undef WRITEBGR24
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
-#else
-#undef WRITEBGR24
-#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
-#endif
#if HAVE_6REGS
static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
@@ -1478,17 +1468,13 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
}
if (c->srcBpc == 8 && c->dstBpc <= 14) {
- // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
-#if COMPILE_TEMPLATE_MMXEXT
- if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
- c->hyscale_fast = ff_hyscale_fast_mmxext;
- c->hcscale_fast = ff_hcscale_fast_mmxext;
- } else {
-#endif /* COMPILE_TEMPLATE_MMXEXT */
- c->hyscale_fast = NULL;
- c->hcscale_fast = NULL;
-#if COMPILE_TEMPLATE_MMXEXT
- }
-#endif /* COMPILE_TEMPLATE_MMXEXT */
+ // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
+ if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
+ c->hyscale_fast = ff_hyscale_fast_mmxext;
+ c->hcscale_fast = ff_hcscale_fast_mmxext;
+ } else {
+ c->hyscale_fast = NULL;
+ c->hcscale_fast = NULL;
+ }
}
}