summaryrefslogtreecommitdiff
path: root/libswscale/x86/input.asm
diff options
context:
space:
mode:
authorAndreas Rheinhardt <andreas.rheinhardt@outlook.com>2022-06-09 16:57:34 +0200
committerAndreas Rheinhardt <andreas.rheinhardt@outlook.com>2022-06-22 13:36:04 +0200
commita05f22eaf393177b94432431c145cbc5ba10390a (patch)
tree31a2ae01b520b2578477ea4b6b2febf1d0e6c1e6 /libswscale/x86/input.asm
parent2831837182fe26f0a19a4d366f3f0553311f1291 (diff)
swscale/x86/swscale: Remove obsolete and harmful MMX(EXT) functions
x64 always has MMX, MMXEXT, SSE and SSE2 and this means that some functions for MMX, MMXEXT, SSE and 3dnow are always overridden by other functions (unless one e.g. explicitly disables SSE2). So given that the only systems that benefit from these functions are truely ancient 32bit x86s they are removed. Moreover, some of the removed code was buggy/not bitexact and lead to failures involving the f32le and f32be versions of gray, gbrp and gbrap on x86-32 when SSE2 was not disabled. See e.g. https://fate.ffmpeg.org/report.cgi?time=20220609221253&slot=x86_32-debian-kfreebsd-gcc-4.4-cpuflags-mmx Notice that yuv2yuvX_mmx is not removed, because it is used by SSE3 and AVX2 as fallback in case of unaligned data and also for tail processing. I don't know why yuv2yuvX_mmxext isn't being used for this; an earlier version [1] of 554c2bc7086f49ef5a6a989ad6bc4bc11807eb6f used it, but the version that was eventually applied does not. [1]: https://ffmpeg.org/pipermail/ffmpeg-devel/2020-November/272124.html Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Diffstat (limited to 'libswscale/x86/input.asm')
-rw-r--r--libswscale/x86/input.asm82
1 files changed, 10 insertions, 72 deletions
diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
index fcdfe2fcd8..6de6733faa 100644
--- a/libswscale/x86/input.asm
+++ b/libswscale/x86/input.asm
@@ -133,23 +133,18 @@ SECTION .text
; %2 = rgb or bgr
%macro RGB24_TO_Y_FN 2-3
cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
-%if mmsize == 8
- mova m5, [%2_Ycoeff_12x4]
- mova m6, [%2_Ycoeff_3x56]
-%define coeff1 m5
-%define coeff2 m6
-%elif ARCH_X86_64
+%if ARCH_X86_64
mova m8, [%2_Ycoeff_12x4]
mova m9, [%2_Ycoeff_3x56]
%define coeff1 m8
%define coeff2 m9
-%else ; x86-32 && mmsize == 16
+%else ; x86-32
%define coeff1 [%2_Ycoeff_12x4]
%define coeff2 [%2_Ycoeff_3x56]
-%endif ; x86-32/64 && mmsize == 8/16
-%if (ARCH_X86_64 || mmsize == 8) && %0 == 3
+%endif ; x86-32/64
+%if ARCH_X86_64 && %0 == 3
jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body
-%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
+%else ; ARCH_X86_64 && %0 == 3
.body:
%if cpuflag(ssse3)
mova m7, [shuf_rgb_12x4]
@@ -184,7 +179,6 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 }
movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 }
-%if mmsize == 16 ; i.e. sse2
punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 }
@@ -193,7 +187,6 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 }
punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
-%endif ; mmsize == 16
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
@@ -215,7 +208,7 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
add wq, mmsize
jl .loop
REP_RET
-%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
+%endif ; ARCH_X86_64 && %0 == 3
%endmacro
; %1 = nr. of XMM registers
@@ -275,12 +268,10 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 }
movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 }
-%if mmsize == 16
punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 }
movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 }
-%endif ; mmsize == 16
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
%endif ; cpuflag(ssse3)
@@ -294,12 +285,10 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
%else ; !cpuflag(ssse3)
-%if mmsize == 16
movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 }
movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 }
punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
-%endif ; mmsize == 16 && !cpuflag(ssse3)
punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; cpuflag(ssse3)
@@ -320,13 +309,8 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
psrad m4, 9
packssdw m0, m1 ; (word) { U[0-7] }
packssdw m2, m4 ; (word) { V[0-7] }
-%if mmsize == 8
mova [dstUq+wq], m0
mova [dstVq+wq], m2
-%else ; mmsize == 16
- mova [dstUq+wq], m0
- mova [dstVq+wq], m2
-%endif ; mmsize == 8/16
add wq, mmsize
jl .loop
REP_RET
@@ -342,11 +326,6 @@ RGB24_TO_UV_FN %2, rgb
RGB24_TO_UV_FN %2, bgr, rgb
%endmacro
-%if ARCH_X86_32
-INIT_MMX mmx
-RGB24_FUNCS 0, 0
-%endif
-
INIT_XMM sse2
RGB24_FUNCS 10, 12
@@ -483,13 +462,8 @@ cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
psrad m1, 9
packssdw m0, m4 ; (word) { U[0-7] }
packssdw m2, m1 ; (word) { V[0-7] }
-%if mmsize == 8
mova [dstUq+wq], m0
mova [dstVq+wq], m2
-%else ; mmsize == 16
- mova [dstUq+wq], m0
- mova [dstVq+wq], m2
-%endif ; mmsize == 8/16
add wq, mmsize
jl .loop
sub wq, mmsize - 1
@@ -535,11 +509,6 @@ RGB32_TO_UV_FN %2, a, r, g, b, rgba
RGB32_TO_UV_FN %2, a, b, g, r, rgba
%endmacro
-%if ARCH_X86_32
-INIT_MMX mmx
-RGB32_FUNCS 0, 0
-%endif
-
INIT_XMM sse2
RGB32_FUNCS 8, 12
@@ -588,25 +557,18 @@ cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
movsxd wq, wd
%endif
add dstq, wq
-%if mmsize == 16
test srcq, 15
-%endif
lea srcq, [srcq+wq*2]
%ifidn %2, yuyv
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
psrlw m2, 8 ; (word) { 0x00ff } x 8
%endif ; yuyv
-%if mmsize == 16
jnz .loop_u_start
neg wq
LOOP_YUYV_TO_Y a, %2
.loop_u_start:
neg wq
LOOP_YUYV_TO_Y u, %2
-%else ; mmsize == 8
- neg wq
- LOOP_YUYV_TO_Y a, %2
-%endif ; mmsize == 8/16
%endmacro
; %1 = a (aligned) or u (unaligned)
@@ -632,16 +594,9 @@ cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 }
pand m1, m0, m2 ; (word) { U0, U1, ..., U7 }
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
-%if mmsize == 16
packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 }
movh [dstUq+wq], m1
movhps [dstVq+wq], m1
-%else ; mmsize == 8
- packuswb m1, m1 ; (byte) { U0, ... U3 }
- packuswb m0, m0 ; (byte) { V0, ... V3 }
- movh [dstUq+wq], m1
- movh [dstVq+wq], m0
-%endif ; mmsize == 8/16
add wq, mmsize / 2
jl .loop_%1
REP_RET
@@ -661,24 +616,24 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%endif
add dstUq, wq
add dstVq, wq
-%if mmsize == 16 && %0 == 2
+%if %0 == 2
test srcq, 15
%endif
lea srcq, [srcq+wq*4]
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
psrlw m2, 8 ; (word) { 0x00ff } x 8
; NOTE: if uyvy+avx, u/a are identical
-%if mmsize == 16 && %0 == 2
+%if %0 == 2
jnz .loop_u_start
neg wq
LOOP_YUYV_TO_UV a, %2
.loop_u_start:
neg wq
LOOP_YUYV_TO_UV u, %2
-%else ; mmsize == 8
+%else
neg wq
LOOP_YUYV_TO_UV a, %2
-%endif ; mmsize == 8/16
+%endif
%endmacro
; %1 = a (aligned) or u (unaligned)
@@ -716,35 +671,18 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%endif
add dstUq, wq
add dstVq, wq
-%if mmsize == 16
test srcq, 15
-%endif
lea srcq, [srcq+wq*2]
pcmpeqb m5, m5 ; (byte) { 0xff } x 16
psrlw m5, 8 ; (word) { 0x00ff } x 8
-%if mmsize == 16
jnz .loop_u_start
neg wq
LOOP_NVXX_TO_UV a, %2
.loop_u_start:
neg wq
LOOP_NVXX_TO_UV u, %2
-%else ; mmsize == 8
- neg wq
- LOOP_NVXX_TO_UV a, %2
-%endif ; mmsize == 8/16
%endmacro
-%if ARCH_X86_32
-INIT_MMX mmx
-YUYV_TO_Y_FN 0, yuyv
-YUYV_TO_Y_FN 0, uyvy
-YUYV_TO_UV_FN 0, yuyv
-YUYV_TO_UV_FN 0, uyvy
-NVXX_TO_UV_FN 0, nv12
-NVXX_TO_UV_FN 0, nv21
-%endif
-
INIT_XMM sse2
YUYV_TO_Y_FN 3, yuyv
YUYV_TO_Y_FN 2, uyvy